aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAaron LI <aly@aaronly.me>2017-08-10 14:38:06 +0800
committerAaron LI <aly@aaronly.me>2017-08-10 14:38:06 +0800
commitf34c91a6969567b23ad880dc43a0346cc5a5b513 (patch)
treebbd70bde6f1d22c357ecc65e320bb283d9ff8022
parent52e9ec5c2863e908710244491df2039e58291421 (diff)
downloadatoolbox-f34c91a6969567b23ad880dc43a0346cc5a5b513.tar.bz2
Add get_arxiv.py to download PDF from arXiv
-rwxr-xr-xcli/get_arxiv.py119
1 files changed, 119 insertions, 0 deletions
diff --git a/cli/get_arxiv.py b/cli/get_arxiv.py
new file mode 100755
index 0000000..edbe320
--- /dev/null
+++ b/cli/get_arxiv.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Get the arxiv abstract data and PDF for a given arxiv id.
+#
+# Weitian LI <liweitianux@gmail.com>
+# 2015/01/23
+#
+
+import sys
+import re
+import urllib
+import subprocess
+import time
+import mimetypes
+
+from bs4 import BeautifulSoup
+
+mirror = "http://jp.arxiv.org/"
+
+def get_url(arxiv_id):
+ """
+ Determine the full arxiv URL from the given ID/URL.
+ """
+ if re.match(r'^[0-9]{7}$', arxiv_id):
+ print("ERROR: 7-digit ID not supported, please use the full URL")
+ sys.exit(2)
+ elif re.match(r'^[0-9]{4}\.[0-9]{4,5}$', arxiv_id):
+ arxiv_url = mirror + "abs/" + arxiv_id
+ elif re.match(r'^https{0,1}://.*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
+ arxiv_id):
+ arxiv_url = arxiv_id
+ elif re.match(r'[a-zA-Z0-9.-]*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
+ arxiv_id):
+ arxiv_url = "http://" + arxiv_id
+ else:
+ print("ERROR: unknown arxiv ID: %s" % arxiv_id)
+ exit(3)
+
+ return arxiv_url
+
+
+def get_id(arxiv_url):
+ """
+ Extract the ID from the URL.
+ """
+ return arxiv_url.split('/')[-1]
+
+
+def get_arxiv_abstract(arxiv_url):
+ """
+ Get the arxiv abstract data and save to file '${id}.txt'.
+ """
+ request = urllib.request.urlopen(arxiv_url)
+ arxiv_html = request.read()
+ soup = BeautifulSoup(arxiv_html)
+ title = soup.body.find('h1', attrs={'class': 'title'}).text\
+ .replace('\n', ' ')
+ authors = soup.body.find('div', attrs={'class': 'authors'}).text\
+ .replace('\n', ' ')
+ date = soup.body.find('div', attrs={'class': 'dateline'}).text\
+ .strip('()')
+ abstract = soup.body.find('blockquote', attrs={'class': 'abstract'})\
+ .text.replace('\n', ' ')[1:]
+ comments = soup.body.find('td', attrs={'class': 'comments'}).text
+ subjects = soup.body.find('td', attrs={'class': 'subjects'}).text
+
+ arxiv_id = get_id(arxiv_url)
+ filename = arxiv_id + '.txt'
+ f = open(filename, 'w')
+ f.write("URL: %s\n" % arxiv_url)
+ f.write("arXiv: %s\n" % arxiv_id)
+ f.write("%s\n\n" % date)
+ f.write("%s\n%s\n\n" % (title, authors))
+ f.write("%s\n\n" % abstract)
+ f.write("Comments: %s\n" % comments)
+ f.write("Subjects: %s\n" % subjects)
+ f.close()
+
+
+def get_arxiv_pdf(arxiv_url):
+ """
+ Get the arxiv PDF with cURL.
+ If the PDF is not generated yet, then retry after 10 seconds.
+ """
+ p = re.compile(r'/abs/')
+ arxiv_pdf_url = p.sub('/pdf/', arxiv_url)
+ arxiv_id = get_id(arxiv_url)
+ filename = arxiv_id + '.pdf'
+ cmd = 'curl -o %(filename)s %(url)s' %\
+ {'filename': filename, 'url': arxiv_pdf_url}
+ print("CMD: %(cmd)s" % {'cmd': cmd})
+ subprocess.call(cmd, shell=True)
+ output = subprocess.check_output(['file', '-ib', filename])
+ filetype = output.decode(encoding='UTF-8').split(';')[0]
+ pdftype = 'application/pdf'
+ while filetype != pdftype:
+ time.sleep(10)
+ subprocess.call(cmd, shell=True)
+ output = subprocess.check_output(['file', '-ib', filename])
+ filetype = output.decode(encoding='UTF-8').split(';')[0]
+
+def main():
+ if len(sys.argv) != 2:
+ print("Usage: %s <arxiv_id | arxiv_url>\n")
+ sys.exit(1)
+
+ arxiv_url = get_url(sys.argv[1])
+ arxiv_id = get_id(arxiv_url)
+ print("arxiv_url: %s" % arxiv_url)
+ print("arxiv_id: %s" % arxiv_id)
+ get_arxiv_abstract(arxiv_url)
+ print("downloading pdf ...")
+ get_arxiv_pdf(arxiv_url)
+
+
+if __name__ == '__main__':
+ main()
+