Add get_arxiv.py to download PDF from arXiv

author: Aaron LI <aly@aaronly.me> 2017-08-10 14:38:06 +0800
committer: Aaron LI <aly@aaronly.me> 2017-08-10 14:38:06 +0800
commit: f34c91a6969567b23ad880dc43a0346cc5a5b513 (patch)
tree: bbd70bde6f1d22c357ecc65e320bb283d9ff8022
parent: 52e9ec5c2863e908710244491df2039e58291421 (diff)
download: atoolbox-f34c91a6969567b23ad880dc43a0346cc5a5b513.tar.bz2
1 files changed, 119 insertions, 0 deletions
diff --git a/cli/get_arxiv.py b/cli/get_arxiv.py
new file mode 100755
index 0000000..edbe320
--- /dev/null
+++ b/cli/get_arxiv.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Get the arxiv abstract data and PDF for a given arxiv id.
+#
+# Weitian LI <liweitianux@gmail.com>
+# 2015/01/23
+#
+
+import sys
+import re
+import urllib
+import subprocess
+import time
+import mimetypes
+
+from bs4 import BeautifulSoup
+
+mirror = "http://jp.arxiv.org/"
+
+def get_url(arxiv_id):
+    """
+    Determine the full arxiv URL from the given ID/URL.
+    """
+    if re.match(r'^[0-9]{7}$', arxiv_id):
+        print("ERROR: 7-digit ID not supported, please use the full URL")
+        sys.exit(2)
+    elif re.match(r'^[0-9]{4}\.[0-9]{4,5}$', arxiv_id):
+        arxiv_url = mirror + "abs/" + arxiv_id
+    elif re.match(r'^https{0,1}://.*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
+            arxiv_id):
+        arxiv_url = arxiv_id
+    elif re.match(r'[a-zA-Z0-9.-]*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
+            arxiv_id):
+        arxiv_url = "http://" + arxiv_id
+    else:
+        print("ERROR: unknown arxiv ID: %s" % arxiv_id)
+        exit(3)
+
+    return arxiv_url
+
+
+def get_id(arxiv_url):
+    """
+    Extract the ID from the URL.
+    """
+    return arxiv_url.split('/')[-1]
+
+
+def get_arxiv_abstract(arxiv_url):
+    """
+    Get the arxiv abstract data and save to file '${id}.txt'.
+    """
+    request = urllib.request.urlopen(arxiv_url)
+    arxiv_html = request.read()
+    soup = BeautifulSoup(arxiv_html)
+    title = soup.body.find('h1', attrs={'class': 'title'}).text\
+            .replace('\n', ' ')
+    authors = soup.body.find('div', attrs={'class': 'authors'}).text\
+            .replace('\n', ' ')
+    date = soup.body.find('div', attrs={'class': 'dateline'}).text\
+            .strip('()')
+    abstract = soup.body.find('blockquote', attrs={'class': 'abstract'})\
+            .text.replace('\n', ' ')[1:]
+    comments = soup.body.find('td', attrs={'class': 'comments'}).text
+    subjects = soup.body.find('td', attrs={'class': 'subjects'}).text
+
+    arxiv_id = get_id(arxiv_url)
+    filename = arxiv_id + '.txt'
+    f = open(filename, 'w')
+    f.write("URL: %s\n" % arxiv_url)
+    f.write("arXiv: %s\n" % arxiv_id)
+    f.write("%s\n\n" % date)
+    f.write("%s\n%s\n\n" % (title, authors))
+    f.write("%s\n\n" % abstract)
+    f.write("Comments: %s\n" % comments)
+    f.write("Subjects: %s\n" % subjects)
+    f.close()
+
+
+def get_arxiv_pdf(arxiv_url):
+    """
+    Get the arxiv PDF with cURL.
+    If the PDF is not generated yet, then retry after 10 seconds.
+    """
+    p = re.compile(r'/abs/')
+    arxiv_pdf_url = p.sub('/pdf/', arxiv_url)
+    arxiv_id = get_id(arxiv_url)
+    filename = arxiv_id + '.pdf'
+    cmd = 'curl -o %(filename)s %(url)s' %\
+            {'filename': filename, 'url': arxiv_pdf_url}
+    print("CMD: %(cmd)s" % {'cmd': cmd})
+    subprocess.call(cmd, shell=True)
+    output = subprocess.check_output(['file', '-ib', filename])
+    filetype = output.decode(encoding='UTF-8').split(';')[0]
+    pdftype = 'application/pdf'
+    while filetype != pdftype:
+        time.sleep(10)
+        subprocess.call(cmd, shell=True)
+        output = subprocess.check_output(['file', '-ib', filename])
+        filetype = output.decode(encoding='UTF-8').split(';')[0]
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: %s <arxiv_id | arxiv_url>\n")
+        sys.exit(1)
+
+    arxiv_url = get_url(sys.argv[1])
+    arxiv_id = get_id(arxiv_url)
+    print("arxiv_url: %s" % arxiv_url)
+    print("arxiv_id: %s" % arxiv_id)
+    get_arxiv_abstract(arxiv_url)
+    print("downloading pdf ...")
+    get_arxiv_pdf(arxiv_url)
+
+
+if __name__ == '__main__':
+    main()
+
author	Aaron LI <aly@aaronly.me>	2017-08-10 14:38:06 +0800
committer	Aaron LI <aly@aaronly.me>	2017-08-10 14:38:06 +0800
commit	f34c91a6969567b23ad880dc43a0346cc5a5b513 (patch)
tree	bbd70bde6f1d22c357ecc65e320bb283d9ff8022
parent	52e9ec5c2863e908710244491df2039e58291421 (diff)
download	atoolbox-f34c91a6969567b23ad880dc43a0346cc5a5b513.tar.bz2