diff options
author | Aaron LI <aly@aaronly.me> | 2017-08-10 14:38:06 +0800 |
---|---|---|
committer | Aaron LI <aly@aaronly.me> | 2017-08-10 14:38:06 +0800 |
commit | f34c91a6969567b23ad880dc43a0346cc5a5b513 (patch) | |
tree | bbd70bde6f1d22c357ecc65e320bb283d9ff8022 /cli | |
parent | 52e9ec5c2863e908710244491df2039e58291421 (diff) | |
download | atoolbox-f34c91a6969567b23ad880dc43a0346cc5a5b513.tar.bz2 |
Add get_arxiv.py to download PDF from arXiv
Diffstat (limited to 'cli')
-rwxr-xr-x | cli/get_arxiv.py | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/cli/get_arxiv.py b/cli/get_arxiv.py new file mode 100755 index 0000000..edbe320 --- /dev/null +++ b/cli/get_arxiv.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# Get the arxiv abstract data and PDF for a given arxiv id. +# +# Weitian LI <liweitianux@gmail.com> +# 2015/01/23 +# + +import sys +import re +import urllib +import subprocess +import time +import mimetypes + +from bs4 import BeautifulSoup + +mirror = "http://jp.arxiv.org/" + +def get_url(arxiv_id): + """ + Determine the full arxiv URL from the given ID/URL. + """ + if re.match(r'^[0-9]{7}$', arxiv_id): + print("ERROR: 7-digit ID not supported, please use the full URL") + sys.exit(2) + elif re.match(r'^[0-9]{4}\.[0-9]{4,5}$', arxiv_id): + arxiv_url = mirror + "abs/" + arxiv_id + elif re.match(r'^https{0,1}://.*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$', + arxiv_id): + arxiv_url = arxiv_id + elif re.match(r'[a-zA-Z0-9.-]*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$', + arxiv_id): + arxiv_url = "http://" + arxiv_id + else: + print("ERROR: unknown arxiv ID: %s" % arxiv_id) + exit(3) + + return arxiv_url + + +def get_id(arxiv_url): + """ + Extract the ID from the URL. + """ + return arxiv_url.split('/')[-1] + + +def get_arxiv_abstract(arxiv_url): + """ + Get the arxiv abstract data and save to file '${id}.txt'. + """ + request = urllib.request.urlopen(arxiv_url) + arxiv_html = request.read() + soup = BeautifulSoup(arxiv_html) + title = soup.body.find('h1', attrs={'class': 'title'}).text\ + .replace('\n', ' ') + authors = soup.body.find('div', attrs={'class': 'authors'}).text\ + .replace('\n', ' ') + date = soup.body.find('div', attrs={'class': 'dateline'}).text\ + .strip('()') + abstract = soup.body.find('blockquote', attrs={'class': 'abstract'})\ + .text.replace('\n', ' ')[1:] + comments = soup.body.find('td', attrs={'class': 'comments'}).text + subjects = soup.body.find('td', attrs={'class': 'subjects'}).text + + arxiv_id = get_id(arxiv_url) + filename = arxiv_id + '.txt' + f = open(filename, 'w') + f.write("URL: %s\n" % arxiv_url) + f.write("arXiv: %s\n" % arxiv_id) + f.write("%s\n\n" % date) + f.write("%s\n%s\n\n" % (title, authors)) + f.write("%s\n\n" % abstract) + f.write("Comments: %s\n" % comments) + f.write("Subjects: %s\n" % subjects) + f.close() + + +def get_arxiv_pdf(arxiv_url): + """ + Get the arxiv PDF with cURL. + If the PDF is not generated yet, then retry after 10 seconds. + """ + p = re.compile(r'/abs/') + arxiv_pdf_url = p.sub('/pdf/', arxiv_url) + arxiv_id = get_id(arxiv_url) + filename = arxiv_id + '.pdf' + cmd = 'curl -o %(filename)s %(url)s' %\ + {'filename': filename, 'url': arxiv_pdf_url} + print("CMD: %(cmd)s" % {'cmd': cmd}) + subprocess.call(cmd, shell=True) + output = subprocess.check_output(['file', '-ib', filename]) + filetype = output.decode(encoding='UTF-8').split(';')[0] + pdftype = 'application/pdf' + while filetype != pdftype: + time.sleep(10) + subprocess.call(cmd, shell=True) + output = subprocess.check_output(['file', '-ib', filename]) + filetype = output.decode(encoding='UTF-8').split(';')[0] + +def main(): + if len(sys.argv) != 2: + print("Usage: %s <arxiv_id | arxiv_url>\n") + sys.exit(1) + + arxiv_url = get_url(sys.argv[1]) + arxiv_id = get_id(arxiv_url) + print("arxiv_url: %s" % arxiv_url) + print("arxiv_id: %s" % arxiv_id) + get_arxiv_abstract(arxiv_url) + print("downloading pdf ...") + get_arxiv_pdf(arxiv_url) + + +if __name__ == '__main__': + main() + |