1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Get the arxiv abstract data and PDF for a given arxiv id.
#
# Weitian LI <liweitianux@gmail.com>
# 2015/01/23
#
import sys
import re
import urllib
import subprocess
import time
import mimetypes
from bs4 import BeautifulSoup
mirror = "http://jp.arxiv.org/"
def get_url(arxiv_id):
"""
Determine the full arxiv URL from the given ID/URL.
"""
if re.match(r'^[0-9]{7}$', arxiv_id):
print("ERROR: 7-digit ID not supported, please use the full URL")
sys.exit(2)
elif re.match(r'^[0-9]{4}\.[0-9]{4,5}$', arxiv_id):
arxiv_url = mirror + "abs/" + arxiv_id
elif re.match(r'^https{0,1}://.*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
arxiv_id):
arxiv_url = arxiv_id
elif re.match(r'[a-zA-Z0-9.-]*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
arxiv_id):
arxiv_url = "http://" + arxiv_id
else:
print("ERROR: unknown arxiv ID: %s" % arxiv_id)
exit(3)
return arxiv_url
def get_id(arxiv_url):
"""
Extract the ID from the URL.
"""
return arxiv_url.split('/')[-1]
def get_arxiv_abstract(arxiv_url):
"""
Get the arxiv abstract data and save to file '${id}.txt'.
"""
request = urllib.request.urlopen(arxiv_url)
arxiv_html = request.read()
soup = BeautifulSoup(arxiv_html)
title = soup.body.find('h1', attrs={'class': 'title'}).text\
.replace('\n', ' ')
authors = soup.body.find('div', attrs={'class': 'authors'}).text\
.replace('\n', ' ')
date = soup.body.find('div', attrs={'class': 'dateline'}).text\
.strip('()')
abstract = soup.body.find('blockquote', attrs={'class': 'abstract'})\
.text.replace('\n', ' ')[1:]
comments = soup.body.find('td', attrs={'class': 'comments'}).text
subjects = soup.body.find('td', attrs={'class': 'subjects'}).text
arxiv_id = get_id(arxiv_url)
filename = arxiv_id + '.txt'
f = open(filename, 'w')
f.write("URL: %s\n" % arxiv_url)
f.write("arXiv: %s\n" % arxiv_id)
f.write("%s\n\n" % date)
f.write("%s\n%s\n\n" % (title, authors))
f.write("%s\n\n" % abstract)
f.write("Comments: %s\n" % comments)
f.write("Subjects: %s\n" % subjects)
f.close()
def get_arxiv_pdf(arxiv_url):
"""
Get the arxiv PDF with cURL.
If the PDF is not generated yet, then retry after 10 seconds.
"""
p = re.compile(r'/abs/')
arxiv_pdf_url = p.sub('/pdf/', arxiv_url)
arxiv_id = get_id(arxiv_url)
filename = arxiv_id + '.pdf'
cmd = 'curl -o %(filename)s %(url)s' %\
{'filename': filename, 'url': arxiv_pdf_url}
print("CMD: %(cmd)s" % {'cmd': cmd})
subprocess.call(cmd, shell=True)
output = subprocess.check_output(['file', '-ib', filename])
filetype = output.decode(encoding='UTF-8').split(';')[0]
pdftype = 'application/pdf'
while filetype != pdftype:
time.sleep(10)
subprocess.call(cmd, shell=True)
output = subprocess.check_output(['file', '-ib', filename])
filetype = output.decode(encoding='UTF-8').split(';')[0]
def main():
if len(sys.argv) != 2:
print("Usage: %s <arxiv_id | arxiv_url>\n")
sys.exit(1)
arxiv_url = get_url(sys.argv[1])
arxiv_id = get_id(arxiv_url)
print("arxiv_url: %s" % arxiv_url)
print("arxiv_id: %s" % arxiv_id)
get_arxiv_abstract(arxiv_url)
print("downloading pdf ...")
get_arxiv_pdf(arxiv_url)
if __name__ == '__main__':
main()
|