cli/get_arxiv.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Get the arxiv abstract data and PDF for a given arxiv id.
#
# Weitian LI <liweitianux@gmail.com>
# 2015/01/23
#

import sys
import re
import urllib
import subprocess
import time
import mimetypes

from bs4 import BeautifulSoup

mirror = "http://jp.arxiv.org/"

def get_url(arxiv_id):
    """
    Determine the full arxiv URL from the given ID/URL.
    """
    if re.match(r'^[0-9]{7}$', arxiv_id):
        print("ERROR: 7-digit ID not supported, please use the full URL")
        sys.exit(2)
    elif re.match(r'^[0-9]{4}\.[0-9]{4,5}$', arxiv_id):
        arxiv_url = mirror + "abs/" + arxiv_id
    elif re.match(r'^https{0,1}://.*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
            arxiv_id):
        arxiv_url = arxiv_id
    elif re.match(r'[a-zA-Z0-9.-]*arxiv.*/([0-9]{7}|[0-9]{4}\.[0-9]{4,5})$',
            arxiv_id):
        arxiv_url = "http://" + arxiv_id
    else:
        print("ERROR: unknown arxiv ID: %s" % arxiv_id)
        exit(3)

    return arxiv_url


def get_id(arxiv_url):
    """
    Extract the ID from the URL.
    """
    return arxiv_url.split('/')[-1]


def get_arxiv_abstract(arxiv_url):
    """
    Get the arxiv abstract data and save to file '${id}.txt'.
    """
    request = urllib.request.urlopen(arxiv_url)
    arxiv_html = request.read()
    soup = BeautifulSoup(arxiv_html)
    title = soup.body.find('h1', attrs={'class': 'title'}).text\
            .replace('\n', ' ')
    authors = soup.body.find('div', attrs={'class': 'authors'}).text\
            .replace('\n', ' ')
    date = soup.body.find('div', attrs={'class': 'dateline'}).text\
            .strip('()')
    abstract = soup.body.find('blockquote', attrs={'class': 'abstract'})\
            .text.replace('\n', ' ')[1:]
    comments = soup.body.find('td', attrs={'class': 'comments'}).text
    subjects = soup.body.find('td', attrs={'class': 'subjects'}).text

    arxiv_id = get_id(arxiv_url)
    filename = arxiv_id + '.txt'
    f = open(filename, 'w')
    f.write("URL: %s\n" % arxiv_url)
    f.write("arXiv: %s\n" % arxiv_id)
    f.write("%s\n\n" % date)
    f.write("%s\n%s\n\n" % (title, authors))
    f.write("%s\n\n" % abstract)
    f.write("Comments: %s\n" % comments)
    f.write("Subjects: %s\n" % subjects)
    f.close()


def get_arxiv_pdf(arxiv_url):
    """
    Get the arxiv PDF with cURL.
    If the PDF is not generated yet, then retry after 10 seconds.
    """
    p = re.compile(r'/abs/')
    arxiv_pdf_url = p.sub('/pdf/', arxiv_url)
    arxiv_id = get_id(arxiv_url)
    filename = arxiv_id + '.pdf'
    cmd = 'curl -o %(filename)s %(url)s' %\
            {'filename': filename, 'url': arxiv_pdf_url}
    print("CMD: %(cmd)s" % {'cmd': cmd})
    subprocess.call(cmd, shell=True)
    output = subprocess.check_output(['file', '-ib', filename])
    filetype = output.decode(encoding='UTF-8').split(';')[0]
    pdftype = 'application/pdf'
    while filetype != pdftype:
        time.sleep(10)
        subprocess.call(cmd, shell=True)
        output = subprocess.check_output(['file', '-ib', filename])
        filetype = output.decode(encoding='UTF-8').split(';')[0]

def main():
    if len(sys.argv) != 2:
        print("Usage: %s <arxiv_id | arxiv_url>\n")
        sys.exit(1)

    arxiv_url = get_url(sys.argv[1])
    arxiv_id = get_id(arxiv_url)
    print("arxiv_url: %s" % arxiv_url)
    print("arxiv_id: %s" % arxiv_id)
    get_arxiv_abstract(arxiv_url)
    print("downloading pdf ...")
    get_arxiv_pdf(arxiv_url)


if __name__ == '__main__':
    main()