修订版 | 2d227eed6e79d0e063b7ffabe8b0dd27cb7bcfdc (tree) |
---|---|
时间 | 2009-04-02 00:31:35 |
作者 | iselllo |
Commiter | iselllo |
A useful script to download arxiv preprints or to see the abstract or the authors. Try it with -h
to see the help.
@@ -0,0 +1,252 @@ | ||
1 | +#! /usr/bin/python | |
2 | + | |
3 | +## arXiv script version 0.2 | |
4 | + | |
5 | +## Copyright 2008 Tom Brown | |
6 | + | |
7 | +## This program is free software; you can redistribute it and/or | |
8 | +## modify it under the terms of the GNU General Public License as | |
9 | +## published by the Free Software Foundation; either version 3 of the | |
10 | +## License, or (at your option) any later version. | |
11 | + | |
12 | +## This program is distributed in the hope that it will be useful, | |
13 | +## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | +## GNU General Public License for more details. | |
16 | + | |
17 | +## You should have received a copy of the GNU General Public License | |
18 | +## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | + | |
20 | +## See http://www.stringwiki.org/wiki/ArXiv_script for more usage | |
21 | +## instructions | |
22 | + | |
23 | +'''arXiv script | |
24 | +Usage: | |
25 | +python arxiv.py reference [ -htabcjdps ] [ --help ] | |
26 | +"reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303. | |
27 | +Options: | |
28 | +-h, --help | |
29 | +displays this help message | |
30 | +-t | |
31 | +displays the title | |
32 | +-a | |
33 | +displays the author(s) | |
34 | +-b | |
35 | +displays the aBstract | |
36 | +-c | |
37 | +displays the comments | |
38 | +-j | |
39 | +displays the journal reference | |
40 | +-d | |
41 | +downloads the PDF | |
42 | +-p | |
43 | +downloads the PS | |
44 | +-s | |
45 | +downloads the source file | |
46 | +''' | |
47 | + | |
48 | +__version__ = "0.2" | |
49 | +__author__ = "Tom Brown" | |
50 | +__copyright__ = "Copyright 2008 Tom Brown, GNU GPL 3" | |
51 | + | |
52 | + | |
53 | +import sys, os, getopt, re, urllib,gzip | |
54 | + | |
55 | + | |
56 | +def findRefType(ref): | |
57 | + ref = ref.replace('arxiv:','') | |
58 | + if re.search(r'^[a-zA-Z\-]+/\d{7}$',ref): | |
59 | + type = 'old-style eprint' | |
60 | + elif re.search(r'^\d{7}$',ref): | |
61 | + type = 'old-style eprint' | |
62 | + ref = 'hep-th/' + ref | |
63 | + elif re.search('^\d{4}\.\d{4}$',ref): | |
64 | + type = 'new-style eprint' | |
65 | + else: | |
66 | + type = 'not arXiv' | |
67 | + | |
68 | + return type, ref | |
69 | + | |
70 | + | |
71 | + | |
72 | + | |
73 | +def downloadPDF(ref,type,downloadPath): | |
74 | + downloadPath = os.path.expanduser(downloadPath) | |
75 | + if type == 'old-style eprint': | |
76 | + urllib.urlretrieve('http://arxiv.org/pdf/' + ref, downloadPath + ref.replace('/','-') + '.pdf') | |
77 | + elif type == 'new-style eprint': | |
78 | + urllib.urlretrieve('http://arxiv.org/pdf/' + ref, downloadPath + ref + '.pdf') | |
79 | + | |
80 | + | |
81 | +def downloadPS(ref,type,downloadPath): | |
82 | + downloadPath = os.path.expanduser(downloadPath) | |
83 | + filename = downloadPath + ref.replace('/','-') | |
84 | + urllib.urlretrieve('http://arxiv.org/ps/' + ref, filename) | |
85 | + gzipFile = gzip.GzipFile(filename) | |
86 | + psFile = open(filename + ".ps","w") | |
87 | + psFile.write(gzipFile.read()) | |
88 | + psFile.close() | |
89 | + gzipFile.close() | |
90 | + os.remove(filename) | |
91 | + | |
92 | +def downloadSource(ref,type,downloadPath): | |
93 | + downloadPath = os.path.expanduser(downloadPath) | |
94 | + filename = downloadPath + ref.replace('/','-') | |
95 | + urllib.urlretrieve('http://arxiv.org/e-print/' + ref, filename + ".dum") | |
96 | + gzipFile = gzip.GzipFile(filename + ".dum") | |
97 | + sourceFile = open(filename,"w") | |
98 | + sourceFile.write(gzipFile.read()) | |
99 | + sourceFile.close() | |
100 | + gzipFile.close() | |
101 | + os.remove(filename + ".dum") | |
102 | + | |
103 | + | |
104 | +def getTitle(html): | |
105 | + title = html[html.find(">Title:</span>")+15:] | |
106 | + title = title[:title.find("</h1>")] | |
107 | + return title | |
108 | + | |
109 | + | |
110 | +def getAuthors(html): | |
111 | + authors = html[html.find(">Authors:</span>"):] | |
112 | + authors = authors[authors.find("\">")+2:] | |
113 | + authors = authors[:authors.find("</div>")] | |
114 | + authors = re.sub('<[^>]*>','',authors) | |
115 | + authors = authors.replace("\n","") | |
116 | + return authors | |
117 | + | |
118 | + | |
119 | +def getAbstract(html): | |
120 | + abstract = html[html.find("Abstract:</span>")+17:] | |
121 | + abstract = abstract[:abstract.find("</blockquote>")-1] | |
122 | + return abstract | |
123 | + | |
124 | +def getComments(html): | |
125 | + if html.count("comments") == 0: | |
126 | + return "no comments" | |
127 | + else: | |
128 | + comments = html[html.find("comments\">")+10:] | |
129 | + comments = comments[:comments.find("</td>")] | |
130 | + return comments | |
131 | + | |
132 | + | |
133 | +def getJref(html): | |
134 | + if html.count("jref") == 0: | |
135 | + return "no journal reference" | |
136 | + else: | |
137 | + jref = html[html.find("jref\">")+6:] | |
138 | + jref = jref[:jref.find("</td>")] | |
139 | + return jref | |
140 | + | |
141 | + | |
142 | + | |
143 | + | |
144 | +if __name__ == "__main__": | |
145 | + | |
146 | + authorOpt = 0 | |
147 | + titleOpt = 0 | |
148 | + abstractOpt = 0 | |
149 | + commentsOpt = 0 | |
150 | + jrefOpt = 0 | |
151 | + pdfOpt = 0 | |
152 | + psOpt = 0 | |
153 | + sourceOpt = 0 | |
154 | + | |
155 | + try: | |
156 | + options, arguments = getopt.gnu_getopt(sys.argv[1:], | |
157 | + 'hatbcjdpsv', ['help']) | |
158 | + except getopt.error: | |
159 | + print 'error: you tried to use an unknown option or the argument for an option that requires it was missing; try \'arxiv.py -h\' for more information' | |
160 | + sys.exit(0) | |
161 | + | |
162 | + for o,a in options: | |
163 | + if o in ('-h','--help'): | |
164 | + print __doc__ | |
165 | + sys.exit(0) | |
166 | + | |
167 | + elif o == '-a': | |
168 | + authorOpt = 1 | |
169 | + | |
170 | + elif o == '-t': | |
171 | + titleOpt = 1 | |
172 | + | |
173 | + elif o == '-b': | |
174 | + abstractOpt = 1 | |
175 | + | |
176 | + elif o == '-c': | |
177 | + commentsOpt = 1 | |
178 | + | |
179 | + elif o == '-j': | |
180 | + jrefOpt = 1 | |
181 | + | |
182 | + elif o == '-d': | |
183 | + pdfOpt = 1 | |
184 | + | |
185 | + elif o == '-p': | |
186 | + psOpt = 1 | |
187 | + | |
188 | + elif o == '-s': | |
189 | + sourceOpt = 1 | |
190 | + | |
191 | + | |
192 | + if len(options) == 0: | |
193 | + authorOpt = 1 | |
194 | + titleOpt = 1 | |
195 | + abstractOpt = 1 | |
196 | + commentsOpt = 1 | |
197 | + jrefOpt = 1 | |
198 | + | |
199 | + | |
200 | + | |
201 | + if len(arguments) != 1: | |
202 | + print 'you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information' | |
203 | + sys.exit(0) | |
204 | + else: | |
205 | + ref=arguments[0] | |
206 | + | |
207 | + | |
208 | + | |
209 | + | |
210 | + | |
211 | + type, ref = findRefType(ref) | |
212 | + | |
213 | + if type=="not arXiv": | |
214 | + print "type not of arXiv form" | |
215 | + sys.exit(0) | |
216 | + | |
217 | + if (authorOpt+titleOpt+abstractOpt+commentsOpt+jrefOpt > 0): | |
218 | + htmlObject = urllib.urlopen('http://arxiv.org/abs/' + ref) | |
219 | + html = htmlObject.read() | |
220 | + | |
221 | + if titleOpt: | |
222 | + title = getTitle(html) | |
223 | + print title | |
224 | + | |
225 | + if authorOpt: | |
226 | + authors = getAuthors(html) | |
227 | + print authors | |
228 | + | |
229 | + | |
230 | + if abstractOpt: | |
231 | + abstract = getAbstract(html) | |
232 | + print abstract | |
233 | + | |
234 | + | |
235 | + if commentsOpt: | |
236 | + comments = getComments(html) | |
237 | + print comments | |
238 | + | |
239 | + | |
240 | + if jrefOpt: | |
241 | + jref = getJref(html) | |
242 | + print jref | |
243 | + | |
244 | + if pdfOpt: | |
245 | + downloadPDF(ref,type,"") | |
246 | + | |
247 | + if psOpt: | |
248 | + downloadPS(ref,type,"") | |
249 | + | |
250 | + if sourceOpt: | |
251 | + downloadSource(ref,type,"") | |
252 | + |