-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathopenlibraryDownloader.py
executable file
·421 lines (349 loc) · 17.4 KB
/
openlibraryDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
#!/bin/python3
######################################################
# openlibraryDownloader
# Script to download all books from digi4school.at open library
#
# Author: Kippi
# Version: 1.2
######################################################
# --- BEGIN IMPORTS (DO NOT EDIT) ---
import subprocess
import requests
import argparse
import signal
import time
import os
from pyquery import PyQuery
from pathlib import Path
# --- END IMPORTS
# --- BEGIN CONFIG ---
# Set the path to which the books should be downloaded
output_dir = ''
# Define if additionally to downloading the files they should be combined in a PDF file
generate_pdf = False
# Define if the cached extra files should be used
use_cache = True
# Define if the script should prompt before starting the download
interactive = True
# Define how often the script should retry after an error occurs
max_retries = 3
# Define how long the script should wait after an error occurred
error_timeout = 30
# --- END CONFIG ---
# DO NOT EDIT BELOW THIS LINE IF YOU DO NOT KNOW WHAT YOU ARE DOING
def send_form(form_request):
form_html = PyQuery(form_request.text)('form')
form_url = form_html.attr['action']
form_payload = {}
for item in form_html('input').items():
form_payload[item.attr['name']] = item.attr['value']
rs = requests.post(form_url, form_payload, headers=headers, allow_redirects=False)
rs.encoding = encoding
return rs
# Used to download extra material content from single <a> html node
def download_content(content_node, book_url, cookies, dl_directory):
extra_file = os.path.join(dl_directory, content_node('h1').text().replace('/', '-'))
if use_cache and os.path.isfile(extra_file) and os.path.getsize(extra_file) > 0:
return
try:
r = requests.get(book_url + content_node.attr['href'], headers=headers, cookies=cookies, stream=True)
r.raise_for_status()
with open(extra_file, 'wb') as f:
for chunk in r.iter_content(chunk_size=16384):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
except Exception as e:
os.remove(extra_file)
raise e
def download_content_from_directory(content_tree, directory_id, book_url, cookies, base_dir):
# Mitigate error if selector contains a -
directory_id = directory_id.replace('-', '\\-')
for content_node in content_tree('a:not(.directory).sub.' + directory_id).items():
download_content(content_node, book_url, cookies, base_dir)
for dir_node in content_tree('a.directory.sub.' + directory_id).items():
directory_path = os.path.join(base_dir, dir_node('h1').text().replace('/', '-'))
Path(directory_path).mkdir(parents=True, exist_ok=True)
download_content_from_directory(content_tree, dir_node.attr['id'], book_url, cookies, directory_path)
def download_book(book_title, book_id, cookie_str, book_dir, book_response):
book_response.encoding = encoding
Path(book_dir).mkdir(parents=True, exist_ok=True)
book_head = PyQuery(book_response.text)('head')
with open(os.path.join(book_dir, "info.txt"), "a", encoding=encoding) as f:
f.write(os.linesep + u"Publisher Homepage: %s" % str(book_head('meta[name="publisherweb"]').attr['content']))
f.write(os.linesep + u"Publisher Address: %s" % str(book_head('meta[name="publisheradr"]').attr['content']))
f.write(os.linesep + u"Publisher Phone Number: %s" % str(book_head('meta[name="publishertel"]').attr['content']))
f.write(os.linesep + u"Publisher E-Mail Address: %s" % str(book_head('meta[name="publishermail"]').attr['content']))
f.write(os.linesep + u"Meta Title: %s" % str(book_head('meta[name="title"]').attr['content']))
f.write(os.linesep + u"SBNr: %s" % str(book_head('meta[name="sbnr"]').attr['content']))
return subprocess.run(['./digiRipper.sh', '-s', '-d' if generate_pdf else 'g', '-n', book_title, '-i', book_id, '-c', cookie_str, '-o', book_dir])
# Stop when receiving USR1 signal
def handle_usr1(signal_num, frame):
global stop
stop = True
def stop_program():
print("Program stopped by user!")
exit(0)
def handle_error(error_msg, retry_count, retry_end):
print(error_msg)
if retry_count <= max_retries:
print("Waiting %d s and retrying (%d of %d)" % (error_timeout, retry_count, max_retries))
time.sleep(error_timeout)
return retry_end + 1
else:
print("Error after %d retries, skipping book..." % (retry_count - 1))
return retry_end
def main():
global output_dir, generate_pdf, use_cache, max_retries, error_timeout, interactive
parser = argparse.ArgumentParser(description='A downloader for the digi4school open library')
parser.add_argument('-s',
'--start',
type=float,
action='store',
dest='start',
default=0,
required=False,
help='Start the download at START percent')
parser.add_argument('-e',
'--end',
type=float,
action='store',
dest='end',
default=100,
required=False,
help='Stop the download at END percent')
parser.add_argument('-o',
'--output-directory',
type=str,
action='store',
dest='output_dir',
default=output_dir,
required=False,
help='The directory into which the books should be downloaded')
parser.add_argument('-g',
'--generate-pdf',
action='store_true',
dest='generate_pdf',
required=False,
default=None,
help='Generate a pdf when all files are downloaded')
parser.add_argument('-ng',
'--no-generate-pdf',
action='store_false',
dest='generate_pdf',
required=False,
default=None,
help='Do NOT generate a pdf when all files are downloaded')
parser.add_argument('-u',
'--use-cache',
action='store_true',
dest='use_cache',
required=False,
default=None,
help='Use already downloaded (cached) extra files')
parser.add_argument('-nu',
'--no-use-cache',
action='store_false',
dest='use_cache',
required=False,
default=None,
help='Download extra files again')
parser.add_argument('-i',
'--interactive',
action='store_true',
dest='interactive',
required=False,
default=None,
help='Prompt before starting download')
parser.add_argument('-ni',
'--no-interactive',
action='store_false',
dest='interactive',
required=False,
default=None,
help='Do not prompt before starting download, start right away')
parser.add_argument('-m',
'--max-retries',
type=int,
action='store',
dest='max_retries',
default=max_retries,
required=False,
help='Retry downloading MAX_RETRIES times before skipping the book')
parser.add_argument('-t',
'--error-timeout',
type=float,
action='store',
dest='error_timeout',
default=error_timeout,
required=False,
help='Wait ERROR_TIMEOUT seconds before retrying the download')
args = parser.parse_args()
start = args.start
end_percent = args.end
output_dir = args.output_dir
max_retries = args.max_retries
error_timeout = args.error_timeout
if args.generate_pdf is not None:
generate_pdf = args.generate_pdf
if args.use_cache is not None:
use_cache = args.use_cache
if args.interactive is not None:
interactive = args.interactive
if len(output_dir) < 1:
output_dir = input("Output directory: ")
Path(output_dir).mkdir(parents=True, exist_ok=True)
signal.signal(signal.SIGUSR1, handle_usr1)
r = requests.post(base_url + openshelf_path, get_all_books_payload, headers=headers)
r.encoding = encoding
books = PyQuery(r.text)('#shelf').find('a')
book_count = len(books)
print()
print(str(book_count) + ' books')
print("Output directory: " + output_dir)
if interactive:
input('Press [ENTER] to start the download')
if start > 0:
print("\nSkipping first %.2f %%..." % start)
i = 0
for book in books.items():
if stop:
stop_program()
i += 1
percent = (i * 100 / book_count)
if percent <= start:
continue
if percent > end_percent:
print("Stopping at %.2f %% ..." % end_percent)
break
book_id = book.attr['data-id']
title = book('h1').text().replace('/', '-')
current_path = os.path.join(output_dir, book_id)
Path(current_path).mkdir(parents=True, exist_ok=True)
print('\n\nDownloading book "' + book_id + "\" (%.2f %%)" % percent)
if generate_pdf:
if os.path.isfile(os.path.join(current_path, title + '.pdf')):
print('Found existing PDF skipping...')
continue
else:
if os.path.isfile(os.path.join(current_path, 'generate-pdf.sh')):
print('Found PDF generation script, skipping...')
continue
# Writing info about book
with open(os.path.join(current_path, 'info.txt'), 'w', encoding=encoding) as f:
f.writelines(os.linesep.join([
u"Thumbnail: %s" % str(book('img').attr['src']),
u"Title: %s" % str(book('h1').text()),
u"Publisher: %s" % str(book('h2').text())
]))
if stop:
stop_program()
count = 0
end = 1
orig_book_id = book_id
while count < end:
book_id = orig_book_id
count += 1
try:
cookie_request = send_form(send_form(requests.get(base_url + token_path + book_id, headers=headers)))
cookie_str = ''
for cookie in cookie_request.cookies:
if cookie.name == target_cookie_name:
cookie_str = cookie.name + '=' + cookie.value + '; '
if len(cookie_str) < 1:
end = handle_error('ERROR: Cookie not found!', count, end)
continue
if stop:
stop_program()
location = cookie_request.headers['Location']
if len(location) < 1:
location = book_base_url + book_path + book_id + '/'
print('WARNING: Can\'t find book location in header, assuming ' + location)
r = requests.get(location, headers=headers, cookies=cookie_request.cookies, allow_redirects=False)
r.encoding = encoding
if r.status_code == 200:
if 'IDRViewer' not in r.text and '<div id="content">' in r.text:
print('Found extra content!')
print('Downloading extra content...')
book_content = PyQuery(r.text)("#content")
extra_path = os.path.join(current_path, 'extra')
extra_books = []
Path(extra_path).mkdir(parents=True, exist_ok=True)
# Download root files
for node in book_content('a:not(.sub):not(.directory)').items():
if not str(node.attr['href']).startswith('1/'):
thumbnail_location = str(node('img').attr['src'])
if thumbnail_location.endswith('thumbnails/1.jpg') and not thumbnail_location.startswith('http'):
extra_books.append([str(node.attr['href']).replace('/index.html', ''), node('h1').text().replace('/', '-')])
else:
download_content(node, location, cookie_request.cookies, extra_path)
if stop:
stop_program()
# Download content of all root directories
for root_dir_node in book_content('a:not(.sub).directory').items():
root_dir = os.path.join(extra_path, root_dir_node('h1').text().replace('/', '-'))
Path(root_dir).mkdir(parents=True, exist_ok=True)
download_content_from_directory(book_content, root_dir_node.attr['id'], location, cookie_request.cookies, root_dir)
print('Checking book_id ' + str(book_id) + '/1...')
r = requests.get(location + "1/", headers=headers, cookies=cookie_request.cookies)
if 'IDRViewer' not in r.text:
print('WARNING: Book "' + book_id + '/1" looks weird (contains no "IDRViewer"! Checking extra books...')
if len(extra_books) == 1:
# COMPATIBILITY
if os.path.exists(os.path.join(extra_path, extra_books[0][1])):
print('INFO: For compatibility reasons, normal book download will be skipped!')
book_id = None
else:
book_id = orig_book_id + '/' + extra_books[0][0]
print('Found one extra book, setting book_id to ' + str(book_id))
r = requests.get(location + extra_books[0][0] + '/', headers=headers, cookies=cookie_request.cookies)
extra_books.pop(0)
elif len(extra_books) > 1:
print('WARNING: Found more than one extra book, skipping normal book download!')
book_id = None
else:
print('WARNING: Found no extra book, skipping normal book download!')
book_id = None
else:
book_id += '/1'
print('Setting book_id to ' + str(book_id))
r = requests.get(location + '1/', headers=headers, cookies=cookie_request.cookies)
for extra_book in extra_books:
if stop:
stop_program()
print('Downloading extra book "' + extra_book[0] + '"...')
er = requests.get(location + extra_book[0] + '/', headers=headers, cookies=cookie_request.cookies)
if 'IDRViewer' not in er.text:
print('WARNING: Extra book "' + extra_book[0] + '" looks weird (contains no "IDRViewer"! Skipping...')
continue
if download_book(extra_book[1], orig_book_id + '/' + extra_book[0], cookie_str, os.path.join(extra_path, extra_book[1]), er).returncode != 0:
end = handle_error('ERROR: Error running digiRipper!', count, end)
raise ConnectionError('Error downloading extra book "' + extra_book[0] + '"!')
print('Downloaded extra content to "' + extra_path + '"')
else:
print('WARNING: Got wrong response code from book page, skipping check for extra material!')
if stop:
stop_program()
if book_id is not None:
if download_book(title, book_id, cookie_str, current_path, r).returncode != 0:
end = handle_error('ERROR: Error running digiRipper!', count, end)
continue
else:
Path(os.path.join(current_path, 'generate-pdf.sh')).touch()
except Exception as e:
end = handle_error('ERROR: An exception was thrown: ' + str(e), count, end)
continue
base_url = 'https://digi4school.at/'
book_base_url = 'https://a.digi4school.at/'
book_path = 'ebook/'
openshelf_path = 'br/openshelf'
token_path = 'token/'
target_cookie_name = 'digi4b'
encoding = "utf-8"
stop = False
get_all_books_payload = {'title': '%%%', 'publisher_id': '', 'level_of_education': ''}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:68.0) Gecko/20100101 Firefox/68.0',
'Accept': '*/*'
}
if __name__ == '__main__':
main()