-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathepub2html.py
executable file
·324 lines (259 loc) · 10.8 KB
/
epub2html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
#!/usr/bin/env python
# The MIT License (MIT)
#
# Copyright (c) 2013 Adrian Vollmer
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
version = "0.1"
# Convention: all *_dir and *_file variables must contain the full path to
# the directory or file they are referring to
import os
import sys
import subprocess
import argparse
from shutil import copyfile
from xml.dom.minidom import parseString, Document
search_dirs = [ os.path.dirname(os.path.realpath(__file__)) + "/",
os.path.expanduser("~") + '/.epub2html/',
'/usr/share/epub2html/' ]
def get_xml(filename):
"""Read an XML file and return the DOM tree."""
f = file(filename)
data = f.read()
f.close()
return parseString(data)
def parse_cli_options():
"""Process the command line arguments."""
parser = argparse.ArgumentParser(description="Convert an ebook from" +
"epub format to html", version="%prog version " + version)
parser.add_argument("-b", "--browser", dest="browser",
help="starts a browser to display the ebook")
parser.add_argument("-c", "--css", dest="css",
help="include a custom CSS file")
parser.add_argument("-j", "--jquery", dest="jquery",
help="include a custom jquery.js file")
parser.add_argument("-d", "--dropdown", dest="dropdown",
help="include a custom dropdown.js file")
parser.add_argument("epub", help="name of the .epub file")
return parser.parse_args()
def get_input_file(book_file, filename, path=None):
"""Copies a file from one of the default paths to the ebook path and returns the new path."""
if (path == None):
try:
input_file = [f + filename for f in search_dirs
if os.path.exists(f + filename )][0]
except IndexError:
print "File '" + filename + "' not found in either of these directories:"
for d in search_dirs:
print d
exit()
else:
input_file = path + filename
dest = os.path.dirname(book_file) + "/" + filename
copyfile(input_file, dest)
return filename
def get_opf(epub_dir):
"""Read the container XML file at epub_dir and return the path of the *.opf file."""
dom = get_xml(epub_dir + 'META-INF/container.xml')
xmlTag = dom.getElementsByTagName('rootfile')[0]
return xmlTag.attributes['full-path'].value
# Get meta info: Author, Title, ...
# TODO
def get_toc(opf_file):
"""Return the filename of the file containing the table of contents."""
dom = get_xml(opf_file)
xmlTag = dom.getElementsByTagName('manifest')[0].getElementsByTagName('item')
for node in xmlTag:
if (node.attributes['id'].value == "ncx"):
toc_file = node.attributes['href'].value
break
root_dir = os.path.dirname(opf_file) + "/"
return root_dir + toc_file
def extract_content(root_dir, opf_file):
dom = get_xml(opf_file)
xmlTag = dom.getElementsByTagName('package')[0].\
getElementsByTagName('spine')[0].getElementsByTagName('itemref')
spinelist = [node.attributes['idref'].value for node in xmlTag]
xmlTag = dom.getElementsByTagName('package')[0].\
getElementsByTagName('manifest')[0].getElementsByTagName('item')
manifest = {}
for node in xmlTag:
manifest[node.attributes['id'].value] = node.attributes['href'].value
content = []
for ref in spinelist:
chap = [root_dir + manifest[ref]]
xmlTag = get_xml(chap[0])
for node in xmlTag.getElementsByTagName('body')[0].childNodes:
chap.append(node)
content.append(chap)
return content
def get_chapter_list(epub_dir, root_dir, toc_file):
"""Build a list containing chapter names, their filenames, etc."""
table_of_contents = []
dom = get_xml(toc_file)
xmlTag = dom.getElementsByTagName('navMap')[0].getElementsByTagName('navPoint')
for node in xmlTag:
chapter_title = node.getElementsByTagName('navLabel')[0].\
getElementsByTagName('text')[0].childNodes[0].nodeValue
chapter_link = node.getElementsByTagName('content')[0].\
attributes['src'].value
if ('#' in chapter_link):
(chapter_file, chapter_anchor) = (root_dir + chapter_link).split('#')
else:
chapter_file = root_dir + chapter_link
chapter_anchor = ''
table_of_contents.append([chapter_title,
chapter_file, chapter_anchor])
return table_of_contents
# Compile chapters
# The structure is going to be:
#
# <html>
# <head>
# [insert header from the ebook (from the first file)]
# [insert own css]
# [insert own js]
# </head>
# <body>
# <div class="header">
# [TABLE OF CONTENTS dropdown menu]
# </div>
# <div class=content>
# [insert invidiual chapters from the ebook]
# </div></body></html>
def make_head(doc, args, table_of_contents, epub_dir, book_file):
"""Build the XML "head" tag."""
head_tag = doc.createElement('head')
# get the head of the second file (the first file is sometimes just
# a picture of the cover and doesn't have a CSS)
xmlTag = get_xml(table_of_contents[1][1])
head_tag = xmlTag.getElementsByTagName('head')[0]
# now attach css tag
css_tag = doc.createElement('link')
css_tag.attributes['type'] = 'text/css'
css_file = get_input_file(book_file, "default.css", args.css)
css_tag.attributes['href'] = css_file
css_tag.attributes['rel'] = "stylesheet"
head_tag.appendChild(css_tag)
# now attach jquery script tag
script1_tag = doc.createElement('script')
script1_tag.attributes['type'] = 'text/javascript'
jquery_file = get_input_file(book_file, "jquery.js", args.jquery)
script1_tag.attributes['src'] = jquery_file
script1_tag.appendChild(doc.createTextNode(' '))
head_tag.appendChild(script1_tag)
# now attach dropdown script tag
script2_tag = doc.createElement('script')
script2_tag.attributes['type'] = 'text/javascript'
dropdown_file = get_input_file(book_file, "dropdown.js", args.dropdown)
script2_tag.attributes['src'] = dropdown_file
script2_tag.appendChild(doc.createTextNode(' '))
head_tag.appendChild(script2_tag)
return head_tag
def make_body(doc, table_of_contents, contents):
"""Build the XML "body" tag."""
body_tag = doc.createElement('body')
# build a TOC menu
header_tag = doc.createElement('div')
header_content_tag = doc.createElement('div')
header_tag.attributes['class'] = 'header'
header_content_tag.attributes['class'] = 'header-cont'
ul1_tag = doc.createElement('ul')
ul1_tag.attributes['class'] = 'dropdown'
li1_tag = doc.createElement('li')
a1_tag = doc.createElement('a')
a1_tag.attributes['href'] = '#'
a1_tag.appendChild(doc.createTextNode('Contents'))
li1_tag.appendChild(a1_tag)
ul2_tag = doc.createElement('ul')
ul2_tag.attributes['class'] = 'sub_menu'
# add the links to each individual chapter to the TOC menu
for chap in table_of_contents:
li2_tag = doc.createElement('li')
a2_tag = doc.createElement('a')
if (len(chap[2]) > 0):
a2_tag.attributes['href'] = '#' + chap[2]
else:
a2_tag.attributes['href'] = '#' + chap[1] + chap[2]
a2_tag.appendChild(doc.createTextNode(chap[0]))
li2_tag.appendChild(a2_tag)
ul2_tag.appendChild(li2_tag)
li1_tag.appendChild(ul2_tag)
ul1_tag.appendChild(li1_tag)
header_content_tag.appendChild(ul1_tag)
header_tag.appendChild(header_content_tag)
body_tag.appendChild(header_tag)
# now insert the chapters
content_tag = doc.createElement('div')
content_tag.attributes['class'] = 'content'
# Fill in the content
inserted_chapters = []
for chap in contents:
# insert anchor here
anchor_tag = doc.createElement('a')
anchor_tag.attributes['id'] = chap[0]
content_tag.appendChild(anchor_tag)
# now append all the <p>'s and so on
for node in chap[1:]:
content_tag.appendChild(node)
body_tag.appendChild(content_tag)
return body_tag
def fix_links(doc):
all_links = doc.getElementsByTagName('a')
for link in all_links:
try:
link.attributes['href'].value = '#' + \
link.attributes['href'].value.split('#')[-1]
except KeyError, IndexError:
pass
def main():
args = parse_cli_options()
epub_file = args.epub
epub_dir = "epub-" + str(abs(hash(epub_file))) + "/"
# Extract files from the epub
subprocess.call(["unzip", "-qo", epub_file, "-d", epub_dir])
# Extract information from the epub files
opf_file = epub_dir + get_opf(epub_dir)
root_dir = os.path.dirname(opf_file) + "/"
toc_file = get_toc(opf_file)
contents = extract_content(root_dir, opf_file)
table_of_contents = get_chapter_list(epub_dir, root_dir, toc_file)
book_file = os.path.dirname(table_of_contents[0][1]) + "/book.html"
# lets hope all chapter files are in the same directory
# Now build the new document
doc = Document()
html_tag = doc.createElement('html')
doc.appendChild(html_tag)
html_tag.appendChild(make_head(doc, args, table_of_contents, epub_dir, book_file))
html_tag.appendChild(make_body(doc, table_of_contents, contents))
fix_links(doc)
# write DOM to file
with open(book_file, 'w') as f:
f.write(doc.toprettyxml().encode('utf8'))
# delete the old html files. they only take up space
for chap in contents:
try:
os.remove(chap[0])
except OSError:
pass
# pass the html page to the browser if one was specified
if (args.browser!=None):
subprocess.call([args.browser, book_file])
if __name__ == "__main__":
main()