-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathcollection2xhtml.py
executable file
·120 lines (94 loc) · 3.84 KB
/
collection2xhtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# python -c "import collectiondbk2pdf; print collectiondbk2pdf.__doStuff('./tests', 'modern-textbook');" > result.pdf
import sys
import os
import Image
from StringIO import StringIO
from tempfile import mkdtemp
import subprocess
from lxml import etree
import urllib2
import module2dbk
import collection2dbk
import util
DEBUG= 'DEBUG' in os.environ
BASE_PATH = os.getcwd()
# XSL files
DOCBOOK2XHTML_XSL=util.makeXsl('dbk2xhtml.xsl')
DOCBOOK_CLEANUP_XSL = util.makeXsl('dbk-clean-whole.xsl')
MODULES_XPATH = etree.XPath('//col:module/@document', namespaces=util.NAMESPACES)
IMAGES_XPATH = etree.XPath('//c:*/@src[not(starts-with(.,"http:"))]', namespaces=util.NAMESPACES)
def __doStuff(dir):
collxml = etree.parse(os.path.join(dir, 'collection.xml'))
moduleIds = MODULES_XPATH(collxml)
modules = {} # {'m1000': (etree.Element, {'file.jpg':'23947239874'})}
allFiles = {}
for moduleId in moduleIds:
print >> sys.stderr, "LOG: Starting on %s" % (moduleId)
moduleDir = os.path.join(dir, moduleId)
if os.path.isdir(moduleDir):
cnxml, files = loadModule(moduleDir)
for f in files:
allFiles[os.path.join(moduleId, f)] = files[f]
modules[moduleId] = (cnxml, files)
dbk, newFiles = collection2dbk.convert(collxml, modules, svg2png=False, math2svg=True)
allFiles.update(newFiles)
return convert(dbk, allFiles)
def loadModule(moduleDir):
""" Given a directory of files (containing an index.cnxml)
load it into memory """
# Try autogenerated CNXML 1st
cnxmlPath = os.path.join(moduleDir, 'index_auto_generated.cnxml')
if not os.path.exists(cnxmlPath):
cnxmlPath = os.path.join(moduleDir, 'index.cnxml')
cnxmlStr = open(cnxmlPath).read()
cnxml = etree.parse(StringIO(cnxmlStr))
files = {}
for f in IMAGES_XPATH(cnxml):
try:
data = open(os.path.join(moduleDir, f)).read()
files[f] = data
#print >> sys.stderr, "LOG: Image ADDED! %s %s" % (module, f)
except IOError:
print >> sys.stderr, "LOG: Image not found %s %s" % (os.path.basename(moduleDir), f)
# If the dbk file has already been generated, include it
dbkPath = os.path.join(moduleDir, 'index.included.dbk')
if os.path.exists(dbkPath):
dbkStr = open(dbkPath).read()
files['index.included.dbk'] = dbkStr
return (cnxml, files)
def convert(dbk1, files):
""" Converts a Docbook Element and a dictionary of files into a PDF. """
tempdir = mkdtemp(suffix='-fo2pdf')
def transform(xslDoc, xmlDoc):
""" Performs an XSLT transform and parses the <xsl:message /> text """
ret = xslDoc(xmlDoc, **({'cnx.tempdir.path':"'%s'" % tempdir}))
for entry in xslDoc.error_log:
# TODO: Log the errors (and convert JSON to python) instead of just printing
print >> sys.stderr, entry.message.encode('utf-8')
return ret
# Step 0 (Sprinkle in some index hints whenever terms are used)
# termsprinkler.py $DOCBOOK > $DOCBOOK2
if DEBUG:
open('temp-collection1.dbk','w').write(etree.tostring(dbk1,pretty_print=True))
# Step 1 (Cleaning up Docbook)
dbk2 = transform(DOCBOOK_CLEANUP_XSL, dbk1)
if DEBUG:
open('temp-collection2.dbk','w').write(etree.tostring(dbk2,pretty_print=True))
# Step 2 (Docbook to XHTML)
xhtml = transform(DOCBOOK2XHTML_XSL, dbk2)
if DEBUG:
open('temp-collection3.xhtml','w').write(etree.tostring(xhtml))
return xhtml, files
def main():
try:
import argparse
parser = argparse.ArgumentParser(description='Converts a a collection directory to an xhtml file and additional images')
parser.add_argument('directory')
parser.add_argument('-o', dest='output', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
args = parser.parse_args()
xhtml, files = __doStuff(args.directory)
args.output.write(etree.tostring(xhtml))
except ImportError:
print "argparse is needed for commandline"
if __name__ == '__main__':
sys.exit(main())