-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild.xml
271 lines (234 loc) · 10.9 KB
/
build.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
<?xml version="1.0" encoding="UTF-8"?>
<project basedir="." default="fullBuild" name="rescueTagSoup" xmlns:if="ant:if"
xmlns:unless="ant:unless">
<description>
#####################################################################
# Project build file by Martin Holmes ([email protected]), 2024. #
This build file does these basic tasks:
* Processes a folder of tagsoup HTML to create a parallel folder of
well-formed XML in the XHTML namespace (using htmlparser-1.4.jar).
* Processes each of those files with XSLT to fix a range of
anticipated problems and forms not compatible with XHTML5.
* Validates the resulting HTML with the vnu.jar validator.
It requires java libraries that are in the /lib/ folder.
Just type:
ant -Dsource=pathToSourceFolder -Doutput=pathToOutputFolder
at the command line to run the build.
# #
#####################################################################
</description>
<property name="echo.separator" value="************************************************"/>
<!-- ISO 8601 date for injection where appropriate. -->
<tstamp>
<format property="today" pattern="yyyy-MM-dd" locale="en,CA"/>
</tstamp>
<!-- Folder for incoming tagsoup HTML. -->
<property name="sourceFolder" value="${basedir}/tests/source"/>
<!-- Folder for outgoing rescued XHTML5 -->
<property name="outputFolder" value="${basedir}/tests/output"/>
<!-- Folder for temporary in-process files. -->
<property name="temp" value="${basedir}/temp"/>
<!-- We may need ant-contrib, but if not, this can be removed. -->
<taskdef resource="net/sf/antcontrib/antcontrib.properties"/>
<!-- Location of Saxon jar. -->
<property name="saxon" value="${basedir}/lib/saxon-he-12.5.jar"/>
<!-- Location of htmlparser jar. -->
<property name="htmlparser" value="${basedir}/lib/htmlparser-1.4.jar"/>
<!-- Source HTML documents. -->
<fileset id="sourceHtml" dir="${sourceFolder}">
<include name="**/*.htm"/>
<include name="**/*.html"/>
<include name="**/*.HTM"/>
<include name="**/*.HTML"/>
</fileset>
<!-- Half-processed temp documents. -->
<fileset id="parsedHtml" dir="${temp}">
<include name="**/*.htm"/>
<include name="**/*.html"/>
<include name="**/*.HTM"/>
<include name="**/*.HTML"/>
</fileset>
<!-- Output XHTML documents. -->
<fileset id="outputHtml" dir="${outputFolder}">
<include name="**/*.htm"/>
<include name="**/*.html"/>
<include name="**/*.HTM"/>
<include name="**/*.HTML"/>
</fileset>
<!-- All output files. -->
<fileset id="allOutput" dir="${outputFolder}">
<include name="**/*"/>
</fileset>
<target name="clean">
<description>
TARGET clean:
Removes old products in case this is required.
</description>
<echo message="${echo.separator}"/>
<echo message="Removing all existing content from the output folder..."/>
<mkdir dir="${outputFolder}"/>
<delete includeemptydirs="true">
<fileset refid="allOutput"/>
</delete>
<delete dir="temp"/>
<mkdir dir="temp"/>
</target>
<target name="buildRenamingMap">
<description>
TARGET buildRenamingMap
The first phase of the process involves fixing any
number of horrible file and folder names resulting
from the scraping process or from the source host
process.
</description>
<echo message="${echo.separator}"/>
<echo message="Building a map to rename all bad file and folder names..."/>
<exec executable="find" output="${basedir}/temp/sourceFiles.txt">
<arg value="${sourceFolder}"/>
<arg line="-type f"/>
</exec>
<java fork="true" classname="net.sf.saxon.Transform" classpath="${saxon}" failonerror="true">
<jvmarg value="-Xmx1024m"/>
<arg value="-s:${basedir}/xsl/map-renames.xsl"/>
<arg value="-xsl:${basedir}/xsl/map-renames.xsl"/>
<arg value="sourceListing=${basedir}/temp/sourceFiles.txt"/>
<arg value="sourceFolder=${sourceFolder}"/>
<arg value="outputFolder=${outputFolder}"/>
<arg value="outputScript=${outputFolder}/rename.sh"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
</java>
</target>
<target name="parseAllFiles">
<description>
TARGET parseAllFiles
This target runs every file found in the source folder
through the parseOneFile target.
</description>
<!-- Just in case it doesn't exist yet. -->
<mkdir dir="${outputFolder}"/>
<foreach target="parseOneFile" param="sourceFile">
<path>
<fileset refid="sourceHtml"/>
</path>
</foreach>
</target>
<target name="parseOneFile">
<description>
TARGET parseOneFile
This target is passed the path to a source tagsoup file and
calculates an output path for it, then calls the htmlparser
jar to process the file. The command-line would be:
java -cp htmlparser-1.4.jar nu.validator.htmlparser.tools.HTML2XML inputsoup.html output.html
</description>
<echo message="${echo.separator}"/>
<propertyregex property="outputFile" input="${sourceFile}" regexp="${sourceFolder}" replace="${temp}"/>
<dirname property="outputDir" file="${outputFile}"/>
<echo message="Parsing the file ${sourceFile} to create the file ${outputFile}..."/>
<mkdir dir="${outputDir}"/>
<java classpath="${basedir}/lib/htmlparser-1.4.jar"
classname="nu.validator.htmlparser.tools.HTML2XML"
fork="true">
<arg value="${sourceFile}"/>
<arg value="${outputFile}"/>
</java>
</target>
<target name="findExternalResources">
<description>
TARGET findExternalResources
This target parses through all the now-XHTML temporary
files and compiles a lookup table of all the resources
which have been externally referenced (CSS, JS, images
and so on) using full http requests in the HTML and CSS.
The lookup is in the form of an XHTML file which can
be used in a process to replace external links with local
links. The process also compiles a command-line script which
will retrieve all of these files and place them in a
folder called "externals" in the output folder.
</description>
<echo message="${echo.separator}"/>
<echo message="Mapping external references..."/>
<java fork="true" classname="net.sf.saxon.Transform" classpath="${saxon}" failonerror="true">
<jvmarg value="-Xmx1024m"/>
<arg value="-s:${basedir}/xsl/map-externals.xsl"/>
<arg value="-xsl:${basedir}/xsl/map-externals.xsl"/>
<arg value="xhtmlFolder=${basedir}/temp"/>
<arg value="sourceFolder=${sourceFolder}"/>
<arg value="outputFolder=${basedir}/temp/externals"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
</java>
</target>
<target name="fixAllFiles">
<description>
TARGET fixeAllFiles
This target runs every file found in the output folder
through the fixOneFile target.
</description>
<foreach target="fixOneFile" param="wellFormedFile">
<path>
<fileset refid="parsedHtml"/>
</path>
</foreach>
</target>
<target name="fixOneFile">
<description>
TARGET fixOneFile
This target is passed the path to a parsed and now well-formed XHTML
file, and processes it with Saxon to perform a range of fixes for
known issues. We pre-calculate and supply a range of useful filename
and path-related parameters because these are a little easier to calculate in
the Ant context than in the XSLT.
</description>
<echo message="${echo.separator}"/>
<propertyregex property="outputFile" input="${wellFormedFile}" regexp="${temp}" replace="${output}"/>
<echo message="Remediating known possible issues in the file ${wellFormedFile} to create ${outputFile}..."/>
<dirname property="outputFolder" file="${outputFile}"/>
<basename property="outputFileName" file="${outputFile}"/>
<propertyregex property="outputNameNoSuffix" input="${outputFileName}" regexp="\.[HhTtMmLl]+$" replace=""/>
<java fork="true" classname="net.sf.saxon.Transform" classpath="${saxon}" failonerror="true">
<jvmarg value="-Xmx1024m"/>
<arg value="-s:${wellFormedFile}"/>
<arg value="-xsl:${basedir}/xsl/xhtml5-fixes.xsl"/>
<arg value="-o:${outputFile}/"/>
<arg value="outputFile=${outputFile}"/>
<arg value="outputFileName=${outputFileName}"/>
<arg value="outputFolder=${outputFolder}"/>
<arg value="outputNameNoSuffix=${outputNameNoSuffix}"/>
<arg value="--suppressXsltNamespaceCheck:on"/>
</java>
</target>
<target name="validateOutput">
<description>
TARGET validateOutput
This target validates the collection of XHTML5-adjacent documents
comprising the output site, using the VNU validator (the same
validator used by the W3C's online validation service).
</description>
<echo message="${echo.separator}"/>
<echo message="Validating the HTML pages produced in the build using the VNU validator."/>
<java jar="${basedir}/lib/vnu.jar" failonerror="false" fork="true">
<arg value="--format text"/>
<arg value="--skip-non-html"/>
<arg value="--xml"/>
<arg value="--also-check-css"/>
<arg value="--errors-only"/>
<arg value="${output}"/>
</java>
</target>
<target name="fullBuild">
<description>
TARGET fullBuild
This runs the entire build process.
</description>
<antcall target="clean"/>
<antcall target="buildRenamingMap"/>
<!-- Do initial conversion. -->
<antcall target="parseAllFiles"/>
<!-- Map any external links. -->
<antcall target="findExternalResources"/>
<!-- Run remediation XSLT. -->
<antcall target="fixAllFiles"/>
<antcall target="validateOutput"/>
<echo message="${echo.separator}"/>
<echo message="Done!"/>
</target>
</project>