fixed a crash during html post-processoing

also: - fixed `implementation_headers` not working when paths use backslashes - added warnings when `implementation_headers` doesn't match anything - added `sources.ignore`
marzer · Sep 11, 2021 · 1712253 · 1712253
1 parent 292cf78
commit 1712253
Show file tree

Hide file tree

Showing 6 changed files with 77 additions and 34 deletions.
diff --git a/poxy/data/version.txt b/poxy/data/version.txt
@@ -1 +1 @@
-0.4.5
+0.5.0
diff --git a/poxy/fixers.py b/poxy/fixers.py
@@ -680,6 +680,7 @@ def __call__(self, doc, context):
 			strings = []
 			for tag in tags:
 				strings = strings + soup.string_descendants(tag, lambda t: soup.find_parent(t, 'a', tag) is None)
+			strings = [s for s in strings if s.parent is not None]
 			for expr, uri in context.autolinks:
 				if uri == doc.path.name: # don't create unnecessary self-links
 					continue

diff --git a/poxy/main.py b/poxy/main.py
@@ -57,7 +57,7 @@ def _run(invoker=True):
 		help=r'path to poxy.toml or a directory containing it (default: %(default)s)'
 	)
 	args.add_argument(
-			r'-v', r'--verbose',
+		r'-v', r'--verbose',
 		action=r'store_true',
 		help=r"enable very noisy diagnostic output"
 	)

diff --git a/poxy/project.py b/poxy/project.py
@@ -740,6 +740,7 @@ class _Inputs(object):
 	schema = {
 		Optional(r'paths')				: ValueOrArray(str, name=r'paths'),
 		Optional(r'recursive_paths')	: ValueOrArray(str, name=r'recursive_paths'),
+		Optional(r'ignore')				: ValueOrArray(str, name=r'ignore'),
 	}
 
 	def __init__(self, config, key, input_dir, additional_inputs=None, additional_recursive_inputs=None):
@@ -761,7 +762,7 @@ def __init__(self, config, key, input_dir, additional_inputs=None, additional_re
 			if config is not None and key in config:
 				paths = paths + [p for p in coerce_collection(config[key])]
 			paths = [p for p in paths if p]
-			paths = [str(p).strip() for p in paths]
+			paths = [str(p).strip().replace('\\', '/') for p in paths]
 			paths = [Path(p) for p in paths if p]
 			paths = [Path(input_dir, p) if not p.is_absolute() else p for p in paths]
 			paths = [p.resolve() for p in paths]
@@ -774,6 +775,15 @@ def __init__(self, config, key, input_dir, additional_inputs=None, additional_re
 				if recursive and path.is_dir():
 					for subdir in enum_subdirs(path, filter=lambda p: not p.name.startswith(r'.'), recursive=True):
 						all_paths.add(subdir)
+
+		ignores = set()
+		if config is not None and r'ignore' in config:
+			for s in coerce_collection(config[r'ignore']):
+				ignore = s.strip()
+			ignores = [re.compile(i) for i in ignores]
+		for ignore in ignores:
+			all_paths = [p for p in all_paths if not ignore.search(str(p))]
+
 		self.paths = list(all_paths)
 		self.paths.sort()
 
@@ -811,7 +821,7 @@ class _Sources(_FilteredInputs):
 	schema = combine_dicts(_FilteredInputs.schema, {
 		Optional(r'strip_paths')		: ValueOrArray(str, name=r'strip_paths'),
 		Optional(r'strip_includes')		: ValueOrArray(str, name=r'strip_includes'),
-		Optional(r'extract_all')		: bool,
+		Optional(r'extract_all')		: bool
 	})
 
 	def __init__(self, config, key, input_dir, additional_inputs=None, additional_recursive_inputs=None):
@@ -1447,12 +1457,12 @@ def __init__(self, config_path, output_dir, threads, cleanup, verbose, mcss_dir,
 			self.implementation_headers = []
 			if 'implementation_headers' in config:
 				for k, v in config['implementation_headers'].items():
-					header = k.strip()
+					header = k.strip().replace('\\', '/')
 					impls = coerce_collection(v)
-					impls = [i.strip() for i in impls]
+					impls = [i.strip().replace('\\', '/') for i in impls]
 					impls = [i for i in impls if i]
 					if header and impls:
-						self.implementation_headers .append((header, impls))
+						self.implementation_headers.append((header, impls))
 			self.implementation_headers = tuple(self.implementation_headers)
 			self.verbose_value(r'Context.implementation_headers', self.implementation_headers)
 

diff --git a/poxy/run.py b/poxy/run.py
@@ -421,6 +421,8 @@ def _postprocess_xml(context):
 		implementation_header_mappings = None
 		implementation_header_innernamespaces = None
 		implementation_header_sectiondefs = None
+		implementation_header_unused_keys = None
+		implementation_header_unused_values = None
 		if context.implementation_headers:
 			implementation_header_data = [
 				(
@@ -431,6 +433,13 @@ def _postprocess_xml(context):
 				)
 				for hp, impl in context.implementation_headers
 			]
+			implementation_header_unused_keys = set()
+			for hp, impl in context.implementation_headers:
+				implementation_header_unused_keys.add(hp)
+			implementation_header_unused_values = dict()
+			for hdata in implementation_header_data:
+				for (ip, ifn, iid) in hdata[3]:
+					implementation_header_unused_values[iid] = (ip, hdata[0])
 			implementation_header_mappings = dict()
 			implementation_header_innernamespaces = dict()
 			implementation_header_sectiondefs = dict()
@@ -440,23 +449,23 @@ def _postprocess_xml(context):
 				for (ip, ifn, iid) in hdata[3]:
 					implementation_header_mappings[iid] = hdata
 
+		# process xml files
 		if 1:
 
 			# pre-pass to delete junk files
 			if 1:
-				# delete the new Doxyfile.xml (https://github.com/doxygen/doxygen/pull/8463)
+				# delete Doxyfile.xml (https://github.com/doxygen/doxygen/pull/8463)
 				# (it breaks m.css otherwise)
 				if not context.xml_only:
 					delete_file(Path(context.xml_dir, r'Doxyfile.xml'), logger=context.verbose_logger)
 
 				# 'file' entries for markdown and dox files
-				dox_files = (r'.dox', r'.md')
-				dox_files = [rf'*{doxygen.mangle_name(ext)}.xml' for ext in dox_files]
+				dox_files = [rf'*{doxygen.mangle_name(ext)}.xml' for ext in (r'.dox', r'.md')]
 				dox_files.append(r'md_home.xml')
 				for xml_file in get_all_files(context.xml_dir, any=dox_files):
 					delete_file(xml_file, logger=context.verbose_logger)
 
-				# 'dir' entries which contain nothing
+				# 'dir' entries for empty directories
 				deleted = True
 				while deleted:
 					deleted = False
@@ -697,19 +706,24 @@ def _postprocess_xml(context):
 
 						# rip the good bits out of implementation headers
 						if context.implementation_headers:
-							if compounddef.get(r'id') in implementation_header_mappings:
-								hid = implementation_header_mappings[compounddef.get("id")][2]
+							iid = compounddef.get(r'id')
+							if iid in implementation_header_mappings:
+								hid = implementation_header_mappings[iid][2]
 								innernamespaces = compounddef.findall(r'innernamespace')
 								if innernamespaces:
 									implementation_header_innernamespaces[hid] = implementation_header_innernamespaces[hid] + innernamespaces
 									extracted_implementation = True
+									if iid in implementation_header_unused_values:
+										del implementation_header_unused_values[iid]
 									for tag in innernamespaces:
 										compounddef.remove(tag)
 										changed = True
 								sectiondefs = compounddef.findall(r'sectiondef')
 								if sectiondefs:
 									implementation_header_sectiondefs[hid] = implementation_header_sectiondefs[hid] + sectiondefs
 									extracted_implementation = True
+									if iid in implementation_header_unused_values:
+										del implementation_header_unused_values[iid]
 									for tag in sectiondefs:
 										compounddef.remove(tag)
 										changed = True
@@ -787,8 +801,18 @@ def _postprocess_xml(context):
 							changed = True
 
 					if changed:
+						implementation_header_unused_keys.remove(hp)
 						write_xml_to_file(xml, xml_file)
 
+			# sanity-check implementation header state
+			if implementation_header_unused_keys:
+				for key in implementation_header_unused_keys:
+					context.warning(rf"implementation_header: nothing extracted for '{key}'")
+			if implementation_header_unused_values:
+				for iid, idata in implementation_header_unused_values.items():
+					context.warning(rf"implementation_header: nothing extracted from '{idata[0]}' for '{idata[1]}'")
+
+
 		# delete the impl header xml files
 		if 1 and context.implementation_headers:
 			for hdata in implementation_header_data:
@@ -836,25 +860,33 @@ def _postprocess_html_file(path, context=None):
 
 	context.verbose(rf'Post-processing {path}')
 	html_changed = False
-	if html_fixers:
-		doc = soup.HTMLDocument(path, logger=context.verbose_logger)
-		for fix in html_fixers:
-			if fix(doc, context):
-				doc.smooth()
-				html_changed = True
-		if html_changed:
-			doc.flush()
-
 	plain_text_changed = False
-	if plain_text_fixers:
-		doc = [ read_all_text_from_file(path, logger=context.verbose_logger) ]
-		for fix in plain_text_fixers:
-			if fix(doc, context):
-				plain_text_changed = True
-		if plain_text_changed:
-			context.verbose(rf'Writing {path}')
-			with open(path, 'w', encoding='utf-8', newline='\n') as f:
-				f.write(doc[0])
+
+	try:
+		if html_fixers:
+			doc = soup.HTMLDocument(path, logger=context.verbose_logger)
+			for fix in html_fixers:
+				if fix(doc, context):
+					doc.smooth()
+					html_changed = True
+			if html_changed:
+				doc.flush()
+
+		if plain_text_fixers:
+			doc = [ read_all_text_from_file(path, logger=context.verbose_logger) ]
+			for fix in plain_text_fixers:
+				if fix(doc, context):
+					plain_text_changed = True
+			if plain_text_changed:
+				context.verbose(rf'Writing {path}')
+				with open(path, 'w', encoding='utf-8', newline='\n') as f:
+					f.write(doc[0])
+	except Exception as e:
+		context.info(rf'{type(e).__name__} raised while post-processing {path}')
+		raise
+	except:
+		context.info(rf'Error occurred while post-processing {path}')
+		raise
 
 	return html_changed or plain_text_changed
 

diff --git a/poxy/soup.py b/poxy/soup.py
@@ -42,12 +42,12 @@ def destroy_node(node):
 
 
 def replace_tag(tag, new_tag_str):
+	assert tag.parent is not None
 	newTags = []
 	if new_tag_str:
 		doc = bs4.BeautifulSoup(new_tag_str, 'html5lib')
-		if (len(doc.body.contents) > 0):
-			newTags = [f for f in doc.body.contents]
-			newTags = [f.extract() for f in newTags]
+		if len(doc.body.contents) > 0:
+			newTags = [f.extract() for f in doc.body.contents]
 			prev = tag
 			for newTag in newTags:
 				prev.insert_after(newTag)