From c79ce043c569c04ecb8e463ec071467652d5be92 Mon Sep 17 00:00:00 2001 From: Yuliia Teslia Date: Thu, 9 May 2024 09:41:22 +0300 Subject: [PATCH 1/4] Scrape fips algo data issue --- src/sec_certs/__main__.py | 2 +- src/sec_certs/dataset/fips_algorithm.py | 66 ++++++++++++++++++++++++- src/sec_certs/sample/fips_algorithm.py | 13 +++++ 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/src/sec_certs/__main__.py b/src/sec_certs/__main__.py index 1d699b85..443fd9cc 100644 --- a/src/sec_certs/__main__.py +++ b/src/sec_certs/__main__.py @@ -3,4 +3,4 @@ from sec_certs.cli import main if __name__ == "__main__": - sys.exit(main()) + sys.exit(main()) \ No newline at end of file diff --git a/src/sec_certs/dataset/fips_algorithm.py b/src/sec_certs/dataset/fips_algorithm.py index 4a87b3ef..2b70d3d2 100644 --- a/src/sec_certs/dataset/fips_algorithm.py +++ b/src/sec_certs/dataset/fips_algorithm.py @@ -87,20 +87,84 @@ def download_alg_list_htmls(output_dir: Path) -> list[Path]: return paths + @staticmethod + def download_algs_data(output_dir: Path, alg_links: list[str]) -> list[Path]: + n_pages = len(alg_links) + + urls = [constants.FIPS_CAVP_URL + "/" + i for i in alg_links] + # doesn't work because of SHA-3 8 + # urls = [constants.FIPS_ALG_URL.format(alg_type, alg_number) for alg_type, alg_number in algs] + paths = [output_dir / f"alg_page{i}.html" for i in range(0, n_pages)] + responses = helpers.download_parallel(urls, paths, progress_bar_desc="Downloading FIPS Algorithm data") + + failed_tuples = [ + (url, path) for url, path, resp in zip(urls, paths, responses) if resp != constants.RESPONSE_OK + ] + if failed_tuples: + failed_urls, failed_paths = zip(*failed_tuples) + responses = helpers.download_parallel(failed_urls, failed_paths) + if any(x != constants.RESPONSE_OK for x in responses): + raise ValueError("Failed to download the algorithms data, the dataset won't be constructed.") + + return paths + @staticmethod def get_number_of_html_pages(html_path: Path) -> int: with html_path.open("r") as handle: soup = BeautifulSoup(handle, "html5lib") return int(soup.select("span[data-total-pages]")[0].attrs["data-total-pages"]) + @staticmethod + def parse_alg_data_from_html(html_path: Path) -> tuple[str, str, str, str]: + fields = [] + with html_path.open("r") as handle: + soup = BeautifulSoup(handle, "html5lib") + for field in ["Description", "Version", "Type"]: + div = soup.find("div", text=field) + fields.append("" if div is None else div.find_next_sibling("div").get_text()) + capability_trs = soup.find("table").find("tbody").findAll("tr") + capabilities = [c.findAll("td")[1].find(["b", "s"]).get_text().strip() for c in capability_trs] + return fields[0], fields[1], fields[2], ",".join(capabilities) + @staticmethod def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: df = pd.read_html(html_path)[0] df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x)) df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x)) + #links = list(zip(df['alg_type'], df['alg_number'])) + links = [] + with html_path.open("r") as handle: + soup = BeautifulSoup(handle, "html5lib") + table = soup.find('table').find('tbody') + for tr in table.findAll("tr"): + if len(tr.findAll("td")) != 4: + td = tr.findAll("td")[0] + links.append(td.find('a')['href']) + else: + td = tr.findAll("td")[2] + links.append(td.find('a')['href']) + + with TemporaryDirectory() as tmp_dir: + alg_pages = FIPSAlgorithmDataset.download_algs_data(Path(tmp_dir), links) + descriptions = [] + versions = [] + types = [] + capabilities = [] + for page in alg_pages: + d, v, t, c = FIPSAlgorithmDataset.parse_alg_data_from_html(page) + descriptions.append(d) + versions.append(v) + types.append(t) + capabilities.append(c) + df = df.assign(description=descriptions) + df = df.assign(version=versions) + df = df.assign(type=types) + df = df.assign(algorithm_capabilities=capabilities) + df["alg"] = df.apply( lambda row: FIPSAlgorithm( - row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"] + row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"], + row["description"], row["version"], row["type"], row["algorithm_capabilities"] ), axis=1, ) diff --git a/src/sec_certs/sample/fips_algorithm.py b/src/sec_certs/sample/fips_algorithm.py index 16f19e64..84491073 100644 --- a/src/sec_certs/sample/fips_algorithm.py +++ b/src/sec_certs/sample/fips_algorithm.py @@ -21,6 +21,11 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType): implementation_name: str validation_date: date + description: str + version: str + type: str + algorithm_capabilities: str + pandas_columns: ClassVar[list[str]] = [ "dgst", "alg_number", @@ -28,6 +33,10 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType): "vendor", "implementation_name", "validation_date", + "description", + "version", + "type", + "algorithm_capabilities" ] @property @@ -39,6 +48,10 @@ def pandas_tuple(self) -> tuple: self.vendor, self.implementation_name, self.validation_date, + self.description, + self.version, + self.type, + self.algorithm_capabilities ) @property From b2e9e4ebae57d829413e37e8965b4636514f1701 Mon Sep 17 00:00:00 2001 From: Yuliia Teslia Date: Sat, 25 May 2024 18:34:09 +0300 Subject: [PATCH 2/4] optimize pd.read_html() parameters to extract links --- src/sec_certs/__main__.py | 2 +- src/sec_certs/dataset/fips_algorithm.py | 37 ++++++++++++------------- src/sec_certs/sample/fips_algorithm.py | 4 +-- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/src/sec_certs/__main__.py b/src/sec_certs/__main__.py index 443fd9cc..1d699b85 100644 --- a/src/sec_certs/__main__.py +++ b/src/sec_certs/__main__.py @@ -3,4 +3,4 @@ from sec_certs.cli import main if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/sec_certs/dataset/fips_algorithm.py b/src/sec_certs/dataset/fips_algorithm.py index 2b70d3d2..57712e40 100644 --- a/src/sec_certs/dataset/fips_algorithm.py +++ b/src/sec_certs/dataset/fips_algorithm.py @@ -92,8 +92,6 @@ def download_algs_data(output_dir: Path, alg_links: list[str]) -> list[Path]: n_pages = len(alg_links) urls = [constants.FIPS_CAVP_URL + "/" + i for i in alg_links] - # doesn't work because of SHA-3 8 - # urls = [constants.FIPS_ALG_URL.format(alg_type, alg_number) for alg_type, alg_number in algs] paths = [output_dir / f"alg_page{i}.html" for i in range(0, n_pages)] responses = helpers.download_parallel(urls, paths, progress_bar_desc="Downloading FIPS Algorithm data") @@ -124,25 +122,17 @@ def parse_alg_data_from_html(html_path: Path) -> tuple[str, str, str, str]: fields.append("" if div is None else div.find_next_sibling("div").get_text()) capability_trs = soup.find("table").find("tbody").findAll("tr") capabilities = [c.findAll("td")[1].find(["b", "s"]).get_text().strip() for c in capability_trs] - return fields[0], fields[1], fields[2], ",".join(capabilities) + return fields[0], fields[1], fields[2], ", ".join(capabilities) @staticmethod def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: - df = pd.read_html(html_path)[0] - df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x)) - df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x)) - #links = list(zip(df['alg_type'], df['alg_number'])) - links = [] - with html_path.open("r") as handle: - soup = BeautifulSoup(handle, "html5lib") - table = soup.find('table').find('tbody') - for tr in table.findAll("tr"): - if len(tr.findAll("td")) != 4: - td = tr.findAll("td")[0] - links.append(td.find('a')['href']) - else: - td = tr.findAll("td")[2] - links.append(td.find('a')['href']) + df = pd.read_html(html_path, extract_links="body")[0] + df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x[0])) + df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x[0])) + links = [x[1] for x in df["Validation Number"]] + df["Vendor"] = df["Vendor"].map(lambda x: x[0]) + df["Implementation"] = df["Implementation"].map(lambda x: x[0]) + df["Validation Date"] = df["Validation Date"].map(lambda x: x[0]) with TemporaryDirectory() as tmp_dir: alg_pages = FIPSAlgorithmDataset.download_algs_data(Path(tmp_dir), links) @@ -163,8 +153,15 @@ def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: df["alg"] = df.apply( lambda row: FIPSAlgorithm( - row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"], - row["description"], row["version"], row["type"], row["algorithm_capabilities"] + row["alg_number"], + row["alg_type"], + row["Vendor"], + row["Implementation"], + row["Validation Date"], + row["description"], + row["version"], + row["type"], + row["algorithm_capabilities"], ), axis=1, ) diff --git a/src/sec_certs/sample/fips_algorithm.py b/src/sec_certs/sample/fips_algorithm.py index 84491073..a1d49739 100644 --- a/src/sec_certs/sample/fips_algorithm.py +++ b/src/sec_certs/sample/fips_algorithm.py @@ -36,7 +36,7 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType): "description", "version", "type", - "algorithm_capabilities" + "algorithm_capabilities", ] @property @@ -51,7 +51,7 @@ def pandas_tuple(self) -> tuple: self.description, self.version, self.type, - self.algorithm_capabilities + self.algorithm_capabilities, ) @property From f01059cd43e2158e2b1d9d91dd37644c84323f13 Mon Sep 17 00:00:00 2001 From: Yuliia Teslia Date: Thu, 5 Sep 2024 18:38:17 +0300 Subject: [PATCH 3/4] review changes --- src/sec_certs/dataset/fips_algorithm.py | 70 ++++++++++++++++++++--- tests/fips/test_fips_algorithm_dataset.py | 4 ++ 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/src/sec_certs/dataset/fips_algorithm.py b/src/sec_certs/dataset/fips_algorithm.py index 56163078..62c432ac 100644 --- a/src/sec_certs/dataset/fips_algorithm.py +++ b/src/sec_certs/dataset/fips_algorithm.py @@ -87,27 +87,79 @@ def download_alg_list_htmls(output_dir: Path) -> list[Path]: return paths + @staticmethod + def download_algs_data(output_dir: Path, alg_links: list[str]) -> list[Path]: + urls = [constants.FIPS_CAVP_URL + "/" + i for i in alg_links] + paths = [output_dir / f"alg_page{i}.html" for i in range(0, len(alg_links))] + responses = helpers.download_parallel(urls, paths, progress_bar_desc="Downloading FIPS Algorithm data") + + failed_tuples = [ + (url, path) for url, path, resp in zip(urls, paths, responses) if resp != constants.RESPONSE_OK + ] + if failed_tuples: + failed_urls, failed_paths = zip(*failed_tuples) + responses = helpers.download_parallel(failed_urls, failed_paths) + if any(x != constants.RESPONSE_OK for x in responses): + raise ValueError("Failed to download the algorithms data, the dataset won't be constructed.") + + return paths + @staticmethod def get_number_of_html_pages(html_path: Path) -> int: with html_path.open("r") as handle: soup = BeautifulSoup(handle, "html5lib") return int(soup.select("span[data-total-pages]")[0].attrs["data-total-pages"]) + @staticmethod + def parse_alg_data_from_html(html_path: Path) -> tuple[str, str, str, str]: + fields = [] + with html_path.open("r") as handle: + soup = BeautifulSoup(handle, "html5lib") + for field in ["Description", "Version", "Type"]: + div = soup.find("div", text=field) + fields.append("" if div is None else div.find_next_sibling("div").get_text()) + capability_trs = soup.find("table").find("tbody").findAll("tr") + capabilities = [c.findAll("td")[1].find(["b", "s"]).get_text().strip() for c in capability_trs] + return fields[0], fields[1], fields[2], ", ".join(capabilities) + @staticmethod def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: - df = pd.read_html(html_path)[0] + df = pd.read_html(html_path, extract_links="body")[0] for col in df.columns: if "Order by" in col: df.rename(columns={col: col.split("Order by")[0]}, inplace=True) - df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x)) - df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x)) - df["alg"] = df.apply( - lambda row: FIPSAlgorithm( - row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"] - ), - axis=1, + df = df.assign( + alg_type=df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x[0])), + alg_number=df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x[0])), + Vendor=df["Vendor"].map(lambda x: x[0]), + Implementation=df["Implementation"].map(lambda x: x[0]), + Validation_Date=df["Validation Date"].map(lambda x: x[0]) ) - return set(df["alg"]) + links = [x[1] for x in df["Validation Number"]] + + with TemporaryDirectory() as tmp_dir: + alg_pages = FIPSAlgorithmDataset.download_algs_data(Path(tmp_dir), links) + parsed_data = [FIPSAlgorithmDataset.parse_alg_data_from_html(page) for page in alg_pages] + descriptions, versions, types, capabilities = zip(*parsed_data) + df = df.assign(description=descriptions, version=versions, type=types, algorithm_capabilities=capabilities) + + return set( + df.apply( + lambda row: FIPSAlgorithm( + row["alg_number"], + row["alg_type"], + row["Vendor"], + row["Implementation"], + row["Validation Date"], + row["description"], + row["version"], + row["type"], + row["algorithm_capabilities"], + ), + axis=1, + ) +) + def to_pandas(self) -> pd.DataFrame: return pd.DataFrame([x.pandas_tuple for x in self], columns=FIPSAlgorithm.pandas_columns).set_index("dgst") diff --git a/tests/fips/test_fips_algorithm_dataset.py b/tests/fips/test_fips_algorithm_dataset.py index 0ef9a177..63d25b7a 100644 --- a/tests/fips/test_fips_algorithm_dataset.py +++ b/tests/fips/test_fips_algorithm_dataset.py @@ -36,6 +36,10 @@ def alg_dict() -> dict[str, Any]: "vendor": "Hewlett-Packard Development Company, L.P.", "implementation_name": "HP Secure Encryption Engine v1.0", "validation_date": "7/10/2014", + "description": "HP Secure Encryption is a controller-based data encryption solution for HP ProLiant Gen8 or newer servers that protects data at rest on any bulk storage attached to the HP Smart Array controller. The solution comprises our 12G family of HP Smart Array controllers, the HP Physical Security Kit, and the HP Secure Encryption licensing.", + "version": "PM8061", + "type": "HARDWARE", + "algorithm_capabilities": "HMAC-SHA2-256, Counter DRBG, AES-ECB, AES-XTS, SHA2-256", } From 258912491eab34b283f38c50818ca013162fd47f Mon Sep 17 00:00:00 2001 From: Yuliia Teslia Date: Thu, 5 Sep 2024 18:45:51 +0300 Subject: [PATCH 4/4] review changes --- src/sec_certs/dataset/fips_algorithm.py | 31 +++++++++++------------ tests/fips/test_fips_algorithm_dataset.py | 11 ++++++-- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/src/sec_certs/dataset/fips_algorithm.py b/src/sec_certs/dataset/fips_algorithm.py index 62c432ac..c7b848a6 100644 --- a/src/sec_certs/dataset/fips_algorithm.py +++ b/src/sec_certs/dataset/fips_algorithm.py @@ -144,22 +144,21 @@ def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: df = df.assign(description=descriptions, version=versions, type=types, algorithm_capabilities=capabilities) return set( - df.apply( - lambda row: FIPSAlgorithm( - row["alg_number"], - row["alg_type"], - row["Vendor"], - row["Implementation"], - row["Validation Date"], - row["description"], - row["version"], - row["type"], - row["algorithm_capabilities"], - ), - axis=1, - ) -) - + df.apply( + lambda row: FIPSAlgorithm( + row["alg_number"], + row["alg_type"], + row["Vendor"], + row["Implementation"], + row["Validation Date"], + row["description"], + row["version"], + row["type"], + row["algorithm_capabilities"], + ), + axis=1, + ) + ) def to_pandas(self) -> pd.DataFrame: return pd.DataFrame([x.pandas_tuple for x in self], columns=FIPSAlgorithm.pandas_columns).set_index("dgst") diff --git a/tests/fips/test_fips_algorithm_dataset.py b/tests/fips/test_fips_algorithm_dataset.py index 63d25b7a..1f1e9ad8 100644 --- a/tests/fips/test_fips_algorithm_dataset.py +++ b/tests/fips/test_fips_algorithm_dataset.py @@ -36,10 +36,17 @@ def alg_dict() -> dict[str, Any]: "vendor": "Hewlett-Packard Development Company, L.P.", "implementation_name": "HP Secure Encryption Engine v1.0", "validation_date": "7/10/2014", - "description": "HP Secure Encryption is a controller-based data encryption solution for HP ProLiant Gen8 or newer servers that protects data at rest on any bulk storage attached to the HP Smart Array controller. The solution comprises our 12G family of HP Smart Array controllers, the HP Physical Security Kit, and the HP Secure Encryption licensing.", + "description": "HP Secure Encryption is a controller-based data " + "encryption solution for HP ProLiant Gen8 or newer " + "servers that protects data at rest on any bulk storage" + " attached to the HP Smart Array controller. The " + "solution comprises our 12G family of HP Smart Array " + "controllers, the HP Physical Security Kit, and the HP " + "Secure Encryption licensing.", "version": "PM8061", "type": "HARDWARE", - "algorithm_capabilities": "HMAC-SHA2-256, Counter DRBG, AES-ECB, AES-XTS, SHA2-256", + "algorithm_capabilities": "HMAC-SHA2-256, Counter DRBG, " + "AES-ECB, AES-XTS, SHA2-256", }