diff --git a/src/sec_certs/dataset/fips_algorithm.py b/src/sec_certs/dataset/fips_algorithm.py index 56163078..c7b848a6 100644 --- a/src/sec_certs/dataset/fips_algorithm.py +++ b/src/sec_certs/dataset/fips_algorithm.py @@ -87,27 +87,78 @@ def download_alg_list_htmls(output_dir: Path) -> list[Path]: return paths + @staticmethod + def download_algs_data(output_dir: Path, alg_links: list[str]) -> list[Path]: + urls = [constants.FIPS_CAVP_URL + "/" + i for i in alg_links] + paths = [output_dir / f"alg_page{i}.html" for i in range(0, len(alg_links))] + responses = helpers.download_parallel(urls, paths, progress_bar_desc="Downloading FIPS Algorithm data") + + failed_tuples = [ + (url, path) for url, path, resp in zip(urls, paths, responses) if resp != constants.RESPONSE_OK + ] + if failed_tuples: + failed_urls, failed_paths = zip(*failed_tuples) + responses = helpers.download_parallel(failed_urls, failed_paths) + if any(x != constants.RESPONSE_OK for x in responses): + raise ValueError("Failed to download the algorithms data, the dataset won't be constructed.") + + return paths + @staticmethod def get_number_of_html_pages(html_path: Path) -> int: with html_path.open("r") as handle: soup = BeautifulSoup(handle, "html5lib") return int(soup.select("span[data-total-pages]")[0].attrs["data-total-pages"]) + @staticmethod + def parse_alg_data_from_html(html_path: Path) -> tuple[str, str, str, str]: + fields = [] + with html_path.open("r") as handle: + soup = BeautifulSoup(handle, "html5lib") + for field in ["Description", "Version", "Type"]: + div = soup.find("div", text=field) + fields.append("" if div is None else div.find_next_sibling("div").get_text()) + capability_trs = soup.find("table").find("tbody").findAll("tr") + capabilities = [c.findAll("td")[1].find(["b", "s"]).get_text().strip() for c in capability_trs] + return fields[0], fields[1], fields[2], ", ".join(capabilities) + @staticmethod def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]: - df = pd.read_html(html_path)[0] + df = pd.read_html(html_path, extract_links="body")[0] for col in df.columns: if "Order by" in col: df.rename(columns={col: col.split("Order by")[0]}, inplace=True) - df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x)) - df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x)) - df["alg"] = df.apply( - lambda row: FIPSAlgorithm( - row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"] - ), - axis=1, + df = df.assign( + alg_type=df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x[0])), + alg_number=df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x[0])), + Vendor=df["Vendor"].map(lambda x: x[0]), + Implementation=df["Implementation"].map(lambda x: x[0]), + Validation_Date=df["Validation Date"].map(lambda x: x[0]) + ) + links = [x[1] for x in df["Validation Number"]] + + with TemporaryDirectory() as tmp_dir: + alg_pages = FIPSAlgorithmDataset.download_algs_data(Path(tmp_dir), links) + parsed_data = [FIPSAlgorithmDataset.parse_alg_data_from_html(page) for page in alg_pages] + descriptions, versions, types, capabilities = zip(*parsed_data) + df = df.assign(description=descriptions, version=versions, type=types, algorithm_capabilities=capabilities) + + return set( + df.apply( + lambda row: FIPSAlgorithm( + row["alg_number"], + row["alg_type"], + row["Vendor"], + row["Implementation"], + row["Validation Date"], + row["description"], + row["version"], + row["type"], + row["algorithm_capabilities"], + ), + axis=1, + ) ) - return set(df["alg"]) def to_pandas(self) -> pd.DataFrame: return pd.DataFrame([x.pandas_tuple for x in self], columns=FIPSAlgorithm.pandas_columns).set_index("dgst") diff --git a/src/sec_certs/sample/fips_algorithm.py b/src/sec_certs/sample/fips_algorithm.py index 16f19e64..a1d49739 100644 --- a/src/sec_certs/sample/fips_algorithm.py +++ b/src/sec_certs/sample/fips_algorithm.py @@ -21,6 +21,11 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType): implementation_name: str validation_date: date + description: str + version: str + type: str + algorithm_capabilities: str + pandas_columns: ClassVar[list[str]] = [ "dgst", "alg_number", @@ -28,6 +33,10 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType): "vendor", "implementation_name", "validation_date", + "description", + "version", + "type", + "algorithm_capabilities", ] @property @@ -39,6 +48,10 @@ def pandas_tuple(self) -> tuple: self.vendor, self.implementation_name, self.validation_date, + self.description, + self.version, + self.type, + self.algorithm_capabilities, ) @property diff --git a/tests/fips/test_fips_algorithm_dataset.py b/tests/fips/test_fips_algorithm_dataset.py index 0ef9a177..1f1e9ad8 100644 --- a/tests/fips/test_fips_algorithm_dataset.py +++ b/tests/fips/test_fips_algorithm_dataset.py @@ -36,6 +36,17 @@ def alg_dict() -> dict[str, Any]: "vendor": "Hewlett-Packard Development Company, L.P.", "implementation_name": "HP Secure Encryption Engine v1.0", "validation_date": "7/10/2014", + "description": "HP Secure Encryption is a controller-based data " + "encryption solution for HP ProLiant Gen8 or newer " + "servers that protects data at rest on any bulk storage" + " attached to the HP Smart Array controller. The " + "solution comprises our 12G family of HP Smart Array " + "controllers, the HP Physical Security Kit, and the HP " + "Secure Encryption licensing.", + "version": "PM8061", + "type": "HARDWARE", + "algorithm_capabilities": "HMAC-SHA2-256, Counter DRBG, " + "AES-ECB, AES-XTS, SHA2-256", }