Skip to content

Commit

Permalink
Add rudimentary profiling.
Browse files Browse the repository at this point in the history
  • Loading branch information
J08nY committed Jul 26, 2023
1 parent de6bfaf commit 9e154bf
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 49 deletions.
66 changes: 32 additions & 34 deletions src/sec_certs/dataset/cc.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
from sec_certs.serialization.json import ComplexSerializableType, serialize
from sec_certs.utils import helpers
from sec_certs.utils import parallel_processing as cert_processing
from sec_certs.utils.profiling import staged


@dataclass
Expand Down Expand Up @@ -270,6 +271,7 @@ def _download_csv_html_resources(self, get_active: bool = True, get_archived: bo
helpers.download_parallel(csv_urls, csv_paths)

@serialize
@staged(logger, "Downloading and processing CSV and HTML files of certificates.")
def get_certs_from_web(
self, to_download: bool = True, keep_metadata: bool = True, get_active: bool = True, get_archived: bool = True
) -> None:
Expand Down Expand Up @@ -520,12 +522,11 @@ def _download_all_artifacts_body(self, fresh: bool = True) -> None:
self._download_reports(fresh)
self._download_targets(fresh)

@staged(logger, "Downloading PDFs of CC certification reports.")
def _download_reports(self, fresh: bool = True) -> None:
self.reports_pdf_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh) and x.report_link]

if fresh:
logger.info("Downloading PDFs of CC certification reports.")
if not fresh and certs_to_process:
logger.info(
f"Downloading {len(certs_to_process)} PDFs of CC certification reports for which previous download failed."
Expand All @@ -537,12 +538,11 @@ def _download_reports(self, fresh: bool = True) -> None:
progress_bar_desc="Downloading PDFs of CC certification reports",
)

@staged(logger, "Downloading PDFs of CC security targets.")
def _download_targets(self, fresh: bool = True) -> None:
self.targets_pdf_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_download(fresh)]

if fresh:
logger.info("Downloading PDFs of CC security targets.")
if not fresh and certs_to_process:
logger.info(
f"Downloading {len(certs_to_process)} PDFs of CC security targets for which previous download failed.."
Expand All @@ -554,12 +554,11 @@ def _download_targets(self, fresh: bool = True) -> None:
progress_bar_desc="Downloading PDFs of CC security targets",
)

@staged(logger, "Converting PDFs of certification reports to txt.")
def _convert_reports_to_txt(self, fresh: bool = True) -> None:
self.reports_txt_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.report_is_ok_to_convert(fresh)]

if fresh:
logger.info("Converting PDFs of certification reports to txt.")
if not fresh and certs_to_process:
logger.info(
f"Converting {len(certs_to_process)} PDFs of certification reports to txt for which previous conversion failed."
Expand All @@ -571,6 +570,7 @@ def _convert_reports_to_txt(self, fresh: bool = True) -> None:
progress_bar_desc="Converting PDFs of certification reports to txt",
)

@staged(logger, "Converting PDFs of security targets to txt.")
def _convert_targets_to_txt(self, fresh: bool = True) -> None:
self.targets_txt_dir.mkdir(parents=True, exist_ok=True)
certs_to_process = [x for x in self if x.state.st_is_ok_to_convert(fresh)]
Expand All @@ -592,8 +592,8 @@ def _convert_all_pdfs_body(self, fresh: bool = True) -> None:
self._convert_reports_to_txt(fresh)
self._convert_targets_to_txt(fresh)

@staged(logger, "Extracting report metadata")
def _extract_report_metadata(self) -> None:
logger.info("Extracting report metadata")
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_metadata,
Expand All @@ -603,8 +603,8 @@ def _extract_report_metadata(self) -> None:
)
self.update_with_certs(processed_certs)

def _extract_targets_metadata(self) -> None:
logger.info("Extracting target metadata")
@staged(logger, "Extracting target metadata")
def _extract_target_metadata(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_metadata,
Expand All @@ -616,10 +616,10 @@ def _extract_targets_metadata(self) -> None:

def _extract_pdf_metadata(self) -> None:
self._extract_report_metadata()
self._extract_targets_metadata()
self._extract_target_metadata()

@staged(logger, "Extracting report frontpages")
def _extract_report_frontpage(self) -> None:
logger.info("Extracting report frontpages")
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_frontpage,
Expand All @@ -629,8 +629,8 @@ def _extract_report_frontpage(self) -> None:
)
self.update_with_certs(processed_certs)

def _extract_targets_frontpage(self) -> None:
logger.info("Extracting target frontpages")
@staged(logger, "Extracting target frontpages")
def _extract_target_frontpage(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_frontpage,
Expand All @@ -642,10 +642,10 @@ def _extract_targets_frontpage(self) -> None:

def _extract_pdf_frontpage(self) -> None:
self._extract_report_frontpage()
self._extract_targets_frontpage()
self._extract_target_frontpage()

@staged(logger, "Extracting report keywords")
def _extract_report_keywords(self) -> None:
logger.info("Extracting report keywords")
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_report_pdf_keywords,
Expand All @@ -655,8 +655,8 @@ def _extract_report_keywords(self) -> None:
)
self.update_with_certs(processed_certs)

def _extract_targets_keywords(self) -> None:
logger.info("Extracting target keywords")
@staged(logger, "Extracting target keywords")
def _extract_target_keywords(self) -> None:
certs_to_process = [x for x in self if x.state.st_is_ok_to_analyze()]
processed_certs = cert_processing.process_parallel(
CCCertificate.extract_st_pdf_keywords,
Expand All @@ -668,27 +668,27 @@ def _extract_targets_keywords(self) -> None:

def _extract_pdf_keywords(self) -> None:
self._extract_report_keywords()
self._extract_targets_keywords()
self._extract_target_keywords()

def extract_data(self) -> None:
logger.info("Extracting various data from certification artifacts")
self._extract_pdf_metadata()
self._extract_pdf_frontpage()
self._extract_pdf_keywords()

@staged(logger, "Computing heuristics: Deriving information about laboratories involved in certification.")
def _compute_cert_labs(self) -> None:
logger.info("Computing heuristics: Deriving information about laboratories involved in certification.")
certs_to_process = [x for x in self if x.state.report_is_ok_to_analyze()]
for cert in certs_to_process:
cert.compute_heuristics_cert_lab()

@staged(logger, "Computing heuristics: Deriving information about certificate ids from artifacts.")
def _compute_normalized_cert_ids(self) -> None:
logger.info("Computing heuristics: Deriving information about certificate ids from artifacts.")
for cert in self:
cert.compute_heuristics_cert_id()

@staged(logger, "Computing heuristics: Transitive vulnerabilities in referenc(ed/ing) certificates.")
def _compute_transitive_vulnerabilities(self):
logger.info("omputing heuristics: computing transitive vulnerabilities in referenc(ed/ing) certificates.")
transitive_cve_finder = TransitiveVulnerabilityFinder(lambda cert: cert.heuristics.cert_id)
transitive_cve_finder.fit(self.certs, lambda cert: cert.heuristics.report_references)

Expand All @@ -698,9 +698,9 @@ def _compute_transitive_vulnerabilities(self):
self.certs[dgst].heuristics.direct_transitive_cves = transitive_cve.direct_transitive_cves
self.certs[dgst].heuristics.indirect_transitive_cves = transitive_cve.indirect_transitive_cves

@staged(logger, "Computing heuristics: Matching scheme data.")
def _compute_scheme_data(self):
if self.auxiliary_datasets.scheme_dset:
print("here")
for scheme in self.auxiliary_datasets.scheme_dset:
if certified := scheme.lists.get(EntryType.Certified):
certs = [cert for cert in self if cert.status == "active"]
Expand All @@ -713,19 +713,20 @@ def _compute_scheme_data(self):
for dgst, match in matches.items():
self[dgst].heuristics.scheme_data = match

@staged(logger, "Computing heuristics: SARs")
def _compute_sars(self) -> None:
transformer = SARTransformer().fit(self.certs.values())
for cert in self:
cert.heuristics.extracted_sars = transformer.transform_single_cert(cert)

def _compute_heuristics(self) -> None:
self._compute_normalized_cert_ids()
super()._compute_heuristics()
self._compute_scheme_data()
self._compute_cert_labs()
self._compute_sars()

def _compute_sars(self) -> None:
logger.info("Computing heuristics: Computing SARs")
transformer = SARTransformer().fit(self.certs.values())
for cert in self:
cert.heuristics.extracted_sars = transformer.transform_single_cert(cert)

@staged(logger, "Computing heuristics: references between certificates.")
def _compute_references(self) -> None:
def ref_lookup(kw_attr):
def func(cert):
Expand All @@ -744,7 +745,6 @@ def func(cert):

return func

logger.info("omputing heuristics: references between certificates.")
for ref_source in ("report", "st"):
kw_source = f"{ref_source}_keywords"
dep_attr = f"{ref_source}_references"
Expand All @@ -768,6 +768,7 @@ def process_auxiliary_datasets(self, download_fresh: bool = False) -> None:
to_download=download_fresh, only_schemes={cert.scheme for cert in self}
)

@staged(logger, "Processing protection profiles.")
def process_protection_profiles(
self, to_download: bool = True, keep_metadata: bool = True
) -> ProtectionProfileDataset:
Expand All @@ -779,7 +780,6 @@ def process_protection_profiles(
:param bool keep_metadata: If json related to the PP dataset should be kept on drive, defaults to True
:raises RuntimeError: When building of PPDataset fails
"""
logger.info("Processing protection profiles.")

self.auxiliary_datasets_dir.mkdir(parents=True, exist_ok=True)

Expand All @@ -798,13 +798,12 @@ def process_protection_profiles(

return pp_dataset

@staged(logger, "Processing maintenace updates.")
def process_maintenance_updates(self, to_download: bool = True) -> CCDatasetMaintenanceUpdates:
"""
Downloads or loads from json a dataset of maintenance updates. Runs analysis on that dataset if it's not completed.
:return CCDatasetMaintenanceUpdates: the resulting dataset of maintenance updates
"""

logger.info("Processing maintenace updates")
self.mu_dataset_dir.mkdir(parents=True, exist_ok=True)

if to_download or not self.mu_dataset_path.exists():
Expand All @@ -827,12 +826,11 @@ def process_maintenance_updates(self, to_download: bool = True) -> CCDatasetMain

return update_dset

@staged(logger, "Processing CC scheme dataset.")
def process_schemes(self, to_download: bool = True, only_schemes: set[str] | None = None) -> CCSchemeDataset:
"""
Downloads or loads from json a dataset of CC scheme data.
"""
logger.info("Processing CC schemes")

self.auxiliary_datasets_dir.mkdir(parents=True, exist_ok=True)

if to_download or not self.scheme_dataset_path.exists():
Expand Down
8 changes: 6 additions & 2 deletions src/sec_certs/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from sec_certs.serialization.json import ComplexSerializableType, get_class_fullname, serialize
from sec_certs.utils import helpers
from sec_certs.utils.nvd_dataset_builder import CpeMatchNvdDatasetBuilder, CpeNvdDatasetBuilder, CveNvdDatasetBuilder
from sec_certs.utils.profiling import staged
from sec_certs.utils.tqdm import tqdm

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -348,6 +349,7 @@ def _compute_references(self) -> None:
def _compute_transitive_vulnerabilities(self) -> None:
raise NotImplementedError("Not meant to be implemented by the base class.")

@staged(logger, "Processing CPEDataset.")
def _prepare_cpe_dataset(self, download_fresh: bool = False) -> CPEDataset:
if not self.auxiliary_datasets_dir.exists():
self.auxiliary_datasets_dir.mkdir(parents=True)
Expand All @@ -371,6 +373,7 @@ def _prepare_cpe_dataset(self, download_fresh: bool = False) -> CPEDataset:

return cpe_dataset

@staged(logger, "Processing CVEDataset.")
def _prepare_cve_dataset(self, download_fresh: bool = False) -> CVEDataset:
if not self.auxiliary_datasets_dir.exists():
logger.info("Loading CVEDataset from json.")
Expand All @@ -395,6 +398,7 @@ def _prepare_cve_dataset(self, download_fresh: bool = False) -> CVEDataset:

return cve_dataset

@staged(logger, "Processing CPE match dict.")
def _prepare_cpe_match_dict(self, download_fresh: bool = False) -> dict:
if self.cpe_match_json_path.exists():
logger.info("Preparing CPE Match feed from json.")
Expand Down Expand Up @@ -433,6 +437,7 @@ def _prepare_cpe_match_dict(self, download_fresh: bool = False) -> dict:
return cpe_match_dict

@serialize
@staged(logger, "Computing heuristics: Finding CPE matches for certificates")
def compute_cpe_heuristics(self) -> CPEClassifier:
"""
Computes matching CPEs for the certificates.
Expand Down Expand Up @@ -465,7 +470,6 @@ def filter_condition(cpe: CPE) -> bool:
return False
return True

logger.info("Computing heuristics: Finding CPE matches for certificates")
if not self.auxiliary_datasets.cpe_dset:
self.auxiliary_datasets.cpe_dset = self._prepare_cpe_dataset()

Expand Down Expand Up @@ -574,11 +578,11 @@ def _get_all_cpes_in_dataset(self) -> set[CPE]:
return set(itertools.chain.from_iterable(cpe_matches))

@serialize
@staged(logger, "Computing heuristics: CVEs in certificates.")
def compute_related_cves(self) -> None:
"""
Computes CVEs for the certificates, given their CPE matches.
"""
logger.info("Computing heuristics: CVEs in certificates.")

if not self.auxiliary_datasets.cpe_dset:
self.auxiliary_datasets.cpe_dset = self._prepare_cpe_dataset()
Expand Down
Loading

0 comments on commit 9e154bf

Please sign in to comment.