Skip to content

Commit

Permalink
Add report after crawling OpenML. (#549)
Browse files Browse the repository at this point in the history
  • Loading branch information
marcenacp authored Feb 21, 2024
1 parent d5115ea commit ba3a49d
Show file tree
Hide file tree
Showing 3 changed files with 624 additions and 20 deletions.
34 changes: 15 additions & 19 deletions health/crawler/spiders/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,25 @@
_TIMEOUT_SECONDS = 10


def scan_parquet_files() -> pl.LazyFrame | None:
"""Scans cached parquet files."""
folder = epath.Path(__file__).parent.parent.parent / "data"
parquet_files = list(folder.glob("*/*.parquet"))
if parquet_files:
logging.info(f"Found existing {len(parquet_files)} Parquet files")
return pl.scan_parquet(parquet_files)
else:
return None


class BaseSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
"""Opens a connection to the local cache repository to do look-ups."""
super().__init__(*args, **kwargs)
self.df = self._scan_parquet_files()
self.df = scan_parquet_files()
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.date = datetime.datetime.now()

def _scan_parquet_files(self) -> pl.LazyFrame | None:
"""Scans cached parquet files."""
folder = epath.Path(__file__).parent.parent.parent / "data"
parquet_files = list(folder.glob("*/*.parquet"))
if parquet_files:
logging.info(f"Found existing Parquet files {parquet_files}")
return pl.scan_parquet(parquet_files)
else:
return None

def list_datasets(self) -> list[Any]:
"""Returns the list of all datasets in the repository.
Expand Down Expand Up @@ -73,13 +74,8 @@ def start_requests(self):
url=url,
callback=self.parse,
errback=self.parse_error,
# Only wait 20 seconds, because some requests seem to timeout
meta={
"download_timeout": _TIMEOUT_SECONDS,
"handle_httpstatus_list": self.settings.attributes[
"HTTPERROR_ALLOWED_CODES"
].value,
},
# Only wait _TIMEOUT_SECONDS, because some requests seem to timeout
meta={"download_timeout": _TIMEOUT_SECONDS},
)

def parse(self, response: http.Response) -> DownloadedItem:
Expand Down Expand Up @@ -108,7 +104,7 @@ def parse_error(self, failure: failure.Failure) -> DownloadedItem:

def spider_closed(self, spider):
del spider
df = self._scan_parquet_files()
df = scan_parquet_files()
if df is None:
logging.info("No data written to disk yet")
else:
Expand Down
2 changes: 1 addition & 1 deletion health/crawler/spiders/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class OpenmlSpider(BaseSpider):

def list_datasets(self):
"""See base class."""
return list(openml.datasets.list_datasets(output_format="dataframe")["did"])[:1]
return list(openml.datasets.list_datasets(output_format="dataframe")["did"])

def get_url(self, dataset_id: str):
"""See base class."""
Expand Down
Loading

0 comments on commit ba3a49d

Please sign in to comment.