From 6f070bc1f9b070d1d6e9e64e5f83006215955b8e Mon Sep 17 00:00:00 2001 From: Lucas Thurston Date: Wed, 29 Mar 2023 15:22:41 -0700 Subject: [PATCH] [WIP] Implement EMS fetcher --- metadata_fetcher/fetchers/ems_fetcher.py | 70 ++++++++++++++++++++++++ metadata_fetcher/lambda_function.py | 3 + 2 files changed, 73 insertions(+) create mode 100644 metadata_fetcher/fetchers/ems_fetcher.py diff --git a/metadata_fetcher/fetchers/ems_fetcher.py b/metadata_fetcher/fetchers/ems_fetcher.py new file mode 100644 index 000000000..bf7dab81b --- /dev/null +++ b/metadata_fetcher/fetchers/ems_fetcher.py @@ -0,0 +1,70 @@ +import json +from xml.etree import ElementTree +from .Fetcher import Fetcher + + +class EmsFetcher(Fetcher): + def __init__(self, params): + super(EmsFetcher, self).__init__(params) + + # If `next_url` is a param, we know that this is not + # the fetch of the first page, so skip setting those + # attributes + if "next_url" in params: + for key in params: + setattr(self, key, params[key]) + return + + self.base_url = params.get("harvest_data").get("url") + self.original_url = self.get_current_url() + self.next_url = self.original_url + self.docs_total = 123 + + def get_current_url(self): + query_params = f"/search/*/objects/xml?filter=approved%3Atrue&page={self.write_page}" + return f"{self.base_url}{query_params}" + + def build_fetch_request(self): + request = {"url": self.next_url} + + print( + f"[{self.collection_id}]: Fetching page {self.write_page} " + f"at {request.get('url')}") + + return request + + def get_text_from_response(self, response): + return response.content + + def check_page(self, http_resp): + """ + TODO: review other fetchers, do what they do + """ + hits = 345 + + print( + f"[{self.collection_id}]: Fetched page {self.write_page} " + f"at {http_resp.url} with {hits} hits" + ) + + return True + + def increment(self, http_resp): + super(EmsFetcher, self).increment(http_resp) + tree = ElementTree.fromstring(http_resp.content.encode('utf-8')) + self.docs_total = len(tree.findall("objects/object")) + self.next_url = self.get_current_url() if self.docs_total > 0 else None + + def json(self): + current_state = { + "harvest_type": self.harvest_type, + "collection_id": self.collection_id, + "next_url": self.next_url, + "write_page": self.write_page, + "base_url": self.base_url + } + + if not self.next_url: + current_state.update({"finished": True}) + + return json.dumps(current_state) diff --git a/metadata_fetcher/lambda_function.py b/metadata_fetcher/lambda_function.py index 9f4b5cef6..d55b40a59 100644 --- a/metadata_fetcher/lambda_function.py +++ b/metadata_fetcher/lambda_function.py @@ -8,6 +8,9 @@ def import_fetcher(harvest_type): + if harvest_type == "emuseum": + harvest_type = "ems" + fetcher_module = importlib.import_module( f"fetchers.{harvest_type}_fetcher", package="metadata_fetcher") fetcher_module_words = harvest_type.split('_')