Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rate limiting to fix issue 77 #81

Merged
merged 2 commits into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 13 additions & 12 deletions app/scraper/other/institutions.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import requests
from flask import current_app

from app.models import Institution
from app.scraper.other.config import INSTITUTIONS_URL
from app.scraper.rechtspraak_session import RechtspraakScrapeSession
from app.scraper.soup_parsing import safe_find_text, to_soup


Expand All @@ -26,17 +26,18 @@ def institution_exists(institution_dict):


def import_institutions_handler():
r = requests.get(INSTITUTIONS_URL)
r.raise_for_status()
with RechtspraakScrapeSession() as session:
r = session.get(INSTITUTIONS_URL)
r.raise_for_status()

institutions = to_soup(r.content).find_all("Instantie")
current_app.logger.info(f"Found {len(institutions)} institutions")
institutions = to_soup(r.content).find_all("Instantie")
current_app.logger.info(f"Found {len(institutions)} institutions")

for institution in institutions:
institution_dict = transform_institution_xml_to_dict(institution)
for institution in institutions:
institution_dict = transform_institution_xml_to_dict(institution)

if not institution_exists(institution_dict):
Institution.create(**institution_dict)
current_app.logger.info(
f"New institution {institution_dict.get('name')} added"
)
if not institution_exists(institution_dict):
Institution.create(**institution_dict)
current_app.logger.info(
f"New institution {institution_dict.get('name')} added"
)
49 changes: 25 additions & 24 deletions app/scraper/other/legal_areas.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import requests
from flask import current_app

from app.models import LegalArea
from app.scraper.other.config import LEGAL_AREAS_URL
from app.scraper.rechtspraak_session import RechtspraakScrapeSession
from app.scraper.soup_parsing import safe_find_text, to_soup


Expand All @@ -22,29 +22,30 @@ def legal_area_exists(legal_area_dict):


def import_legal_areas_handler():
r = requests.get(LEGAL_AREAS_URL)
r.raise_for_status()

main_areas = (
to_soup(r.content)
.find("Rechtsgebieden")
.findChildren("Rechtsgebied", recursive=False)
)
current_app.logger.info(f"Found {len(main_areas)} main legal areas")

for main_area in main_areas:
legal_area_dict = transform_legal_area_xml_to_dict(main_area)
if not legal_area_exists(legal_area_dict):
LegalArea.create(**legal_area_dict)
with RechtspraakScrapeSession() as session:
r = session.get(LEGAL_AREAS_URL)
r.raise_for_status()

main_areas = (
to_soup(r.content)
.find("Rechtsgebieden")
.findChildren("Rechtsgebied", recursive=False)
)
current_app.logger.info(f"Found {len(main_areas)} main legal areas")

for main_area in main_areas:
legal_area_dict = transform_legal_area_xml_to_dict(main_area)
if not legal_area_exists(legal_area_dict):
LegalArea.create(**legal_area_dict)

sub_areas = main_area.find_all("Rechtsgebied")
current_app.logger.info(f"Found {len(sub_areas)} sub areas")
sub_areas = main_area.find_all("Rechtsgebied")
current_app.logger.info(f"Found {len(sub_areas)} sub areas")

for sub_area in sub_areas:
legal_area_dict = transform_legal_area_xml_to_dict(sub_area)
for sub_area in sub_areas:
legal_area_dict = transform_legal_area_xml_to_dict(sub_area)

if not legal_area_exists(legal_area_dict):
LegalArea.create(**legal_area_dict)
current_app.logger.info(
f"New legal area {legal_area_dict.get('legal_area_name')} added"
)
if not legal_area_exists(legal_area_dict):
LegalArea.create(**legal_area_dict)
current_app.logger.info(
f"New legal area {legal_area_dict.get('legal_area_name')} added"
)
25 changes: 13 additions & 12 deletions app/scraper/other/procedure_types.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import requests
from flask import current_app

from app.models import ProcedureType
from app.scraper.other.config import PROCEDURE_TYPES_URL
from app.scraper.rechtspraak_session import RechtspraakScrapeSession
from app.scraper.soup_parsing import safe_find_text, to_soup


Expand All @@ -22,17 +22,18 @@ def procedure_type_exists(procedure_type_dict):


def import_procedure_types_handler():
r = requests.get(PROCEDURE_TYPES_URL)
r.raise_for_status()
with RechtspraakScrapeSession() as session:
r = session.get(PROCEDURE_TYPES_URL)
r.raise_for_status()

procedure_types = to_soup(r.content).find_all("Proceduresoort")
current_app.logger.info(f"Found {len(procedure_types)} procedure types")
procedure_types = to_soup(r.content).find_all("Proceduresoort")
current_app.logger.info(f"Found {len(procedure_types)} procedure types")

for procedure_type in procedure_types:
procedure_type_dict = transform_procedure_type_xml_to_dict(procedure_type)
for procedure_type in procedure_types:
procedure_type_dict = transform_procedure_type_xml_to_dict(procedure_type)

if not procedure_type_exists(procedure_type_dict):
ProcedureType.create(**procedure_type_dict)
current_app.logger.info(
f"New procedure type {procedure_type_dict.get('name')} added"
)
if not procedure_type_exists(procedure_type_dict):
ProcedureType.create(**procedure_type_dict)
current_app.logger.info(
f"New procedure type {procedure_type_dict.get('name')} added"
)
113 changes: 57 additions & 56 deletions app/scraper/people/enrich_people.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,73 +15,74 @@
def enrich_people_handler():
people = Person.query.all()

for person in people:
enrich_person(person)
# Rate limit to default requests p/s, which means that enriching 5.000 judges will take a little less than 3 hours
with RechtspraakScrapeSession() as session:
for person in people:
enrich_person(session, person)


def person_details_url(rechtspraak_id: str) -> str:
return DETAILS_ENDPOINT + rechtspraak_id


def enrich_person(person: Person) -> None:
with RechtspraakScrapeSession() as session:
r = session.get(person_details_url(person.rechtspraak_id))
current_app.logger.info(
f"Enriching person {person.id} with information from {r.url}"
)
def enrich_person(session: RechtspraakScrapeSession, person: Person) -> None:
r = session.get(person_details_url(person.rechtspraak_id))
current_app.logger.info(
f"Enriching person {person.id} with information from {r.url}"
)

if not r.ok or r.url == FAULTY_URL:
current_app.logger.warning(
f"Enrichtment of person {person.id} failed with status {r.status_code}, url {r.url}",
extra={"id": person.id},
)
person.removed_from_rechtspraak_at = datetime.now()
person.last_scraped_at = datetime.now()
person.save()
return
if not r.ok or r.url == FAULTY_URL:
current_app.logger.warning(
f"Enrichtment of person {person.id} failed with status {r.status_code}, url {r.url}",
extra={"id": person.id},
)
person.removed_from_rechtspraak_at = datetime.now()
person.last_scraped_at = datetime.now()
person.save()
return

person_json = r.json().get("model", {})
person_json = r.json().get("model", {})

for beroepsgegeven in person_json.get("beroepsgegevens", []):
pd_kwargs = ProfessionalDetail.transform_beroepsgegevens_dict(
beroepsgegeven
for beroepsgegeven in person_json.get("beroepsgegevens", []):
pd_kwargs = ProfessionalDetail.transform_beroepsgegevens_dict(
beroepsgegeven
)
if not professional_detail_already_exists(person, pd_kwargs):
institution = find_institution_for_professional_detail(
pd_kwargs.get("organisation")
)
if not professional_detail_already_exists(person, pd_kwargs):
institution = find_institution_for_professional_detail(
pd_kwargs.get("organisation")
)
ProfessionalDetail.create(
**{"person_id": person.id, **pd_kwargs}, institution=institution
)

for historisch_beroepsgegeven in person_json.get("historieBeroepsgegevens", []):
pd_kwargs = ProfessionalDetail.transform_historisch_beroepsgegevens_dict(
historisch_beroepsgegeven
ProfessionalDetail.create(
**{"person_id": person.id, **pd_kwargs}, institution=institution
)
if not professional_detail_already_exists(person, pd_kwargs):
institution = find_institution_for_professional_detail(
pd_kwargs.get("organisation")
)
ProfessionalDetail.create(
**{"person_id": person.id, **pd_kwargs}, institution=institution
)

for nevenbetrekking in person_json.get("huidigeNevenbetrekkingen", []):
nb_kwargs = SideJob.transform_huidige_nevenbetrekkingen_dict(
nevenbetrekking

for historisch_beroepsgegeven in person_json.get("historieBeroepsgegevens", []):
pd_kwargs = ProfessionalDetail.transform_historisch_beroepsgegevens_dict(
historisch_beroepsgegeven
)
if not professional_detail_already_exists(person, pd_kwargs):
institution = find_institution_for_professional_detail(
pd_kwargs.get("organisation")
)
if not side_job_already_exists(person, nb_kwargs):
SideJob.create(**{"person_id": person.id, **nb_kwargs})

for voorgaande_nevenbetrekking in person_json.get(
"voorgaandeNevenbetrekkingen", []
):
nb_kwargs = SideJob.transform_voorgaande_nevenbetrekkingen_dict(
voorgaande_nevenbetrekking
ProfessionalDetail.create(
**{"person_id": person.id, **pd_kwargs}, institution=institution
)
if not side_job_already_exists(person, nb_kwargs):
SideJob.create(**{"person_id": person.id, **nb_kwargs})

person.removed_from_rechtspraak_at = None
person.last_scraped_at = datetime.now()
person.save()
for nevenbetrekking in person_json.get("huidigeNevenbetrekkingen", []):
nb_kwargs = SideJob.transform_huidige_nevenbetrekkingen_dict(
nevenbetrekking
)
if not side_job_already_exists(person, nb_kwargs):
SideJob.create(**{"person_id": person.id, **nb_kwargs})

for voorgaande_nevenbetrekking in person_json.get(
"voorgaandeNevenbetrekkingen", []
):
nb_kwargs = SideJob.transform_voorgaande_nevenbetrekkingen_dict(
voorgaande_nevenbetrekking
)
if not side_job_already_exists(person, nb_kwargs):
SideJob.create(**{"person_id": person.id, **nb_kwargs})

person.removed_from_rechtspraak_at = None
person.last_scraped_at = datetime.now()
person.save()
10 changes: 7 additions & 3 deletions app/scraper/people/tests/test_enrich.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from datetime import datetime

from app.scraper.rechtspraak_session import RechtspraakScrapeSession
from app.scraper.people.enrich_people import enrich_person, person_details_url
from app.tests.factories import PersonFactory

Expand All @@ -10,15 +11,17 @@ def test_removed_at_is_not_set(requests_mock, person):
)

assert person.removed_from_rechtspraak_at is None
enrich_person(person)
with RechtspraakScrapeSession() as session:
enrich_person(session, person)
assert person.removed_from_rechtspraak_at is None


def test_removed_at_is_set_on_http_error(requests_mock, person):
requests_mock.get(person_details_url(person.rechtspraak_id), status_code=500)

assert person.removed_from_rechtspraak_at is None
enrich_person(person)
with RechtspraakScrapeSession() as session:
enrich_person(session, person)
assert person.removed_from_rechtspraak_at is not None


Expand All @@ -30,5 +33,6 @@ def test_removed_at_is_removed_on_successful_scrape(requests_mock):
)

assert person.removed_from_rechtspraak_at == dt
enrich_person(person)
with RechtspraakScrapeSession() as session:
enrich_person(session, person)
assert person.removed_from_rechtspraak_at is None
10 changes: 7 additions & 3 deletions app/scraper/rechtspraak_session.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from requests import Session
from requests.adapters import HTTPAdapter, Retry
from requests.adapters import Retry

from requests_ratelimiter import LimiterAdapter


class RechtspraakScrapeSession(Session):
Expand All @@ -14,14 +16,16 @@ class RechtspraakScrapeSession(Session):

def __init__(self):
super().__init__()

retries = Retry(
total=3,
backoff_factor=2,
raise_on_status=False,
status_forcelist=tuple(range(401, 600)),
)
self.mount("http://", HTTPAdapter(max_retries=retries))
self.mount("https://", HTTPAdapter(max_retries=retries))
adapter = LimiterAdapter(per_second=0.5, per_minute=30, burst=1, max_retries=retries)
self.mount("http://", adapter)
self.mount("https://", adapter)

def request(self, *args, **kwargs):
kwargs.setdefault("timeout", 2)
Expand Down
Loading
Loading