Skip to content

Commit

Permalink
updated uninotes scraper to timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
eaglethrost committed Nov 15, 2024
1 parent b66aa06 commit ad2b3ca
Showing 1 changed file with 20 additions and 7 deletions.
27 changes: 20 additions & 7 deletions scraper/uniNotes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from bs4 import BeautifulSoup
import re
import requests
import signal
import time

website = "UniNotes"

Expand All @@ -11,20 +13,26 @@

# Get latest studentvip review ID
start_id = requests.get(f"http://localhost:3030/api/v1/reviews/scraped/maxId/{website}").json()['maxId']

course_codes = courses
url_prefix = "https://uninotes.com/university-subjects/university-of-new-south-wales-unsw/"
review_values = []
count = 0

for course_code in course_codes:
print(f"Processing reviews for {course_code}...")
page = requests.get(url_prefix + course_code)
soup = BeautifulSoup(page.content, "html.parser")
# Define a timeout handler
def handler(signum, frame):
raise TimeoutError("Loop iteration timed out")

# Set the timeout duration (in seconds) in case the scraping takes too long
timeout_duration = 10

for course_code in course_codes:
signal.signal(signal.SIGALRM, handler)
signal.alarm(timeout_duration)

try:
page = requests.get(url_prefix + course_code)
soup = BeautifulSoup(page.content, "html.parser")
res = soup.find_all(attrs={"id": re.compile(r'^review')})

for review in res:
author_name = review.find("h2").get_text(strip=True).replace("'", "''")

Expand All @@ -47,9 +55,14 @@
review_values.append(
f"('{course_code}', '{website}', '{cur_id}', 'Review #{cur_id}', {rating}, '{description}', '{author_name}', '{term_taken}', '{{}}')"
)

if len(res) > 0:
print(f"Scraped reviews for {course_code}...")
except TimeoutError:
print(f"Iteration timed out, moving to next iteration.")
except Exception as e:
print(f"Could not process reviews for {course_code}: {str(e)}")
finally:
signal.alarm(0) # Disable the alarm after each iteration

# Write SQL statements to a file
with open('../backend/data/uninotes_reviews.sql', 'w') as f:
Expand Down

0 comments on commit ad2b3ca

Please sign in to comment.