Skip to content

Commit

Permalink
Implement scheduling
Browse files Browse the repository at this point in the history
  • Loading branch information
steffsas committed Jun 18, 2024
1 parent 8f0aaf8 commit b0b6a1a
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 13 deletions.
4 changes: 3 additions & 1 deletion .env
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
URL=""
DOWNLOAD_FOLDER="downloads"
OUTPUT_FOLDER="output"
LOG_FOLDER="logs"
LOG_FOLDER="logs"
SCHEDULE="true"
SCHEDULE_DAILY_AT="00:15:00"
4 changes: 4 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
FROM python:3.12-alpine

# set time zone for cron jobs
RUN apk add --no-cache tzdata
ENV TZ=Europe/Berlin

# first layer should contain the dependencies
COPY ./requirements.txt /app/requirements.txt

Expand Down
72 changes: 60 additions & 12 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import time
import httpx
import os
import lzma
import re
import pandas as pd
import logging
import sys
import schedule

from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup
from dotenv import load_dotenv
Expand All @@ -14,26 +17,46 @@
LOG_FOLDER_ENV = "LOG_FOLDER"
DOWNLOAD_FOLDER_ENV = "DOWNLOAD_FOLDER"
OUTPUT_FOLDER_ENV = "OUTPUT_FOLDER"
SCHEDULE_ENV = "SCHEDULE"
SCHEDULE_DAILY_AT_ENV = "SCHEDULE_DAILY_AT"

DEFAULT_SCHEDULE_TIME = "00:30:00"
DEFAULT_LOG_FOLDER = "logs"

logger = logging.getLogger(__name__)

def main() -> None:
# set logger config
prepareLogger()

# let's load the .env variables
load_dotenv(".env", override=False)

today = datetime.now().strftime("%Y-%m-%d")
logger.info("start")

logFolder = os.getenv(LOG_FOLDER_ENV)
if logFolder == None:
logFolder = "logs"
shouldSchedule = os.getenv(SCHEDULE_ENV)
if shouldSchedule == None:
download()
return

if not os.path.exists(logFolder):
os.makedirs(logFolder)
scheduleDailyAt = os.getenv(SCHEDULE_DAILY_AT_ENV)
if scheduleDailyAt == None:
logger.warning(f"no schedule time found, fallback to schedule at {DEFAULT_SCHEDULE_TIME} every day")
scheduleDailyAt = DEFAULT_SCHEDULE_TIME

logging.basicConfig(
filename=f"{logFolder}/{today}.log", level=logging.INFO,
format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
logger.info(f"start scheduling download daily at {scheduleDailyAt}")

# let's schedule the download every day at the specified time
schedule.every().day.at(scheduleDailyAt).do(download)

while True:
schedule.run_pending()
time.sleep(1)

def download():
logging.info("start download")

today = datetime.now().strftime("%Y-%m-%d")

url = os.getenv(URL_ENV)
if url == None:
Expand Down Expand Up @@ -140,6 +163,32 @@ def main() -> None:

logger.info("done")

def prepareLogger():
today = datetime.now().strftime("%Y-%m-%d")

logFolder = os.getenv(LOG_FOLDER_ENV)
if logFolder == None:
logFolder = DEFAULT_LOG_FOLDER

# create log folder if not exists
if not os.path.exists(logFolder):
os.makedirs(logFolder)

logFileName = f"{logFolder}/{today}.log"

# create log file if not exists
Path(logFileName).touch()

file_handler = logging.FileHandler(filename=logFileName)
stdout_handler = logging.StreamHandler(stream=sys.stdout)
handlers = [file_handler, stdout_handler]

logging.basicConfig(
handlers=handlers,
level=logging.INFO,
format='%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
)

def receiveSoup(url: str) -> BeautifulSoup:
response = httpx.get(url, follow_redirects=True)
if response.status_code != 200:
Expand All @@ -158,7 +207,6 @@ def getSortedLinks(soup: BeautifulSoup) -> list[tuple[str, str]]:
# let's get the latest link
sortedLinks = sorted(links, key=lambda x: x[1], reverse=True)
return sortedLinks


def extractDateFromFilename(filename: str) -> str:
# Define the regular expression pattern to match YYYY-MM-DD format
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pandas==2.2.2
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.1
schedule==1.2.2
six==1.16.0
sniffio==1.3.1
soupsieve==2.5
Expand Down

0 comments on commit b0b6a1a

Please sign in to comment.