Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Week3 submission #22

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions week_1/project/week_1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
from datetime import datetime
from typing import List
from operator import attrgetter

from dagster import In, Nothing, Out, job, op, usable_as_dagster_type
from pydantic import BaseModel
Expand Down Expand Up @@ -51,15 +52,16 @@ def get_s3_data(context):


@op
def process_data():
pass
def process_data(stock_list: List[Stock]):
max_high = max(stock_list, key=attrgetter("high"))
return Aggregation(date=max_high.date, high=max_high.high)


@op
def put_redis_data():
def put_redis_data(aggregation: Aggregation):
pass


@job
def week_1_pipeline():
pass
put_redis_data(process_data(get_s3_data()))
42 changes: 36 additions & 6 deletions week_1/project/week_1_challenge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import csv
from datetime import datetime
import heapq
from heapq import nlargest
from typing import List
from operator import attrgetter

from dagster import (
DynamicOut,
Expand Down Expand Up @@ -60,16 +62,44 @@ def get_s3_data(context):
return output


@op
def process_data():
pass
@op(
config_schema={"nlargest": int},
ins={"stock_list": In(dagster_type=List[Stock])},
out=DynamicOut(),
tags={"kind": "Aggregation"},
description="Get a list of Aggregation based on the nlarges in config file",
)
def process_data(context, stock_list):
aggr_list = []
numb_Items = context.op_config["nlargest"]
# One more way to find the nlargest items in a list
# largest_stocks_ow = heapq.nlargest(numb_Items, stock_list,
# key=lambda stock: stock.high)
largest_stocks = heapq.nlargest(numb_Items, stock_list,
key=sortkey)
[aggr_list.append(Aggregation(date=stock.date, high=stock.high))
for stock in largest_stocks]
print(aggr_list)
for idx, aggregation in enumerate(aggr_list):
yield DynamicOutput(aggregation, mapping_key=str(idx))


@op
def put_redis_data():
@op(description="Upload an Aggregation to Redis",
tags={"kind": "redis"})
def put_redis_data(context, aggregation: Aggregation) -> None:
print(aggregation)
pass

# Define a function that returns a comparison key for Stock objects


def sortkey(stock):
return stock.high


@job
def week_1_pipeline():
pass
aggregations = process_data(get_s3_data())
aggr_data = aggregations.map(put_redis_data)
# Why do we need this step?
aggr_data.collect()
47 changes: 35 additions & 12 deletions week_2/dagster_ucr/project/week_2.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,52 @@
import csv
from importlib import resources
from operator import attrgetter
import resource
from typing import List

from dagster import In, Nothing, Out, ResourceDefinition, graph, op
from dagster_ucr.project.types import Aggregation, Stock
from dagster_ucr.resources import mock_s3_resource, redis_resource, s3_resource


@op
def get_s3_data():
pass
@op(
config_schema={"s3_key": str},
required_resource_keys={"s3"},
out={"stocks": Out(dagster_type=List[Stock],
description="List of Stocks")},
)
def get_s3_data(context):
output = list()
for csv_row in context.resources.s3.get_data(context.op_config["s3_key"]):
stock = Stock.from_list(csv_row)
output.append(stock)
return output


@op
def process_data():
# Use your op from week 1
pass
@op(
ins={"stocks": In(dagster_type=List[Stock])},
out={"highest_stock": Out(dagster_type=Aggregation)},
description="Given a list of stocks, return an Aggregation with the highest value"
)
def process_data(stocks):
highest_stock = max(stocks, key=attrgetter("high"))
aggregation = Aggregation(date=highest_stock.date, high=highest_stock.high)
return aggregation


@op
def put_redis_data():
pass
@op(
ins={"aggregation": In(dagster_type=Aggregation)},
required_resource_keys={"redis"},
description="Given a Aggregation, Upload to Redis"
)
def put_redis_data(context, aggregation):
context.resources.redis.put_data(aggregation.date, str(aggregation.high))


@graph
def week_2_pipeline():
# Use your graph from week 1
pass
put_redis_data(process_data(get_s3_data()))


local = {
Expand Down Expand Up @@ -54,7 +76,8 @@ def week_2_pipeline():
local_week_2_pipeline = week_2_pipeline.to_job(
name="local_week_2_pipeline",
config=local,
resource_defs={"s3": mock_s3_resource, "redis": ResourceDefinition.mock_resource()},
resource_defs={"s3": mock_s3_resource,
"redis": ResourceDefinition.mock_resource()},
)

docker_week_2_pipeline = week_2_pipeline.to_job(
Expand Down
31 changes: 24 additions & 7 deletions week_2/dagster_ucr/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,30 @@ def mock_s3_resource():
return s3_mock


@resource
def s3_resource():
"""This resource defines a S3 client"""
pass
@resource(
config_schema={
"bucket": Field(String),
"access_key": Field(String),
"secret_key": Field(String),
"endpoint_url": Field(String),
},
description="A resource that can run S3")
def s3_resource(context) -> S3:

return S3(
bucket=context.resource_config["bucket"],
access_key=context.resource_config["access_key"],
secret_key=context.resource_config["secret_key"],
endpoint_url=context.resource_config["endpoint_url"],
)


@resource
def redis_resource():
@resource(
config_schema={
"host": Field(String),
"port": Field(Int),
},
description="A resource that can run Redis")
def redis_resource(context) -> Redis:
"""This resource defines a Redis client"""
pass
return Redis(host=context.resource_config["host"], port=context.resource_config["port"])
110 changes: 90 additions & 20 deletions week_3/project/week_3.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from operator import attrgetter
from typing import List

from dagster import (
Expand All @@ -8,6 +9,9 @@
RetryPolicy,
RunRequest,
ScheduleDefinition,
schedule,
DefaultScheduleStatus,
StaticPartitionsDefinition,
SkipReason,
graph,
op,
Expand All @@ -19,28 +23,47 @@
from project.types import Aggregation, Stock


@op
def get_s3_data():
@op(
config_schema={"s3_key": str},
required_resource_keys={"s3"},
out={"stocks": Out(dagster_type=List[Stock],
description="List of Stocks")},
)
def get_s3_data(context):
# Use your ops from week 2
pass
output = list()
for csv_row in context.resources.s3.get_data(context.op_config["s3_key"]):
stock = Stock.from_list(csv_row)
output.append(stock)
return output


@op
def process_data():
@op(
ins={"stocks": In(dagster_type=List[Stock])},
out={"highest_stock": Out(dagster_type=Aggregation)},
description="Given a list of stocks, return an Aggregation with the highest value"
)
def process_data(stocks):
# Use your ops from week 2
pass
highest_stock = max(stocks, key=attrgetter("high"))
aggregation = Aggregation(date=highest_stock.date, high=highest_stock.high)
return aggregation


@op
def put_redis_data():
@op(
ins={"aggregation": In(dagster_type=Aggregation)},
required_resource_keys={"redis"},
description="Given a Aggregation, Upload to Redis"
)
def put_redis_data(context, aggregation):
# Use your ops from week 2
pass
context.resources.redis.put_data(aggregation.date, str(aggregation.high))


@graph
def week_3_pipeline():
# Use your graph from week 2
pass
put_redis_data(process_data(get_s3_data()))


local = {
Expand Down Expand Up @@ -69,8 +92,12 @@ def week_3_pipeline():
}


def docker_config():
pass
@static_partitioned_config(partition_keys=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
def docker_config(partition: str):
update_docker_ops = docker
update_docker_ops["ops"]["get_s3_data"]["config"][
"s3_key"] = f"prefix/stock_{partition}.csv"
return update_docker_ops


local_week_3_pipeline = week_3_pipeline.to_job(
Expand All @@ -89,14 +116,57 @@ def docker_config():
"s3": s3_resource,
"redis": redis_resource,
},
)
op_retry_policy=RetryPolicy(
max_retries=10,
delay=1, # 200ms
),


local_week_3_schedule = None # Add your schedule

docker_week_3_schedule = None # Add your schedule
)


@sensor
def docker_week_3_sensor():
pass
local_week_3_schedule = ScheduleDefinition(
job=local_week_3_pipeline, cron_schedule="*/15 * * * *", default_status=DefaultScheduleStatus.RUNNING) # Add your schedule

docker_week_3_schedule = ScheduleDefinition(
job=docker_week_3_pipeline, cron_schedule="0 * * * *", default_status=DefaultScheduleStatus.RUNNING) # Add your schedule


# Struggled to figure out what this job is
@ sensor(job=docker_week_3_pipeline, minimum_interval_seconds=30)
def docker_week_3_sensor(context):
new_files = get_s3_keys(
bucket="dagster",
prefix="prefix",
endpoint_url="http://localstack:4566",
since_key=".",
max_keys=1000,
)
updated_config = docker_config
if not new_files:
yield SkipReason("No new s3 files found in bucket.")
return
for new_file in new_files:

yield RunRequest(
run_key=new_file,
# updated_config["ops"]["get_s3_data"]["config"]["s3_key"] = new_file -- this is failing with ParsedConfig not subscriptable issue , Need to figure out how to update the config
# run_config=updated_config -- this is failing with ParsedConfig not subscriptable issue
run_config={"resources": {
"s3": {
"config": {
"bucket": "dagster",
"access_key": "test",
"secret_key": "test",
"endpoint_url": "http://localstack:4566",
}
},
"redis": {
"config": {
"host": "redis",
"port": 6379,
}
},
},
"ops": {"get_s3_data": {"config": {"s3_key": new_file}}},
},
)
6 changes: 3 additions & 3 deletions week_4/project/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@ def repo():
return [get_s3_data_docker, process_data_docker, put_redis_data_docker]


@repository
def assets_dbt():
pass
# @repository
# def assets_dbt():
# pass
Loading