Skip to content

Commit

Permalink
NYT Update: Add statemachine
Browse files Browse the repository at this point in the history
- Add statemachine to update nyt articles and their links to assets
- Fix bug on nyt links
- Clean template.yaml
  • Loading branch information
nico-corthorn committed Nov 3, 2024
1 parent 738ee9a commit 591cb3c
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 42 deletions.
3 changes: 2 additions & 1 deletion db/nyt/nyt_news_to_asset.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
CREATE TABLE nyt_news_to_asset
(
web_url text NOT NULL,
year_month varchar(6) NOT NULL,
organization text NOT NULL,
asset_ids text NOT NULL,
PRIMARY KEY (web_url)
PRIMARY KEY (web_url, organization)
)
21 changes: 18 additions & 3 deletions esgtools/nyt/nyt_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,27 @@
["YouTube.com", ["GOOG-14542","GOOGL-90319"]],
["Facebook Inc", ["META-13407"]],
["Facebook.com", ["META-13407"]],
["WhatsApp Inc", ["META-13407"]],
["Lehman Brothers Holdings Inc", ["LEH-80599"]],
["Wal-Mart Stores Inc", ["WMT-55976"]],
["Tesla Motors Inc", ["TSLA-93436"]],
["Tesla Motors", ["TSLA-93436"]],
["Bear Stearns Cos", ["BSC-68304"]],
["Bear Stearns Companies Incorporated", ["BSC-68304"]],
["Fox News Channel", ["FOXA-18420"]],
["Fox Broadcasting Co", ["FOXA-18420"]],
["Wall Street Journal", ["FOXA-18420"]],
["CNN", ["WBD-0"]],
["Warner Brothers", ["WBD-0"]]
["Nissan Motor Co", ["NSANY-57681"]],
["Berkshire Hathaway Inc", ["BRK-83443"]],
["BlackRock Inc", ["BLK-87267"]],
["Disney, Walt, Co", ["DIS-26403"]],
["iTunes", ["AAPL-14593"]],
["Morgan, J P, Chase & Co", ["JPM-47896"]],
["Bain Capital", ["BCSF-18222"]],
["Nokia Oyj", ["NOK-87128"]],
["Enron Corporation", ["ENE-23317"]]
]


Expand Down Expand Up @@ -187,13 +202,13 @@ def get_news_to_asset_link(nyt_news, org_merged) -> pd.DataFrame:
)

news_to_asset = (
nyt_news_expanded[["web_url", "organization"]]
nyt_news_expanded[["web_url", "year_month", "organization"]]
.merge(
org_merged[["organization", "asset_ids"]],
how="left",
on=["organization"]
)
.drop_duplicates(["web_url", "organization"], keep="first", inplace=True)
.drop_duplicates(["web_url", "organization"], keep="first")
)

return news_to_asset
Expand All @@ -204,6 +219,6 @@ def upload_org_to_asset(self, org_merged: pd.DataFrame):
self.sql_manager.upload_df_chunks(self.org_to_asset_table, org_merged[org_to_asset_cols])

def upload_news_to_asset(self, news_to_asset: pd.DataFrame):
news_to_asset_cols = ["web_url", "organization", "asset_ids"]
news_to_asset_cols = ["web_url", "year_month", "organization", "asset_ids"]
self.sql_manager.query(f"delete from {self.news_to_asset_table}")
self.sql_manager.upload_df_chunks(self.news_to_asset_table, news_to_asset[news_to_asset_cols])
32 changes: 32 additions & 0 deletions statemachine/update_nyt.asl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"Comment": "State machine to update NYT articles and links",
"StartAt": "UpdateNytArticles",
"States": {
"UpdateNytArticles": {
"Type": "Task",
"Resource": "${UpdateNytArticlesFunctionArn}",
"Next": "UpdateNytLinks",
"Retry": [
{
"ErrorEquals": ["States.ALL"],
"IntervalSeconds": 1,
"MaxAttempts": 3,
"BackoffRate": 2.0
}
]
},
"UpdateNytLinks": {
"Type": "Task",
"Resource": "${UpdateNytLinksFunctionArn}",
"End": true,
"Retry": [
{
"ErrorEquals": ["States.ALL"],
"IntervalSeconds": 1,
"MaxAttempts": 3,
"BackoffRate": 2.0
}
]
}
}
}
180 changes: 142 additions & 38 deletions template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,34 @@ Globals:
Function:
Timeout: 3
Tracing: Active
Runtime: python3.9
CodeUri: esgtools/
Architectures:
- x86_64
Api:
TracingEnabled: True

Resources:
# Log Groups
UpdatePricesStatesLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: /aws/vendedlogs/states/UpdatePrices
RetentionInDays: 30

UpdateAccountingStatesLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: /aws/vendedlogs/states/UpdateAccounting
RetentionInDays: 30

UpdateNytStatesLogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: /aws/vendedlogs/states/UpdateNyt
RetentionInDays: 30

# State Machines
UpdatePricesStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
Expand All @@ -31,6 +55,14 @@ Resources:
FunctionName: !Ref UpdatePricesAlphaMonthlyFunction
- LambdaInvokePolicy:
FunctionName: !Ref UpdateReturnsMonthlyFunction
- CloudWatchLogsFullAccess
Type: STANDARD
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt UpdatePricesStatesLogGroup.Arn

UpdateAccountingStateMachine:
Type: AWS::Serverless::StateMachine
Expand All @@ -44,105 +76,177 @@ Resources:
FunctionName: !Ref GetAssetsFunction
- LambdaInvokePolicy:
FunctionName: !Ref UpdateAccountingFunction
- CloudWatchLogsFullAccess
Type: STANDARD
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt UpdateAccountingStatesLogGroup.Arn

UpdateNytStateMachine:
Type: AWS::Serverless::StateMachine
Properties:
DefinitionUri: statemachine/update_nyt.asl.json
DefinitionSubstitutions:
UpdateNytArticlesFunctionArn: !GetAtt UpdateNytArticlesFunction.Arn
UpdateNytLinksFunctionArn: !GetAtt UpdateNytLinksFunction.Arn
Policies:
- LambdaInvokePolicy:
FunctionName: !Ref UpdateNytArticlesFunction
- LambdaInvokePolicy:
FunctionName: !Ref UpdateNytLinksFunction
- CloudWatchLogsFullAccess
Type: STANDARD
Logging:
Level: ALL
IncludeExecutionData: true
Destinations:
- CloudWatchLogsLogGroup:
LogGroupArn: !GetAtt UpdateNytStatesLogGroup.Arn

# Lambdas
UpdateAssetsFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_assets.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 600
MemorySize: 500
Architectures:
- x86_64

GetAssetsFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: get_assets.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 300
MemorySize: 500
Architectures:
- x86_64
Events:
UpdatePrices:
Type: Api
Properties:
Path: /get-assets
Method: get

UpdatePricesAlphaFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_prices_alpha.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 900
MemorySize: 1000
Architectures:
- x86_64

UpdatePricesAlphaMonthlyFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_prices_alpha_monthly.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 900
MemorySize: 1000
Architectures:
- x86_64

UpdateReturnsMonthlyFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_returns_monthly.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 300
MemorySize: 500
Architectures:
- x86_64

UpdateAccountingFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_accounting.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 900
MemorySize: 1000
Architectures:
- x86_64

UpdateNytArticlesFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_nyt_articles.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 600
MemorySize: 500
Architectures:
- x86_64

UpdateNytLinksFunction:
Type: AWS::Serverless::Function
Properties:
CodeUri: esgtools/
Handler: update_nyt_links.lambda_handler
Runtime: python3.9
Role: arn:aws:iam::654580413909:role/LambdaSecretsManagerReadAccess
Timeout: 600
MemorySize: 500
Architectures:
- x86_64

# Schedules
# For UpdatePrices (weekly on Saturday)
UpdatePricesScheduleRule:
Type: AWS::Scheduler::Schedule
Properties:
Name: UpdatePricesWeeklySchedule
Description: "Triggers UpdatePrices Step Function every Saturday"
FlexibleTimeWindow:
Mode: "OFF"
ScheduleExpression: "cron(0 0 ? * SAT *)"
Target:
Arn: !GetAtt UpdatePricesStateMachine.Arn
RoleArn: !GetAtt UpdatePricesScheduleRole.Arn
Input: |
{
"queryStringParameters": {
"ref_table": "prices_alpha",
"size": "compact"
}
}
UpdatePricesScheduleRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Principal:
Service: scheduler.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: StepFunctionExecutionPolicy
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource: !GetAtt UpdatePricesStateMachine.Arn

# For UpdateNyt (monthly on 2nd)
UpdateNytScheduleRule:
Type: AWS::Scheduler::Schedule
Properties:
Name: UpdateNytMonthlySchedule
Description: "Triggers UpdateNyt Step Function on the 2nd day of every month"
FlexibleTimeWindow:
Mode: "OFF"
ScheduleExpression: "cron(0 0 2 * ? *)"
Target:
Arn: !GetAtt UpdateNytStateMachine.Arn
RoleArn: !GetAtt UpdateNytScheduleRole.Arn
Input: |
{
"queryStringParameters": {
"year_start": 2024,
"clean_table": "False"
}
}
UpdateNytScheduleRole:
Type: AWS::IAM::Role
Properties:
AssumeRolePolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Principal:
Service: scheduler.amazonaws.com
Action: sts:AssumeRole
Policies:
- PolicyName: StepFunctionExecutionPolicy
PolicyDocument:
Version: "2012-10-17"
Statement:
- Effect: Allow
Action:
- states:StartExecution
Resource: !GetAtt UpdateNytStateMachine.Arn

0 comments on commit 591cb3c

Please sign in to comment.