-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstore_data.py
236 lines (205 loc) · 8.57 KB
/
store_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import warnings
import numpy as np
import pandas as pd
import espn_api as epi
quad = "4_team"
pent = "5_team"
def prep_weekly_results(weekly_result_dict: dict) -> pd.DataFrame:
"""Rely on the 'espn_api.full_ap_xc_run()' function as input.
This will have 'conference_teams_df', 'conference_scores_dict' as keys to the dict. """
base_df = weekly_result_dict["conference_teams_df"]
conf_scores_dict = weekly_result_dict["conference_scores_dict"]
# Taking the column name and inserting it as the column name the tuple included
base_df.columns = [(col, conf_scores_dict[col]) for col in base_df.columns]
return base_df
def what_week_is_current(week, year):
""" The week is 'current' for the present year. But what week is that exactly numerically (or Final)?"""
year, week = epi.date_processing(year, week)
if week == epi.current:
# Is the 'current' week actually the final rankings of the season?
year, week = epi.what_week_is_it()
if week == epi.current:
# We have a 'current' week string that is not the final ranking of the year.
# We need to numerilize that for storing the data correctly.
week = epi.extract_week_from_url(epi.espn_api_url_generator(year, week))
return week
def summarize_data(
week,
conference_score_tuple: list,
n_teams_str: str = pent,
existing_summary_df: pd.DataFrame = None,
):
"""
Purpose: Summarizes data for a given week and conference score tuple. It standardizes the week formatting, handles potential errors, and writes the summary data to a file.
Inputs:
week: The week to summarize.
conference_score_tuple: List of conference scores.
n_teams_str: String indicating the number of teams (default is pent).
existing_summary_df: Existing summary DataFrame (optional).
Outputs:
The summary data as a DataFrame.
"""
preseason_title = epi.preseason.title()
final_title = epi.final.title()
# Define the potential weeks
potential_weeks = [
preseason_title,
"Week 2",
"Week 3",
"Week 4",
"Week 5",
"Week 6",
"Week 7",
"Week 8",
"Week 9",
"Week 10",
"Week 11",
"Week 12",
"Week 13",
"Week 14",
"Week 15",
"Week 16",
final_title,
]
# Check if the week is in the potential weeks
# Standardize the week formatting, including catching if it is the final AP ranking of the season.
dummy_year = 2014
week_str = epi.date_processing(dummy_year, week)[1]
if week_str == epi.current:
# We have a 'current' week string. We need to numerilize that.
week_str = epi.extract_week_from_url(
epi.espn_api_url_generator(dummy_year, week_str)
)
# Make sure the preseason week is handled correctly
if week_str == "1" or str(week) == "1":
week_str = preseason_title
# Make sure the Final week is handled correctly
try:
if int(week) > 16 or week_str == "17":
week_str = final_title
except ValueError:
pass
week_str = week_str.title()
# Not a book-end ranking week?
if week_str not in [preseason_title, final_title]:
week_str = f"Week {week_str}"
if week_str not in potential_weeks:
print(f"Week '{week_str}' not found in potential weeks. Cannot insert data.")
return
# Define the summary dataset
idx_header = f"AP_XC_{n_teams_str.title()}_Race"
if existing_summary_df is None:
summary_data = {idx_header: potential_weeks}
# Convert the summary data to a DataFrame
summary_df = pd.DataFrame(summary_data)
else:
summary_df = existing_summary_df
# Extract conference names from the headers
conf_names = [header[0] for header in conference_score_tuple]
# Initialize conference score lists with None only if the conference doesn't exist yet
for conf in conf_names:
if conf not in summary_df.columns:
summary_df[conf] = np.nan
# Insert the headers data into the corresponding column for the given week
for header in conference_score_tuple:
conf_name = header[0]
conf_score = header[1]
if conf_score == "'DNS'" or conf_score == "DNS":
conf_score = None
summary_df.loc[summary_df[idx_header] == week_str, conf_name] = conf_score
# # Reorder the columns based on potential weeks
# summary_df = summary_df[potential_weeks + [idx_header, 'SEC', 'ACC', 'B1G', 'Big 12', 'Pac-12']]
return summary_df
def write_weekly_results(
year, week, prepped_result_df: pd.DataFrame, four_team_race: bool = False
) -> pd.DataFrame:
""" Record the weekly results as individual CSVs & append that data to the summary statistics for the year and n(Team) race. """
# Manage input dates
year, week = epi.date_processing(year, week)
week = what_week_is_current(week=week, year=year)
# Define base directory and subdirectories
base_dir = os.path.join(os.getcwd(), "data", str(year))
team_dir = quad if four_team_race else pent
year_dir = os.path.join(base_dir, team_dir)
# Create directories if they don't exist
os.makedirs(year_dir, exist_ok=True)
# Write weekly results to CSV
if prepped_result_df is not None:
# Write weekly results to CSV
week_file = os.path.join(year_dir, f"{year}_week_{week}.csv")
prepped_result_df.to_csv(week_file, index=False)
# Update summary statistics CSV
summary_file = os.path.join(
base_dir, team_dir, f"{year}_{team_dir}_summary_statistics.csv"
)
if os.path.exists(summary_file):
# Load existing summary statistics
e_summary_stats = pd.read_csv(summary_file)
else:
# No existing SumStats; create it downstream.
e_summary_stats = None
# Generate the updated summary data
the_summary_data = summarize_data(
week,
conference_score_tuple=prepped_result_df.columns.to_list(),
n_teams_str=team_dir,
existing_summary_df=e_summary_stats,
)
# Write updated summary statistics to CSV
try:
the_summary_data.to_csv(summary_file, index=False)
print("Summary data has been successfully written to", summary_file)
except Exception as e:
warnings.warn(
"Unable to write summary data to {}. Error: {}".format(summary_file, str(e))
)
return the_summary_data
def store_weekly_results(
year: int = None, week=None, four_team_score: bool = False
) -> pd.DataFrame:
""" Full process to store weekly results and write them to the season's summary statistics. """
four_team_score = epi.string_to_bool(four_team_score)
results_dict = epi.full_ap_xc_run(year, week, four_team_score=four_team_score)
base_rez_df = prep_weekly_results(results_dict)
written_results = write_weekly_results(
year=year,
week=week,
four_team_race=four_team_score,
prepped_result_df=base_rez_df,
)
return written_results
def store_all_data_2014_to_present():
# Ensure the dates are in proper format
# year, week = epi.date_processing(year, week)
# if week == 'final':
year, week = epi.what_week_is_it()
# Loop through years from 2014 to present year
for y in range(2014, year + 1):
# Loop through weeks from 1 to 17
for w in range(1, 18):
# Call store_weekly_results function twice, with four_team_score False and True
for four_team_score in [False, True]:
try:
store_weekly_results(
year=y, week=w, four_team_score=four_team_score
)
except:
print(
f"Error running {y}'s week {w} AP XC ranking.\n Likely hasn't occured yet."
)
if __name__ == "__main__":
# Example usage:
# stored = store_weekly_results(2021, 1, four_team_score = False)
# stored = store_weekly_results()
# stored = store_weekly_results(2023, 'preseason', four_team_score = False)
# stored = store_weekly_results(2023, 'final', four_team_score=False)
# print(stored)
#
# Default execution to store the most recent results.
store_weekly_results(four_team_score=True)
stored = store_weekly_results(four_team_score=False)
print(stored)
#
# Store all the data ESPN has on AP Rankings
# store_all_data_2014_to_present()