-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfootball_match_winners.py
122 lines (72 loc) · 3.33 KB
/
football_match_winners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
url = 'https://raw.githubusercontent.com/dataquestio/project-walkthroughs/master/football_matches/matches.csv'
matches = pd.read_csv(url, index_col=0)
matches.shape
matches['team'].value_counts()
matches[matches['team'] == 'Fulham']
matches["round"].value_counts()
matches.dtypes
matches['date'] = pd.to_datetime(matches['date'])
matches.dtypes
matches['venue_code'] = matches['venue'].astype('category').cat.codes
matches['opp_code'] = matches['opponent'].astype('category').cat.codes
matches['hour'] = matches['time'].str.replace(':.+', "", regex=True).astype('int')
matches['day_code'] = matches['date'].dt.dayofweek
matches['target'] = (matches['result'] == 'W').astype('int')
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
train = matches[matches["date"] < '2022-01-01']
test = matches[matches["date"] > '2022-01-01']
predictors = ['venue_code', 'opp_code', 'hour', 'day_code']
rf.fit(train[predictors], train['target'])
preds = rf.predict(test[predictors])
acc = accuracy_score(test['target'], preds)
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
pd.crosstab(index=combined['actual'], columns=combined['prediction'])
precision_score(test['target'], preds)
group_matches = matches.groupby('team')
group = group_matches.get_group('Manchester City')
def rolling_averages(group, cols, new_cols):
group = group.sort_values('date')
rolling_stats = group[cols].rolling(3, closed='left').mean()
group[new_cols] = rolling_stats
group = group.dropna(subset=new_cols)
return group
cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f'{c}rolling' for c in cols]
rolling_averages(group, cols, new_cols)
matches_rolling = matches.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
matches_rolling = matches_rolling.droplevel('team')
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling
def make_predictions(data, predictors):
train = data[data["date"] < '2022-01-01']
test = data[data["date"] > '2022-01-01']
rf.fit(train[predictors], train['target'])
preds = rf.predict(test[predictors])
combined = pd.DataFrame(dict(actual=test['target'], prediction=preds), index=test.index)
precision = precision_score(test['target'], preds)
return combined, precision
combined, precision = make_predictions(matches_rolling, predictors + new_cols)
precision
combined
combined = combined.merge(matches_rolling[['date', 'team', 'opponent', 'result']], left_index=True, right_index=True)
class MissingDict(dict):
__missing__ = lambda self, key: key
map_values = {
'Brighton and Hove Albion': 'Brighton',
'Manchester United': 'Manchester Utd',
'Newcastle United': 'Newcastle Utd',
'Tottenham Hotspur': 'Tottenham',
'West Ham United': 'West Ham',
'Wolverhampton Wanderers': 'Wolves'
}
mapping = MissingDict(**map_values)
combined['new_team'] = combined['team'].map(mapping)
combined
merged = combined.merge(combined, left_on=['date', 'new_team'], right_on=['date', 'opponent'])
merged
merged[(merged["prediction_x"] == 1) & (merged['prediction_y'] == 0)]['actual_x'].value_counts()
print("Accuracy of the model:", acc)