-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
79 lines (60 loc) · 2.38 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#
# Feature Engineering - Handling Cyclical Features
# Post: http://blog.davidkaleko.com/feature-engineering-cyclical-features.html
# Data: https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip
#
# 1. Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Supress warnings
import warnings
warnings.filterwarnings('ignore')
# 2. Load data
df = pd.read_csv("data/hour.csv")
# 3. EDA
print(df.columns.values)
# 4. Extract 'hour' and 'month' features
df = df[['mnth', 'hr', 'cnt']].copy()
# 5. Number of unique values
print("Unique values of month: ", df.mnth.unique())
print('Unique values of hour:', df.hr.unique())
# 6. Magic
df['hr_sin'] = np.sin(df.hr * (2. * np.pi/24))
df['hr_cos'] = np.cos(df.hr * (2. * np.pi/24))
df['mnth_sin'] = np.sin((df.mnth-1) * (2. * np.pi/12))
df['mnth_cos'] = np.cos((df.mnth-1) * (2. * np.pi/12))
# 7. Impact on Model Performance
# 7.1. 'Non-cyclic' features
# 7.1.1. Construct the pipeline with a standard scaler and a small neural network
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('nn', MLPRegressor(hidden_layer_sizes=(5, ), max_iter = 1000)))
model = Pipeline(estimators)
# 7.1.2. To begin, let's use only these two features to predict 'cnt' (bicycle count)
features = ['mnth', 'hr']
X = df[features].values
y = df.cnt
# 7.1.3. We'll use 5-fold cross validation. That is, a random 80% of the data will be used
# to train the model, and the prediction score will be computed on the remaining 20%.
# This process is repeated five times such that the training sets in each "fold"
# are mutually orthogonal.
kfold = KFold(n_splits=5)
# 7.1.4. Train model
results = cross_val_score(model, X, y, cv = kfold, scoring = 'neg_mean_squared_error')
# 7.1.5. Result
print('CV Scoring Result: mean=', np.mean(results), 'std=', np.std(results))
# 7.2. 'Cyclic' features
# 7.2.1. Features
features = ['mnth_sin', 'mnth_cos', 'hr_sin', 'hr_cos']
X = df[features].values
# 7.2.2. Train model
results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')
# 7.2.3. Results
print('CV Scoring Result: mean=', np.mean(results), 'std=', np.std(results))
# 8. Improvement
22678/31401