forked from ttong-ai/lstm-for-time-series
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprepareData.py
81 lines (65 loc) · 3.75 KB
/
prepareData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
def createNewFeatures(df_expanded):
# This function is for the feature engineering. Basically, hand-crafted features are being created.
# Usually, an area expert analyze the data and come up with this kind of feature creation decision.
# The code below is written based on an expert's analysis.
df_expanded['y_lagged'] = df_expanded[['id', 'y']].groupby('id').shift(periods=1)
df_expanded['technical_diff'] = df_expanded['technical_20'] - df_expanded['technical_30']
timediffCols = ['technical_diff', 'technical_20', 'technical_30', 'technical_40']
# 8 new features will be created
for thisColumn in timediffCols:
kernel = thisColumn.replace('technical_','krnl')
periodicity = thisColumn.replace('technical_','delta5')
grped = df_expanded[['id', thisColumn]].groupby('id')
df_expanded[kernel] = 12.5*(df_expanded[thisColumn] - 0.92 * grped[thisColumn].shift(periods=1))
df_expanded[periodicity] = df_expanded[thisColumn] - grped[thisColumn].shift(periods=5)
for thisColumn in ['fundamental_29']:
crossSectional = thisColumn.replace('fundamental_', 'fmod').replace('technical_', 'tmod')
tmp = df_expanded[['timestamp', thisColumn]].groupby('timestamp')[thisColumn].mean()
df_expanded[crossSectional] = tmp[df_expanded['timestamp']].values
return df_expanded
def fillNaNs(df):
# After the extreme data is removed, the data is prepared
COLs = [c for c in df.columns if c not in ['timestamp', 'y']]
# Calculating median without NaN
COLs_mean = df[COLs].dropna().median()
print(COLs_mean.head())
# Replaces the NaNs with median.
df = df.fillna(COLs_mean)
return df
def removeExtremeValues(df, insampleCutoffTimestamp):
# Data cleaning part of the code.
# Truncating (clipping) extreme values
# utilizing robust statistics (quartiles) because some values
# are extreme, e.g. derived_1 has a value of 1.06845e+16
secondQuartile = df[df.timestamp < insampleCutoffTimestamp].quantile(0.25) # Gets the value of the 25% for each column
fourthQuartile = df[df.timestamp < insampleCutoffTimestamp].quantile(0.75) # Gets the value of the 75% for each column
# Getting upper and lower level
twoQuartileRange = fourthQuartile - secondQuartile
allowedFrom = secondQuartile - 9 * twoQuartileRange
allowedTo = fourthQuartile + 9 * twoQuartileRange
nRows, _ = df.shape
for thisColumn in df.columns:
if thisColumn not in ('id', 'timestamp', 'y', 'CntNs'):
# Truncate unusual values
indexTooLow = df[thisColumn] < allowedFrom[thisColumn]
indexTooHigh = df[thisColumn] > allowedTo[thisColumn]
maxValue = df[thisColumn].max()
minValue = df[thisColumn].min()
print('Truncating %s: TooLow %s (%.1f%%), TooHigh %s (%.1f%%), range: %s to %s' \
% ( thisColumn, sum(indexTooLow), 100*sum(indexTooLow)/nRows, \
sum(indexTooHigh), 100*sum(indexTooHigh)/nRows,
minValue, maxValue))
# Gets the part of the data where data makes sense
df[thisColumn] = df[thisColumn].clip(allowedFrom[thisColumn], allowedTo[thisColumn])
# If the resulted column does not have any variance, it is removed.
if abs(df[thisColumn].std()) < 0.0000001:
print('variance = ', df[thisColumn].std(), ' dropping it')
df.drop([thisColumn], axis = 1, inplace = True)
#else:
# df[thisColumn] = (df[thisColumn] - df[thisColumn].mean()) / df[thisColumn].std()
else:
print('skipping ', thisColumn)
return df