-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_preprocessing.py
152 lines (119 loc) · 5.57 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 8 09:15:20 2020
@author: rfuchs
"""
from copy import deepcopy
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import autograd.numpy as np
from autograd.numpy.random import uniform
from autograd.numpy import newaxis as n_axis
def gen_categ_as_bin_dataset(y, var_distrib):
''' Convert the categorical variables in the dataset to binary variables
y (numobs x p ndarray): The observations containing categorical variables
var_distrib (p 1darray): An array containing the types of the variables in y
----------------------------------------------------------------------------
returns ((numobs, p_new) ndarray): The new dataset where categorical variables
have been converted to binary variables
'''
new_y = deepcopy(y)
new_y = new_y.reset_index(drop = True)
new_var_distrib = deepcopy(var_distrib[var_distrib != 'categorical'])
categ_idx = np.where(var_distrib == 'categorical')[0]
oh = OneHotEncoder(drop = 'first')
for idx in categ_idx:
name = y.iloc[:, idx].name
categ_var = pd.DataFrame(oh.fit_transform(pd.DataFrame(y.iloc[:, idx])).toarray())
nj_var = len(categ_var.columns)
categ_var.columns = [str(name) + '_' + str(categ_var.columns[i]) for i in range(nj_var)]
# Delete old categorical variable & insert new binary variables in the dataframe
del(new_y[name])
new_y = new_y.join(categ_var.astype(int))
new_var_distrib = np.concatenate([new_var_distrib, ['bernoulli'] * nj_var])
return new_y, new_var_distrib
def ordinal_encoding(sequence, ord_labels, codes):
''' Perform label encoding, replacing ord_labels with codes
sequence (numobs 1darray): The sequence to encode
ord_labels (nj_ord_j 1darray): The labels existing in sequences
codes (nj_ord_j 1darray): The codes used to replace ord_labels
-----------------------------------------------------------------
returns (numobs 1darray): The encoded sequence
'''
new_sequence = deepcopy(sequence.values)
for i, lab in enumerate(ord_labels):
new_sequence = np.where(new_sequence == lab, codes[i], new_sequence)
return new_sequence
def compute_nj(y, var_distrib):
''' Compute nj for each variable y_j
y (numobs x p ndarray): The original data
var_distrib (p 1darray): The type of the variables in the data
-------------------------------------------------------------------
returns (tuple (p 1d array, nb_bin 1d array, nb_ord 1d array)): The number
of categories of all the variables, for count/bin variables only and for
ordinal variables only
'''
nj = []
nj_bin = []
nj_ord = []
nj_categ = []
for i in range(len(y.columns)):
if np.logical_or(var_distrib[i] == 'bernoulli', var_distrib[i] == 'binomial'):
max_nj = int(np.max(y.iloc[:,i], axis = 0))
nj.append(max_nj)
nj_bin.append(max_nj)
elif var_distrib[i] == 'ordinal':
card_nj = len(np.unique(y.iloc[:,i]))
nj.append(card_nj)
nj_ord.append(card_nj)
elif var_distrib[i] == 'categorical':
card_nj = len(np.unique(y.iloc[:,i]))
nj.append(card_nj)
nj_categ.append(card_nj)
elif var_distrib[i] == 'continuous':
nj.append(np.inf)
else:
raise ValueError('Data type', var_distrib[i], 'is illegal')
nj = np.array(nj)
nj_bin = np.array(nj_bin)
nj_ord = np.array(nj_ord)
nj_categ = np.array(nj_categ)
return nj, nj_bin, nj_ord, nj_categ
def bin_to_bern(Nj, yj_binom, zM_binom):
''' Split the binomial variable into Bernoulli. Them just recopy the corresponding zM.
It is necessary to fit binary logistic regression
Example: yj has support in [0,10]: Then if y_ij = 3 generate a vector with 3 ones and 7 zeros
(3 success among 10).
Nj (int): The upper bound of the support of yj_binom
yj_binom (numobs 1darray): The Binomial variable considered
zM_binom (numobs x r nd-array): The continuous representation of the data
-----------------------------------------------------------------------------------
returns (tuple of 2 (numobs x Nj) arrays): The "Bernoullied" Binomial variable
'''
n_yk = len(yj_binom) # parameter k of the binomial
# Generate Nj Bernoullis from each binomial and get a (numobsxNj, 1) table
u = uniform(size =(n_yk,Nj))
p = (yj_binom/Nj)[..., n_axis]
yk_bern = (u > p).astype(int).flatten('A')#[..., n_axis]
return yk_bern, np.repeat(zM_binom, Nj, 0)
def data_processing(y, var_distrib, cast_types = False):
dtypes_dict = {'continuous': float, 'categorical': str, 'ordinal': float,\
'bernoulli': int, 'binomial': int}
p = y.shape[1]
le_dict = {}
df = deepcopy(y)
#===========================================#
# Formating the data
#===========================================#
# Encode non-continuous variables
for col_idx, colname in enumerate(df.columns):
if np.logical_and(var_distrib[col_idx] != 'continuous', var_distrib[col_idx] != 'binomial'):
le = LabelEncoder()
df[colname] = le.fit_transform(df[colname])
le_dict[colname] = deepcopy(le)
# Feature category (cf)
if cast_types:
dtype = {df.columns[j]: dtypes_dict[var_distrib[j]] for j in range(p)}
df = df.astype(dtype)
return df, le_dict