-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexploring.py
25 lines (17 loc) · 929 Bytes
/
exploring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import pandas as pd
print("There are", 97452, "lines of data")
x = pd.read_csv("data/xTrain.csv", delimiter=',')
y = pd.read_csv("data/yTrain.csv", names=['y'])
df = pd.concat([x, y], axis=1)
print("\nNumber of different cows:", len(x['idCow'].unique()))
print("\nDuration of the study in hours:", len(x['data_hour'].unique())+23)
missing_values = x.groupby('idCow').agg(lambda x: x.isnull().sum())
print("\nNumber of missing values for each cow in hours:")
print(missing_values[['all0', 'rest0', 'eat0']])
# Compter le nombre de lignes où il manque une donnée sur au moins une colonne
nb_lignes_manquantes = df.isna().any(axis=1).sum()
print(f"\nThere are {nb_lignes_manquantes} lines with at least one data missing")
# Compter le nombre de fois où y vaut 1.0 pour chaque vache
count_y_by_cow = df[df['y'] == 1.0].groupby('idCow').size()
print("\nNumber of hours during each cow has been sick:")
print(count_y_by_cow)