-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPenguinPredictor.py
153 lines (117 loc) · 7.13 KB
/
PenguinPredictor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import pandas as pd
import urllib
import csv
import numpy as np
from matplotlib import pyplot as plt
from sklearn import tree, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression # picked a MODEL CLASS
from sklearn.model_selection import cross_val_score
class penguinData():
"""
Reads in csv file of penguin data and cleans up for Species Prediction
by functions logisticRegression() or randomForestClassifier()
"""
def __init__(self, csv): #, X_train = pd.DataFrame(), X_test = pd.DataFrame(), y_train = pd.DataFrame(), y_test = pd.DataFrame()): # Takes in and reads csv file
"""
Initialize class with user-supplied csv file
Args:
csv: a .csv file
Returns:
None
"""
self.csv = csv # instantiate the csv
self.df = pd.read_csv(csv) # instantiate the df which reads in the csv file
def readData(self): # Method that reads the data; pass in self, which contains the read csv file from the init method
"""
Prints a readout of the data
"""
print(self.df) # instantiate using self.___
def dropNAN(self):
"""
Drops any data that will not be used (i.e. drops unused columns or NAN values)
"""
try: # only for training data, like palmer_penguins.csv, which contain Species data
self.variables = self.df[['Culmen Length (mm)', 'Body Mass (g)', 'Island', 'Species']] # data on penguins with Species
except KeyError: # for the actual dataset which does not contain Species data
self.variables = self.df[['Culmen Length (mm)', 'Body Mass (g)', 'Island']] # data on penguins without Species
# if 'Species' in self.df: # for "complete" data that contains true Species (e.g. palmer_penguins.csv)
# self.variables = self.df[['Culmen Length (mm)', 'Body Mass (g)', 'Island', 'Species']] # for training data (palmer_penguins.csv)
# else: # for data without Species
# self.variables = self.df[['Culmen Length (mm)', 'Body Mass (g)', 'Island']] # for actual user data (data on penguins without Species)
self.df = self.variables.dropna(axis = 0)
def islandToNum(self):
"""
Changes categorical island data to numbers.
"""
self.le = preprocessing.LabelEncoder() # makes an instance of labelencoder
self.X = self.le.fit_transform(self.df['Island'])
self.df['Island_num'] = self.le.fit_transform(self.df["Island"]) # actually sets the #s to each island
def splitTrainTest(self):
"""
Splits the penguin data into X (predictor variables) and y (target variables).
If the data is training data (i.e. palmer_penguins.csv), generates X_train, X_test, y_train, and y_test groups;
if the data is actual data (i.e. penguin data without Species), only creates dataframe of X (predictor variables)
"""
self.dropNAN() # member function call to drop NAN values
self.islandToNum() # member function call to turn island categories into numbers
self.X = self.df[['Culmen Length (mm)', 'Body Mass (g)', 'Island_num']] # sets predictor variables
if 'Species' in self.df: # for "complete" data that contains true Species (e.g. palmer_penguins.csv)
self.y = self.df['Species'] # this sets the target variables
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state = 2022) # creates instance variables of X/y, train/test
# print(self.X_train, self.X_test, self.y_train, self.y_test)
def logisticRegression(csv):
"""
Uses existing, complete penguin data (palmer_penguins.csv) as training dataset to predict the species of penguin on
actual penguin dataset without species information using Logistic Regression model.
Args:
csv: a .csv file
Returns:
Prints the train and test scores using Logistic Regression on existing, complete penguin data.
Returns a new .csv file with species predictions appended to data from original csv .
"""
training = penguinData('palmer_penguins.csv') # instantiate an object of class penguinData (class we just made)
training.dropNAN()
training.islandToNum()
training.splitTrainTest() # call method splitTrainTest() on the object penguins
actual = penguinData(csv) # csv file without Species in it (don't know the species but collected penguin data)
actual.dropNAN()
actual.islandToNum()
actual.splitTrainTest()
LR = LogisticRegression(max_iter = 500) # max_iter default usually 100, but our data is "too big"
LR.fit(training.X_train, training.y_train) # for every review, LR will output a "probability" of the review being positive
LR_train_score = LR.score(training.X_train, training.y_train) # score the training set
LR_test_score = LR.score(training.X_test, training.y_test) # score the test set
# # visually compare to see if there is overfitting
speciesPrediction = LR.predict(actual.X) # makes species predictions using logistic regression
actual.df['Species Prediction'] = speciesPrediction # adds column to dataframe with species predictions
print("Trained Score:", LR_train_score, "\nTest Score:", LR_test_score) # prints trained vs test score
return actual.df.to_csv('Predicted_Species_Logistic_Regression.csv') # returns a csv of all data plus the species predictions
def randomForestClassifier(csv):
"""
Uses existing, complete penguin data (palmer_penguins.csv) as training dataset to predict the species of penguin on
actual penguin dataset without species information using Random Forest Classifier model.
Args:
csv: a .csv file
Returns:
Prints the train and test scores using Random Forest Classifier on existing, complete penguin data.
Returns a new .csv file with species predictions appended to data from original csv .
"""
training = penguinData('palmer_penguins.csv') # instantiate an object of class penguinData (class we just made)
training.dropNAN()
training.islandToNum()
training.splitTrainTest() # call method splitTrainTest() on the object penguins
actual = penguinData(csv)
actual.dropNAN()
actual.islandToNum()
actual.splitTrainTest()
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(training.X_train, training.y_train)
rfc_train_score = clf.score(training.X_train, training.y_train)
rfc_test_score = clf.score(training.X_test, training.y_test)
speciesPrediction = clf.predict(actual.X)
actual.df['Species Prediction'] = speciesPrediction # adds column to dataframe with species predictions
print("Trained Score:", rfc_train_score, "\nTest Score:", rfc_test_score) # prints trained vs test score
return actual.df.to_csv('Predicted_Species_Random_Forest_Classifier.csv')