-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdata_generator.py
91 lines (71 loc) · 3.23 KB
/
data_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import pandas as pd
import numpy as np
import scipy.stats
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris['data'], columns = iris['feature_names'])
df['target'] = pd.Series(iris['target'], name = 'target_values')
df['target_name'] = df['target'].replace([0,1,2],
['iris-' + species for species in iris['target_names'].tolist()])
def my_distribution(min_val, max_val, mean, std):
scale = max_val - min_val
location = min_val
# Mean and standard deviation of the unscaled beta distribution
unscaled_mean = (mean - min_val) / scale
unscaled_var = (std / scale) ** 2
# Computation of alpha and beta can be derived from mean and variance formulas
t = unscaled_mean / (1 - unscaled_mean)
beta = ((t / unscaled_var) - (t * t) - (2 * t) - 1) / ((t * t * t) + (3 * t * t) + (3 * t) + 1)
alpha = beta * t
# Not all parameters may produce a valid distribution
if alpha <= 0 or beta <= 0:
raise ValueError('Cannot create distribution for the given parameters.')
# Make scaled beta distribution with computed parameters
return scipy.stats.beta(alpha, beta, scale=scale, loc=location)
np.random.seed(100)
def iris_data_creator(iteration=1):
sl_list = []
sw_list = []
pl_list = []
pw_list = []
for i in range(iteration):
min_val_sl = df["sepal length (cm)"].min()
max_val_sl = df["sepal length (cm)"].max()
mean_sl = df["sepal length (cm)"].mean(skipna=True)
std_sl = df["sepal length (cm)"].std(skipna=True)
my_dist_sl = my_distribution(min_val_sl, max_val_sl, mean_sl, std_sl)
sample_sl = my_dist_sl.rvs(size=1).round(1)[0]
sl_list.append(sample_sl)
for i in range(iteration):
min_val_sw = df['sepal width (cm)'].min()
max_val_sw = df['sepal width (cm)'].max()
mean_sw = df['sepal width (cm)'].mean(skipna=True)
std_sw = df['sepal width (cm)'].std(skipna=True)
my_dist_sw = my_distribution(min_val_sw, max_val_sw, mean_sw, std_sw)
sample_sw = my_dist_sw.rvs(size=1).round(1)[0]
sw_list.append(sample_sw)
for i in range(iteration):
min_val_pl = df["petal length (cm)"].min()
max_val_pl = df["petal length (cm)"].max()
mean_pl = df["petal length (cm)"].mean(skipna=True)
std_pl = df["petal length (cm)"].std(skipna=True)
my_dist_pl = my_distribution(min_val_pl, max_val_pl, mean_pl, std_pl)
sample_pl = my_dist_pl.rvs(size=1).round(1)[0]
pl_list.append(sample_pl)
for i in range(iteration):
min_val_pw = df["petal width (cm)"].min()
max_val_pw = df["petal width (cm)"].max()
mean_pw = df["petal width (cm)"].mean(skipna=True)
std_pw = df["petal width (cm)"].std(skipna=True)
my_dist_pw = my_distribution(min_val_pw, max_val_pw, mean_pw, std_pw)
sample_pw = my_dist_pw.rvs(size=1).round(1)[0]
pw_list.append(sample_pw)
random_df = pd.DataFrame()
random_df['sepal_length'] = sl_list
random_df['sepal_width'] = sw_list
random_df['petal_length'] = pl_list
random_df['petal_width'] = pw_list
return random_df.to_json()
# for i in range(1):
# if __name__ == '__main__':
# print(iris_data_creator(iteration=1))