-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCDF.py
50 lines (37 loc) · 1.27 KB
/
CDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import ks_2samp
attrib = "GPS_longitude"
"""GPS_longitude
crop_count
age_malehead
"""
df = pd.read_csv("./data/farmer_survey.csv")
df = df.fillna(0)
# getting data of the histogram
count, bins_count = np.histogram(df[attrib], bins=10)
# finding the PDF of the histogram using count values
pdf = count / sum(count)
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf = np.cumsum(pdf)
plt.plot(cdf, bins_count[1:], label="Original data")
df_1 = pd.read_csv("./synthetic_table/farmer_survey_synthetic_with_privacy.csv")
df_1 = df_1.fillna(0)
# getting data of the histogram
count_1, bins_count_1 = np.histogram(df_1[attrib], bins=10)
# finding the PDF of the histogram using count values
pdf_1 = count_1 / sum(count_1)
# using numpy np.cumsum to calculate the CDF
# We can also find using the PDF values by looping and adding
cdf_1 = np.cumsum(pdf_1)
print(ks_2samp(df[attrib], df_1[attrib]))
# plotting PDF and CDF
plt.plot(cdf_1, bins_count_1[1:], label="Synthetic data")
plt.legend()
plt.ylabel("CDF")
plt.xlabel("Feature value")
plt.title("Attribute with High Privacy Risk")
plt.savefig("./output/cdf/cdf_high.png")