-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHW2.py
124 lines (90 loc) · 3.54 KB
/
HW2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import numpy as np
import json
import pprint
from scipy import stats
from collections import Counter
from scipy.stats import entropy
data = pd.read_csv('Data/USvideos.csv')
samples = data.sample(n=10)
new_df = pd.DataFrame(samples, columns = ['views','likes','comments_disabled','ratings_disabled','category_id'])
print(new_df)
with open('Data/US_category_id.json') as f:
data = json.load(f)
categories = {}
for item in data['items']:
categories[item['id']] = item['snippet']['title']
num_attrib = pd.DataFrame(new_df, columns = ['views', 'likes'])
print(num_attrib.corr(method = 'pearson'))
crosstab = pd.crosstab(new_df['comments_disabled'],new_df['ratings_disabled'])
print(crosstab)
chi2,p_value,dof,freqs = stats.chi2_contingency(crosstab)
print(chi2)
#chi2 tells the discrepancies between the expected and the actual result
#since we've got 0 or pretty low result, it means that there are no or low correlation between these two categories
def normalize(col):
min = np.min(col)
max = np.max(col)
range = max - min
return [(item - min) / range for item in col]
new_df['views'], new_df['likes'] = normalize(new_df['views']), normalize(new_df['likes'])
print(new_df)
new_df['views'], new_df['likes'] = pd.cut(new_df['views'],3, labels=["Low","Medium","High"]), pd.cut(new_df['likes'],3, labels=["Low","Medium","High"])
print(new_df)
pd_target_attr_category_id = Counter(new_df['category_id'])
print(pd_target_attr_category_id)
print(len(pd_target_attr_category_id))
p = []
for key,val in pd_target_attr_category_id.items():
p.append(val/10)
print(p)
entropy_target_attr_category_id = entropy(p,base=2)
print(entropy_target_attr_category_id)
ct = pd.crosstab(new_df['views'],new_df['category_id'])
print(ct)
print(max(ct.count()))
num_view_labels = max(ct.count())
view_level_Pds = []
for view_label in range(num_view_labels):
temp = []
for label,content in ct.iteritems():
print(label,content[view_label])
temp.append(content[view_label]/len(pd_target_attr_category_id))
view_level_Pds.append(temp)
print(view_level_Pds)
#category_vs_views entropy
entropy_category_vs_views = []
for arr in view_level_Pds:
entropy_category_vs_views.append(entropy(arr,base=2))
print(entropy_category_vs_views)
#information gain
view_levels = Counter(new_df['views'])
sum_views_entropy = 0
idx = 0
for key,val in view_levels.items():
print("{0}: {1}/10".format(key,val))
sum_views_entropy = sum_views_entropy+(val/10)*entropy_category_vs_views[idx]
idx = idx+1
print("sum_views_entropy = {0}".format(sum_views_entropy))
print("information gain = {0}-{1}".format(entropy_target_attr_category_id,sum_views_entropy))
information_gain = entropy_target_attr_category_id-sum_views_entropy
print("information gain = {0})".format(information_gain))
ct2 = pd.crosstab(new_df['likes'],new_df['category_id'])
num_likes_labels = max(ct2.count())
likes_level_Pds = []
for likes_label in range(num_likes_labels):
temp = []
for label,content in ct2.iteritems():
temp.append(content[likes_label]/len(pd_target_attr_category_id))
likes_level_Pds.append(temp)
entropy_category_vs_likes = []
for arr in likes_level_Pds:
entropy_category_vs_likes.append(entropy(arr,base=2))
print(entropy_category_vs_likes)
likes_levels = Counter(new_df['likes'])
sum_likes_entropy = 0
idx = 0
for key,val in likes_levels.items():
sum_likes_entropy = sum_likes_entropy+(val/10)*entropy_category_vs_likes[idx]
information_gain = entropy_target_attr_category_id-sum_likes_entropy
print(information_gain)