-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetCategories.py
37 lines (31 loc) · 1.26 KB
/
getCategories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import operator
import readYelp
import justTry
from sklearn.cluster import KMeans
"""This provides the categories named by mor than k businesses"""
def getCategoriesGT(i=0, k = 300):
#get the data
(l, d) = readYelp.readY('../yelp/yelp_academic_dataset_business.json', i)
catList = [ bus.get("categories", []) for bus in d]
countCats = {}
for bus_cats in catList:
for cat in bus_cats:
countCats[cat] = countCats.get(cat, 0) + 1
sorted_count = sorted(countCats.iteritems(), key=operator.itemgetter(1))
top_cats = [ cat.encode('utf-8') for (cat,num) in sorted_count if num > k]
return top_cats
"""This provides the top n categories"""
def getCategoriesTOP(i=0, n = 32):
#get the data
(l, d) = readYelp.readY('../yelp/yelp_academic_dataset_business.json', i)
catList = [ bus.get("categories", []) for bus in d]
countCats = {}
for bus_cats in catList:
for cat in bus_cats:
countCats[cat] = countCats.get(cat, 0) + 1
sorted_count = sorted(countCats.iteritems(), key=operator.itemgetter(1))
if len(sorted_count)> 100:
top_cats = sorted_count[-n +1: len(sorted_count)]
return [cat for (cat, num) in top_cats]
else:
return [cat for (cat, num) in sorted_count]