-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapriori.py
executable file
·79 lines (69 loc) · 1.84 KB
/
apriori.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def createCandidateSet(data):
cand = []
for row in data:
for itm in row:
if [itm] not in cand:
cand.append([itm])
cand.sort()
return list(map(frozenset,cand))
def scanData(data, candidateSet, minSupport):
subsetCount = {}
for curSet in data:
for cand in candidateSet:
if cand.issubset(curSet):
if not cand in subsetCount:
subsetCount[cand] = 1
else:
subsetCount[cand] += 1
n = float(len(data))
valid = []
for key in subsetCount:
sup = subsetCount[key]
if sup >= minSupport:
valid.insert(0,key)
return valid, subsetCount
def genApriori(freqSets, k):
valid = []
nFreqSets = len(freqSets)
for i in range(nFreqSets):
for j in range(i+1, nFreqSets):
lstCands1 = list(freqSets[i])[:k-2]
lstCands2 = list(freqSets[j])[:k-2]
lstCands1.sort()
lstCands2.sort()
# if first k-2 elements are equal
if lstCands1 == lstCands2:
valid.append(freqSets[i]|freqSets[j]) # union
return valid
def apriori(data, minSupport):
candSet = createCandidateSet(data)
setData = list(map(set,data))
lstCands, subsetCounts = scanData(setData,candSet,minSupport)
lstCands = [lstCands]
k = 2
while(len(lstCands[k-2]) > 0):
candSetX = genApriori(lstCands[k-2],k)
lstCandsX, subsetCountsX = scanData(setData,candSetX, minSupport)
subsetCounts.update(subsetCountsX)
lstCands.append(candSetX)
k += 1
return lstCands, subsetCounts
# read in data
data = []
dataSetFilename = 'Dataset-apriori.txt'
with open(dataSetFilename,'r') as file:
for line in file:
data.append(line.strip().split(','))
print("What min. support do you want to use? ")
minSupp = raw_input()
minSupp = int(minSupp)
print("\n**** Apriori with minSupport = {} ****".format(minSupp))
# call apriori
sets, counts = apriori(data,minSupp)
print("\nSets:\n")
for x in sets:
for y in x:
print(y)
print("\nCounts:\n")
for k,v in counts.items():
print(k, v)