-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpageview_counter.py
78 lines (66 loc) · 1.79 KB
/
pageview_counter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gzip, os, sys, urllib2
'''
@author gibbons4, dvetrano
May 11, 2012
Finds the month, year an article was created
Counts pageviews per month.
prints hash screen in ugly format
'''
# load creation dates
languages = []
creation_dates = {}
for lang in os.listdir('./revision-tuples/'):
languages.append(lang)
print "process language "+lang
for line in open('./revision-tuples/'+lang, 'r'):
parts = line.rstrip('\n').split(' ')
lang = parts[0]
name = urllib2.unquote(parts[1]).strip()
t0 = int(parts[2])
tEdit = int(parts[3])
isBot = int(parts[4])
if isBot == 0:
continue
if (lang, name) not in creation_dates:
creation_dates[(lang, name)] = tEdit / 604800
def parsePageviewLine(line):
parts = line.split()
lang = None
name = ''
views = 0
if '.' not in parts[0]:
lang = parts[0].split('.')[0]
name = urllib2.unquote(parts[1]).strip()
views = int(parts[2])
return lang, name, views
total_counts = {}
found = 0
missed = 0
for fd in os.listdir('./pageviews'):
print 'started: ', fd
views_file = gzip.open('./pageviews/' + fd)
for line in views_file:
lang, name, views = parsePageviewLine(line)
if lang is None:
continue
if lang not in total_counts:
total_counts[lang] = {}
if (lang, name) not in creation_dates:
missed += views
continue
creation_date = creation_dates[(lang, name)]
if creation_date not in total_counts[lang]:
total_counts[lang][creation_date] = 0
total_counts[lang][creation_date] += views
found += views
print 'finished: ' , fd
break
print 'Found: ', found, 'Missed: ', missed
print 'Write output files'
for lang in languages:
if lang not in total_counts:
continue
f = open('./views-by-lang/'+lang, 'w')
for date in total_counts[lang]:
f.write(str(date)+'\t'+str(total_counts[lang][date])+'\n')
f.close()