-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreducer_article_age.py
68 lines (57 loc) · 1.96 KB
/
reducer_article_age.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
import sys
SECONDS_PER_WEEK = 60*60*24*7
'''
@author : susan_biancani
outputs tab delimited text: week, summary stats for age of edits in that week
summary stats = count, sum, sumsq
(Using count, sum, sumsq, you can calculate mean, st dev)
'''
def findFirstEdit(editTimes):
return min(editTimes)
def collectWeekData(weeks, editTimes):
if len(editTimes) == 0:
return
firstEdit = findFirstEdit(editTimes)
for edit in editTimes:
age = edit - firstEdit
if not weeks.has_key(edit):
weeks[edit] = [0.0, 0.0, 0.0]
weeks[edit][0]+=1
weeks[edit][1]+=age
agesq = weeks[edit][2]
weeks[edit][2]+=(age*age)
if agesq > weeks[edit][2]:
sys.stderr.write("Underflow on " + str(firstEdit) + " " + str(edit))
# Schema defined at :
#https://github.com/whym/RevDiffSearch/blob/master/README.rst
get = {'rev_id':0, 'page_id':1, 'namespace':2, 'title':3, 'timestamp':4,
'comment':5, 'minor':6, 'user_id':7, 'user_text':8}
"""Each week is a list: [count, sum(age of edits), sumsq(age of edits)]"""
weeks = {}
editTimes = []
lastPage = -1
#filename = "C:/Users/Susan/Documents/CS341/WikiEditor/subset_test.txt"
#data = open(filename)
#for line in data.readlines():
for line in sys.stdin :
line = line.strip('\n').split('\t')
# gather article statistics and output
page = line[get['page_id']+1]
timestamp = line[get['timestamp']+1]
weekTimestamp = int(timestamp)/SECONDS_PER_WEEK
if lastPage != page:
if lastPage < 0:
lastPage = page
else:
#firstEdit = findFirstEdit(lastPage, editTimes)
collectWeekData(weeks, editTimes)
lastPage = page
editTimes = []
# collect edit time for a revision
editTimes.append(weekTimestamp)
collectWeekData(weeks, editTimes)
for week in weeks:
output = [str(week)] + [str(w) for w in weeks[week]]
print '\t'.join(output)
#data.close()