-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathrun_fixer.py
149 lines (105 loc) · 3.78 KB
/
run_fixer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import os, glob, json, codecs, time
from src.word_fix import fix_repo
# Ignore search results that have more than these number, they may
# represent github's automatic "fixes".
big_word_count = 500
# Load the blacklisted users, these users WON'T be checked.
BLACKLIST = {}
with open("blacklists/users.txt") as FIN:
BLACKLIST["users"] = set()
for line in FIN:
BLACKLIST["users"].add(line.strip())
# Load the submissions, for now everybody only gets one!
with open("logs/submitted.log") as FIN:
BLACKLIST["submitted"] = set()
for line in FIN:
word, name, submit_time = line.split()
BLACKLIST["submitted"].add(name)
# Use the parsed version
#f_wordlist = "wordlists/wikipedia_list.txt"
f_wordlist = "wordlists/parsed_wikipedia_list.txt"
FLAG_USING_FILTER = False
# Total number of corrections to run in one batch
max_total_corrections = 20**10
#max_total_corrections = 1
os.system("mkdir -p logs")
F_SEARCH = sorted(glob.glob("search_data/*"))
# Create (or append a log file)
f_logfile = "logs/submitted.log"
# Read logfile into memory
with open(f_logfile,'r') as FIN:
LOGS = []
for line in FIN:
word, full_name, timestamp = line.split()
LOGS.append((word,full_name))
LOGS = set(LOGS)
# Open the logfile for appending
F_LOG = open(f_logfile,'a')
# Load the wordlist
corrections = {}
with open(f_wordlist) as FIN:
for line in FIN:
bad, good = line.strip().split('->')
# Skip words with multiple mappings
if ',' in good: continue
# Skip words that aren't in clean list
#if FLAG_USING_FILTER:
# if bad not in filter_words: continue
corrections[bad] = good
def load_word_file(f):
with codecs.open(f,'r','utf-8') as FIN:
js = json.loads(FIN.read())
return js
total_corrections = 0
for f in F_SEARCH:
# Keep track of the "no-edits", this may mark github's autocorrect
no_edit_counter = 0
if total_corrections > max_total_corrections:
break
js = load_word_file(f)
count = js["total_count"]
word = f.split('/')[-1]
if not count:
continue
if count > big_word_count:
#print "BIG WORD COUNT...", f, count
continue
if word not in corrections:
#print "Word {} not in corrections, skipping".format(word)
continue
if len(word) <= 3:
print "Word '{}' too short, skipping".format(word)
continue
print "** Starting word {} ({}) **".format(word,count)
for full_name in js["items"]:
key = (word, full_name)
user_name, repo_name = full_name.split('/')
if user_name in BLACKLIST["users"]:
msg = "Skipping {}. User on the blacklist."
print msg.format(user_name)
continue
if key in LOGS:
print "{} {} already completed, skipping".format(*key)
continue
if full_name in BLACKLIST["submitted"]:
msg = "Skipping {}. User/repo already submitted."
print msg.format(full_name)
continue
# Simple check for other spelling bots
if "spell" in repo_name.lower():
continue
bad_word = word
good_word = corrections[bad_word]
# This case would be an intentional "typo"
if bad_word in repo_name or bad_word in user_name:
continue
print "Starting {} {} -> {}".format(full_name, bad_word, good_word)
pull_status = fix_repo(full_name, good_word, bad_word)
log_item = "{} {} {}\n"
F_LOG.write(log_item.format(word, full_name, int(time.time())))
if not pull_status:
no_edit_counter += 1
if no_edit_counter >= 1:
break
total_corrections += 1
F_LOG.close()