forked from emeryberger/CSrankings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidate_commit.py
256 lines (238 loc) · 10.5 KB
/
validate_commit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import csv
import glob
import json
import re
import requests
import sys
import time
import unidecode
import urllib.parse
allowed_files = ['csrankings-[a-z0].csv', 'country-info.csv', 'old/industry.csv', 'old/other.csv', 'old/emeritus.csv', 'old/rip.csv']
def remove_suffix_and_brackets(input_string: str) -> str:
# Remove any suffix with a space and anything in brackets only if it is at the end of the string
# Used to handle special entries like [Tech]
modified_string = re.sub(r'\s*\[.*?\]$', '', input_string)
return modified_string
def translate_name_to_dblp(name: str) -> str:
"""
Converts a given name to a DBLP URL.
Args:
name: A string containing the name to be converted.
Returns:
A string containing the DBLP URL representation of the name.
"""
# Replace spaces and non-ASCII characters.
# removes periods
name = re.sub('\\.', '', name)
# replaces '-' with ' ' to cope with DBLP search API issue (disabled negation operator)
name = re.sub('-', ' ', name)
# encodes diacritics
name = urllib.parse.quote(name, safe='=')
# replaces '&' with '='
name = re.sub('&', '=', name)
# replaces ';' with '='
name = re.sub(';', '=', name)
split_name = name.split(' ')
last_name = split_name[-1]
disambiguation = ''
# Handle disambiguation entries.
try:
if int(last_name) > 0:
disambiguation = last_name
split_name.pop()
last_name = split_name[-1] + '_' + disambiguation
except:
pass
# Consolidate name and replace spaces with underscores.
split_name.pop()
new_name = ' '.join(split_name)
new_name = new_name.replace(' ', '_')
new_name = new_name.replace('-', '=')
new_name = urllib.parse.quote(new_name)
str_ = ''
# str_ = "https://dblp.org/pers/hd"
last_initial = last_name[0].lower()
str_ += f'{last_name}:{new_name}'
# str_ += f'/{last_initial}/{last_name}:{new_name}'
# return the DBLP URL containing the given name
return str_
def is_valid_account(account: str) -> bool:
return not account.startswith('anonymous')
def has_reasonable_title(title):
# Check if the title is reasonable
return not title.startswith('Update csrankings-')
# Use richer headers to avoid 403 errors.
# From https://scrapeops.io/web-scraping-playbook/403-forbidden-error-web-scraping.
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Cache-Control": "max-age=0",
}
def has_valid_homepage(homepage: str) -> bool:
try:
response = requests.get(homepage, headers=HEADERS, timeout=15)
if response.status_code != 200:
print(f' WARNING: Received error code {response.status_code}.')
return response.status_code == 200
except requests.exceptions.RequestException as e:
print(f" ERROR: An exception occurred: {e}")
return False
def has_valid_google_scholar_id(id):
# Check if the Google Scholar ID is valid
if id == 'NOSCHOLARPAGE':
return True
# Define the regular expression pattern for valid IDs
pattern = '^[a-zA-Z0-9_-]{12}$'
# Check if the ID matches the pattern
return re.match(pattern, id)
def matching_name_with_dblp(name: str) -> int:
"""
Check if a name matches a DBLP entry and return the number of completions.
Args:
name: A string representing the name to check.
Returns:
An integer representing the number of completions. 1 indicates an exact match.
"""
# Translate the name to a format that can be used in DBLP queries.
author_name = translate_name_to_dblp(name)
# Search for up to 10 matching authors.
dblp_url = f'https://dblp.org/search/author/api?q=author%3A{author_name}$%3A&format=json&c=10'
try:
# Send a request to the DBLP API.
response = requests.get(dblp_url)
# Extract the number of completions from the JSON response.
if "<title>429 Too Many Requests</title>" in response.text:
# wait for a few seconds and try again
time.sleep(10)
return matching_name_with_dblp(name)
j = json.loads(response.text)
completions = int(j['result']['completions']['@total'])
# Print a message if there is a match.
if completions != 0:
print(f' Checking {dblp_url}')
# Check for an exact name match
if completions > 0:
for hit in j['result']['hits']['hit']:
if hit['info']['author'] == name:
return 1
return completions
except requests.exceptions.RequestException as e:
# Handle any exceptions that occur during the request.
print(f'ERROR: Exception: {e}')
return 0
def is_valid_file(file: str) -> bool:
global allowed_files
if re.match('.*\\.csv', file):
if not any((re.match(pattern, file) for pattern in allowed_files)):
return False
return True
def process():
# Read in the institutions dictionary.
institutions = {}
with open('institutions.csv', 'r') as f:
reader = csv.DictReader(f)
for row in reader:
institutions[row['institution']] = True
# Read in the argument JSON file.
json_file = sys.argv[1]
with open(json_file, 'r') as f:
json_data = f.read()
data = json.loads(json_data)
changed_lines = {}
for d in data['files']:
try:
file_path = d['path']
changed_lines[file_path] = []
for chunk in d['chunks']:
for change in chunk['changes']:
if change['type'] == 'AddedLine':
changed_lines[file_path].append(change)
except:
# Gracefully handle misformed / empty JSON.
pass
# Now process the diffs.
valid = True
line_valid = True
print('Sanity checking the commit. Please check any issues raised here.')
# Pick arbitrary thresholds; if there are more than this many diffs,
# it is probably because of some line ending mismatch or editing with Excel.
remaining_diffs = 500
# TO DO: check deleted lines to see if home page still valid
# or if moved to another file
for file in changed_lines:
if not is_valid_file(file):
print(f'ERROR: Invalid file modification ({file}). Please only modify allowed CSV files.')
valid, line_valid = (False, False)
break
# Check if we are processing a `csrankings-?.csv` file.
matched = re.match('csrankings-([a-z0])\\.csv', file)
if matched:
the_letter = unidecode.unidecode(matched.groups(0)[0]) # Convert to ASCII
for l in changed_lines[file]:
line_valid = True
remaining_diffs -= 1
if remaining_diffs <= 0:
print('ERROR: This PR has too many diffs. Something probably went wrong.')
valid, line_valid = (False, False)
break
line = l['content'].strip()
print(f'Processing {line}')
if re.search(',\\s', line):
print(f' ERROR: Found a space after a comma ({line}). Please ensure there are no spaces after commas.')
valid, line_valid = (False, False)
continue
try:
name, affiliation, homepage, scholarid = line.split(',')
name = unidecode.unidecode(remove_suffix_and_brackets(name))
# Verify that the affiliation is already in the database
if affiliation not in institutions:
print(f' ERROR: This institution ({affiliation}) was not found in `institutions.csv`.')
valid, line_valid = (False, False)
# Verify that entry is in the correct file.
if name[0].lower() != the_letter and the_letter != '0':
print(f' ERROR: This entry is in the wrong file. It is in `csrankings-{the_letter}.csv` but should be in `csrankings-{name[0].lower()}.csv`.')
valid, line_valid = (False, False)
# Check Google Scholar ID.
# print(f" Checking Google Scholar ID ({scholarid})")
if not has_valid_google_scholar_id(scholarid):
print(f' ERROR: Invalid Google Scholar ID ({scholarid}). Please provide a valid identifier.')
valid = False
# Check name against DBLP.
completions = matching_name_with_dblp(name)
if completions == 0:
print(f' ERROR: Invalid name ({name}). Please ensure it matches the DBLP entry.')
valid, line_valid = (False, False)
elif completions > 1:
print(f' WARNING: Possibly invalid name ({name}). This may be a disambiguation entry.')
valid, line_valid = (False, False)
# Test the homepage.
print(f" Checking homepage URL ({homepage})")
if not has_valid_homepage(homepage):
print(f' WARNING: Invalid homepage URL ({homepage}). Please provide a correct URL.')
valid, line_valid = (False, False)
# TODO:
# - verify that new entry is in alphabetical order
# - warn if there is an affiliation mismatch with DBLP
# - warn if there is a home page mismatch with DBLP
if line_valid:
pass
else:
# print(f"All tests passed for {name}.")
print(f'***Test failure for {name}***.')
except Exception as e:
print(f'Processing failed ({e}).')
valid, line_valid = (False, False)
if valid:
sys.exit(0)
else:
sys.exit(-1)
if __name__ == '__main__':
process()