-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtest_urls.py
134 lines (105 loc) · 3.85 KB
/
test_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import requests
import re
import time
import pandas as pd
from functools import reduce
# URL for the Innovation Gateway API
API_URL = "https://api.www.healthdatagateway.org/api/v2/datasets"
# Regex pattern for finding URLs in strings
URL_REGEX = "^https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0-9()]{1,6}\\b(?:[-a-zA-Z0-9()@:%_\\+.~#?&\\/=]*)$"
# Fields which contain URLs in the dataset metadata
FIELDS = [
'name',
'datasetv2.summary.publisher.name',
'datasetv2.documentation.description',
'datasetv2.documentation.associatedMedia',
'datasetv2.accessibility.usage.investigations',
'datasetv2.accessibility.access.accessRights',
'datasetv2.accessibility.access.accessService',
'datasetv2.accessibility.access.accessRequestCost',
'datasetv2.enrichmentAndLinkage.tools'
]
def get_dataset_objects(api_url, params:dict = {}):
'''
Retrieves active dataset objects from the Gateway API.
'''
# Default parameters:
params['activeflag'] = 'active'
params['limit'] = 100
# Datasets will be added to this list in batches of 100
out = []
# Loop through the API pages:
for i in range(1, 100):
params['page'] = i
r = requests.get(url = api_url, params = params)
data = r.json()['datasets']
# Add the datasets from the page to the output list:
out.extend(data)
# Wait 1 second before requesting the next page:
time.sleep(1)
# Stop when there are less than 100 datasets on the page (i.e. the last page):
if len(data) < 100:
break
return out
def recursive_get(d, path):
return reduce(lambda d, k: d.get(k, {}), path, d)
def reduce_dictionary(d, keys):
return {k: recursive_get(d, k.split('.')) for k in keys}
def url_test(url):
try:
status_code = requests.get(url, timeout=10.0).status_code
return status_code
except requests.Timeout:
return 'Timeout'
except:
return 'Error'
def report(dataset, regex=URL_REGEX):
compiled_regex = re.compile(regex)
out = {
'dataset': dataset['name'],
'publisher': dataset['datasetv2.summary.publisher.name'],
'urls': []
}
for k, v in dataset.items():
if isinstance(v, str):
urls = re.findall(regex, v)
elif isinstance(v, list):
urls = [url for url in v if compiled_regex.match(str(url))]
else:
urls = []
out['urls'].extend(
[{'url': url, 'location': k, 'status':url_test(url)} for url in urls]
)
return out
def build_df(report):
df = pd.json_normalize(report)
# Explode the urls column into a new row for each URL
df = df.explode('urls')
df = df.dropna(subset=['urls'])
df = df.reset_index(drop=True)
# Convert the urls column to a separate columns
df = df.join(pd.json_normalize(df['urls'])).drop('urls', axis=1).reset_index(drop=True)
return df
def main() -> None:
# Retrieve the dataset objects from the API
print(f"Querying {API_URL}...")
data = get_dataset_objects(API_URL, params = {'fields':','.join(FIELDS)})
print(f"Retrieved {len(data)} dataset objects from gateway API.")
# Reduce the size of the dataset objects
print("Reducing object sizes...")
data = [reduce_dictionary(d, FIELDS) for d in data]
print("Object sizes reduced.")
# Create a report of URL statuses for each dataset
print("Creating reports...")
reports = []
for i, d in enumerate(data):
print(f"Creating report for {d['name']}, {i+1}/{len(data)}")
reports.append(report(d))
print("Reports created.")
# Build a dataframe from the reports
print("Building dataframe and exporting to CSV...")
df = build_df(reports)
df.to_csv('reports/url_test.csv', index=False)
print("Complete.")
if __name__ == "__main__":
main()