-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.py
executable file
·64 lines (49 loc) · 1.84 KB
/
get_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python3
import requests
import re
from os import path, makedirs
from tqdm import tqdm
# Important urls
base_url = "https://s3.amazonaws.com/tripdata/index.html"
xml_url = "https://s3.amazonaws.com/tripdata"
download_url = "https://s3.amazonaws.com/tripdata/"
# Seting up regex for needed files
key_regex = r"<Key>([\d\w\-\.]+.zip)</Key>"
key_matcher = re.compile(key_regex)
# Check for correct directory stucture
if not path.exists("./data"):
makedirs("./data/JC")
makedirs("./data/NC")
if not path.exists("./data/NC"):
makedirs("./data/NC")
if not path.exists("./data/JC"):
makedirs("./data/JC")
# Starting up a http session
with requests.Session() as sess:
# Authentication request
res = sess.get(base_url)
# Headeres nessesary for getting xml file
xml_req_headers = {
"Content-Type": "application/xml",
"Server": res.headers.get("Server"),
"x-amz-bucket-region": res.headers.get("x-amz-bucket-region"),
"x-amz-id-2": res.headers.get("x-amz-id-2"),
"x-amz-request-id": res.headers.get("x-amz-request-id")
}
# Getting xml file
xml_res = sess.get(url=xml_url,
headers=xml_req_headers)
# Extracting content
xml_content = str(xml_res.content)[2:-1]
print("Download starts...")
# Going throught all uniqe matches of regex and downloading the files found
for key in tqdm(set(key_matcher.findall(xml_content))):
city_code = "JC" if key.startswith("JC") else "NC"
filepath = "./data/" + city_code + "/" + key
# Checking if a file doesn't already exists
if not path.isfile(filepath):
raw_data = requests.get(download_url + key)
with open(filepath, 'wb') as f:
f.write(raw_data.content)
print(f"File: {key} downloaded")
print("Download finished")