-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
146 lines (106 loc) · 4.22 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
This script downloads the `Best of` Audio files from the Watkins Marine Mammal Sound Database.
Credit: Watkins Marine Mammal Sound Database, Woods Hole Oceanographic Institution and the New Bedford Whaling Museum
URL: https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm
Date of script: 30 December, 2023
"""
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import os
import re
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from scipy import signal
from scipy.io import wavfile
BASE_URL = "https://whoicf2.whoi.edu"
SUBSET = ["Killer Whale", "Spinner Dolphin", "Harp Seal", "Walrus", "Narwhal"]
audio_path = Path("audio")
thumbnail_path = Path("thumbnails")
spectrograms_path = Path("spectrograms")
audio_path.mkdir(exist_ok=True)
thumbnail_path.mkdir(exist_ok=True)
spectrograms_path.mkdir(exist_ok=True)
def download_audio_from(page_url, foldername, n=20):
"""Downloads all .wav files from a `Best of` page from the Watkins Marine Sound Database.
Args:
page_url (string): Watkins Best of Page with audio download links
foldername (string): Name of species
"""
print(f"Fetching {page_url}")
page = requests.get(page_url)
page.raise_for_status()
folder_path = audio_path / foldername
folder_path.mkdir(exist_ok=True)
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a")
download_links = [link for link in links if link.text == "Download"]
pattern = re.compile("(\w*.wav)$")
total_saved = 0
for link in tqdm(download_links[:n]):
filename = pattern.findall(link["href"])[0]
if filename.endswith(".wav"):
audiofile = requests.get(BASE_URL + link["href"])
with open(folder_path / filename, "wb") as f:
f.write(audiofile.content)
total_saved += 1
print(f"Saved {total_saved} audio files to {folder_path}")
def download_thumbnail_from(image_url, name):
"""Downloads a PNG thumbnail from a given url.
Args:
image_url (string): URL to Marine Mammal image PNG.
name (string): Name of the animal depicted in the image.
"""
print(f"Downloading thumbnail from {image_url}")
filename = name.lower().replace(" ", "-") + ".png"
thumbnail = requests.get(BASE_URL + image_url)
with open(thumbnail_path / filename, "wb") as f:
f.write(thumbnail.content)
print(f"Saved thumbnail to {filename}")
def generate_spectrogram(foldername):
folder_path = spectrograms_path / foldername
folder_path.mkdir(exist_ok=True)
for filename in os.listdir(audio_path / foldername):
audio_filepath = audio_path / foldername / filename
output_filepath = folder_path / (filename[:-4] + ".jpg")
sample_rate, samples = wavfile.read(audio_filepath)
fig, ax = plt.subplots(figsize=(8, 8), dpi=200)
ax.grid(False)
ax.specgram(samples, Fs=sample_rate)
plt.axis("off")
plt.tight_layout()
plt.savefig(output_filepath)
print(f"Saved spectrograms to {folder_path}.")
def get_best_of():
print("Fetching species...")
url = "https://whoicf2.whoi.edu/science/B/whalesounds/index.cfm"
page = requests.get(url)
page.raise_for_status()
soup = BeautifulSoup(page.content, "html.parser")
links = soup.find_all("a")
best_of = [link for link in links if link["href"].startswith("bestOf")]
data = []
print("Getting species data...")
for link in tqdm(best_of):
page_name = link.find_all("h3")[0].text
page_name = (
page_name.replace(",", "")
.replace("(", "")
.replace(")", "")
.replace("'", "")
)
page_href = f"https://whoicf2.whoi.edu/science/B/whalesounds/{link['href']}"
image_src = link.find_all("img")[0]["src"]
if SUBSET == None or page_name in SUBSET:
data.append((page_name, page_href, image_src))
return data
if __name__ == "__main__":
watkins = get_best_of()
for species_data in watkins:
name, url, image_src = species_data
download_thumbnail_from(image_src, name)
download_audio_from(url, name, n=40)
generate_spectrogram(name)
print()