-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrec_2_detect.py
96 lines (74 loc) · 2.52 KB
/
rec_2_detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import time
import pickle
import librosa
import librosa.display
import numpy as np
import pyaudio
import wave
import torch
import subprocess
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
CHUNK = 1024
RECORD_SECONDS = 6
WAVE_OUTPUT_FILENAME = "file.wav" # path where you want to save your recorded sound file
def spec_to_image(spec, eps=1e-6):
mean = spec.mean()
std = spec.std()
spec_norm = (spec - mean) / (std + eps)
spec_min, spec_max = spec_norm.min(), spec_norm.max()
spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
spec_scaled = spec_scaled.astype(np.uint8)
return spec_scaled
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80):
wav,sr = librosa.load(file_path,sr=sr)
if wav.shape[0]<5*sr:
wav=np.pad(wav,int(np.ceil((5*sr-wav.shape[0])/2)),mode='reflect')
else:
wav=wav[:5*sr]
spec=librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=n_fft,
hop_length=hop_length,n_mels=n_mels,fmin=fmin,fmax=fmax)
spec_db=librosa.power_to_db(spec,top_db=top_db)
return spec_db
t1 = time.time()
audio = pyaudio.PyAudio()
# start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
rate=RATE, input=True,
frames_per_buffer=CHUNK)
print("recording...")
stream.start_stream()
frames = []
# for recording
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("finished recording")
# stop Recording
stream.stop_stream()
while stream.is_active():
time.sleep(0.25)
stream.close()
audio.terminate()
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()
device=torch.device('cuda:0')
with open('indtocat.pkl','rb') as f: # specify path to pickle file
indtocat = pickle.load(f)
resnet_model = torch.load("esc50resnet.pth") # specify path to saved pytorch model file
filename='file.wav' # path to retrieve previously recorded and saved sound file
spec=spec_to_image(get_melspectrogram_db(filename))
spec_t=torch.tensor(spec).to(device, dtype=torch.float32)
pr=resnet_model.forward(spec_t.reshape(1,1,*spec_t.shape))
ind = pr.argmax(dim=1).cpu().detach().numpy().ravel()[0]
t2=time.time()
print("{} seconds".format(t2-t1))
print(indtocat[ind])
if indtocat[ind] == 'coughing':
PREDICT = 'python predict.py file.wav'
subprocess.run(PREDICT, shell=True)