-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_formants.py
67 lines (57 loc) · 3.54 KB
/
extract_formants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from scipy.fft import fft
from scipy.io import wavfile
from scipy.interpolate import interp1d
from db_tools import convert_dir_to_wav
import process_phn, numpy as np
from smooth import smooth
from typing import List, Tuple
import matplotlib.pyplot as plt
FFT_SLICE_RADIUS = 50 # number of samples on either side of the center sample of the phone which are used for FFT
MAX_FREQ = 8000 # assumed max frequency of voice data, since TIMIT data is at 16 kHz, so the Nyquist frequency is 8 kHz
FORMANT_OVERLAP = 250 # number of minimum Hz between frequencies which are decided to be formants
def extract_formants(wav_path: str, phn_path: str, plot = False) -> List[Tuple[str, Tuple[int, int, int, int, int]]]:
"""
Takes a path to a file storing data about the phones in an audio file as well as a path to the audio file.
Returns a list of tuples, each of which contain (a) the phone for which a set of formants were extracted and (b) a tuple containing the formants.
"""
out = []
phn_data = process_phn.extract_monophthong_times(phn_path)
_, wav_data = wavfile.read(wav_path)
# If plot argument is True, print the transcript of the recording to provide context for the plots
if plot:
txt_data_path = phn_path.replace("PHN", "TXT")
with open(txt_data_path) as transcript:
print(transcript.read())
for phn_instance in phn_data:
# Determine start and end times of phone, then determine middle sample index
vowel_phone, vowel_start, vowel_end = phn_instance
fft_slice_middle = vowel_start + (vowel_end - vowel_start) // 2
# Set vowel_data to be a slice of wav_data at the middle sample index ± FFT_SLICE_RADIUS
vowel_data = wav_data[fft_slice_middle - FFT_SLICE_RADIUS : fft_slice_middle + FFT_SLICE_RADIUS]
# Take the absolute value of the real part of the FFT of vowel_data, then stretch it out to MAX_FREQ length and smooth it out a bit
vowel_data_fft = fft(vowel_data).real
vowel_data_fft = abs(vowel_data_fft)[:len(vowel_data_fft) // 2] # Get rid of needless second half of FFT, it's symmetrical
vowel_data_fft_interp = interp1d(np.arange(vowel_data_fft.size), vowel_data_fft)
vowel_data_fft = vowel_data_fft_interp(np.linspace(0, vowel_data_fft.size - 1, MAX_FREQ))
vowel_data_fft = smooth(vowel_data_fft, 20)
# If plot argument is True, plot the spectral slice for each vowel
if plot:
print(vowel_phone)
plt.semilogy(vowel_data_fft,'r')
plt.suptitle(f"[{vowel_phone}]", fontsize=20)
plt.xlabel("Frequency (Hz)")
plt.ylabel("Log-amplitude")
plt.show()
# Associate the frequencies with their amplitudes and sort them in increasing amplitude
formant_candidates = [(freq, amplitude) for freq, amplitude in enumerate(vowel_data_fft)]
formant_candidates = sorted(formant_candidates, key=lambda x: x[1])
# Until there are 5 chosen formants, keep adding the remaining most intense frequency unless it overlaps with an already chosen formant
# Then sort the chosen formants in order of increasing frequency
formants = []
while len(formants) < 5:
if len(list(filter(lambda x: abs(x[0] - formant_candidates[-1][0]) <= FORMANT_OVERLAP, formants))) == 0:
formants.append(formant_candidates[-1])
formant_candidates.pop()
formants = sorted(formants, key=lambda x: x[0])
out.append(tuple([vowel_phone, tuple(formant[0] for formant in formants)]))
return out