Skip to content

Commit 1d76135

Browse files
neurlangYour Name
authored and
Your Name
committed
tweak the griffin lim
1 parent a1aa593 commit 1d76135

File tree

4 files changed

+95
-89
lines changed

4 files changed

+95
-89
lines changed

cmd/tomel/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func main() {
2525
m.MelFmin = 0
2626
m.MelFmax = 8000
2727
m.YReverse = true
28-
m.Window = 1024
28+
m.Window = 256
2929
m.Resolut = 8192
3030

3131
if strings.HasSuffix(filename, ".flac") {

cmd/towav/main.go

+13-3
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"fmt"
55
"github.com/neurlang/gomel/mel"
66
"os"
7+
"strconv"
78
)
89

910
func main() {
@@ -15,6 +16,12 @@ func main() {
1516

1617
// Get the filename from the command-line arguments
1718
var filename = os.Args[1]
19+
var freq = "44100"
20+
21+
if len(os.Args) > 2 {
22+
freq = os.Args[2]
23+
}
24+
frequency, _ := strconv.Atoi(freq)
1825

1926
// Create a new instance of Mel
2027
var m = mel.NewMel()
@@ -24,10 +31,13 @@ func main() {
2431
m.MelFmin = 0
2532
m.MelFmax = 8000
2633
m.YReverse = true
27-
m.Window = 1024
34+
m.Window = 256
2835
m.Resolut = 8192
29-
m.GriffinLimIterations = 5
30-
m.Spread = -13
36+
m.GriffinLimIterations = 20
37+
m.VolumeBoost = 0.0
38+
39+
m.SampleRate = frequency
40+
3141
// Generate the wave from a PNG file
3242
inputFile := filename
3343
outputFile := filename + ".wav"

mel/impl.go

+51-41
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import "github.com/faiface/beep/wav"
1010
import "github.com/mewkiz/flac"
1111
import "math"
1212
import "math/rand"
13+
import "encoding/binary"
1314

1415
func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
1516
stride := len(buf) / mels
@@ -41,7 +42,13 @@ func dumpbuffer(buf [][2]float64, mels int) (out []uint16) {
4142
return
4243
}
4344

44-
func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
45+
func unpackBytesToFloat64(bytes []byte) float64 {
46+
bits := binary.LittleEndian.Uint64(bytes) // Read the bits from the byte slice
47+
f := math.Float64frombits(bits) // Convert uint64 bits to float64
48+
return f
49+
}
50+
51+
func loadpng(name string, reverse bool) (buf [][2]float64) {
4552
// Open the PNG file
4653
file, err := os.Open(name)
4754
if err != nil {
@@ -59,7 +66,7 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
5966

6067
// Get the bounds of the image
6168
bounds := img.Bounds()
62-
var mgc float64
69+
var floats []byte
6370
for y := bounds.Min.Y; y < bounds.Max.Y; y++ {
6471
for x := bounds.Min.X; x < bounds.Max.X; x++ {
6572

@@ -71,22 +78,37 @@ func loadpng(name string, reverse bool, spread int) (buf [][2]float64) {
7178
// Get the color of the pixel at (x, y)
7279
color = img.At(x, y)
7380
}
74-
r, g, b, a := color.RGBA()
81+
r, g, b, _ := color.RGBA()
7582

76-
//println(128 + int(b) - ((int(a))/2))
77-
mgc = math.Ldexp(1, -128+int(b)/int(math.Sqrt(float64(a))))
83+
if x == 0 && y < 16 {
84+
floats = append(floats, byte(b>>8))
85+
}
7886

79-
val0 := (mgc - float64(r)/float64(a)) * float64(spread)
80-
val1 := (mgc - float64(g)/float64(a)) * float64(spread)
87+
val0 := float64(r>>8) / 255
88+
val1 := float64(g>>8) / 255
8189

8290
val := [2]float64{val0, val1}
8391

8492
buf = append(buf, val)
8593
}
8694
}
95+
var mgc_max, mgc_min = unpackBytesToFloat64(floats[0:8]), unpackBytesToFloat64(floats[8:16])
96+
97+
for i := range buf {
98+
buf[i][0] = (buf[i][0]*(mgc_max-mgc_min) + mgc_min)
99+
buf[i][1] = (buf[i][1]*(mgc_max-mgc_min) + mgc_min)
100+
}
101+
//dumpimage("test.png", buf, 160, reverse)
87102
return
88103
}
89104

105+
func packFloat64ToBytes(f float64) []byte {
106+
bits := math.Float64bits(f) // Convert float64 to uint64
107+
bytes := make([]byte, 8) // Create a byte slice of size 8
108+
binary.LittleEndian.PutUint64(bytes, bits) // Write the bits to the byte slice in little-endian order
109+
return bytes
110+
}
111+
90112
func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {
91113

92114
f, err := os.Create(name)
@@ -113,16 +135,16 @@ func dumpimage(name string, buf [][2]float64, mels int, reverse bool) error {
113135
}
114136
}
115137
}
116-
_, exp := math.Frexp((mgc_max + mgc_min) / 2)
117-
exp += 128
138+
floats := append(packFloat64ToBytes(mgc_max), packFloat64ToBytes(mgc_min)...)
139+
//println(mgc_max, mgc_min)
118140
for x := 0; x < stride; x++ {
119141
for y := 0; y < mels; y++ {
120142
var col color.NRGBA
121143
val0 := (buf[stride*y+x][0] - mgc_min) / (mgc_max - mgc_min)
122144
val1 := (buf[stride*y+x][1] - mgc_min) / (mgc_max - mgc_min)
123145
col.R = uint8(int(255 * val0))
124146
col.G = uint8(int(255 * val1))
125-
col.B = uint8(int(exp))
147+
col.B = uint8(int(floats[y&15]))
126148
col.A = uint8(255)
127149
if reverse {
128150
img.SetNRGBA(x, mels-y-1, col)
@@ -255,83 +277,71 @@ func hz_to_mel(value float64) float64 {
255277
}
256278

257279
func domel(filtersize, mels int, spectrum [][2]float64, mel_fmin, mel_fmax float64) (melspectrum [][2]float64) {
258-
259-
var melbin = hz_to_mel(mel_fmax) / float64(mels)
280+
melbin := hz_to_mel(mel_fmax) / float64(mels)
260281

261282
for i := 0; i < mels; i++ {
262-
//var j = 0
263283
for j := 0; j < len(spectrum); j += filtersize {
284+
vallo := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i))) / (mel_fmax + mel_fmin)
285+
valhi := float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)
264286

265-
var vallo = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+0))) / (mel_fmax + mel_fmin)
266-
var valhi = float64(filtersize) * (mel_fmin + mel_to_hz(melbin*float64(i+1))) / (mel_fmax + mel_fmin)
267-
268-
var inlo, modlo = math.Modf(vallo)
269-
var inhi = math.Floor(valhi)
287+
inlo, modlo := math.Modf(vallo)
288+
inhi := math.Floor(valhi)
270289
if inlo < 0 {
271290
inlo, modlo, inhi = 0, 0, 0
272291
}
292+
273293
var tot [2]float64
274294
for l := 0; l < 2; l++ {
275-
276295
var total float64
277296

278297
if int(inlo)+1 == int(inhi) {
279-
total += spectrum[j+int(inlo)][l] * float64(1-modlo)
280-
total += spectrum[j+int(inhi)][l] * float64(modlo)
298+
total += spectrum[j+int(inlo)][l] * (1 - modlo)
299+
total += spectrum[j+int(inhi)][l] * modlo
281300
} else {
282-
283301
for k := int(inlo); k < int(inhi); k++ {
284-
var sample = spectrum[j+k][l]
285-
total += sample
302+
total += spectrum[j+k][l]
286303
}
304+
total /= float64(int(inhi) - int(inlo) + 1)
287305
}
288306

289-
total /= float64(int(inhi) - int(inlo) + 1)
290-
291307
tot[l] = total
292308
}
293309
melspectrum = append(melspectrum, tot)
294-
295310
}
296311
}
297312

298313
return
299-
300314
}
301315

302316
func undomel(filtersize, mels int, melspectrum [][2]float64, mel_fmin, mel_fmax float64) (spectrum [][2]float64) {
303-
var filterbin = hz_to_mel(mel_fmax) / float64(mels)
304-
//originalLength := filtersize * mels
317+
filterbin := hz_to_mel(mel_fmax) / float64(mels)
305318
stride := len(melspectrum) / mels
306319

307320
for j := 0; j < len(melspectrum)/mels; j++ {
308-
309321
for i := 0; i < filtersize; i++ {
310-
311322
vallo := float64(hz_to_mel((float64(i)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)
312323
valhi := float64(hz_to_mel((float64(i+1)*(mel_fmax+mel_fmin)/float64(filtersize))-mel_fmin) / filterbin)
313324

314-
var inlo, _ = math.Modf(vallo)
315-
var inhi = math.Floor(valhi)
325+
inlo, modlo := math.Modf(vallo)
326+
inhi := math.Floor(valhi)
316327
if inlo < 0 {
317-
inlo, inhi = 0, 0
328+
inlo, modlo, inhi = 0, 0, 0
318329
}
330+
319331
var tot [2]float64
320332
for l := 0; l < 2; l++ {
321333
var total float64
322334

323335
if int(inlo) == int(inhi) {
324336
total += melspectrum[j+stride*int(inlo)][l]
325337
} else if int(inlo)+1 == int(inhi) && int(inhi) < mels {
326-
total += melspectrum[j+stride*int(inlo)][l] / 2
327-
total += melspectrum[j+stride*int(inhi)][l] / 2
338+
total += melspectrum[j+stride*int(inlo)][l] * (1 - modlo)
339+
total += melspectrum[j+stride*int(inhi)][l] * modlo
328340
} else {
329-
330341
for k := int(inlo); k < int(inhi); k++ {
331-
var sample = melspectrum[j+stride*k][l]
332-
sample /= inhi - inlo
333-
total += sample
342+
total += melspectrum[j+stride*k][l]
334343
}
344+
total /= inhi - inlo + 1
335345
}
336346

337347
tot[l] = total

mel/mel.go

+30-44
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import "github.com/mjibson/go-dsp/fft"
55
import "math"
66
import "errors"
77
import "math/cmplx"
8+
import "math/rand"
89

910
// Mel represents the configuration for generating mel spectrograms.
1011
type Mel struct {
@@ -19,8 +20,11 @@ type Mel struct {
1920

2021
GriffinLimIterations int
2122

22-
// spread when loading spectrogram from image, can be a value like -10
23-
Spread int
23+
// VolumeBoost when loading spectrogram from image, can be a value like 1.666
24+
VolumeBoost float64
25+
26+
// sample rate for output wav
27+
SampleRate int
2428
}
2529

2630
// NewMel creates a new Mel instance with default values.
@@ -73,26 +77,18 @@ func (m *Mel) ToMel(buf []float64) ([][2]float64, error) {
7377
}
7478

7579
func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float64 {
80+
frameShift := s.FrameShift
7681
frameLen := len(spectrogram[0])
7782
numFrames := len(spectrogram)
78-
reconstructedSignal := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
79-
windowSum := make([]float64, frameLen+(numFrames-1)*s.FrameShift)
83+
reconstructedSignal := make([]float64, frameLen+(numFrames-1)*frameShift)
84+
windowSum := make([]float64, frameLen+(numFrames-1)*frameShift)
8085

81-
// Initial reconstruction
86+
// Initial reconstruction with a random phase
8287
for i := 0; i < numFrames; i++ {
83-
buf := fft.IFFT(spectrogram[i])
84-
index := 0
85-
for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
86-
reconstructedSignal[t] += real(buf[index]) * s.Window[index]
87-
windowSum[t] += s.Window[index]
88-
index++
89-
}
90-
}
91-
92-
// Normalize reconstructed signal by window sum
93-
for i := range reconstructedSignal {
94-
if windowSum[i] != 0 {
95-
reconstructedSignal[i] /= windowSum[i]
88+
for j := range spectrogram[i] {
89+
magnitude0 := cmplx.Abs(spectrogram[i][j])
90+
phase := 2 * math.Pi * rand.Float64()
91+
spectrogram[i][j] = cmplx.Rect(magnitude0, phase)
9692
}
9793
}
9894

@@ -102,8 +98,8 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
10298
for i := 0; i < numFrames; i++ {
10399
frame := make([]float64, frameLen)
104100
for j := 0; j < frameLen; j++ {
105-
if i*s.FrameShift+j < len(reconstructedSignal) {
106-
frame[j] = reconstructedSignal[i*s.FrameShift+j] * s.Window[j]
101+
if i*frameShift+j < len(reconstructedSignal) {
102+
frame[j] = reconstructedSignal[i*frameShift+j] * s.Window[j]
107103
}
108104
}
109105
stftFrame := fft.FFTReal(frame)
@@ -117,12 +113,12 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
117113
}
118114

119115
// Reconstruct the signal from the updated spectrogram
120-
reconstructedSignal = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
121-
windowSum = make([]float64, frameLen+(numFrames-1)*s.FrameShift)
116+
reconstructedSignal = make([]float64, frameLen+(numFrames-1)*frameShift)
117+
windowSum = make([]float64, frameLen+(numFrames-1)*frameShift)
122118
for i := 0; i < numFrames; i++ {
123119
buf := fft.IFFT(spectrogram[i])
124120
index := 0
125-
for t := i * s.FrameShift; t < i*s.FrameShift+frameLen; t++ {
121+
for t := i * frameShift; t < i*frameShift+frameLen; t++ {
126122
reconstructedSignal[t] += real(buf[index]) * s.Window[index]
127123
windowSum[t] += s.Window[index]
128124
index++
@@ -142,30 +138,15 @@ func ISTFT(s *stft.STFT, spectrogram [][]complex128, numIterations int) []float6
142138

143139
// FromMel generates a wave buffer from a mel spectrogram and returns the wave buffer.
144140
func (m *Mel) FromMel(ospectrum [][2]float64) ([]float64, error) {
145-
146141
spectral_denormalize(ospectrum)
147142

148-
ospectrum = undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax)
149-
150-
for r := 0; r < int(math.Sqrt(float64(m.MelFmax-m.MelFmin)/float64(m.NumMels))); r++ {
151-
for l := 0; l < 2; l++ {
152-
for x := 0; x < len(ospectrum)/(m.Resolut/2); x++ {
153-
for y := 1; y+1 < m.Resolut/2; y++ {
154-
ospectrum[y+x*(m.Resolut/2)][l] = (ospectrum[y-1+x*(m.Resolut/2)][l] +
155-
ospectrum[y+0+x*(m.Resolut/2)][l] +
156-
ospectrum[y+1+x*(m.Resolut/2)][l]) / 3
157-
}
158-
}
159-
}
160-
}
161-
162-
spectrum := m.undospectrum(ospectrum)
143+
stft1 := stft.New(m.Window, m.Resolut)
163144

164-
stft := stft.New(m.Window, m.Resolut)
145+
undo := m.undospectrum(undomel(m.Resolut/2, m.NumMels, ospectrum, m.MelFmin, m.MelFmax))
165146

166-
buf := ISTFT(stft, spectrum, m.GriffinLimIterations)
147+
buf1 := ISTFT(stft1, undo, m.GriffinLimIterations)
167148

168-
return buf, nil
149+
return buf1, nil
169150
}
170151

171152
// LoadFlac loads mono flac file to sample vector
@@ -225,17 +206,22 @@ func (m *Mel) ToMelWav(inputFile, outputFile string) error {
225206

226207
func (m *Mel) ToWavPng(inputFile, outputFile string) error {
227208

228-
var buf = loadpng(inputFile, m.YReverse, m.Spread)
209+
var buf = loadpng(inputFile, m.YReverse)
229210
if len(buf) == 0 {
230211
return ErrFileNotLoaded
231212
}
232213

214+
for i := range buf {
215+
buf[i][0] += m.VolumeBoost
216+
buf[i][1] += m.VolumeBoost
217+
}
218+
233219
owave, err := m.FromMel(buf)
234220
if err != nil {
235221
return err
236222
}
237223

238-
dumpwav(outputFile, owave, 44100)
224+
dumpwav(outputFile, owave, m.SampleRate)
239225

240226
return nil
241227
}

0 commit comments

Comments
 (0)