@@ -26,6 +26,9 @@ import org.tensorflow.op.audio.Mfcc
2626import org.tensorflow.types.TFloat32
2727import org.tensorflow.types.TInt32
2828import org.tensorflow.types.TString
29+ import kotlin.Boolean
30+ import kotlin.Float
31+ import kotlin.Long
2932
3033/* *
3134 * An API for building `audio` operations as [Op][org.tensorflow.op.Op]s
@@ -47,33 +50,33 @@ public class AudioOps(
4750
4851 /* *
4952 * Produces a visualization of audio data over time.
50- *
53+ *
5154 * Spectrograms are a standard way of representing audio information as a series of
5255 * slices of frequency information, one slice for each window of time. By joining
5356 * these together into a sequence, they form a distinctive fingerprint of the sound
5457 * over time.
55- *
58+ *
5659 * This op expects to receive audio data as an input, stored as floats in the range
5760 * -1 to 1, together with a window width in samples, and a stride specifying how
5861 * far to move the window between slices. From this it generates a three
5962 * dimensional output. The first dimension is for the channels in the input, so a
6063 * stereo audio input would have two here for example. The second dimension is time,
6164 * with successive frequency slices. The third dimension has an amplitude value for
6265 * each frequency during that time slice.
63- *
66+ *
6467 * This means the layout when converted and saved as an image is rotated 90 degrees
6568 * clockwise from a typical spectrogram. Time is descending down the Y axis, and
6669 * the frequency decreases from left to right.
67- *
70+ *
6871 * Each value in the result represents the square root of the sum of the real and
6972 * imaginary parts of an FFT on the current window of samples. In this way, the
7073 * lowest dimension represents the power of each frequency in the current window,
7174 * and adjacent windows are concatenated in the next dimension.
72- *
75+ *
7376 * To get a more intuitive and visual look at what this operation does, you can run
7477 * tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
7578 * resulting spectrogram as a PNG image.
76- *
79+ *
7780 * @param input Float representation of audio data.
7881 * @param windowSize How wide the input window is in samples. For the highest efficiency
7982 * this should be a power of two, but other values are accepted.
@@ -89,33 +92,33 @@ public class AudioOps(
8992 windowSize : Long ,
9093 stride : Long ,
9194 magnitudeSquared : Boolean? = null
92- ): AudioSpectrogram = java.audioSpectrogram(
95+ ): AudioSpectrogram = java.audioSpectrogram(
9396 input,
9497 windowSize,
9598 stride,
9699 * listOfNotNull(
97- magnitudeSquared?.let { org.tensorflow.op.audio.AudioSpectrogram .magnitudeSquared(it) }
100+ magnitudeSquared?.let { org.tensorflow.op.audio.AudioSpectrogram .magnitudeSquared(it) }
98101 ).toTypedArray()
99- )
102+ )
100103
101104 /* *
102105 * Decode a 16-bit PCM WAV file to a float tensor.
103- *
106+ *
104107 * The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105- *
108+ *
106109 * When desired_channels is set, if the input contains fewer channels than this
107110 * then the last channel will be duplicated to give the requested number, else if
108111 * the input has more channels than requested then the additional channels will be
109112 * ignored.
110- *
113+ *
111114 * If desired_samples is set, then the audio will be cropped or padded with zeroes
112115 * to the requested length.
113- *
116+ *
114117 * The first output contains a Tensor with the content of the audio samples. The
115118 * lowest dimension will be the number of channels, and the second will be the
116119 * number of samples. For example, a ten-sample-long stereo WAV file should give an
117120 * output shape of [10, 2].
118- *
121+ *
119122 * @param contents The WAV-encoded audio, usually from a file.
120123 * @param options carries optional attributes values
121124 * @return a new instance of DecodeWav
@@ -127,47 +130,47 @@ public class AudioOps(
127130 contents : Operand <TString >,
128131 desiredChannels : Long? = null,
129132 desiredSamples : Long? = null
130- ): DecodeWav = java.decodeWav(
133+ ): DecodeWav = java.decodeWav(
131134 contents,
132135 * listOfNotNull(
133- desiredChannels?.let { org.tensorflow.op.audio.DecodeWav .desiredChannels(it) },
134- desiredSamples?.let { org.tensorflow.op.audio.DecodeWav .desiredSamples(it) }
136+ desiredChannels?.let { org.tensorflow.op.audio.DecodeWav .desiredChannels(it) },
137+ desiredSamples?.let { org.tensorflow.op.audio.DecodeWav .desiredSamples(it) }
135138 ).toTypedArray()
136- )
139+ )
137140
138141 /* *
139142 * Encode audio data using the WAV file format.
140- *
143+ *
141144 * This operation will generate a string suitable to be saved out to create a .wav
142145 * audio file. It will be encoded in the 16-bit PCM format. It takes in float
143146 * values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144147 * that range.
145- *
148+ *
146149 * `audio` is a 2-D float Tensor of shape `[length, channels]`.
147150 * `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148- *
151+ *
149152 * @param audio 2-D with shape `[length, channels]`.
150153 * @param sampleRate Scalar containing the sample frequency.
151154 * @return a new instance of EncodeWav
152155 * @see org.tensorflow.op.AudioOps.encodeWav
153156 */
154157 public fun encodeWav (audio : Operand <TFloat32 >, sampleRate : Operand <TInt32 >): EncodeWav =
155- java.encodeWav(
156- audio,
157- sampleRate
158+ java.encodeWav(
159+ audio,
160+ sampleRate
158161 )
159162
160163 /* *
161164 * Transforms a spectrogram into a form that's useful for speech recognition.
162- *
165+ *
163166 * Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164167 * been effective as an input feature for machine learning. They are created by
165168 * taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
166169 * higher frequencies that are less significant to the human ear. They have a long
167170 * history in the speech recognition world, and
168171 * https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
169172 * is a good resource to learn more.
170- *
173+ *
171174 * @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172175 * set to true.
173176 * @param sampleRate How many samples per second the source audio used.
@@ -188,14 +191,14 @@ public class AudioOps(
188191 lowerFrequencyLimit : Float? = null,
189192 filterbankChannelCount : Long? = null,
190193 dctCoefficientCount : Long? = null
191- ): Mfcc = java.mfcc(
194+ ): Mfcc = java.mfcc(
192195 spectrogram,
193196 sampleRate,
194197 * listOfNotNull(
195- upperFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .upperFrequencyLimit(it) },
196- lowerFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .lowerFrequencyLimit(it) },
197- filterbankChannelCount?.let { org.tensorflow.op.audio.Mfcc .filterbankChannelCount(it) },
198- dctCoefficientCount?.let { org.tensorflow.op.audio.Mfcc .dctCoefficientCount(it) }
198+ upperFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .upperFrequencyLimit(it) },
199+ lowerFrequencyLimit?.let { org.tensorflow.op.audio.Mfcc .lowerFrequencyLimit(it) },
200+ filterbankChannelCount?.let { org.tensorflow.op.audio.Mfcc .filterbankChannelCount(it) },
201+ dctCoefficientCount?.let { org.tensorflow.op.audio.Mfcc .dctCoefficientCount(it) }
199202 ).toTypedArray()
200- )
203+ )
201204}
0 commit comments