Audio Processor

`AudioProcessor`

A class used to process audio signals and convert them into different representations.

Attributes:

Name	Type	Description
`hann_window`	`dict`	A dictionary to store the Hann window for different configurations.
`mel_basis`	`dict`	A dictionary to store the Mel basis for different configurations.

Methods:

Name	Description
`name_mel_basis`	Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.
`amp_to_db`	Convert amplitude to decibels (dB).
`db_to_amp`	Convert decibels (dB) to amplitude.
`wav_to_spec`	Convert a waveform to a spectrogram and compute the magnitude.
`wav_to_energy`	Convert a waveform to a spectrogram and compute the energy.
`spec_to_mel`	Convert a spectrogram to a Mel spectrogram.
`wav_to_mel`	Convert a waveform to a Mel spectrogram.

Source code in training/preprocess/audio_processor.py

class AudioProcessor:
    r"""A class used to process audio signals and convert them into different representations.

    Attributes:
        hann_window (dict): A dictionary to store the Hann window for different configurations.
        mel_basis (dict): A dictionary to store the Mel basis for different configurations.

    Methods:
        name_mel_basis(spec, n_fft, fmax): Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.
        amp_to_db(magnitudes, C=1, clip_val=1e-5): Convert amplitude to decibels (dB).
        db_to_amp(magnitudes, C=1): Convert decibels (dB) to amplitude.
        wav_to_spec(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the magnitude.
        wav_to_energy(y, n_fft, hop_length, win_length, center=False): Convert a waveform to a spectrogram and compute the energy.
        spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax): Convert a spectrogram to a Mel spectrogram.
        wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False): Convert a waveform to a Mel spectrogram.
    """

    def __init__(self):
        self.hann_window = {}
        self.mel_basis = {}

    @staticmethod
    def name_mel_basis(spec: torch.Tensor, n_fft: int, fmax: int) -> str:
        """Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

        Args:
            spec (torch.Tensor): The spectrogram tensor.
            n_fft (int): The FFT size.
            fmax (int): The maximum frequency.

        Returns:
            str: The generated name for the Mel basis.
        """
        n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
        return n_fft_len

    @staticmethod
    def amp_to_db(magnitudes: torch.Tensor, C: int = 1, clip_val: float = 1e-5) -> torch.Tensor:
        r"""Convert amplitude to decibels (dB).

        Args:
            magnitudes (Tensor): The amplitude magnitudes to convert.
            C (int, optional): A constant value used in the conversion. Defaults to 1.
            clip_val (float, optional): A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

        Returns:
            Tensor: The converted magnitudes in dB.
        """
        return torch.log(torch.clamp(magnitudes, min=clip_val) * C)

    @staticmethod
    def db_to_amp(magnitudes: torch.Tensor, C: int = 1) -> torch.Tensor:
        r"""Convert decibels (dB) to amplitude.

        Args:
            magnitudes (Tensor): The dB magnitudes to convert.
            C (int, optional): A constant value used in the conversion. Defaults to 1.

        Returns:
            Tensor: The converted magnitudes in amplitude.
        """
        return torch.exp(magnitudes) / C

    def wav_to_spec(
        self,
        y: torch.Tensor,
        n_fft: int,
        hop_length: int,
        win_length: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a spectrogram and compute the magnitude.

        Args:
            y (Tensor): The input waveform.
            n_fft (int): The FFT size.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            Tensor: The magnitude of the computed spectrogram.
        """
        y = y.squeeze(1)

        dtype_device = str(y.dtype) + "_" + str(y.device)
        wnsize_dtype_device = str(win_length) + "_" + dtype_device
        if wnsize_dtype_device not in self.hann_window:
            self.hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)

        y = torch.nn.functional.pad(
            y.unsqueeze(1),
            (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
            mode="reflect",
        )
        y = y.squeeze(1)

        spec = torch.stft(
            y,
            n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=self.hann_window[wnsize_dtype_device],
            center=center,
            pad_mode="reflect",
            normalized=False,
            onesided=True,
            return_complex=True,
        )

        spec = torch.view_as_real(spec)

        # Compute the magnitude
        spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

        return spec

    def wav_to_energy(
        self,
        y: torch.Tensor,
        n_fft: int,
        hop_length: int,
        win_length: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a spectrogram and compute the energy.

        Args:
            y (Tensor): The input waveform.
            n_fft (int): The FFT size.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            Tensor: The energy of the computed spectrogram.
        """
        spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)
        spec = torch.norm(spec, dim=1, keepdim=True).squeeze(0)

        # Normalize the energy
        return (spec - spec.mean()) / spec.std()

    def spec_to_mel(
            self,
            spec: torch.Tensor,
            n_fft: int,
            num_mels: int,
            sample_rate: int,
            fmin: int,
            fmax: int,
    ) -> torch.Tensor:
        r"""Convert a spectrogram to a Mel spectrogram.

        Args:
            spec (torch.Tensor): The input spectrogram of shape [B, C, T].
            n_fft (int): The FFT size.
            num_mels (int): The number of Mel bands.
            sample_rate (int): The sample rate of the audio.
            fmin (int): The minimum frequency.
            fmax (int): The maximum frequency.

        Returns:
            torch.Tensor: The computed Mel spectrogram of shape [B, C, T].
        """
        mel_basis_key = self.name_mel_basis(spec, n_fft, fmax)

        if mel_basis_key not in self.mel_basis:
            mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
            self.mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

        mel = torch.matmul(self.mel_basis[mel_basis_key], spec)
        mel = self.amp_to_db(mel)

        return mel

    def wav_to_mel(
        self,
        y: torch.Tensor,
        n_fft: int,
        num_mels: int,
        sample_rate: int,
        hop_length: int,
        win_length: int,
        fmin: int,
        fmax: int,
        center: bool = False,
    ) -> torch.Tensor:
        r"""Convert a waveform to a Mel spectrogram.

        Args:
            y (torch.Tensor): The input waveform.
            n_fft (int): The FFT size.
            num_mels (int): The number of Mel bands.
            sample_rate (int): The sample rate of the audio.
            hop_length (int): The hop (stride) size.
            win_length (int): The window size.
            fmin (int): The minimum frequency.
            fmax (int): The maximum frequency.
            center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

        Returns:
            torch.Tensor: The computed Mel spectrogram.
        """
        # Convert the waveform to a spectrogram
        spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)

        # Convert the spectrogram to a Mel spectrogram
        mel = self.spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

        return mel

`amp_to_db(magnitudes, C=1, clip_val=1e-05)` `staticmethod`

Convert amplitude to decibels (dB).

Parameters:

Name	Type	Description	Default
`magnitudes`	`Tensor`	The amplitude magnitudes to convert.	required
`C`	`int`	A constant value used in the conversion. Defaults to 1.	`1`
`clip_val`	`float`	A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.	`1e-05`

Returns:

Name	Type	Description
`Tensor`	`Tensor`	The converted magnitudes in dB.

Source code in training/preprocess/audio_processor.py

@staticmethod
def amp_to_db(magnitudes: torch.Tensor, C: int = 1, clip_val: float = 1e-5) -> torch.Tensor:
    r"""Convert amplitude to decibels (dB).

    Args:
        magnitudes (Tensor): The amplitude magnitudes to convert.
        C (int, optional): A constant value used in the conversion. Defaults to 1.
        clip_val (float, optional): A value to clamp the magnitudes to avoid taking the log of zero. Defaults to 1e-5.

    Returns:
        Tensor: The converted magnitudes in dB.
    """
    return torch.log(torch.clamp(magnitudes, min=clip_val) * C)

`db_to_amp(magnitudes, C=1)` `staticmethod`

Convert decibels (dB) to amplitude.

Parameters:

Name	Type	Description	Default
`magnitudes`	`Tensor`	The dB magnitudes to convert.	required
`C`	`int`	A constant value used in the conversion. Defaults to 1.	`1`

Returns:

Name	Type	Description
`Tensor`	`Tensor`	The converted magnitudes in amplitude.

Source code in training/preprocess/audio_processor.py

@staticmethod
def db_to_amp(magnitudes: torch.Tensor, C: int = 1) -> torch.Tensor:
    r"""Convert decibels (dB) to amplitude.

    Args:
        magnitudes (Tensor): The dB magnitudes to convert.
        C (int, optional): A constant value used in the conversion. Defaults to 1.

    Returns:
        Tensor: The converted magnitudes in amplitude.
    """
    return torch.exp(magnitudes) / C

`name_mel_basis(spec, n_fft, fmax)` `staticmethod`

Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

Parameters:

Name	Type	Description	Default
`spec`	`Tensor`	The spectrogram tensor.	required
`n_fft`	`int`	The FFT size.	required
`fmax`	`int`	The maximum frequency.	required

Returns:

Name	Type	Description
`str`	`str`	The generated name for the Mel basis.

Source code in training/preprocess/audio_processor.py

@staticmethod
def name_mel_basis(spec: torch.Tensor, n_fft: int, fmax: int) -> str:
    """Generate a name for the Mel basis based on the FFT size, maximum frequency, data type, and device.

    Args:
        spec (torch.Tensor): The spectrogram tensor.
        n_fft (int): The FFT size.
        fmax (int): The maximum frequency.

    Returns:
        str: The generated name for the Mel basis.
    """
    n_fft_len = f"{n_fft}_{fmax}_{spec.dtype}_{spec.device}"
    return n_fft_len

`spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)`

Convert a spectrogram to a Mel spectrogram.

Parameters:

Name	Type	Description	Default
`spec`	`Tensor`	The input spectrogram of shape [B, C, T].	required
`n_fft`	`int`	The FFT size.	required
`num_mels`	`int`	The number of Mel bands.	required
`sample_rate`	`int`	The sample rate of the audio.	required
`fmin`	`int`	The minimum frequency.	required
`fmax`	`int`	The maximum frequency.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: The computed Mel spectrogram of shape [B, C, T].

Source code in training/preprocess/audio_processor.py

def spec_to_mel(
        self,
        spec: torch.Tensor,
        n_fft: int,
        num_mels: int,
        sample_rate: int,
        fmin: int,
        fmax: int,
) -> torch.Tensor:
    r"""Convert a spectrogram to a Mel spectrogram.

    Args:
        spec (torch.Tensor): The input spectrogram of shape [B, C, T].
        n_fft (int): The FFT size.
        num_mels (int): The number of Mel bands.
        sample_rate (int): The sample rate of the audio.
        fmin (int): The minimum frequency.
        fmax (int): The maximum frequency.

    Returns:
        torch.Tensor: The computed Mel spectrogram of shape [B, C, T].
    """
    mel_basis_key = self.name_mel_basis(spec, n_fft, fmax)

    if mel_basis_key not in self.mel_basis:
        mel = librosa_mel_fn(sr=sample_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
        self.mel_basis[mel_basis_key] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)

    mel = torch.matmul(self.mel_basis[mel_basis_key], spec)
    mel = self.amp_to_db(mel)

    return mel

`wav_to_energy(y, n_fft, hop_length, win_length, center=False)`

Convert a waveform to a spectrogram and compute the energy.

Parameters:

Name	Type	Description	Default
`y`	`Tensor`	The input waveform.	required
`n_fft`	`int`	The FFT size.	required
`hop_length`	`int`	The hop (stride) size.	required
`win_length`	`int`	The window size.	required
`center`	`bool`	Whether to pad `y` such that frames are centered. Defaults to False.	`False`

Returns:

Name	Type	Description
`Tensor`	`Tensor`	The energy of the computed spectrogram.

Source code in training/preprocess/audio_processor.py

def wav_to_energy(
    self,
    y: torch.Tensor,
    n_fft: int,
    hop_length: int,
    win_length: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a spectrogram and compute the energy.

    Args:
        y (Tensor): The input waveform.
        n_fft (int): The FFT size.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        Tensor: The energy of the computed spectrogram.
    """
    spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)
    spec = torch.norm(spec, dim=1, keepdim=True).squeeze(0)

    # Normalize the energy
    return (spec - spec.mean()) / spec.std()

`wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False)`

Convert a waveform to a Mel spectrogram.

Parameters:

Name	Type	Description	Default
`y`	`Tensor`	The input waveform.	required
`n_fft`	`int`	The FFT size.	required
`num_mels`	`int`	The number of Mel bands.	required
`sample_rate`	`int`	The sample rate of the audio.	required
`hop_length`	`int`	The hop (stride) size.	required
`win_length`	`int`	The window size.	required
`fmin`	`int`	The minimum frequency.	required
`fmax`	`int`	The maximum frequency.	required
`center`	`bool`	Whether to pad `y` such that frames are centered. Defaults to False.	`False`

Returns:

Type	Description
`Tensor`	torch.Tensor: The computed Mel spectrogram.

Source code in training/preprocess/audio_processor.py

def wav_to_mel(
    self,
    y: torch.Tensor,
    n_fft: int,
    num_mels: int,
    sample_rate: int,
    hop_length: int,
    win_length: int,
    fmin: int,
    fmax: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a Mel spectrogram.

    Args:
        y (torch.Tensor): The input waveform.
        n_fft (int): The FFT size.
        num_mels (int): The number of Mel bands.
        sample_rate (int): The sample rate of the audio.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        fmin (int): The minimum frequency.
        fmax (int): The maximum frequency.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        torch.Tensor: The computed Mel spectrogram.
    """
    # Convert the waveform to a spectrogram
    spec = self.wav_to_spec(y, n_fft, hop_length, win_length, center=center)

    # Convert the spectrogram to a Mel spectrogram
    mel = self.spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

    return mel

`wav_to_spec(y, n_fft, hop_length, win_length, center=False)`

Convert a waveform to a spectrogram and compute the magnitude.

Parameters:

Name	Type	Description	Default
`y`	`Tensor`	The input waveform.	required
`n_fft`	`int`	The FFT size.	required
`hop_length`	`int`	The hop (stride) size.	required
`win_length`	`int`	The window size.	required
`center`	`bool`	Whether to pad `y` such that frames are centered. Defaults to False.	`False`

Returns:

Name	Type	Description
`Tensor`	`Tensor`	The magnitude of the computed spectrogram.

Source code in training/preprocess/audio_processor.py

def wav_to_spec(
    self,
    y: torch.Tensor,
    n_fft: int,
    hop_length: int,
    win_length: int,
    center: bool = False,
) -> torch.Tensor:
    r"""Convert a waveform to a spectrogram and compute the magnitude.

    Args:
        y (Tensor): The input waveform.
        n_fft (int): The FFT size.
        hop_length (int): The hop (stride) size.
        win_length (int): The window size.
        center (bool, optional): Whether to pad `y` such that frames are centered. Defaults to False.

    Returns:
        Tensor: The magnitude of the computed spectrogram.
    """
    y = y.squeeze(1)

    dtype_device = str(y.dtype) + "_" + str(y.device)
    wnsize_dtype_device = str(win_length) + "_" + dtype_device
    if wnsize_dtype_device not in self.hann_window:
        self.hann_window[wnsize_dtype_device] = torch.hann_window(win_length).to(dtype=y.dtype, device=y.device)

    y = torch.nn.functional.pad(
        y.unsqueeze(1),
        (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)),
        mode="reflect",
    )
    y = y.squeeze(1)

    spec = torch.stft(
        y,
        n_fft,
        hop_length=hop_length,
        win_length=win_length,
        window=self.hann_window[wnsize_dtype_device],
        center=center,
        pad_mode="reflect",
        normalized=False,
        onesided=True,
        return_complex=True,
    )

    spec = torch.view_as_real(spec)

    # Compute the magnitude
    spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)

    return spec

Audio Processor

AudioProcessor

amp_to_db(magnitudes, C=1, clip_val=1e-05) staticmethod

db_to_amp(magnitudes, C=1) staticmethod

name_mel_basis(spec, n_fft, fmax) staticmethod

spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)

wav_to_energy(y, n_fft, hop_length, win_length, center=False)

wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False)

wav_to_spec(y, n_fft, hop_length, win_length, center=False)

`AudioProcessor`

`amp_to_db(magnitudes, C=1, clip_val=1e-05)` `staticmethod`

`db_to_amp(magnitudes, C=1)` `staticmethod`

`name_mel_basis(spec, n_fft, fmax)` `staticmethod`

`spec_to_mel(spec, n_fft, num_mels, sample_rate, fmin, fmax)`

`wav_to_energy(y, n_fft, hop_length, win_length, center=False)`

`wav_to_mel(y, n_fft, num_mels, sample_rate, hop_length, win_length, fmin, fmax, center=False)`

`wav_to_spec(y, n_fft, hop_length, win_length, center=False)`