Skip to content

Audio

normalize_loudness(wav)

Normalize the loudness of an audio waveform.

Parameters:

Name Type Description Default
wav Tensor

The input waveform.

required

Returns:

Type Description
Tensor

torch.Tensor: The normalized waveform.

Examples:

>>> wav = np.array([1.0, 2.0, 3.0])
>>> normalize_loudness(wav)
tensor([0.33333333, 0.66666667, 1.  ])
Source code in training/preprocess/audio.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
def normalize_loudness(wav: torch.Tensor) -> torch.Tensor:
    r"""Normalize the loudness of an audio waveform.

    Args:
        wav (torch.Tensor): The input waveform.

    Returns:
        torch.Tensor: The normalized waveform.

    Examples:
        >>> wav = np.array([1.0, 2.0, 3.0])
        >>> normalize_loudness(wav)
        tensor([0.33333333, 0.66666667, 1.  ])
    """
    return wav / torch.max(torch.abs(wav))

preprocess_audio(audio, sr_actual, sr)

Preprocesses audio by converting stereo to mono, resampling if necessary, and returning the audio tensor and sample rate.

Parameters:

Name Type Description Default
audio Tensor

The audio tensor to preprocess.

required
sr_actual int

The actual sample rate of the audio.

required
sr Union[int, None]

The target sample rate to resample the audio to, if necessary.

required

Returns:

Type Description
Tuple[Tensor, int]

Tuple[torch.Tensor, int]: The preprocessed audio tensor and sample rate.

Source code in training/preprocess/audio.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def preprocess_audio(
    audio: torch.Tensor, sr_actual: int, sr: Union[int, None],
) -> Tuple[torch.Tensor, int]:
    r"""Preprocesses audio by converting stereo to mono, resampling if necessary, and returning the audio tensor and sample rate.

    Args:
        audio (torch.Tensor): The audio tensor to preprocess.
        sr_actual (int): The actual sample rate of the audio.
        sr (Union[int, None]): The target sample rate to resample the audio to, if necessary.

    Returns:
        Tuple[torch.Tensor, int]: The preprocessed audio tensor and sample rate.
    """
    try:
        if audio.shape[0] > 0:
            audio = stereo_to_mono(audio)
        audio = audio.squeeze(0)
        if sr_actual != sr and sr is not None:
            audio_np = resample(audio.numpy(), orig_sr=sr_actual, target_sr=sr)
            # Convert back to torch tensor
            audio = torch.from_numpy(audio_np)
            sr_actual = sr
    except Exception as e:
        raise type(e)(
            f"The following error happened while processing the audio ... \n {e!s}",
        ).with_traceback(sys.exc_info()[2])

    return audio, sr_actual

resample(wav, orig_sr, target_sr)

Resamples an audio waveform from the original sampling rate to the target sampling rate.

Parameters:

Name Type Description Default
wav ndarray

The audio waveform to be resampled.

required
orig_sr int

The original sampling rate of the audio waveform.

required
target_sr int

The target sampling rate to resample the audio waveform to.

required

Returns:

Type Description
ndarray

np.ndarray: The resampled audio waveform.

Source code in training/preprocess/audio.py
22
23
24
25
26
27
28
29
30
31
32
33
def resample(wav: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
    r"""Resamples an audio waveform from the original sampling rate to the target sampling rate.

    Args:
        wav (np.ndarray): The audio waveform to be resampled.
        orig_sr (int): The original sampling rate of the audio waveform.
        target_sr (int): The target sampling rate to resample the audio waveform to.

    Returns:
        np.ndarray: The resampled audio waveform.
    """
    return librosa.resample(wav, orig_sr=orig_sr, target_sr=target_sr)

safe_load(path, sr)

Load an audio file from disk and return its content as a numpy array.

Parameters:

Name Type Description Default
path str

The path to the audio file.

required
sr int or None

The target sampling rate. If None, the original sampling rate is used.

required

Returns:

Type Description
Tuple[ndarray, int]

Tuple[np.ndarray, int]: A tuple containing the audio content as a numpy array and the actual sampling rate.

Source code in training/preprocess/audio.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def safe_load(path: str, sr: Union[int, None]) -> Tuple[np.ndarray, int]:
    r"""Load an audio file from disk and return its content as a numpy array.

    Args:
        path (str): The path to the audio file.
        sr (int or None): The target sampling rate. If None, the original sampling rate is used.

    Returns:
        Tuple[np.ndarray, int]: A tuple containing the audio content as a numpy array and the actual sampling rate.
    """
    try:
        audio, sr_actual = torchaudio.load(path) # type: ignore
        if audio.shape[0] > 0:
            audio = stereo_to_mono(audio)
        audio = audio.squeeze(0)
        if sr_actual != sr and sr is not None:
            audio = resample(audio.numpy(), orig_sr=sr_actual, target_sr=sr)
            sr_actual = sr
        else:
            audio = audio.numpy()
    except Exception as e:
        raise type(e)(
            f"The following error happened loading the file {path} ... \n" + str(e),
        ).with_traceback(sys.exc_info()[2])

    return audio, sr_actual

stereo_to_mono(audio)

Converts a stereo audio tensor to mono by taking the mean across channels.

Parameters:

Name Type Description Default
audio Tensor

Input audio tensor of shape (channels, samples).

required

Returns:

Type Description
Tensor

torch.Tensor: Mono audio tensor of shape (1, samples).

Source code in training/preprocess/audio.py
10
11
12
13
14
15
16
17
18
19
def stereo_to_mono(audio: torch.Tensor) -> torch.Tensor:
    r"""Converts a stereo audio tensor to mono by taking the mean across channels.

    Args:
        audio (torch.Tensor): Input audio tensor of shape (channels, samples).

    Returns:
        torch.Tensor: Mono audio tensor of shape (1, samples).
    """
    return torch.mean(audio, 0, True)