Skip to content

Layers

ConvNorm

Bases: Module

1D Convolution with optional batch normalization.

Parameters:

Name Type Description Default
in_channels int

Number of input channels.

required
out_channels int

Number of output channels.

required
kernel_size int

Size of the convolving kernel. Defaults to 1.

1
stride int

Stride of the convolution. Defaults to 1.

1
padding int

Zero-padding added to both sides of the input. Defaults to None.

None
dilation int

Spacing between kernel elements. Defaults to 1.

1
bias bool

If True, adds a learnable bias to the output. Defaults to True.

True
Source code in models/enhancer/gaussian_diffusion/layers.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class ConvNorm(Module):
    r"""1D Convolution with optional batch normalization.

    Args:
        in_channels (int): Number of input channels.
        out_channels (int): Number of output channels.
        kernel_size (int, optional): Size of the convolving kernel. Defaults to 1.
        stride (int, optional): Stride of the convolution. Defaults to 1.
        padding (int, optional): Zero-padding added to both sides of the input. Defaults to None.
        dilation (int, optional): Spacing between kernel elements. Defaults to 1.
        bias (bool, optional): If True, adds a learnable bias to the output. Defaults to True.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: int,
        kernel_size: int = 1,
        stride: int = 1,
        padding: Optional[int] = None,
        dilation: int = 1,
        bias: bool = True,
    ):
        super().__init__()

        if padding is None:
            assert kernel_size % 2 == 1
            padding = int(dilation * (kernel_size - 1) / 2)

        self.conv = nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            dilation=dilation,
            bias=bias,
        )
        nn.init.kaiming_normal_(self.conv.weight)

    def forward(self, signal: Tensor) -> Tensor:
        r"""Forward pass through the convolutional layer.

        Args:
            signal (torch.Tensor): Input signal tensor.

        Returns:
            torch.Tensor: Output tensor after convolution.
        """
        conv_signal = self.conv(signal)

        return conv_signal

forward(signal)

Forward pass through the convolutional layer.

Parameters:

Name Type Description Default
signal Tensor

Input signal tensor.

required

Returns:

Type Description
Tensor

torch.Tensor: Output tensor after convolution.

Source code in models/enhancer/gaussian_diffusion/layers.py
69
70
71
72
73
74
75
76
77
78
79
80
def forward(self, signal: Tensor) -> Tensor:
    r"""Forward pass through the convolutional layer.

    Args:
        signal (torch.Tensor): Input signal tensor.

    Returns:
        torch.Tensor: Output tensor after convolution.
    """
    conv_signal = self.conv(signal)

    return conv_signal

DiffusionEmbedding

Bases: Module

Diffusion Step Embedding.

This module generates diffusion step embeddings for the given input.

Parameters:

Name Type Description Default
d_denoiser int

Dimension of the denoiser.

required

Attributes:

Name Type Description
dim int

Dimension of the diffusion step embedding.

Source code in models/enhancer/gaussian_diffusion/layers.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
class DiffusionEmbedding(Module):
    r"""Diffusion Step Embedding.

    This module generates diffusion step embeddings for the given input.

    Args:
        d_denoiser (int): Dimension of the denoiser.

    Attributes:
        dim (int): Dimension of the diffusion step embedding.
    """

    def __init__(self, d_denoiser: int):
        super().__init__()
        self.dim = d_denoiser

    def forward(self, x: Tensor) -> Tensor:
        r"""Forward pass through the DiffusionEmbedding module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Diffusion step embeddings.
        """
        device = x.device
        half_dim = self.dim // 2

        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)

        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)

        return emb

forward(x)

Forward pass through the DiffusionEmbedding module.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required

Returns:

Type Description
Tensor

torch.Tensor: Diffusion step embeddings.

Source code in models/enhancer/gaussian_diffusion/layers.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def forward(self, x: Tensor) -> Tensor:
    r"""Forward pass through the DiffusionEmbedding module.

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Diffusion step embeddings.
    """
    device = x.device
    half_dim = self.dim // 2

    emb = math.log(10000) / (half_dim - 1)
    emb = torch.exp(torch.arange(half_dim, device=device) * -emb)

    emb = x[:, None] * emb[None, :]
    emb = torch.cat((emb.sin(), emb.cos()), dim=-1)

    return emb

LinearNorm

Bases: Module

LinearNorm Projection.

This module performs a linear projection with optional bias.

Parameters:

Name Type Description Default
in_features int

Number of input features.

required
out_features int

Number of output features.

required
bias bool

If True, adds a learnable bias to the output. Default is False.

False

Attributes:

Name Type Description
linear Linear

Linear transformation module.

Source code in models/enhancer/gaussian_diffusion/layers.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class LinearNorm(Module):
    r"""LinearNorm Projection.

    This module performs a linear projection with optional bias.

    Args:
        in_features (int): Number of input features.
        out_features (int): Number of output features.
        bias (bool, optional): If True, adds a learnable bias to the output. Default is False.

    Attributes:
        linear (torch.nn.Linear): Linear transformation module.

    """

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = False,
    ):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features, bias)

        nn.init.xavier_uniform_(self.linear.weight)
        if bias:
            nn.init.constant_(self.linear.bias, 0.0)

    def forward(self, x: Tensor) -> Tensor:
        r"""Forward pass through the LinearNorm module.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after linear projection.
        """
        x = self.linear(x)
        return x

forward(x)

Forward pass through the LinearNorm module.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required

Returns:

Type Description
Tensor

torch.Tensor: Output tensor after linear projection.

Source code in models/enhancer/gaussian_diffusion/layers.py
148
149
150
151
152
153
154
155
156
157
158
def forward(self, x: Tensor) -> Tensor:
    r"""Forward pass through the LinearNorm module.

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor after linear projection.
    """
    x = self.linear(x)
    return x

Mish

Bases: Module

Applies the Mish activation function.

Mish is a smooth, non-monotonic function that attempts to mitigate the problems of dying ReLU units in deep neural networks.

Source code in models/enhancer/gaussian_diffusion/layers.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class Mish(Module):
    r"""Applies the Mish activation function.

    Mish is a smooth, non-monotonic function that attempts to mitigate the
    problems of dying ReLU units in deep neural networks.
    """

    def forward(self, x: Tensor) -> Tensor:
        r"""Forward pass of the Mish activation function.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Output tensor after applying Mish activation.
        """
        return x * torch.tanh(F.softplus(x))

forward(x)

Forward pass of the Mish activation function.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required

Returns:

Type Description
Tensor

torch.Tensor: Output tensor after applying Mish activation.

Source code in models/enhancer/gaussian_diffusion/layers.py
17
18
19
20
21
22
23
24
25
26
def forward(self, x: Tensor) -> Tensor:
    r"""Forward pass of the Mish activation function.

    Args:
        x (torch.Tensor): Input tensor.

    Returns:
        torch.Tensor: Output tensor after applying Mish activation.
    """
    return x * torch.tanh(F.softplus(x))

ResidualBlock

Bases: Module

Residual Block.

This module defines a residual block used in a neural network architecture. It consists of several convolutional and linear projections followed by nonlinear activations.

Parameters:

Name Type Description Default
d_encoder int

Dimension of the encoder output.

required
residual_channels int

Number of channels in the residual block.

required
dropout float

Dropout probability.

required
d_spk_prj int

Dimension of the speaker projection.

required
multi_speaker bool

Flag indicating if the model is trained with multiple speakers. Defaults to True.

True

Attributes:

Name Type Description
multi_speaker bool

Flag indicating if the model is trained with multiple speakers.

conv_layer ConvNorm

Convolutional layer in the residual block.

diffusion_projection LinearNorm

Linear projection for the diffusion step.

speaker_projection LinearNorm

Linear projection for the speaker embedding.

conditioner_projection ConvNorm

Convolutional projection for the conditioner.

output_projection ConvNorm

Convolutional projection for the output.

Source code in models/enhancer/gaussian_diffusion/layers.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
class ResidualBlock(Module):
    r"""Residual Block.

    This module defines a residual block used in a neural network architecture. It consists of
    several convolutional and linear projections followed by nonlinear activations.

    Args:
        d_encoder (int): Dimension of the encoder output.
        residual_channels (int): Number of channels in the residual block.
        dropout (float): Dropout probability.
        d_spk_prj (int): Dimension of the speaker projection.
        multi_speaker (bool, optional): Flag indicating if the model is trained with multiple speakers. Defaults to True.

    Attributes:
        multi_speaker (bool): Flag indicating if the model is trained with multiple speakers.
        conv_layer (ConvNorm): Convolutional layer in the residual block.
        diffusion_projection (LinearNorm): Linear projection for the diffusion step.
        speaker_projection (LinearNorm): Linear projection for the speaker embedding.
        conditioner_projection (ConvNorm): Convolutional projection for the conditioner.
        output_projection (ConvNorm): Convolutional projection for the output.
    """

    def __init__(
        self,
        d_encoder: int,
        residual_channels: int,
        dropout: float,
        d_spk_prj: int,
        multi_speaker: bool = True,
    ):
        super().__init__()
        self.multi_speaker = multi_speaker
        self.conv_layer = ConvNorm(
            residual_channels,
            2 * residual_channels,
            kernel_size=3,
            stride=1,
            padding=int((3 - 1) / 2),
            dilation=1,
        )
        self.diffusion_projection = LinearNorm(residual_channels, residual_channels)
        if multi_speaker:
            self.speaker_projection = LinearNorm(d_spk_prj, residual_channels)
        self.conditioner_projection = ConvNorm(
            d_encoder, residual_channels, kernel_size=1,
        )
        self.output_projection = ConvNorm(
            residual_channels, 2 * residual_channels, kernel_size=1,
        )

    def forward(
        self,
        x: Tensor,
        conditioner: Tensor,
        diffusion_step: Tensor,
        speaker_emb: Tensor,
        mask: Optional[Tensor] = None,
    ):
        r"""Forward pass through the ResidualBlock module.

        Args:
            x (torch.Tensor): Input tensor.
            conditioner (torch.Tensor): Conditioner tensor.
            diffusion_step (torch.Tensor): Diffusion step tensor.
            speaker_emb (torch.Tensor): Speaker embedding tensor.
            mask (torch.Tensor, optional): Mask tensor. Defaults to None.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Tuple containing the output tensor and skip tensor.
        """
        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
        conditioner = self.conditioner_projection(conditioner)
        # conditioner = self.conditioner_projection(conditioner.transpose(1, 2))
        if self.multi_speaker:
            # speaker_emb = self.speaker_projection(speaker_emb).unsqueeze(1).expand(
            #     -1, conditioner.shape[-1], -1,
            # ).transpose(1, 2)
            speaker_emb = self.speaker_projection(speaker_emb).expand(
                -1, conditioner.shape[-1], -1,
            ).transpose(1, 2)

        residual = y = x + diffusion_step
        y = self.conv_layer(
            (y + conditioner + speaker_emb) if self.multi_speaker else (y + conditioner),
        )
        gate, filter = torch.chunk(y, 2, dim=1)
        y = torch.sigmoid(gate) * torch.tanh(filter)

        y = self.output_projection(y)
        x, skip = torch.chunk(y, 2, dim=1)

        return (x + residual) / math.sqrt(2.0), skip

forward(x, conditioner, diffusion_step, speaker_emb, mask=None)

Forward pass through the ResidualBlock module.

Parameters:

Name Type Description Default
x Tensor

Input tensor.

required
conditioner Tensor

Conditioner tensor.

required
diffusion_step Tensor

Diffusion step tensor.

required
speaker_emb Tensor

Speaker embedding tensor.

required
mask Tensor

Mask tensor. Defaults to None.

None

Returns:

Type Description

Tuple[torch.Tensor, torch.Tensor]: Tuple containing the output tensor and skip tensor.

Source code in models/enhancer/gaussian_diffusion/layers.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
def forward(
    self,
    x: Tensor,
    conditioner: Tensor,
    diffusion_step: Tensor,
    speaker_emb: Tensor,
    mask: Optional[Tensor] = None,
):
    r"""Forward pass through the ResidualBlock module.

    Args:
        x (torch.Tensor): Input tensor.
        conditioner (torch.Tensor): Conditioner tensor.
        diffusion_step (torch.Tensor): Diffusion step tensor.
        speaker_emb (torch.Tensor): Speaker embedding tensor.
        mask (torch.Tensor, optional): Mask tensor. Defaults to None.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]: Tuple containing the output tensor and skip tensor.
    """
    diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
    conditioner = self.conditioner_projection(conditioner)
    # conditioner = self.conditioner_projection(conditioner.transpose(1, 2))
    if self.multi_speaker:
        # speaker_emb = self.speaker_projection(speaker_emb).unsqueeze(1).expand(
        #     -1, conditioner.shape[-1], -1,
        # ).transpose(1, 2)
        speaker_emb = self.speaker_projection(speaker_emb).expand(
            -1, conditioner.shape[-1], -1,
        ).transpose(1, 2)

    residual = y = x + diffusion_step
    y = self.conv_layer(
        (y + conditioner + speaker_emb) if self.multi_speaker else (y + conditioner),
    )
    gate, filter = torch.chunk(y, 2, dim=1)
    y = torch.sigmoid(gate) * torch.tanh(filter)

    y = self.output_projection(y)
    x, skip = torch.chunk(y, 2, dim=1)

    return (x + residual) / math.sqrt(2.0), skip