Skip to content

Aligner

Aligner

Bases: Module

DEPRECATED: Aligner class represents a PyTorch module responsible for alignment tasks in a sequence-to-sequence model. It uses convolutional layers combined with LeakyReLU activation functions to project inputs to a hidden representation. The class utilizes both softmax and log-softmax to calculate softmax along dimension 3.

Parameters:

Name Type Description Default
d_enc_in int

Number of channels in the input for the encoder.

required
d_dec_in int

Number of channels in the input for the decoder.

required
d_hidden int

Number of channels in the output (hidden layers).

required
kernel_size_enc int

Size of the convolving kernel for encoder, default is 3.

3
kernel_size_dec int

Size of the convolving kernel for decoder, default is 7.

7
temperature float

The temperature value applied in Gaussian isotropic attention mechanism, default is 0.0005.

0.0005
leaky_relu_slope float

Controls the angle of the negative slope of LeakyReLU activation, default is LEAKY_RELU_SLOPE.

LEAKY_RELU_SLOPE
Source code in models/tts/delightful_tts/acoustic_model/aligner.py
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class Aligner(Module):
    r"""DEPRECATED: Aligner class represents a PyTorch module responsible for alignment tasks
    in a sequence-to-sequence model. It uses convolutional layers combined with
    LeakyReLU activation functions to project inputs to a hidden representation.
    The class utilizes both softmax and log-softmax to calculate softmax
    along dimension 3.

    Args:
        d_enc_in (int): Number of channels in the input for the encoder.
        d_dec_in (int): Number of channels in the input for the decoder.
        d_hidden (int): Number of channels in the output (hidden layers).
        kernel_size_enc (int, optional): Size of the convolving kernel for encoder, default is 3.
        kernel_size_dec (int, optional): Size of the convolving kernel for decoder, default is 7.
        temperature (float, optional): The temperature value applied in Gaussian isotropic
            attention mechanism, default is 0.0005.
        leaky_relu_slope (float, optional): Controls the angle of the negative slope of
            LeakyReLU activation, default is LEAKY_RELU_SLOPE.

    """

    def __init__(
        self,
        d_enc_in: int,
        d_dec_in: int,
        d_hidden: int,
        kernel_size_enc: int = 3,
        kernel_size_dec: int = 7,
        temperature: float = 0.0005,
        leaky_relu_slope: float = LEAKY_RELU_SLOPE,
    ):
        super().__init__()
        self.temperature = temperature

        self.softmax = torch.nn.Softmax(dim=3)
        self.log_softmax = torch.nn.LogSoftmax(dim=3)

        self.key_proj = nn.Sequential(
            nn.Conv1d(
                d_enc_in,
                d_hidden,
                kernel_size=kernel_size_enc,
                padding=kernel_size_enc // 2,
            ),
            nn.LeakyReLU(leaky_relu_slope),
            nn.Conv1d(
                d_hidden,
                d_hidden,
                kernel_size=kernel_size_enc,
                padding=kernel_size_enc // 2,
            ),
            nn.LeakyReLU(leaky_relu_slope),
        )

        self.query_proj = nn.Sequential(
            nn.Conv1d(
                d_dec_in,
                d_hidden,
                kernel_size=kernel_size_dec,
                padding=kernel_size_dec // 2,
            ),
            nn.LeakyReLU(leaky_relu_slope),
            nn.Conv1d(
                d_hidden,
                d_hidden,
                kernel_size=kernel_size_dec,
                padding=kernel_size_dec // 2,
            ),
            nn.LeakyReLU(leaky_relu_slope),
            nn.Conv1d(
                d_hidden,
                d_hidden,
                kernel_size=kernel_size_dec,
                padding=kernel_size_dec // 2,
            ),
            nn.LeakyReLU(leaky_relu_slope),
        )

    def binarize_attention_parallel(
        self,
        attn: torch.Tensor,
        in_lens: torch.Tensor,
        out_lens: torch.Tensor,
    ) -> torch.Tensor:
        r"""For training purposes only! Binarizes attention with MAS.
        Binarizes the attention tensor using Maximum Attention Strategy (MAS).

        This process is applied for training purposes only and the resulting
        binarized attention tensor will no longer receive a gradient in the
        backpropagation process.

        Args:
            attn (Tensor): The attention tensor. Must be of shape (B, 1, max_mel_len, max_text_len),
                where B represents the batch size, max_mel_len represents the maximum length
                of the mel spectrogram, and max_text_len represents the maximum length of the text.
            in_lens (Tensor): A 1D tensor of shape (B,) that contains the input sequence lengths,
                which likely corresponds to text sequence lengths.
            out_lens (Tensor): A 1D tensor of shape (B,) that contains the output sequence lengths,
                which likely corresponds to mel spectrogram lengths.

        Returns:
            Tensor: The binarized attention tensor. The output tensor has the same shape as the input `attn` tensor.
        """
        with torch.no_grad():
            attn_cpu = attn.data.cpu().numpy()
            attn_out = b_mas(
                attn_cpu,
                in_lens.cpu().numpy(),
                out_lens.cpu().numpy(),
                width=1,
            )
        return torch.from_numpy(attn_out)

    def forward(
        self,
        enc_in: torch.Tensor,
        dec_in: torch.Tensor,
        enc_len: torch.Tensor,
        dec_len: torch.Tensor,
        enc_mask: torch.Tensor,
        attn_prior: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        r"""Performs the forward pass through the Aligner module.

        Args:
            enc_in (Tensor): The text encoder outputs.
                Must be of shape (B, C_1, T_1), where B is the batch size, C_1 the number of
                channels in encoder inputs,
                and T_1 the sequence length of encoder inputs.
            dec_in (Tensor): The data to align with encoder outputs.
                Must be of shape (B, C_2, T_2), where C_2 is the number of channels in decoder inputs,
                and T_2 the sequence length of decoder inputs.
            enc_len (Tensor): 1D tensor representing the lengths of each sequence in the batch in `enc_in`.
            dec_len (Tensor): 1D tensor representing the lengths of each sequence in the batch in `dec_in`.
            enc_mask (Tensor): Binary mask tensor used to avoid attention to certain timesteps.
            attn_prior (Tensor): Previous attention values for attention calculation.

        Returns:
            Tuple[Tensor, Tensor, Tensor, Tensor]: Returns a tuple of Tensors representing the log-probability, soft attention, hard attention, and hard attention duration.
        """
        queries = dec_in.float()
        keys = enc_in.float()
        keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
        queries_enc = self.query_proj(queries)

        # Simplistic Gaussian Isotopic Attention
        attn = (
            queries_enc[:, :, :, None] - keys_enc[:, :, None]
        ) ** 2  # B x n_attn_dims x T1 x T2
        attn = -self.temperature * attn.sum(1, keepdim=True)

        if attn_prior is not None:
            # print(f"AlignmentEncoder \t| mel: {queries.shape} phone: {keys.shape}
            # mask: {mask.shape} attn: {attn.shape} attn_prior: {attn_prior.shape}")
            attn = self.log_softmax(attn) + torch.log(
                attn_prior.permute((0, 2, 1))[:, None] + 1e-8,
            )
            # print(f"AlignmentEncoder \t| After prior sum attn: {attn.shape}")"""

        attn_logprob = attn.clone()

        if enc_mask is not None:
            attn.masked_fill(enc_mask.unsqueeze(1).unsqueeze(1), -float("inf"))

        attn_soft = self.softmax(attn)  # softmax along T2
        attn_hard = self.binarize_attention_parallel(attn_soft, enc_len, dec_len)
        attn_hard_dur = attn_hard.sum(2)[:, 0, :]
        return attn_logprob, attn_soft, attn_hard, attn_hard_dur

binarize_attention_parallel(attn, in_lens, out_lens)

For training purposes only! Binarizes attention with MAS. Binarizes the attention tensor using Maximum Attention Strategy (MAS).

This process is applied for training purposes only and the resulting binarized attention tensor will no longer receive a gradient in the backpropagation process.

Parameters:

Name Type Description Default
attn Tensor

The attention tensor. Must be of shape (B, 1, max_mel_len, max_text_len), where B represents the batch size, max_mel_len represents the maximum length of the mel spectrogram, and max_text_len represents the maximum length of the text.

required
in_lens Tensor

A 1D tensor of shape (B,) that contains the input sequence lengths, which likely corresponds to text sequence lengths.

required
out_lens Tensor

A 1D tensor of shape (B,) that contains the output sequence lengths, which likely corresponds to mel spectrogram lengths.

required

Returns:

Name Type Description
Tensor Tensor

The binarized attention tensor. The output tensor has the same shape as the input attn tensor.

Source code in models/tts/delightful_tts/acoustic_model/aligner.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def binarize_attention_parallel(
    self,
    attn: torch.Tensor,
    in_lens: torch.Tensor,
    out_lens: torch.Tensor,
) -> torch.Tensor:
    r"""For training purposes only! Binarizes attention with MAS.
    Binarizes the attention tensor using Maximum Attention Strategy (MAS).

    This process is applied for training purposes only and the resulting
    binarized attention tensor will no longer receive a gradient in the
    backpropagation process.

    Args:
        attn (Tensor): The attention tensor. Must be of shape (B, 1, max_mel_len, max_text_len),
            where B represents the batch size, max_mel_len represents the maximum length
            of the mel spectrogram, and max_text_len represents the maximum length of the text.
        in_lens (Tensor): A 1D tensor of shape (B,) that contains the input sequence lengths,
            which likely corresponds to text sequence lengths.
        out_lens (Tensor): A 1D tensor of shape (B,) that contains the output sequence lengths,
            which likely corresponds to mel spectrogram lengths.

    Returns:
        Tensor: The binarized attention tensor. The output tensor has the same shape as the input `attn` tensor.
    """
    with torch.no_grad():
        attn_cpu = attn.data.cpu().numpy()
        attn_out = b_mas(
            attn_cpu,
            in_lens.cpu().numpy(),
            out_lens.cpu().numpy(),
            width=1,
        )
    return torch.from_numpy(attn_out)

forward(enc_in, dec_in, enc_len, dec_len, enc_mask, attn_prior)

Performs the forward pass through the Aligner module.

Parameters:

Name Type Description Default
enc_in Tensor

The text encoder outputs. Must be of shape (B, C_1, T_1), where B is the batch size, C_1 the number of channels in encoder inputs, and T_1 the sequence length of encoder inputs.

required
dec_in Tensor

The data to align with encoder outputs. Must be of shape (B, C_2, T_2), where C_2 is the number of channels in decoder inputs, and T_2 the sequence length of decoder inputs.

required
enc_len Tensor

1D tensor representing the lengths of each sequence in the batch in enc_in.

required
dec_len Tensor

1D tensor representing the lengths of each sequence in the batch in dec_in.

required
enc_mask Tensor

Binary mask tensor used to avoid attention to certain timesteps.

required
attn_prior Tensor

Previous attention values for attention calculation.

required

Returns:

Type Description
Tuple[Tensor, Tensor, Tensor, Tensor]

Tuple[Tensor, Tensor, Tensor, Tensor]: Returns a tuple of Tensors representing the log-probability, soft attention, hard attention, and hard attention duration.

Source code in models/tts/delightful_tts/acoustic_model/aligner.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def forward(
    self,
    enc_in: torch.Tensor,
    dec_in: torch.Tensor,
    enc_len: torch.Tensor,
    dec_len: torch.Tensor,
    enc_mask: torch.Tensor,
    attn_prior: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    r"""Performs the forward pass through the Aligner module.

    Args:
        enc_in (Tensor): The text encoder outputs.
            Must be of shape (B, C_1, T_1), where B is the batch size, C_1 the number of
            channels in encoder inputs,
            and T_1 the sequence length of encoder inputs.
        dec_in (Tensor): The data to align with encoder outputs.
            Must be of shape (B, C_2, T_2), where C_2 is the number of channels in decoder inputs,
            and T_2 the sequence length of decoder inputs.
        enc_len (Tensor): 1D tensor representing the lengths of each sequence in the batch in `enc_in`.
        dec_len (Tensor): 1D tensor representing the lengths of each sequence in the batch in `dec_in`.
        enc_mask (Tensor): Binary mask tensor used to avoid attention to certain timesteps.
        attn_prior (Tensor): Previous attention values for attention calculation.

    Returns:
        Tuple[Tensor, Tensor, Tensor, Tensor]: Returns a tuple of Tensors representing the log-probability, soft attention, hard attention, and hard attention duration.
    """
    queries = dec_in.float()
    keys = enc_in.float()
    keys_enc = self.key_proj(keys)  # B x n_attn_dims x T2
    queries_enc = self.query_proj(queries)

    # Simplistic Gaussian Isotopic Attention
    attn = (
        queries_enc[:, :, :, None] - keys_enc[:, :, None]
    ) ** 2  # B x n_attn_dims x T1 x T2
    attn = -self.temperature * attn.sum(1, keepdim=True)

    if attn_prior is not None:
        # print(f"AlignmentEncoder \t| mel: {queries.shape} phone: {keys.shape}
        # mask: {mask.shape} attn: {attn.shape} attn_prior: {attn_prior.shape}")
        attn = self.log_softmax(attn) + torch.log(
            attn_prior.permute((0, 2, 1))[:, None] + 1e-8,
        )
        # print(f"AlignmentEncoder \t| After prior sum attn: {attn.shape}")"""

    attn_logprob = attn.clone()

    if enc_mask is not None:
        attn.masked_fill(enc_mask.unsqueeze(1).unsqueeze(1), -float("inf"))

    attn_soft = self.softmax(attn)  # softmax along T2
    attn_hard = self.binarize_attention_parallel(attn_soft, enc_len, dec_len)
    attn_hard_dur = attn_hard.sum(2)[:, 0, :]
    return attn_logprob, attn_soft, attn_hard, attn_hard_dur