SISU/utils/MultiResolutionSTFTLoss.py

from typing import Dict, List

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.transforms as T


class MultiResolutionSTFTLoss(nn.Module):
    def __init__(
        self,
        fft_sizes: List[int] = [512, 1024, 2048, 4096, 8192],
        hop_sizes: List[int] = [64, 128, 256, 512, 1024],
        win_lengths: List[int] = [256, 512, 1024, 2048, 4096],
        eps: float = 1e-7,
        center: bool = True
    ):
        super().__init__()

        self.eps = eps
        self.n_resolutions = len(fft_sizes)

        self.stft_transforms = nn.ModuleList()
        for i, (n_fft, hop_len, win_len) in enumerate(zip(fft_sizes, hop_sizes, win_lengths)):
            stft = T.Spectrogram(
                n_fft=n_fft,
                hop_length=hop_len,
                win_length=win_len,
                window_fn=torch.hann_window,
                power=None,
                center=center,
                pad_mode="reflect",
                normalized=False,
            )
            self.stft_transforms.append(stft)

    def forward(
        self, y_true: torch.Tensor, y_pred: torch.Tensor
    ) -> Dict[str, torch.Tensor]:
        if y_true.dim() == 3 and y_true.size(1) == 1:
            y_true = y_true.squeeze(1)
        if y_pred.dim() == 3 and y_pred.size(1) == 1:
            y_pred = y_pred.squeeze(1)

        sc_loss = 0.0
        mag_loss = 0.0

        for stft in self.stft_transforms:
            stft.window = stft.window.to(y_true.device)
            stft_true = stft(y_true)
            stft_pred = stft(y_pred)

            stft_mag_true = torch.abs(stft_true)
            stft_mag_pred = torch.abs(stft_pred)

            norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1))
            norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1))
            sc_loss += torch.mean(norm_diff / (norm_true + self.eps))

            log_mag_pred = torch.log(stft_mag_pred + self.eps)
            log_mag_true = torch.log(stft_mag_true + self.eps)
            mag_loss += F.l1_loss(log_mag_pred, log_mag_true)

        sc_loss /= self.n_resolutions
        mag_loss /= self.n_resolutions
        total_loss = sc_loss + mag_loss

        return {"total": total_loss, "sc": sc_loss, "mag": mag_loss}