from typing import Dict, List import torch import torch.nn as nn import torch.nn.functional as F import torchaudio.transforms as T class MultiResolutionSTFTLoss(nn.Module): def __init__( self, fft_sizes: List[int] = [512, 1024, 2048, 4096, 8192], hop_sizes: List[int] = [64, 128, 256, 512, 1024], win_lengths: List[int] = [256, 512, 1024, 2048, 4096], eps: float = 1e-7, center: bool = True ): super().__init__() self.eps = eps self.n_resolutions = len(fft_sizes) self.stft_transforms = nn.ModuleList() for i, (n_fft, hop_len, win_len) in enumerate(zip(fft_sizes, hop_sizes, win_lengths)): stft = T.Spectrogram( n_fft=n_fft, hop_length=hop_len, win_length=win_len, window_fn=torch.hann_window, power=None, center=center, pad_mode="reflect", normalized=False, ) self.stft_transforms.append(stft) def forward( self, y_true: torch.Tensor, y_pred: torch.Tensor ) -> Dict[str, torch.Tensor]: if y_true.dim() == 3 and y_true.size(1) == 1: y_true = y_true.squeeze(1) if y_pred.dim() == 3 and y_pred.size(1) == 1: y_pred = y_pred.squeeze(1) sc_loss = 0.0 mag_loss = 0.0 for stft in self.stft_transforms: stft.window = stft.window.to(y_true.device) stft_true = stft(y_true) stft_pred = stft(y_pred) stft_mag_true = torch.abs(stft_true) stft_mag_pred = torch.abs(stft_pred) norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1)) norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1)) sc_loss += torch.mean(norm_diff / (norm_true + self.eps)) log_mag_pred = torch.log(stft_mag_pred + self.eps) log_mag_true = torch.log(stft_mag_true + self.eps) mag_loss += F.l1_loss(log_mag_pred, log_mag_true) sc_loss /= self.n_resolutions mag_loss /= self.n_resolutions total_loss = sc_loss + mag_loss return {"total": total_loss, "sc": sc_loss, "mag": mag_loss}