from typing import Dict, List import torch import torch.nn as nn import torch.nn.functional as F import torchaudio.transforms as T class MultiResolutionSTFTLoss(nn.Module): """ Computes a loss based on multiple STFT resolutions, including both spectral convergence and log STFT magnitude components. """ def __init__( self, fft_sizes: List[int] = [1024, 2048, 512], hop_sizes: List[int] = [120, 240, 50], win_lengths: List[int] = [600, 1200, 240], eps: float = 1e-7, ): super().__init__() self.stft_transforms = nn.ModuleList( [ T.Spectrogram( n_fft=n_fft, win_length=win_len, hop_length=hop_len, power=None ) for n_fft, hop_len, win_len in zip(fft_sizes, hop_sizes, win_lengths) ] ) self.eps = eps def forward( self, y_true: torch.Tensor, y_pred: torch.Tensor ) -> Dict[str, torch.Tensor]: sc_loss = 0.0 # Spectral Convergence Loss mag_loss = 0.0 # Log STFT Magnitude Loss for stft in self.stft_transforms: stft.to(y_pred.device) # Ensure transform is on the correct device # Get complex STFTs stft_true = stft(y_true) stft_pred = stft(y_pred) # Get magnitudes stft_mag_true = torch.abs(stft_true) stft_mag_pred = torch.abs(stft_pred) # --- Spectral Convergence Loss --- # || |S_true| - |S_pred| ||_F / || |S_true| ||_F norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1)) norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1)) sc_loss += torch.mean(norm_diff / (norm_true + self.eps)) # --- Log STFT Magnitude Loss --- mag_loss += F.l1_loss( torch.log(stft_mag_pred + self.eps), torch.log(stft_mag_true + self.eps) ) total_loss = sc_loss + mag_loss return {"total": total_loss, "sc": sc_loss, "mag": mag_loss}