88 lines
2.7 KiB
Python
88 lines
2.7 KiB
Python
from typing import Dict, List
|
|
|
|
import torch
|
|
import torch.nn as nn
|
|
import torch.nn.functional as F
|
|
import torchaudio.transforms as T
|
|
|
|
|
|
class MultiResolutionSTFTLoss(nn.Module):
|
|
"""
|
|
Multi-resolution STFT loss.
|
|
Combines spectral convergence loss and log-magnitude loss
|
|
across multiple STFT resolutions.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
fft_sizes: List[int] = [1024, 2048, 512],
|
|
hop_sizes: List[int] = [120, 240, 50],
|
|
win_lengths: List[int] = [600, 1200, 240],
|
|
eps: float = 1e-7,
|
|
):
|
|
super().__init__()
|
|
|
|
self.eps = eps
|
|
self.n_resolutions = len(fft_sizes)
|
|
|
|
self.stft_transforms = nn.ModuleList()
|
|
for n_fft, hop_len, win_len in zip(fft_sizes, hop_sizes, win_lengths):
|
|
window = torch.hann_window(win_len)
|
|
stft = T.Spectrogram(
|
|
n_fft=n_fft,
|
|
hop_length=hop_len,
|
|
win_length=win_len,
|
|
window_fn=lambda _: window,
|
|
power=None, # Keep complex output
|
|
center=True,
|
|
pad_mode="reflect",
|
|
normalized=False,
|
|
)
|
|
self.stft_transforms.append(stft)
|
|
|
|
def forward(
|
|
self, y_true: torch.Tensor, y_pred: torch.Tensor
|
|
) -> Dict[str, torch.Tensor]:
|
|
"""
|
|
Args:
|
|
y_true: (B, T) or (B, 1, T) waveform
|
|
y_pred: (B, T) or (B, 1, T) waveform
|
|
"""
|
|
# Ensure correct shape (B, T)
|
|
if y_true.dim() == 3 and y_true.size(1) == 1:
|
|
y_true = y_true.squeeze(1)
|
|
if y_pred.dim() == 3 and y_pred.size(1) == 1:
|
|
y_pred = y_pred.squeeze(1)
|
|
|
|
sc_loss = 0.0
|
|
mag_loss = 0.0
|
|
|
|
for stft in self.stft_transforms:
|
|
stft = stft.to(y_pred.device)
|
|
|
|
# Complex STFTs: (B, F, T, 2)
|
|
stft_true = stft(y_true)
|
|
stft_pred = stft(y_pred)
|
|
|
|
# Magnitudes
|
|
stft_mag_true = torch.abs(stft_true)
|
|
stft_mag_pred = torch.abs(stft_pred)
|
|
|
|
# --- Spectral Convergence Loss ---
|
|
norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1))
|
|
norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1))
|
|
sc_loss += torch.mean(norm_diff / (norm_true + self.eps))
|
|
|
|
# --- Log STFT Magnitude Loss ---
|
|
mag_loss += F.l1_loss(
|
|
torch.log(stft_mag_pred + self.eps),
|
|
torch.log(stft_mag_true + self.eps),
|
|
)
|
|
|
|
# Average across resolutions
|
|
sc_loss /= self.n_resolutions
|
|
mag_loss /= self.n_resolutions
|
|
total_loss = sc_loss + mag_loss
|
|
|
|
return {"total": total_loss, "sc": sc_loss, "mag": mag_loss}
|