⚗️ | More architectural changes

This commit is contained in:
2025-11-18 21:34:59 +02:00
parent 3f23242d6f
commit 782a3bab28
8 changed files with 245 additions and 254 deletions

View File

@@ -7,18 +7,13 @@ import torchaudio.transforms as T
class MultiResolutionSTFTLoss(nn.Module):
"""
Multi-resolution STFT loss.
Combines spectral convergence loss and log-magnitude loss
across multiple STFT resolutions.
"""
def __init__(
self,
fft_sizes: List[int] = [1024, 2048, 512],
hop_sizes: List[int] = [120, 240, 50],
win_lengths: List[int] = [600, 1200, 240],
fft_sizes: List[int] = [512, 1024, 2048, 4096, 8192],
hop_sizes: List[int] = [64, 128, 256, 512, 1024],
win_lengths: List[int] = [256, 512, 1024, 2048, 4096],
eps: float = 1e-7,
center: bool = True
):
super().__init__()
@@ -26,15 +21,14 @@ class MultiResolutionSTFTLoss(nn.Module):
self.n_resolutions = len(fft_sizes)
self.stft_transforms = nn.ModuleList()
for n_fft, hop_len, win_len in zip(fft_sizes, hop_sizes, win_lengths):
window = torch.hann_window(win_len)
for i, (n_fft, hop_len, win_len) in enumerate(zip(fft_sizes, hop_sizes, win_lengths)):
stft = T.Spectrogram(
n_fft=n_fft,
hop_length=hop_len,
win_length=win_len,
window_fn=lambda _: window,
power=None, # Keep complex output
center=True,
window_fn=torch.hann_window,
power=None,
center=center,
pad_mode="reflect",
normalized=False,
)
@@ -43,12 +37,6 @@ class MultiResolutionSTFTLoss(nn.Module):
def forward(
self, y_true: torch.Tensor, y_pred: torch.Tensor
) -> Dict[str, torch.Tensor]:
"""
Args:
y_true: (B, T) or (B, 1, T) waveform
y_pred: (B, T) or (B, 1, T) waveform
"""
# Ensure correct shape (B, T)
if y_true.dim() == 3 and y_true.size(1) == 1:
y_true = y_true.squeeze(1)
if y_pred.dim() == 3 and y_pred.size(1) == 1:
@@ -58,28 +46,21 @@ class MultiResolutionSTFTLoss(nn.Module):
mag_loss = 0.0
for stft in self.stft_transforms:
stft = stft.to(y_pred.device)
# Complex STFTs: (B, F, T, 2)
stft.window = stft.window.to(y_true.device)
stft_true = stft(y_true)
stft_pred = stft(y_pred)
# Magnitudes
stft_mag_true = torch.abs(stft_true)
stft_mag_pred = torch.abs(stft_pred)
# --- Spectral Convergence Loss ---
norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1))
norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1))
sc_loss += torch.mean(norm_diff / (norm_true + self.eps))
# --- Log STFT Magnitude Loss ---
mag_loss += F.l1_loss(
torch.log(stft_mag_pred + self.eps),
torch.log(stft_mag_true + self.eps),
)
log_mag_pred = torch.log(stft_mag_pred + self.eps)
log_mag_true = torch.log(stft_mag_true + self.eps)
mag_loss += F.l1_loss(log_mag_pred, log_mag_true)
# Average across resolutions
sc_loss /= self.n_resolutions
mag_loss /= self.n_resolutions
total_loss = sc_loss + mag_loss