⚗️ | Added some stupid ways for training + some makeup
This commit is contained in:
@@ -8,8 +8,9 @@ import torchaudio.transforms as T
|
||||
|
||||
class MultiResolutionSTFTLoss(nn.Module):
|
||||
"""
|
||||
Computes a loss based on multiple STFT resolutions, including both
|
||||
spectral convergence and log STFT magnitude components.
|
||||
Multi-resolution STFT loss.
|
||||
Combines spectral convergence loss and log-magnitude loss
|
||||
across multiple STFT resolutions.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@@ -20,43 +21,67 @@ class MultiResolutionSTFTLoss(nn.Module):
|
||||
eps: float = 1e-7,
|
||||
):
|
||||
super().__init__()
|
||||
self.stft_transforms = nn.ModuleList(
|
||||
[
|
||||
T.Spectrogram(
|
||||
n_fft=n_fft, win_length=win_len, hop_length=hop_len, power=None
|
||||
)
|
||||
for n_fft, hop_len, win_len in zip(fft_sizes, hop_sizes, win_lengths)
|
||||
]
|
||||
)
|
||||
|
||||
self.eps = eps
|
||||
self.n_resolutions = len(fft_sizes)
|
||||
|
||||
self.stft_transforms = nn.ModuleList()
|
||||
for n_fft, hop_len, win_len in zip(fft_sizes, hop_sizes, win_lengths):
|
||||
window = torch.hann_window(win_len)
|
||||
stft = T.Spectrogram(
|
||||
n_fft=n_fft,
|
||||
hop_length=hop_len,
|
||||
win_length=win_len,
|
||||
window_fn=lambda _: window,
|
||||
power=None, # Keep complex output
|
||||
center=True,
|
||||
pad_mode="reflect",
|
||||
normalized=False,
|
||||
)
|
||||
self.stft_transforms.append(stft)
|
||||
|
||||
def forward(
|
||||
self, y_true: torch.Tensor, y_pred: torch.Tensor
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
sc_loss = 0.0 # Spectral Convergence Loss
|
||||
mag_loss = 0.0 # Log STFT Magnitude Loss
|
||||
"""
|
||||
Args:
|
||||
y_true: (B, T) or (B, 1, T) waveform
|
||||
y_pred: (B, T) or (B, 1, T) waveform
|
||||
"""
|
||||
# Ensure correct shape (B, T)
|
||||
if y_true.dim() == 3 and y_true.size(1) == 1:
|
||||
y_true = y_true.squeeze(1)
|
||||
if y_pred.dim() == 3 and y_pred.size(1) == 1:
|
||||
y_pred = y_pred.squeeze(1)
|
||||
|
||||
sc_loss = 0.0
|
||||
mag_loss = 0.0
|
||||
|
||||
for stft in self.stft_transforms:
|
||||
stft.to(y_pred.device) # Ensure transform is on the correct device
|
||||
stft = stft.to(y_pred.device)
|
||||
|
||||
# Get complex STFTs
|
||||
# Complex STFTs: (B, F, T, 2)
|
||||
stft_true = stft(y_true)
|
||||
stft_pred = stft(y_pred)
|
||||
|
||||
# Get magnitudes
|
||||
# Magnitudes
|
||||
stft_mag_true = torch.abs(stft_true)
|
||||
stft_mag_pred = torch.abs(stft_pred)
|
||||
|
||||
# --- Spectral Convergence Loss ---
|
||||
# || |S_true| - |S_pred| ||_F / || |S_true| ||_F
|
||||
norm_true = torch.linalg.norm(stft_mag_true, dim=(-2, -1))
|
||||
norm_diff = torch.linalg.norm(stft_mag_true - stft_mag_pred, dim=(-2, -1))
|
||||
sc_loss += torch.mean(norm_diff / (norm_true + self.eps))
|
||||
|
||||
# --- Log STFT Magnitude Loss ---
|
||||
mag_loss += F.l1_loss(
|
||||
torch.log(stft_mag_pred + self.eps), torch.log(stft_mag_true + self.eps)
|
||||
torch.log(stft_mag_pred + self.eps),
|
||||
torch.log(stft_mag_true + self.eps),
|
||||
)
|
||||
|
||||
# Average across resolutions
|
||||
sc_loss /= self.n_resolutions
|
||||
mag_loss /= self.n_resolutions
|
||||
total_loss = sc_loss + mag_loss
|
||||
|
||||
return {"total": total_loss, "sc": sc_loss, "mag": mag_loss}
|
||||
|
60
utils/TrainingTools.py
Normal file
60
utils/TrainingTools.py
Normal file
@@ -0,0 +1,60 @@
|
||||
import torch
|
||||
|
||||
# In case if needed again...
|
||||
# from utils.MultiResolutionSTFTLoss import MultiResolutionSTFTLoss
|
||||
#
|
||||
# stft_loss_fn = MultiResolutionSTFTLoss(
|
||||
# fft_sizes=[1024, 2048, 512], hop_sizes=[120, 240, 50], win_lengths=[600, 1200, 240]
|
||||
# )
|
||||
|
||||
|
||||
def signal_mae(input_one: torch.Tensor, input_two: torch.Tensor) -> torch.Tensor:
|
||||
absolute_difference = torch.abs(input_one - input_two)
|
||||
return torch.mean(absolute_difference)
|
||||
|
||||
|
||||
def discriminator_train(
|
||||
high_quality,
|
||||
low_quality,
|
||||
high_labels,
|
||||
low_labels,
|
||||
discriminator,
|
||||
generator,
|
||||
criterion,
|
||||
):
|
||||
decision_high = discriminator(high_quality)
|
||||
d_loss_high = criterion(decision_high, high_labels)
|
||||
# print(f"Is this real?: {discriminator_decision_from_real} | {d_loss_real}")
|
||||
|
||||
decision_low = discriminator(low_quality)
|
||||
d_loss_low = criterion(decision_low, low_labels)
|
||||
# print(f"Is this real?: {discriminator_decision_from_fake} | {d_loss_fake}")
|
||||
|
||||
with torch.no_grad():
|
||||
generator_quality = generator(low_quality)
|
||||
decision_gen = discriminator(generator_quality)
|
||||
d_loss_gen = criterion(decision_gen, low_labels)
|
||||
|
||||
noise = torch.rand_like(high_quality) * 0.08
|
||||
decision_noise = discriminator(high_quality + noise)
|
||||
d_loss_noise = criterion(decision_noise, low_labels)
|
||||
|
||||
d_loss = (d_loss_high + d_loss_low + d_loss_gen + d_loss_noise) / 4.0
|
||||
|
||||
return d_loss
|
||||
|
||||
|
||||
def generator_train(
|
||||
low_quality, high_quality, real_labels, generator, discriminator, adv_criterion
|
||||
):
|
||||
generator_output = generator(low_quality)
|
||||
|
||||
discriminator_decision = discriminator(generator_output)
|
||||
adversarial_loss = adv_criterion(discriminator_decision, real_labels)
|
||||
|
||||
# Signal similarity
|
||||
similarity_loss = signal_mae(generator_output, high_quality)
|
||||
|
||||
combined_loss = adversarial_loss + (similarity_loss * 100)
|
||||
|
||||
return combined_loss, adversarial_loss
|
Reference in New Issue
Block a user