From 54338e55a99b3ab5620b36d7b594ca5aa2b29039 Mon Sep 17 00:00:00 2001 From: NikkeDoy Date: Tue, 25 Mar 2025 19:50:51 +0200 Subject: [PATCH] :albemic: | Tests. --- data.py | 2 +- discriminator.py | 23 +++++++++++++---------- generator.py | 2 +- training.py | 11 ++++++----- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/data.py b/data.py index 9ca5ee5..bc7574f 100644 --- a/data.py +++ b/data.py @@ -9,7 +9,7 @@ import AudioUtils class AudioDataset(Dataset): audio_sample_rates = [11025] - MAX_LENGTH = 88200 # Define your desired maximum length here + MAX_LENGTH = 44100 # Define your desired maximum length here def __init__(self, input_dir, device): self.input_files = [os.path.join(root, f) for root, _, files in os.walk(input_dir) for f in files if f.endswith('.wav')] diff --git a/discriminator.py b/discriminator.py index b1ec6eb..1608199 100644 --- a/discriminator.py +++ b/discriminator.py @@ -28,19 +28,22 @@ class AttentionBlock(nn.Module): return x * attention_weights class SISUDiscriminator(nn.Module): - def __init__(self, layers=4): #Increased base layer count + def __init__(self, layers=64): #Increased base layer count super(SISUDiscriminator, self).__init__() self.model = nn.Sequential( - discriminator_block(1, layers, kernel_size=7, stride=4), #Aggressive downsampling + discriminator_block(1, layers, kernel_size=3, stride=1), #Aggressive downsampling discriminator_block(layers, layers * 2, kernel_size=5, stride=2), - discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=2), - discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4), - AttentionBlock(layers * 8), #Added attention - discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8), - discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1), - discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2), - discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1), - discriminator_block(layers * 4, layers * 2, kernel_size=3, stride=1), + discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=4), + + #AttentionBlock(layers * 4), #Added attention + + #discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4), + #AttentionBlock(layers * 8), #Added attention + #discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8), + #discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1), + #discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2), + #discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1), + discriminator_block(layers * 4, layers * 2, kernel_size=5, stride=2), discriminator_block(layers * 2, layers, kernel_size=3, stride=1), discriminator_block(layers, 1, kernel_size=3, stride=1, spectral_norm=False) #last layer no spectral norm. ) diff --git a/generator.py b/generator.py index 950530a..04ac5b4 100644 --- a/generator.py +++ b/generator.py @@ -34,7 +34,7 @@ class ResidualInResidualBlock(nn.Module): return x + residual class SISUGenerator(nn.Module): - def __init__(self, layer=4, num_rirb=4): #increased base layer and rirb amounts + def __init__(self, layer=64, num_rirb=4): #increased base layer and rirb amounts super(SISUGenerator, self).__init__() self.conv1 = nn.Sequential( nn.Conv1d(1, layer, kernel_size=7, padding=3), diff --git a/training.py b/training.py index 50743be..63fc5b8 100644 --- a/training.py +++ b/training.py @@ -38,9 +38,9 @@ device = torch.device(args.device if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") mfcc_transform = T.MFCC( - sample_rate=44100, # Adjust to your sample rate + sample_rate=44100, n_mfcc=20, - melkwargs={'n_fft': 2048, 'hop_length': 512} # adjust n_fft and hop_length to your needs. + melkwargs={'n_fft': 2048, 'hop_length': 256} ).to(device) def gpu_mfcc_loss(y_true, y_pred): @@ -49,7 +49,8 @@ def gpu_mfcc_loss(y_true, y_pred): min_len = min(mfccs_true.shape[2], mfccs_pred.shape[2]) mfccs_true = mfccs_true[:, :, :min_len] mfccs_pred = mfccs_pred[:, :, :min_len] - return torch.mean((mfccs_true - mfccs_pred)**2) + loss = torch.mean((mfccs_true - mfccs_pred)**2) + return loss def discriminator_train(high_quality, low_quality, real_labels, fake_labels): optimizer_d.zero_grad() @@ -99,7 +100,7 @@ dataset = AudioDataset(dataset_dir, device) # ========= SINGLE ========= -train_data_loader = DataLoader(dataset, batch_size=128, shuffle=True) +train_data_loader = DataLoader(dataset, batch_size=16, shuffle=True) # Initialize models and move them to device generator = SISUGenerator() @@ -161,7 +162,7 @@ def start_training(): if debug: print(d_loss, combined_loss, adversarial_loss, mfcc_l) scheduler_d.step(d_loss) - scheduler_g.step(combined_loss) + #scheduler_g.step(combined_loss) # ========= SAVE LATEST AUDIO ========= high_quality_audio = (high_quality_clip[0][0], high_quality_clip[1][0])