Merge new-arch, because it has proven to give the best results #1

Merged
NikkeDoy merged 14 commits from new-arch into main 2025-04-30 23:47:41 +03:00
4 changed files with 21 additions and 17 deletions
Showing only changes of commit 54338e55a9 - Show all commits

View File

@ -9,7 +9,7 @@ import AudioUtils
class AudioDataset(Dataset): class AudioDataset(Dataset):
audio_sample_rates = [11025] audio_sample_rates = [11025]
MAX_LENGTH = 88200 # Define your desired maximum length here MAX_LENGTH = 44100 # Define your desired maximum length here
def __init__(self, input_dir, device): def __init__(self, input_dir, device):
self.input_files = [os.path.join(root, f) for root, _, files in os.walk(input_dir) for f in files if f.endswith('.wav')] self.input_files = [os.path.join(root, f) for root, _, files in os.walk(input_dir) for f in files if f.endswith('.wav')]

View File

@ -28,19 +28,22 @@ class AttentionBlock(nn.Module):
return x * attention_weights return x * attention_weights
class SISUDiscriminator(nn.Module): class SISUDiscriminator(nn.Module):
def __init__(self, layers=4): #Increased base layer count def __init__(self, layers=64): #Increased base layer count
super(SISUDiscriminator, self).__init__() super(SISUDiscriminator, self).__init__()
self.model = nn.Sequential( self.model = nn.Sequential(
discriminator_block(1, layers, kernel_size=7, stride=4), #Aggressive downsampling discriminator_block(1, layers, kernel_size=3, stride=1), #Aggressive downsampling
discriminator_block(layers, layers * 2, kernel_size=5, stride=2), discriminator_block(layers, layers * 2, kernel_size=5, stride=2),
discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=2), discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=4),
discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4),
AttentionBlock(layers * 8), #Added attention #AttentionBlock(layers * 4), #Added attention
discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8),
discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1), #discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4),
discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2), #AttentionBlock(layers * 8), #Added attention
discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1), #discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8),
discriminator_block(layers * 4, layers * 2, kernel_size=3, stride=1), #discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1),
#discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2),
#discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1),
discriminator_block(layers * 4, layers * 2, kernel_size=5, stride=2),
discriminator_block(layers * 2, layers, kernel_size=3, stride=1), discriminator_block(layers * 2, layers, kernel_size=3, stride=1),
discriminator_block(layers, 1, kernel_size=3, stride=1, spectral_norm=False) #last layer no spectral norm. discriminator_block(layers, 1, kernel_size=3, stride=1, spectral_norm=False) #last layer no spectral norm.
) )

View File

@ -34,7 +34,7 @@ class ResidualInResidualBlock(nn.Module):
return x + residual return x + residual
class SISUGenerator(nn.Module): class SISUGenerator(nn.Module):
def __init__(self, layer=4, num_rirb=4): #increased base layer and rirb amounts def __init__(self, layer=64, num_rirb=4): #increased base layer and rirb amounts
super(SISUGenerator, self).__init__() super(SISUGenerator, self).__init__()
self.conv1 = nn.Sequential( self.conv1 = nn.Sequential(
nn.Conv1d(1, layer, kernel_size=7, padding=3), nn.Conv1d(1, layer, kernel_size=7, padding=3),

View File

@ -38,9 +38,9 @@ device = torch.device(args.device if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") print(f"Using device: {device}")
mfcc_transform = T.MFCC( mfcc_transform = T.MFCC(
sample_rate=44100, # Adjust to your sample rate sample_rate=44100,
n_mfcc=20, n_mfcc=20,
melkwargs={'n_fft': 2048, 'hop_length': 512} # adjust n_fft and hop_length to your needs. melkwargs={'n_fft': 2048, 'hop_length': 256}
).to(device) ).to(device)
def gpu_mfcc_loss(y_true, y_pred): def gpu_mfcc_loss(y_true, y_pred):
@ -49,7 +49,8 @@ def gpu_mfcc_loss(y_true, y_pred):
min_len = min(mfccs_true.shape[2], mfccs_pred.shape[2]) min_len = min(mfccs_true.shape[2], mfccs_pred.shape[2])
mfccs_true = mfccs_true[:, :, :min_len] mfccs_true = mfccs_true[:, :, :min_len]
mfccs_pred = mfccs_pred[:, :, :min_len] mfccs_pred = mfccs_pred[:, :, :min_len]
return torch.mean((mfccs_true - mfccs_pred)**2) loss = torch.mean((mfccs_true - mfccs_pred)**2)
return loss
def discriminator_train(high_quality, low_quality, real_labels, fake_labels): def discriminator_train(high_quality, low_quality, real_labels, fake_labels):
optimizer_d.zero_grad() optimizer_d.zero_grad()
@ -99,7 +100,7 @@ dataset = AudioDataset(dataset_dir, device)
# ========= SINGLE ========= # ========= SINGLE =========
train_data_loader = DataLoader(dataset, batch_size=128, shuffle=True) train_data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
# Initialize models and move them to device # Initialize models and move them to device
generator = SISUGenerator() generator = SISUGenerator()
@ -161,7 +162,7 @@ def start_training():
if debug: if debug:
print(d_loss, combined_loss, adversarial_loss, mfcc_l) print(d_loss, combined_loss, adversarial_loss, mfcc_l)
scheduler_d.step(d_loss) scheduler_d.step(d_loss)
scheduler_g.step(combined_loss) #scheduler_g.step(combined_loss)
# ========= SAVE LATEST AUDIO ========= # ========= SAVE LATEST AUDIO =========
high_quality_audio = (high_quality_clip[0][0], high_quality_clip[1][0]) high_quality_audio = (high_quality_clip[0][0], high_quality_clip[1][0])