Compare commits
2 Commits
tiniest-ex
...
miniscule-
Author | SHA1 | Date | |
---|---|---|---|
f928d8c2cf | |||
54338e55a9 |
2
data.py
2
data.py
@ -9,7 +9,7 @@ import AudioUtils
|
||||
|
||||
class AudioDataset(Dataset):
|
||||
audio_sample_rates = [11025]
|
||||
MAX_LENGTH = 88200 # Define your desired maximum length here
|
||||
MAX_LENGTH = 44100 # Define your desired maximum length here
|
||||
|
||||
def __init__(self, input_dir, device):
|
||||
self.input_files = [os.path.join(root, f) for root, _, files in os.walk(input_dir) for f in files if f.endswith('.wav')]
|
||||
|
@ -31,16 +31,19 @@ class SISUDiscriminator(nn.Module):
|
||||
def __init__(self, layers=4): #Increased base layer count
|
||||
super(SISUDiscriminator, self).__init__()
|
||||
self.model = nn.Sequential(
|
||||
discriminator_block(1, layers, kernel_size=7, stride=4), #Aggressive downsampling
|
||||
discriminator_block(1, layers, kernel_size=3, stride=1), #Aggressive downsampling
|
||||
discriminator_block(layers, layers * 2, kernel_size=5, stride=2),
|
||||
discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=2),
|
||||
discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4),
|
||||
AttentionBlock(layers * 8), #Added attention
|
||||
discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8),
|
||||
discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1),
|
||||
discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2),
|
||||
discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1),
|
||||
discriminator_block(layers * 4, layers * 2, kernel_size=3, stride=1),
|
||||
discriminator_block(layers * 2, layers * 4, kernel_size=5, dilation=4),
|
||||
|
||||
#AttentionBlock(layers * 4), #Added attention
|
||||
|
||||
#discriminator_block(layers * 4, layers * 8, kernel_size=5, dilation=4),
|
||||
#AttentionBlock(layers * 8), #Added attention
|
||||
#discriminator_block(layers * 8, layers * 16, kernel_size=5, dilation=8),
|
||||
#discriminator_block(layers * 16, layers * 16, kernel_size=3, dilation=1),
|
||||
#discriminator_block(layers * 16, layers * 8, kernel_size=3, dilation=2),
|
||||
#discriminator_block(layers * 8, layers * 4, kernel_size=3, dilation=1),
|
||||
discriminator_block(layers * 4, layers * 2, kernel_size=5, stride=2),
|
||||
discriminator_block(layers * 2, layers, kernel_size=3, stride=1),
|
||||
discriminator_block(layers, 1, kernel_size=3, stride=1, spectral_norm=False) #last layer no spectral norm.
|
||||
)
|
||||
|
29
training.py
29
training.py
@ -30,7 +30,7 @@ parser.add_argument("--discriminator", type=str, default=None,
|
||||
help="Path to the discriminator model file")
|
||||
parser.add_argument("--device", type=str, default="cpu", help="Select device")
|
||||
parser.add_argument("--epoch", type=int, default=0, help="Current epoch for model versioning")
|
||||
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
|
||||
parser.add_argument("--debug", action="store_true", help="Print debug logs")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@ -38,9 +38,9 @@ device = torch.device(args.device if torch.cuda.is_available() else "cpu")
|
||||
print(f"Using device: {device}")
|
||||
|
||||
mfcc_transform = T.MFCC(
|
||||
sample_rate=44100, # Adjust to your sample rate
|
||||
sample_rate=44100,
|
||||
n_mfcc=20,
|
||||
melkwargs={'n_fft': 2048, 'hop_length': 512} # adjust n_fft and hop_length to your needs.
|
||||
melkwargs={'n_fft': 2048, 'hop_length': 256}
|
||||
).to(device)
|
||||
|
||||
def gpu_mfcc_loss(y_true, y_pred):
|
||||
@ -49,7 +49,8 @@ def gpu_mfcc_loss(y_true, y_pred):
|
||||
min_len = min(mfccs_true.shape[2], mfccs_pred.shape[2])
|
||||
mfccs_true = mfccs_true[:, :, :min_len]
|
||||
mfccs_pred = mfccs_pred[:, :, :min_len]
|
||||
return torch.mean((mfccs_true - mfccs_pred)**2)
|
||||
loss = torch.mean((mfccs_true - mfccs_pred)**2)
|
||||
return loss
|
||||
|
||||
def discriminator_train(high_quality, low_quality, real_labels, fake_labels):
|
||||
optimizer_d.zero_grad()
|
||||
@ -79,19 +80,20 @@ def generator_train(low_quality, high_quality, real_labels):
|
||||
# Forward pass for fake samples (from generator output)
|
||||
generator_output = generator(low_quality[0])
|
||||
|
||||
mfcc_l = gpu_mfcc_loss(high_quality[0], generator_output)
|
||||
#mfcc_l = gpu_mfcc_loss(high_quality[0], generator_output)
|
||||
|
||||
discriminator_decision = discriminator(generator_output)
|
||||
adversarial_loss = criterion_g(discriminator_decision, real_labels)
|
||||
|
||||
combined_loss = adversarial_loss + 0.5 * mfcc_l
|
||||
#combined_loss = adversarial_loss + 0.5 * mfcc_l
|
||||
|
||||
combined_loss.backward()
|
||||
adversarial_loss.backward()
|
||||
optimizer_g.step()
|
||||
|
||||
return (generator_output, combined_loss, adversarial_loss, mfcc_l)
|
||||
#return (generator_output, combined_loss, adversarial_loss, mfcc_l)
|
||||
return (generator_output, adversarial_loss)
|
||||
|
||||
debug = args.verbose
|
||||
debug = args.debug
|
||||
|
||||
# Initialize dataset and dataloader
|
||||
dataset_dir = './dataset/good'
|
||||
@ -99,7 +101,7 @@ dataset = AudioDataset(dataset_dir, device)
|
||||
|
||||
# ========= SINGLE =========
|
||||
|
||||
train_data_loader = DataLoader(dataset, batch_size=128, shuffle=True)
|
||||
train_data_loader = DataLoader(dataset, batch_size=256, shuffle=True)
|
||||
|
||||
# Initialize models and move them to device
|
||||
generator = SISUGenerator()
|
||||
@ -156,12 +158,13 @@ def start_training():
|
||||
|
||||
# ========= GENERATOR =========
|
||||
generator.train()
|
||||
generator_output, combined_loss, adversarial_loss, mfcc_l = generator_train(low_quality_sample, high_quality_sample, real_labels)
|
||||
#generator_output, combined_loss, adversarial_loss, mfcc_l = generator_train(low_quality_sample, high_quality_sample, real_labels)
|
||||
generator_output, adversarial_loss = generator_train(low_quality_sample, high_quality_sample, real_labels)
|
||||
|
||||
if debug:
|
||||
print(d_loss, combined_loss, adversarial_loss, mfcc_l)
|
||||
print(d_loss, adversarial_loss)
|
||||
scheduler_d.step(d_loss)
|
||||
scheduler_g.step(combined_loss)
|
||||
scheduler_g.step(adversarial_loss)
|
||||
|
||||
# ========= SAVE LATEST AUDIO =========
|
||||
high_quality_audio = (high_quality_clip[0][0], high_quality_clip[1][0])
|
||||
|
Reference in New Issue
Block a user