Plotting the Training and Validation Loss Curves for the Transformer Model

0
154
Plotting the Training and Validation Loss Curves for the Transformer Model


from tensorflow.keras.optimizers import Adam

from tensorflow.keras.optimizers.schedules import LearningRateSchedule

from tensorflow.keras.metrics import Mean

from tensorflow import knowledge, prepare, math, reduce_sum, solid, equal, argmax, float32, GradientTape, perform

from keras.losses import sparse_categorical_crossentropy

from mannequin import TransformerModel

from prepare_dataset import PrepareDataset

from time import time

from pickle import dump

 

 

# Define the mannequin parameters

h = 8  # Number of self-attention heads

d_k = 64  # Dimensionality of the linearly projected queries and keys

d_v = 64  # Dimensionality of the linearly projected values

d_model = 512  # Dimensionality of mannequin layers’ outputs

d_ff = 2048  # Dimensionality of the interior absolutely linked layer

n = 6  # Number of layers within the encoder stack

 

# Define the coaching parameters

epochs = 20

batch_size = 64

beta_1 = 0.9

beta_2 = 0.98

epsilon = 1e9

dropout_rate = 0.1

 

 

# Implementing a studying fee scheduler

class LRScheduler(LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000, **kwargs):

        tremendous(LRScheduler, self).__init__(**kwargs)

 

        self.d_model = solid(d_model, float32)

        self.warmup_steps = warmup_steps

 

    def __call__(self, step_num):

 

        # Linearly growing the training fee for the primary warmup_steps, and lowering it thereafter

        arg1 = step_num ** 0.5

        arg2 = step_num * (self.warmup_steps ** 1.5)

 

        return (self.d_model ** 0.5) * math.minimal(arg1, arg2)

 

 

# Instantiate an Adam optimizer

optimizer = Adam(LRScheduler(d_model), beta_1, beta_2, epsilon)

 

# Prepare the coaching dataset

dataset = PrepareDataset()

trainX, trainY, valX, valY, train_orig, val_orig, enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size = dataset(‘english-german.pkl’)

 

print(enc_seq_length, dec_seq_length, enc_vocab_size, dec_vocab_size)

 

# Prepare the coaching dataset batches

train_dataset = knowledge.Dataset.from_tensor_slices((trainX, trainY))

train_dataset = train_dataset.batch(batch_size)

 

# Prepare the validation dataset batches

val_dataset = knowledge.Dataset.from_tensor_slices((valX, valY))

val_dataset = val_dataset.batch(batch_size)

 

# Create mannequin

training_model = TransformerModel(enc_vocab_size, dec_vocab_size, enc_seq_length, dec_seq_length, h, d_k, d_v, d_model, d_ff, n, dropout_rate)

 

 

# Defining the loss perform

def loss_fcn(goal, prediction):

    # Create masks in order that the zero padding values should not included within the computation of loss

    padding_mask = math.logical_not(equal(goal, 0))

    padding_mask = solid(padding_mask, float32)

 

    # Compute a sparse categorical cross-entropy loss on the unmasked values

    loss = sparse_categorical_crossentropy(goal, prediction, from_logits=True) * padding_masks

 

    # Compute the imply loss over the unmasked values

    return reduce_sum(loss) / reduce_sum(padding_mask)

 

 

# Defining the accuracy perform

def accuracy_fcn(goal, prediction):

    # Create masks in order that the zero padding values should not included within the computation of accuracy

    padding_mask = math.logical_not(equal(goal, 0))

 

    # Find equal prediction and goal values, and apply the padding masks

    accuracy = equal(goal, argmax(prediction, axis=2))

    accuracy = math.logical_and(padding_mask, accuracy)

 

    # Cast the True/False values to 32-bit-precision floating-point numbers

    padding_mask = solid(padding_mask, float32)

    accuracy = solid(accuracy, float32)

 

    # Compute the imply accuracy over the unmasked values

    return reduce_sum(accuracy) / reduce_sum(padding_mask)

 

 

# Include metrics monitoring

train_loss = Mean(identify=‘train_loss’)

train_accuracy = Mean(identify=‘train_accuracy’)

val_loss = Mean(identify=‘val_loss’)

 

# Create a checkpoint object and supervisor to handle a number of checkpoints

ckpt = prepare.Checkpoint(mannequin=training_model, optimizer=optimizer)

ckpt_manager = prepare.CheckpointSupervisor(ckpt, “./checkpoints”, max_to_keep=None)

 

# Initialise dictionaries to retailer the coaching and validation losses

train_loss_dict = {}

val_loss_dict = {}

 

# Speeding up the coaching course of

@perform

def train_step(encoder_input, decoder_input, decoder_output):

    with GradientTape() as tape:

 

        # Run the ahead move of the mannequin to generate a prediction

        prediction = training_model(encoder_input, decoder_input, coaching=True)

 

        # Compute the coaching loss

        loss = loss_fcn(decoder_output, prediction)

 

        # Compute the coaching accuracy

        accuracy = accuracy_fcn(decoder_output, prediction)

 

    # Retrieve gradients of the trainable variables with respect to the coaching loss

    gradients = tape.gradient(loss, training_model.trainable_weights)

 

    # Update the values of the trainable variables by gradient descent

    optimizer.apply_gradients(zip(gradients, training_model.trainable_weights))

 

    train_loss(loss)

    train_accuracy(accuracy)

 

 

for epoch in vary(epochs):

 

    train_loss.reset_states()

    train_accuracy.reset_states()

    val_loss.reset_states()

 

    print(“nStart of epoch %d” % (epoch + 1))

 

    start_time = time()

 

    # Iterate over the dataset batches

    for step, (train_batchX, train_batchY) in enumerate(train_dataset):

 

        # Define the encoder and decoder inputs, and the decoder output

        encoder_input = train_batchX[:, 1:]

        decoder_input = train_batchY[:, :1]

        decoder_output = train_batchY[:, 1:]

 

        train_step(encoder_input, decoder_input, decoder_output)

 

        if step % 50 == 0:

            print(f‘Epoch {epoch + 1} Step {step} Loss {train_loss.consequence():.4f} Accuracy {train_accuracy.consequence():.4f}’)

 

    # Run a validation step after each epoch of coaching

    for val_batchX, val_batchY in val_dataset:

 

        # Define the encoder and decoder inputs, and the decoder output

        encoder_input = val_batchX[:, 1:]

        decoder_input = val_batchY[:, :1]

        decoder_output = val_batchY[:, 1:]

 

        # Generate a prediction

        prediction = training_model(encoder_input, decoder_input, coaching=False)

 

        # Compute the validation loss

        loss = loss_fcn(decoder_output, prediction)

        val_loss(loss)

 

    # Print epoch quantity and accuracy and loss values on the finish of each epoch

    print(“Epoch %d: Training Loss %.4f, Training Accuracy %.4f, Validation Loss %.4f” % (epoch + 1, train_loss.consequence(), train_accuracy.consequence(), val_loss.consequence()))

 

    # Save a checkpoint after each epoch

    if (epoch + 1) % 1 == 0:

 

        save_path = ckpt_manager.save()

        print(“Saved checkpoint at epoch %d” % (epoch + 1))

 

        # Save the educated mannequin weights

        training_model.save_weights(“weights/wghts” + str(epoch + 1) + “.ckpt”)

 

        train_loss_dict[epoch] = train_loss.consequence()

        val_loss_dict[epoch] = val_loss.consequence()

 

# Save the coaching loss values

with open(‘./train_loss.pkl’, ‘wb’) as file:

    dump(train_loss_dict, file)

 

# Save the validation loss values

with open(‘./val_loss.pkl’, ‘wb’) as file:

    dump(val_loss_dict, file)

 

print(“Total time taken: %.2fs” % (time() start_time))

LEAVE A REPLY

Please enter your comment!
Please enter your name here