python - The loss did not converge when writing a training loop from scratch with TensorFlow 2.2 and Tensorflow 2.3

Question

Welcome To Ask or Share your Answers For Others

python - The loss did not converge when writing a training loop from scratch with TensorFlow 2.2 and Tensorflow 2.3

asked Feb 6, 2021 in Technique[技术] by 深蓝 (71.8m points)

python - The loss did not converge when writing a training loop from scratch with TensorFlow 2.2 and Tensorflow 2.3

If I use fit in training my network. The loss function converges, and the metric (accuracy) is significantly improved.

However, I write the training process from scratch by myself. I found that the loss function not converges, and the metric (accuracy) isn't also improved.

My code is the following (don't mind with network architecture, you can design other model and try).

import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.datasets import cifar100
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, Dense, GlobalAveragePooling2D
from tensorflow.keras.regularizers import l2

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.InteractiveSession(config=config)

def read_dataset(num_classes=100):
    (x_train, y_train), (x_test, y_test) = cifar100.load_data()
    x_train = x_train.astype('float32') / 255
    x_test = x_test.astype('float32') / 255
    x_train_mean = np.mean(x_train, axis=0)
    x_train -= x_train_mean
    x_test -= x_train_mean
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test, num_classes)
    return x_train, y_train, x_test, y_test

def basic_model(input_shape, num_classes):
    inps = Input(shape=input_shape)

    x = Conv2D(32, kernel_size=(7,7), strides=(2,2), padding='same', use_bias=False,
               kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(inps)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = Conv2D(64, kernel_size=(3, 3), strides=(2, 2), padding='same', use_bias=False,
               kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same', use_bias=False,
               kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = GlobalAveragePooling2D()(x)
    out = Dense(num_classes, activation='softmax')(x)
    return Model(inputs=inps, outputs=out)

def train(input_shape, num_classes=100, num_epochs=20, batch_size=32, lr_init=0.001):
    X_train, y_train, X_test, y_test = read_dataset()
    print('X_train shape', X_train.shape)
    print('y_train shape', y_train.shape)
    print('X_test shape', X_test.shape)
    print('y_test shape', y_test.shape)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    test_dataset = test_dataset.shuffle(buffer_size=1024).batch(batch_size)

    model = basic_model(input_shape, num_classes)
    model.summary(line_length=100)

    sgd = SGD(learning_rate=lr_init, momentum=0.99, nesterov=True)
    model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
    model.fit(train_dataset, epochs=num_epochs, verbose=1, validation_data=test_dataset)

class Custom_Model(Model):
    def __init__(self, model):
        super(Custom_Model, self).__init__()
        self.model = model

    def compile(
        self,
        optimizer,
        metrics,
        loss_fn
    ):
        super(Custom_Model, self).compile(optimizer=optimizer, metrics=metrics)
        self.loss_fn = loss_fn

    def train_step(self, data):
        # Unpack data
        x, y = data

        with tf.GradientTape() as tape:
            # Forward pass of student
            predictions = self.model(x, training=True)

            # Compute losses
            loss = self.loss_fn(y, predictions)
            loss += sum(self.model.losses)

        # Compute gradients
        trainable_vars = self.model.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)

        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Update the metrics configured in `compile()`.
        self.compiled_metrics.update_state(y, predictions)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"loss": loss})
        return results

    def test_step(self, data):
        # Unpack the data
        x, y = data

        # Compute predictions
        y_prediction = self.model(x, training=False)

        # Calculate the loss
        loss = self.loss_fn(y, y_prediction)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_prediction)

        # Return a dict of performance
        results = {m.name: m.result() for m in self.metrics}
        results.update({"val_loss": loss})
        return results


def train_loop(input_shape, num_classes=100, num_epochs=20, batch_size=32, lr_init=0.001):
    X_train, y_train, X_test, y_test = read_dataset()
    print('X_train shape', X_train.shape)
    print('y_train shape', y_train.shape)
    print('X_test shape', X_test.shape)
    print('y_test shape', y_test.shape)

    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)

    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
    test_dataset = test_dataset.shuffle(buffer_size=1024).batch(batch_size)

    model = basic_model(input_shape, num_classes)
    model.summary(line_length=100)

    sgd = SGD(learning_rate=lr_init, momentum=0.99, nesterov=True)
    custom_model = Custom_Model(model=model)
    custom_model.compile(optimizer=sgd, loss_fn=CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

    custom_model.fit(train_dataset, epochs=num_epochs, verbose=1, validation_data=test_dataset)


def main():
    input_shape = (32,32,3)
    num_classes = 100
    num_epochs = 20
    batch_size = 32
    lr_init = 0.001

    print(tf.__version__)

    print('-----------------Training with fit-----------------------------')
    train(input_shape, num_classes, num_epochs, batch_size, lr_init)

    print('-----------------Training loop from scratch-----------------------------')
    train_loop(input_shape, num_classes, num_epochs, batch_size, lr_init)


if __name__ == '__main__':
    main()

Here is the result when I use model.fit as usual: