If I use fit
in training my network. The loss function converges, and the metric (accuracy) is significantly improved.
However, I write the training process from scratch by myself. I found that the loss function not converges, and the metric (accuracy) isn't also improved.
My code is the following (don't mind with network architecture, you can design other model and try).
import numpy as np
import tensorflow as tf
import os
from tensorflow.keras.datasets import cifar100
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.layers import Input, Conv2D, BatchNormalization, Activation, Dense, GlobalAveragePooling2D
from tensorflow.keras.regularizers import l2
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.InteractiveSession(config=config)
def read_dataset(num_classes=100):
(x_train, y_train), (x_test, y_test) = cifar100.load_data()
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255
x_train_mean = np.mean(x_train, axis=0)
x_train -= x_train_mean
x_test -= x_train_mean
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
return x_train, y_train, x_test, y_test
def basic_model(input_shape, num_classes):
inps = Input(shape=input_shape)
x = Conv2D(32, kernel_size=(7,7), strides=(2,2), padding='same', use_bias=False,
kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(inps)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(64, kernel_size=(3, 3), strides=(2, 2), padding='same', use_bias=False,
kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Conv2D(128, kernel_size=(3, 3), strides=(1, 1), padding='same', use_bias=False,
kernel_initializer='he_normal',kernel_regularizer=l2(5e-4))(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = GlobalAveragePooling2D()(x)
out = Dense(num_classes, activation='softmax')(x)
return Model(inputs=inps, outputs=out)
def train(input_shape, num_classes=100, num_epochs=20, batch_size=32, lr_init=0.001):
X_train, y_train, X_test, y_test = read_dataset()
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.shuffle(buffer_size=1024).batch(batch_size)
model = basic_model(input_shape, num_classes)
model.summary(line_length=100)
sgd = SGD(learning_rate=lr_init, momentum=0.99, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(train_dataset, epochs=num_epochs, verbose=1, validation_data=test_dataset)
class Custom_Model(Model):
def __init__(self, model):
super(Custom_Model, self).__init__()
self.model = model
def compile(
self,
optimizer,
metrics,
loss_fn
):
super(Custom_Model, self).compile(optimizer=optimizer, metrics=metrics)
self.loss_fn = loss_fn
def train_step(self, data):
# Unpack data
x, y = data
with tf.GradientTape() as tape:
# Forward pass of student
predictions = self.model(x, training=True)
# Compute losses
loss = self.loss_fn(y, predictions)
loss += sum(self.model.losses)
# Compute gradients
trainable_vars = self.model.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
# Update weights
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
# Update the metrics configured in `compile()`.
self.compiled_metrics.update_state(y, predictions)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update({"loss": loss})
return results
def test_step(self, data):
# Unpack the data
x, y = data
# Compute predictions
y_prediction = self.model(x, training=False)
# Calculate the loss
loss = self.loss_fn(y, y_prediction)
# Update the metrics.
self.compiled_metrics.update_state(y, y_prediction)
# Return a dict of performance
results = {m.name: m.result() for m in self.metrics}
results.update({"val_loss": loss})
return results
def train_loop(input_shape, num_classes=100, num_epochs=20, batch_size=32, lr_init=0.001):
X_train, y_train, X_test, y_test = read_dataset()
print('X_train shape', X_train.shape)
print('y_train shape', y_train.shape)
print('X_test shape', X_test.shape)
print('y_test shape', y_test.shape)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.shuffle(buffer_size=1024).batch(batch_size)
model = basic_model(input_shape, num_classes)
model.summary(line_length=100)
sgd = SGD(learning_rate=lr_init, momentum=0.99, nesterov=True)
custom_model = Custom_Model(model=model)
custom_model.compile(optimizer=sgd, loss_fn=CategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
custom_model.fit(train_dataset, epochs=num_epochs, verbose=1, validation_data=test_dataset)
def main():
input_shape = (32,32,3)
num_classes = 100
num_epochs = 20
batch_size = 32
lr_init = 0.001
print(tf.__version__)
print('-----------------Training with fit-----------------------------')
train(input_shape, num_classes, num_epochs, batch_size, lr_init)
print('-----------------Training loop from scratch-----------------------------')
train_loop(input_shape, num_classes, num_epochs, batch_size, lr_init)
if __name__ == '__main__':
main()
Here is the result when I use model.fit
as usual:
Here is the result when I use the training loop from scratch:
The graph of the loss value between model.fit
and training loop: