¿Por qué mi modelo ConvLSTM no puede predecir?

He construido un modelo Convolutional LSTM utilizando Tensorflow ConvLSTMCell (), tf.nn.dynamic_rnn () y tf.contrib.legacy_seq2seq.rnn_decoder () Tengo 3 capas de codificador y 3 capas de decodificador, los estados iniciales de los decodificadores provienen de los estados finales de los codificadores. Tengo 128, 64 y 64 filtros para la capa 1, capa 2 y capa 3 respectivamente. Finalmente, concateno las salidas de los decodificadores y los paso a través de una capa de convolución para disminuir el número de canales a uno. Y luego aplico la función de pérdida. Mi conjunto de datos se está moviendo conjunto de datos mnist. En el conjunto de datos mnist en movimiento, cada secuencia tiene 20 cuadros. Por este modelo, estoy tratando de predecir los cuadros 11 a 20 según los primeros 10 cuadros. Pero la salida, que es una secuencia de 10 cuadros, está lejos de la verdad fundamental y básicamente intenta reproducir el último cuadro de entrada que es el décimo cuadro. Pongo el código aquí, gracias por tu ayuda.

import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt import tensorflow as tf import numpy as np from sklearn.metrics import confusion_matrix import time from datetime import timedelta import math import random from random import getrandbits from tensorflow.contrib.rnn.python.ops.rnn_cell import ConvLSTMCell from tensorflow.python.ops.rnn_cell import LSTMStateTuple tf.reset_default_graph() # cell = ConvLSTMCell() num_channels = 1 img_size = 64 #filter sizes filter_size1 = 5 filter_size2 = 5 filter_size3 = 5 #number of filters in each layer num_filters1 = 128 num_filters2 = 64 num_filters3 = 64 img_size_flat = img_size * img_size y = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='y') y_image = tf.reshape(y, [-1, img_size, img_size, num_channels], name='y_image') z = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='z') z_image = tf.reshape(z, [-1, img_size, img_size, num_channels], name='z_image') x = tf.placeholder(tf.float32, shape=[None,None,img_size,img_size,num_channels], name='x') with tf.variable_scope("Encoder"): with tf.variable_scope("Encoder_Layer1"): InputShape = [img_size, img_size, num_channels] encoder_1_KernelShape = [filter_size1, filter_size1] rnn_cell = ConvLSTMCell(2, InputShape, num_filters1, encoder_1_KernelShape, use_bias=True, forget_bias=1.0, name='Encoder_1') # defining initial state #initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32) initial_state = rnn_cell.zero_state(1, dtype=tf.float32) encoder_1_outputs, encoder_1_state = tf.nn.dynamic_rnn(rnn_cell, x, initial_state=initial_state, dtype=tf.float32) with tf.variable_scope("Encoder_Layer2"): Encoder_2_InputShape = [img_size, img_size, num_filters1] encoder_2_KernelShape = [filter_size2, filter_size2] encoder_2_cell = ConvLSTMCell(2, Encoder_2_InputShape, num_filters2, encoder_2_KernelShape, use_bias=True, forget_bias=1.0, name='Encoder_2') initial_state_2 = encoder_2_cell.zero_state(1, dtype=tf.float32) encoder_2_outputs, encoder_2_state = tf.nn.dynamic_rnn(encoder_2_cell, encoder_1_outputs, initial_state=initial_state_2, dtype=tf.float32) with tf.variable_scope("Encoder_Layer3"): Encoder_3_InputShape = [img_size, img_size, num_filters2] encoder_3_KernelShape = [filter_size3, filter_size3] encoder_3_cell = ConvLSTMCell(2, Encoder_3_InputShape, num_filters3, encoder_3_KernelShape, use_bias=True, forget_bias=1.0, name='Encoder_3') initial_state_3 = encoder_3_cell.zero_state(1, dtype=tf.float32) encoder_3_outputs, encoder_3_state = tf.nn.dynamic_rnn(encoder_3_cell, encoder_2_outputs, initial_state=initial_state_3, dtype=tf.float32) #Weights function def new_weights(shape, name): return tf.get_variable(name, shape, initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) #Convolution function def conv_layer(input, # The previous layer. num_input_channels, # Num. channels in prev. layer. filter_size, # Width and height of each filter. num_filters): # Number of filters. #with tf.variable_scope("ConvLayer") as Conv_Layer: filter_shape = [filter_size, filter_size, num_input_channels, num_filters] w = new_weights(shape=filter_shape, name='ConvLayer_Weights') conv_output = tf.nn.conv2d(input=input, filter=w, strides=[1, 1, 1, 1], padding='SAME') #relu_output = tf.nn.relu(conv_output) return conv_output #Loss function def loss(prediction, label): #with tf.variable_scope("Loss") as Loss_scope: log_pred = tf.log(tf.clip_by_value((prediction),1e-10,1.0), name='Prediction_Log') log_pred_2 = tf.log(tf.clip_by_value((1-prediction),1e-10,1.0), name='1-Prediction_Log') cross_entropy = -tf.multiply(label, log_pred) - tf.multiply((1-label), log_pred_2) return cross_entropy # In[ ]: labels = tf.reshape(y_image, [1, 10, 64, 64, 1]) w = tf.get_variable(name = "decoder_1_weights", shape =[10, 5, 5, 1, num_filters1], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) true_label = tf.nn.conv3d(input=labels, filter=w, strides=[1, 1, 1, 1, 1], padding='SAME') true_label = tf.reshape(true_label, [1, 10, 64, 64, num_filters1]) true_label = tf.unstack(true_label, num = 10, axis = 1) # In[ ]: START = np.zeros((((1, 10, 64, 64, num_filters1)))) START = np.float32(START) GO = tf.unstack(START, num = 10, axis = 1) # In[ ]: def loop_fn(previous_output, time): if previous_output is None: # time == 0 START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START') return START else: return previous_output # In[ ]: #loop function for the first decoder in the training phase, we are randomly feeding the ground truth def loop_fn_train_1(previous_output, time): if previous_output is None: # time == 0 START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START') return START else: if(bool(random.getrandbits(1))): return previous_output else: return true_label[time] with tf.variable_scope("Decoder"): with tf.variable_scope("Decoder_Layer1"): decoder_1_InputShape = [img_size,img_size, num_filters1] decoder_1_KernelShape = [filter_size1,filter_size1] decoder_1_rnn_cell = ConvLSTMCell(2, decoder_1_InputShape, num_filters1, decoder_1_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_1') decoder_1_outputs, decoder_1_states = tf.contrib.legacy_seq2seq.rnn_decoder(true_label, encoder_1_state, decoder_1_rnn_cell, loop_fn_train_1) with tf.variable_scope("Decoder_Layer2"): decoder_2_InputShape = [img_size,img_size, num_filters2] decoder_2_KernelShape = [filter_size2,filter_size2] decoder_2_rnn_cell = ConvLSTMCell(2, decoder_2_InputShape, num_filters2, decoder_2_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_2') w = tf.get_variable(name = "decoder_2_weights", shape =[10, 5, 5, num_filters1, num_filters2], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) decoder_2_inputs = tf.nn.conv3d(input=decoder_1_outputs, filter=w, strides=[1, 1, 1, 1, 1], padding='SAME') decoder_2_inputs = tf.reshape(decoder_2_inputs, [1, 10, 64, 64, num_filters2]) decoder_2_inputs = tf.unstack(decoder_2_inputs, num = 10, axis = 1) #loop function for the second decoder in the training phase, we are randomly feeding the ground truth def loop_fn_train_2(previous_output, time): if previous_output is None: # time == 0 START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START') return START else: if(bool(random.getrandbits(1))): return previous_output else: return decoder_2_inputs[time] decoder_2_outputs, decoder_2_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_2_inputs, encoder_2_state, decoder_2_rnn_cell, loop_fn_train_2) with tf.variable_scope("Decoder_Layer3"): decoder_3_InputShape = [img_size,img_size, num_filters3] decoder_3_KernelShape = [filter_size3,filter_size3] decoder_3_rnn_cell = ConvLSTMCell(2, decoder_3_InputShape, num_filters3, decoder_3_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_3') w = tf.get_variable(name = "decoder_3_weights", shape =[10, 5, 5, num_filters2, num_filters3], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) decoder_3_inputs = tf.nn.conv3d(input=decoder_2_outputs, filter=w, strides=[1, 1, 1, 1, 1], padding='SAME') decoder_3_inputs = tf.reshape(decoder_3_inputs, [1, 10, 64, 64, num_filters3]) decoder_3_inputs = tf.unstack(decoder_3_inputs, num = 10, axis = 1) #loop function for the second decoder in the training phase, we are randomly feeding the ground truth def loop_fn_train_3(previous_output, time): if previous_output is None: # time == 0 START = tf.placeholder(tf.float32, shape=[None,1, img_size, img_size, 1], name='START') return START else: if(bool(random.getrandbits(1))): return previous_output else: return decoder_3_inputs[time] decoder_3_outputs, decoder_3_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_3_inputs, encoder_3_state, decoder_3_rnn_cell, loop_fn_train_3) # In[ ]: with tf.variable_scope("Decoder", reuse=True): with tf.variable_scope("Decoder_Layer1"): decoder_1_InputShape = [img_size,img_size, num_filters1] decoder_1_KernelShape = [filter_size1,filter_size1] decoder_1_rnn_cell = ConvLSTMCell(2, decoder_1_InputShape, num_filters1, decoder_1_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_1') Test_decoder_1_outputs, Test_decoder_1_states = tf.contrib.legacy_seq2seq.rnn_decoder(GO, encoder_1_state, decoder_1_rnn_cell, loop_fn) with tf.variable_scope("Decoder_Layer2"): decoder_2_InputShape = [img_size,img_size, num_filters2] decoder_2_KernelShape = [filter_size2,filter_size2] decoder_2_rnn_cell = ConvLSTMCell(2, decoder_2_InputShape, num_filters2, decoder_2_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_2') w = tf.get_variable(name = "decoder_2_weights", shape =[10, 5, 5, num_filters1, num_filters2], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) decoder_2_inputs = tf.nn.conv3d(input=Test_decoder_1_outputs, filter=w, strides=[1, 1, 1, 1, 1], padding='SAME') decoder_2_inputs = tf.reshape(decoder_2_inputs, [1, 10, 64, 64, num_filters2]) decoder_2_inputs = tf.unstack(decoder_2_inputs, num = 10, axis = 1) Test_decoder_2_outputs, Test_decoder_2_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_2_inputs, encoder_2_state, decoder_2_rnn_cell, loop_fn) with tf.variable_scope("Decoder_Layer3"): decoder_3_InputShape = [img_size,img_size, num_filters3] decoder_3_KernelShape = [filter_size3,filter_size3] decoder_3_rnn_cell = ConvLSTMCell(2, decoder_3_InputShape, num_filters3, decoder_3_KernelShape, use_bias=True, forget_bias=1.0, name='Decoder_3') w = tf.get_variable(name = "decoder_3_weights", shape =[10, 5, 5, num_filters2, num_filters3], initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.05)) decoder_3_inputs = tf.nn.conv3d(input=Test_decoder_2_outputs, filter=w, strides=[1, 1, 1, 1, 1], padding='SAME') decoder_3_inputs = tf.reshape(decoder_3_inputs, [1, 10, 64, 64, num_filters3]) decoder_3_inputs = tf.unstack(decoder_3_inputs, num = 10, axis = 1) Test_decoder_3_outputs, Test_decoder_3_states = tf.contrib.legacy_seq2seq.rnn_decoder(decoder_3_inputs, encoder_3_state, decoder_3_rnn_cell, loop_fn) Conv_inputs = tf.concat([decoder_1_outputs,decoder_2_outputs, decoder_3_outputs], 4) Conv_inputs = tf.reshape(Conv_inputs, [10, 64, 64, num_filters1 + num_filters2 + num_filters3]) # In[ ]: Test_Conv_inputs = tf.concat([Test_decoder_1_outputs, Test_decoder_2_outputs, Test_decoder_3_outputs], 4) Test_Conv_inputs = tf.reshape(Test_Conv_inputs, [10, 64, 64, num_filters1 + num_filters2 + num_filters3]) # In[ ]: with tf.variable_scope("ConvLayer"): with tf.variable_scope("ConvLayer_Pred"): pred_1 = conv_layer(input=Conv_inputs, # The previous layer. num_input_channels=num_filters1 + num_filters2 + num_filters3, # Num. channels in prev. layer. filter_size=1, # Width and height of each filter. num_filters=1) # In[ ]: with tf.variable_scope("ConvLayer", reuse=True): with tf.variable_scope("ConvLayer_Pred"): Test_pred_1 = conv_layer(input=Test_Conv_inputs, # The previous layer. num_input_channels=num_filters1 + num_filters2 + num_filters3, # Num. channels in prev. layer. filter_size=1, # Width and height of each filter. num_filters=1) with tf.variable_scope("Training_Loss"): with tf.variable_scope("Loss_Pred"): Pdistance = loss(prediction=pred_1, label=y_image) #cost = tf.reduce_sum(distance) with tf.variable_scope("Training_Loss", reuse=True): with tf.variable_scope("Loss_Pred"): Test_Pdistance = loss(prediction=Test_pred_1, label=y_image) #cost = tf.reduce_sum(distance) cost = tf.reduce_sum(Pdistance) Test_cost = tf.reduce_sum(Test_Pdistance) #batch_cost += cost with tf.variable_scope("Optimizer"): optimizer = tf.train.AdamOptimizer(1e-3).minimize(cost) session = tf.Session() session.run(tf.global_variables_initializer()) # In[ ]: cwd = '/Users/maryamr/Tensorflow/' # In[ ]: data = np.load(cwd+'mnist_test_seq.npy') data_2 = data.reshape([20*10000,64*64]) from sklearn.preprocessing import MinMaxScaler from sklearn import preprocessing scaler = MinMaxScaler() scaler.fit(data_2) print(scaler.data_max_) data_3 = scaler.transform(data_2) # In[ ]: data_3 = data_3.reshape([20, 10000, 64, 64]) # In[ ]: cost_record = np.zeros(10000) for i in range(10000): x_train = data_3[0:10, i, :, :] x_train = x_train.flatten() x_train = x_train.reshape([1, 10, img_size, img_size, 1]) x_train = np.float32(x_train) y_train = data_3[10:20, i, :, :] #print("true_y_sum: {}".format(np.sum(y_train))) y_train = y_train.flatten() y_train = y_train.reshape([10, img_size * img_size]) y_train = np.float32(y_train) x_2 = np.reshape(x_train,[10, 64, 64, 1]) x_train_reverse = np.flip(x_2, 0) z_train = np.reshape(x_train_reverse,[10, 64*64]) feed_dict_train = {x: x_train, y: y_train, z:z_train} if(i < 9990): session.run(optimizer, feed_dict=feed_dict_train) cost_out = session.run(cost, feed_dict=feed_dict_train) cost_record[i]=cost_out else: final_pred_1 = session.run(Test_pred_1, feed_dict=feed_dict_train) true_label = session.run(y_image, feed_dict=feed_dict_train) #Hid = session.run(encoder_1_state.h, feed_dict=feed_dict_train) Cell_1, Cell_2, Cell_3 = session.run([encoder_1_state.c, encoder_2_state.c, encoder_3_state.c], feed_dict=feed_dict_train) cost_out = session.run(Test_cost, feed_dict=feed_dict_train) print("cost: {}".format(cost_out)) cost_record[i]=cost_out # In[ ]: plt.plot(cost_record) plt.xlabel('number of iterations') plt.ylabel('loss') #plt.show() plt.savefig('/Users/maryamr/Loss_plot.png', bbox_inches = 'tight') # In[ ]: plt.imsave('/Users/maryamr/Cell_1.png', Cell_1[0,:,:,15], cmap='gray') plt.imsave('/Users/maryamr/Cell_2.png', Cell_2[0,:,:,15], cmap='gray') plt.imsave('/Users/maryamr/Cell_3.png', Cell_3[0,:,:,15], cmap='gray') # In[ ]: f, axarr = plt.subplots(2, 5) m = 0 for i in range(2): for j in range(5): axarr[i, j].imshow(final_pred_1[m,:,:,0], cmap='gray') axarr[i, j].get_xaxis() axarr[i, j].get_yaxis() m+=1 plt.savefig('/Users/maryamr/final_pred_1_10.png', bbox_inches = 'tight') # In[ ]: f, axarr = plt.subplots(2, 5) m = 0 for i in range(2): for j in range(5): axarr[i, j].imshow(true_label[m,:,:,0], cmap='gray') axarr[i, j].get_xaxis() axarr[i, j].get_yaxis() m+=1 plt.savefig('/Users/maryamr/true_label_10.png', bbox_inches = 'tight') 

Estas son las funciones de entrada, salida y pérdida (las 10 primeras imágenes se ingresan y las 10 segundas son verdaderas para la predicción). Entrené el modelo en 9990 secuencias y comencé la prueba desde 9990 hasta 10000, por eso Ver un salto en la ttwig de la pérdida. y también estos resultados son para la secuencia 10000:

introduzca la descripción de la imagen aquí

introduzca la descripción de la imagen aquí

introduzca la descripción de la imagen aquí

Debido a que no ha guardado su modelo, si lo guarda, puede restaurar su modelo y hacer predicciones.