Pesos y sesgos no se actualizan en tensorflow

He hecho esta neural network para averiguar si una casa es una buena compra o una mala compra. Por algunas razones, el código no está actualizando pesos y sesgos. Mi pérdida se mantiene igual. Este es mi código:

He hecho esta neural network para averiguar si una casa es una buena compra o una mala compra. Por algunas razones, el código no está actualizando pesos y sesgos. Mi pérdida se mantiene igual. Este es mi código:

import pandas as pd import tensorflow as tf data = pd.read_csv("E:/workspace_py/datasets/good_bad_buy.csv") features = data.drop(['index', 'good buy'], axis = 1) lbls = data.drop(['index', 'area', 'bathrooms', 'price', 'sq_price'], axis = 1) features = features[0:20] lbls = lbls[0:20] print(features) print(lbls) n_examples = len(lbls) # Model # Hyper parameters epochs = 100 learning_rate = 0.1 batch_size = 1 input_data = tf.placeholder('float', [None, 4]) labels = tf.placeholder('float', [None, 1]) weights = { 'hl1': tf.Variable(tf.random_normal([4, 10])), 'hl2': tf.Variable(tf.random_normal([10, 10])), 'hl3': tf.Variable(tf.random_normal([10, 4])), 'ol': tf.Variable(tf.random_normal([4, 1])) } biases = { 'hl1': tf.Variable(tf.random_normal([10])), 'hl2': tf.Variable(tf.random_normal([10])), 'hl3': tf.Variable(tf.random_normal([4])), 'ol': tf.Variable(tf.random_normal([1])) } hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1'])) hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2'])) hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3'])) ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])) loss = tf.reduce_mean((labels - ol)**2) train = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) iterations = int(n_examples/batch_size) for epoch_no in range(epochs): ptr = 0 for iteration_no in range(iterations): epoch_input = features[ptr:ptr+batch_size] epoch_label = lbls[ptr: ptr+batch_size] ptr = ptr + batch_size _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls}) print("Error at epoch ", epoch_no, ": ", err) print(sess.run(ol, feed_dict={input_data: [[2104, 3, 399900, 190.0665]]})) 

Este es el conjunto de datos:

 Features: area bathrooms price sq_price 0 2104 3 399900 190.066540 1 1600 3 329900 206.187500 2 2400 3 369000 153.750000 3 1416 2 232000 163.841808 4 3000 4 539900 179.966667 5 1985 4 299900 151.083123 6 1534 3 314900 205.280313 7 1427 3 198999 139.452698 8 1380 3 212000 153.623188 9 1494 3 242500 162.315930 10 1940 4 239999 123.710825 11 2000 3 347000 173.500000 12 1890 3 329999 174.602645 13 4478 5 699900 156.297454 14 1268 3 259900 204.968454 15 2300 4 449900 195.608696 16 1320 2 299900 227.196970 17 1236 3 199900 161.731392 18 2609 4 499998 191.643542 19 3031 4 599000 197.624546 labels: good buy 0 1.0 1 0.0 2 1.0 3 0.0 4 1.0 5 0.0 6 0.0 7 1.0 8 0.0 9 0.0 10 1.0 11 1.0 12 1.0 13 1.0 14 0.0 15 1.0 16 0.0 17 1.0 18 1.0 19 1.0 

Alguna sugerencia en como arreglar esto? He intentado tf.reduce_sum que no sea tf.reduce_mean. También he intentado un batch_size más grande.

Algunas cosas a considerar

  • Minibatch no se está evaluando correctamente ya que alimenta características y lbls en lugar de epoch_input y epoch_label.
  • No condiciona sus datos de ninguna manera, por lo que está completamente fuera de rango. Es decir, mi código a continuación normaliza las características en stddev y mean. Podría considerar el uso de batch_normalization.
  • No estás evaluando el error en ningún momento. Necesitas un conjunto de entrenamiento y pruebas. Mi código a continuación no contiene datos, pero sí prueba en términos de% de error en lugar de solo pérdida (que es un proxy débil para el error, por lo que no debería llamarlo error).
  • Inicializa sesgos a normales aleatorios. Probablemente quieras comenzar con esos a cero.
  • Probablemente debería usar tf.layers u otra api de alto nivel.

El siguiente código logra un error de entrenamiento del 95%. Desearía realizar una prueba con un conjunto de datos retenidos que no se utiliza para la capacitación para evaluar el error de la prueba.

 #!/usr/bin/env python import sys import pandas as pd import numpy as np import tensorflow as tf data = pd.read_csv("data.csv") features = data.drop(['good buy'], axis = 1) lbls = data.drop([ 'area', 'bathrooms', 'price', 'sq_price'], axis = 1) features = features[0:20] lbls = lbls[0:20] mu = np.mean(features, axis=0) sigma = (np.std(features, axis=0)) features = (features - mu) / sigma n_examples = len(lbls) # Model # Hyper parameters epochs = 100 learning_rate = 0.01 batch_size = 5 input_data = tf.placeholder('float', [None, 4]) labels = tf.placeholder('float', [None, 1]) weights = { 'hl1': tf.Variable(tf.random_normal([4, 10])), 'hl2': tf.Variable(tf.random_normal([10, 10])), 'hl3': tf.Variable(tf.random_normal([10, 4])), 'ol': tf.Variable(tf.random_normal([4, 1])) } biases = { 'hl1': tf.Variable(tf.zeros([10])), 'hl2': tf.Variable(tf.zeros([10])), 'hl3': tf.Variable(tf.zeros([4])), 'ol': tf.Variable(tf.zeros([1])) } hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1'])) hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2'])) hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3'])) ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])) loss = tf.reduce_mean((labels - ol)**2) train = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) iterations = int(n_examples/batch_size) def training_accuracy(): foo, = sess.run([ol], feed_dict={input_data: features, labels: lbls}) return (float(np.count_nonzero(np.equal(np.round(foo), lbls))) / float(lbls.shape[0])) print("Initial training accuracy %f" % training_accuracy()) for epoch_no in range(epochs): ptr = 0 for iteration_no in range(iterations): epoch_input = features[ptr:ptr+batch_size] epoch_label = lbls[ptr: ptr+batch_size] ptr = (ptr + batch_size)%len(features) _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label}) print("Error at epoch ", epoch_no, ": ", err) print(" Training accuracy %f" % training_accuracy()) 

Además, no publique preguntas de uso como esta en github, pertenecen aquí en StackOverflow.

Hay varias cosas que no están bien con tu código. Primero te refieres

  epoch_input = features[ptr:ptr+batch_size] epoch_label = lbls[ptr: ptr+batch_size] ptr = ptr + batch_size // _, err = sess.run([train, loss], feed_dict={input_data: features, labels: lbls} _, err = sess.run([train, loss], feed_dict={input_data: epoch_input, labels: epoch_label} 

Ahora usa minibatch.

Depurando el gradiente:

Siempre puedes revisar algunas cosas agregando

 loss = tf.Print(loss, [tf.reduce_sum(weights['hl1'])]) 

Esto imprimirá los elementos de esa lista [tf.reduce_sum(weights['hl1'])] . Para investigar más a fondo su problema, puede verificar los gradientes en lugar de utilizar minimizar

 grads = tf.reduce_sum(tf.gradients(loss, ol)[0]) sess.run(grads, {input_data: features, labels: lbls}) 

Y finalmente, la función de pérdida es inadecuada / numérica inestable para la clasificación. Con tu versión, obtengo:

 variables Variable:0 Variable_1:0 Variable_2:0 Variable_3:0 Variable_4:0 Variable_5:0 Variable_6:0 Variable_7:0 I tensorflow/core/kernels/logging_ops.cc:79] [-6.2784553] ----------------------------------------- name MatMul_grad gradient [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] value [[-0.59977376 -0.30060738 0.55068201 0.15304407 1.39992142 0.07495346 -0.87189424 -0.22595075 -0.30094525 -1.2688272 ] [-0.44018757 1.08651936 -0.26267499 -0.54463315 0.47019768 0.69873857 0.56195319 0.20222363 0.38143152 -0.92212462] [-0.39977714 -1.07244122 0.41926911 1.4951371 -2.28751612 0.45676312 0.88010246 -0.88077509 -1.25860023 0.56874037] [-0.98260719 -1.30747247 -1.4460088 1.0717535 0.08794415 -0.53184992 -1.17537284 -0.51598179 -0.15323587 0.91142744]] ----------------------------------------- name MatMul_1_grad gradient [[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] value [[-0.1170694 0.12174897 0.91696155 0.59427398 0.90844423 0.29010534 -0.34039831 -0.62824941 0.37833953 0.27777222] [-0.34947088 1.09264851 0.27353975 1.31722498 -0.42032316 -2.74952078 -0.66349608 -0.61844724 -0.82141227 1.21691799] [ 0.10453336 -1.68631995 0.45700032 -1.58120835 -1.23378754 -0.05648948 -1.64761281 -0.57684237 -0.06499017 -0.49623618] [ 1.47821534 -0.5329541 0.09209292 1.78089786 1.71149898 0.30547267 0.39544162 1.00369155 1.0097307 -0.92320329] [ 1.27038908 -2.17246103 -0.31276336 0.8945803 0.30964327 1.15329361 0.9711507 -0.36301252 -0.05652813 0.63399518] [-0.30909851 -0.41660413 -0.50603527 0.11735299 -0.26837045 0.16547598 -0.33875859 -0.46821991 0.25723135 -0.80380815] [-0.86255074 -1.11751068 0.01365725 0.66119182 0.48947951 1.6353699 -0.794447 0.43182942 -0.97692633 -1.62605619] [ 1.38552308 0.83679706 -0.87287223 2.59401655 -0.61855 0.38301265 1.09983373 0.49209142 1.03003716 -1.33537853] [ 0.74452382 1.57940936 -0.90974236 -1.2211293 -1.1076287 0.92846316 -0.46856263 -0.3179535 0.75120807 -0.86442506] [ 0.31622764 -0.35965034 -0.02351121 -0.0650174 0.4714573 0.35687482 1.43354905 0.39608309 0.42744714 -0.37226421]] ----------------------------------------- name MatMul_2_grad gradient [[ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.] [ 0. 0. 0. 0.]] value [[-1.50904143 0.00228321 1.45787132 0.68312413] [-0.16627057 1.31303644 1.16326404 0.72901946] [ 0.8004092 0.37329885 0.89361066 -0.19850619] [ 1.58354807 -1.05612624 0.69891322 -0.32565734] [-1.57602286 -0.41256282 0.69086516 -0.54095054] [ 1.72376788 -0.53928965 -0.71574098 -0.94974124] [-0.62061429 1.51380932 -0.72585452 -0.07695383] [ 0.35537818 1.49691582 0.03931179 0.93435526] [ 0.20697887 1.39266443 0.73217523 -0.64737892] [ 1.00519872 0.90984046 1.68565321 -0.28157935]] ----------------------------------------- name MatMul_3_grad gradient [[ 0.] [ 0.] [ 0.] [ 0.]] value [[ 0.94082022] [ 0.14753926] [-0.08765228] [ 1.32516992]] ----------------------------------------- name Add_grad gradient [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] value [ 1.71239722 1.12632215 0.75409448 0.01951236 0.32135537 -1.46281374 0.40413955 0.54653352 -0.57894999 0.2746354 ] ----------------------------------------- name Add_1_grad gradient [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] value [ 0.74800217 -0.43517059 -0.77706921 1.46858656 1.09103405 -0.46681881 0.6126743 -2.27877688 1.48809242 -1.19616997] ----------------------------------------- name Add_2_grad gradient [ 0. 0. 0. 0.] value [-0.12137324 -0.23238407 0.17909229 -0.75496733] ----------------------------------------- name Add_3_grad gradient [ 0.] value [-0.91176724] 

Como ves, casi todos los gradientes son cero. ¿Por qué?

  • por definición (labels - ol) está en [0, 1]
  • el valor cuadrado es mucho menor que uno
  • la derivada de sigmoide s(x) es s'(x) = s(x)*(1-s(x)) los gradientes se multiplican por este valor, que es mucho más pequeño que uno.

Pero después de usar sparse_softmax_cross_entropy_with_logits que es numéricamente estable y opera en el dominio de registro que obtengo

 variables Variable:0 Variable_1:0 Variable_2:0 Variable_3:0 Variable_4:0 Variable_5:0 Variable_6:0 Variable_7:0 ----------------------------------------- name MatMul_grad gradient [[ -1.42780918e-05 -1.96137808e-05 -2.44040220e-05 -2.25691911e-05 0.00000000e+00 2.95208647e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ -2.54181440e-08 -3.49168410e-08 -4.34445262e-08 -4.01781257e-08 0.00000000e+00 5.25536308e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ -2.45539122e-03 -3.37296468e-03 -4.19673882e-03 -3.88120394e-03 0.00000000e+00 5.07667707e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ -1.42123906e-06 -1.95235293e-06 -2.42917258e-06 -2.24653377e-06 0.00000000e+00 2.93850212e-06 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]] value [[ 0.43133125 -0.40009859 -0.08456381 0.59587955 0.57171088 -0.9824872 1.18876612 0.9704771 0.74798232 0.15660612] [-1.18380785 0.22617982 -1.15734088 -0.50478351 1.43819618 1.55950046 -1.1510663 -0.88835335 0.58378232 0.56860197] [ 0.29826403 0.02192715 0.62225986 2.47716165 -0.9223454 1.70159853 -1.03968358 -0.26019615 -0.33808291 -0.30873826] [ 0.59774327 -1.28855145 -0.43420359 -0.4413566 -0.19220066 0.96984953 -0.04922202 0.32994318 -1.05539823 -0.80112725]] ----------------------------------------- name MatMul_1_grad gradient [[ 0.00000000e+00 1.15650124e-03 0.00000000e+00 0.00000000e+00 6.59449317e-04 -1.09400018e-03 0.00000000e+00 -4.02117817e-04 5.44495881e-04 -8.90314346e-04] [ 0.00000000e+00 7.24206184e-05 0.00000000e+00 0.00000000e+00 4.12950030e-05 -6.85067716e-05 0.00000000e+00 -2.51807924e-05 3.40965707e-05 -5.57518724e-05] [ 0.00000000e+00 2.38713808e-03 0.00000000e+00 0.00000000e+00 1.36117137e-03 -2.25812919e-03 0.00000000e+00 -8.30012548e-04 1.12389564e-03 -1.83770037e-03] [ 0.00000000e+00 9.52679198e-03 0.00000000e+00 0.00000000e+00 5.43227792e-03 -9.01193265e-03 0.00000000e+00 -3.31248436e-03 4.48533799e-03 -7.33405072e-03] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 6.51591457e-03 0.00000000e+00 0.00000000e+00 3.71544389e-03 -6.16377220e-03 0.00000000e+00 -2.26559630e-03 3.06777749e-03 -5.01617463e-03] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] [ 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]] value [[ 0.38902158 -2.14370036 -1.02228141 -0.6492967 1.87193418 -0.06453216 1.0013988 -1.26857054 0.59826601 0.45045251] [ 0.51465249 -1.09108925 -0.21368918 -0.49310678 -0.87893176 -0.07944249 -0.15810326 1.65703297 1.01812947 -0.95572269] [-1.76351583 -1.46950841 1.43533802 2.15617752 1.30682683 0.77409673 -1.50309181 0.81978178 0.6672287 -0.434971 ] [-0.7291944 2.16516733 -1.39850736 -1.06059277 0.40035763 1.23335707 -0.03707252 1.88107574 0.09459961 2.11439633] [-1.39152992 -1.39924514 -0.35704514 -0.71152836 -2.68857026 0.78129828 -1.0077033 -1.26149333 0.4403404 -0.10159389] [ 0.37354535 0.12654085 0.7632165 -0.76493222 0.68177891 -0.34254205 -1.11582613 2.60665917 1.53196526 -0.867055 ] [ 0.62746197 -0.01072595 3.26629376 1.28371656 -0.88725293 3.55530715 0.67065352 -0.61927503 1.20604384 -0.87207574] [-0.68954837 1.89912283 0.90083456 0.02054735 -0.23425011 0.39949065 -0.08969283 -0.75943565 1.0924015 0.28920195] [-0.64865923 -1.29299021 -0.39945969 0.02289505 1.46024895 0.94282049 -0.99704605 -1.36124468 0.76788425 0.86770487] [ 0.63794595 1.68530416 -0.15548207 -0.22658408 -0.45446202 -0.77308726 -0.12694608 1.17369819 2.25879693 0.20346723]] ----------------------------------------- name MatMul_2_grad gradient [[ 0. 0. 0. 0. ] [-0.02205572 0. 0.00960038 0. ] [ 0. 0. 0. 0. ] [ 0. 0. 0. 0. ] [-0.01932034 0. 0.00840973 0. ] [-0.01617817 0. 0.00704201 0. ] [ 0. 0. 0. 0. ] [-0.05091252 0. 0.02216113 0. ] [-0.0189826 0. 0.00826272 0. ] [-0.01993647 0. 0.00867792 0. ]] value [[-0.18724969 -0.0544498 -0.69153035 0.47535184] [-0.75444973 -1.33321464 -0.13066645 1.56889391] [-0.6458627 1.17859495 -0.75926393 0.30138403] [ 1.0069555 -0.69344127 0.49295315 0.54917085] [-0.55954564 -1.13277721 -0.37167427 -0.64837182] [ 0.93753678 1.12197697 0.63789612 0.52438796] [ 0.77543265 -1.241382 1.78230286 -0.6928125 ] [ 0.95383584 -2.00331807 1.63409865 -0.36474878] [-0.73891008 2.066082 -0.94303596 -0.42322466] [ 0.38519588 0.03278512 -0.3487882 -1.50447905]] ----------------------------------------- name MatMul_3_grad gradient [[ 0.08460998] [ 0. ] [ 0.16564058] [ 0. ]] value [[-0.35376808] [-0.07330427] [ 0.15398768] [-0.06484076]] ----------------------------------------- name Add_grad gradient [ -8.22783885e-09 -1.13025616e-08 -1.40629695e-08 -1.30056375e-08 0.00000000e+00 1.70115797e-08 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00] value [-1.00038147 -0.56519473 0.59372097 -1.1646167 -0.16213787 -0.69313556 0.62788707 1.03768504 0.57876503 -0.5201084 ] ----------------------------------------- name Add_1_grad gradient [ 0.00000000e+00 1.28705375e-08 0.00000000e+00 0.00000000e+00 7.33891703e-09 -1.21749730e-08 0.00000000e+00 -4.47511184e-09 6.05961770e-09 -9.90818183e-09] value [ 0.02854451 -1.46039021 -0.03916361 0.40116394 0.16030532 0.88267213 -0.46328214 0.18927227 -1.7536788 -0.46590349] ----------------------------------------- name Add_2_grad gradient [ -1.84504412e-08 0.00000000e+00 8.03108247e-09 0.00000000e+00] value [ 0.94534302 -0.9080081 -1.86719894 -1.31547296] ----------------------------------------- name Add_3_grad gradient [ 0.29727879 -0.29727876] value [ 0.07999782 -0.75647992] 

Los gradientes son (aunque muy pequeños) esta vez no cero. El código para reproducir que es

 import numpy as np import tensorflow as tf features = [ [2104, 3, 399900, 190.066540], [1600, 3, 329900, 206.187500], [2400, 3, 369000, 153.750000], [1416, 2, 232000, 163.841808], [3000, 4, 539900, 179.966667], [1985, 4, 299900, 151.083123], [1534, 3, 314900, 205.280313], [1427, 3, 198999, 139.452698], [1380, 3, 212000, 153.623188], [1494, 3, 242500, 162.315930], [1940, 4, 239999, 123.710825], [2000, 3, 347000, 173.500000], [1890, 3, 329999, 174.602645], [4478, 5, 699900, 156.297454], [1268, 3, 259900, 204.968454], [2300, 4, 449900, 195.608696], [1320, 2, 299900, 227.196970], [1236, 3, 199900, 161.731392], [2609, 4, 499998, 191.643542], [3031, 4, 599000, 197.624546]] lbls = [1,0,1,0,1,0,0,1,0,0,1,1,1,1,0,1,0,1,1,1] features = np.array(features, dtype=np.float32) lbls = np.array(lbls, dtype=np.int32) n_examples = len(lbls) epochs = 100 learning_rate = 0.1 batch_size = 1 input_data = tf.placeholder('float', [None, 4]) labels = tf.placeholder('int32', [None]) weights = { 'hl1': tf.Variable(tf.random_normal([4, 10])), 'hl2': tf.Variable(tf.random_normal([10, 10])), 'hl3': tf.Variable(tf.random_normal([10, 4])), 'ol': tf.Variable(tf.random_normal([4, 1])) } biases = { 'hl1': tf.Variable(tf.random_normal([10])), 'hl2': tf.Variable(tf.random_normal([10])), 'hl3': tf.Variable(tf.random_normal([4])), # 'ol': tf.Variable(tf.random_normal([1])), 'ol': tf.Variable(tf.random_normal([2])) } hl1 = tf.nn.relu(tf.add(tf.matmul(input_data, weights['hl1']), biases['hl1'])) hl2 = tf.nn.relu(tf.add(tf.matmul(hl1, weights['hl2']), biases['hl2'])) hl3 = tf.nn.relu(tf.add(tf.matmul(hl2, weights['hl3']), biases['hl3'])) # ol = tf.nn.sigmoid(tf.add(tf.matmul(hl3, weights['ol']), biases['ol'])) logits = tf.add(tf.matmul(hl3, weights['ol']), biases['ol']) # ol = tf.Print(ol, [tf.reduce_sum(weights['hl1'])]) # loss = tf.reduce_mean((labels - ol)**2) cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels) # loss = tf.reduce_mean((labels - ol)**2) loss = tf.reduce_mean(cost) optimizer = tf.train.AdamOptimizer(learning_rate) iterations = int(n_examples/batch_size) def debug_minimize(optimizer, loss, sess): from tensorflow.python.ops import variables from tensorflow.python.framework import ops # get all varibles var_list = (variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) print 'variables' for v in var_list: print ' ', v.name # get all gradients grads_and_vars = optimizer.compute_gradients(loss) train_op = optimizer.apply_gradients(grads_and_vars) zipped_val = sess.run(grads_and_vars, {input_data: features, labels: lbls}) for rsl, tensor in zip(zipped_val, grads_and_vars): print '-----------------------------------------' print 'name', tensor[0].name.replace('/tuple/control_dependency_1:0', '').replace('gradients/', '') print 'gradient', rsl[0] print 'value', rsl[1] return train_op sess = tf.Session() sess.run(tf.global_variables_initializer()) debug_minimize(optimizer, loss, sess) 

No estoy seguro de si este es el problema para ti. Pero el gradiente de las funciones sigmoideas puede ser muy pequeño si su entrada es demasiado grande, esto puede hacer que las actualizaciones sean muy lentas.

Para comprobar si este es el caso, intente inicializar todos sus pesos en valores muy pequeños. Puede ajustar esto estableciendo una desviación estándar para sus normas aleatorias.

 tf.Variable(tf.random_normal([4, 10], stddev=0.1))