Tensorflow Speicherverlust mit rekurrenten neuronalen Netzwerk

Ich versuche, ein einfaches rekurrentes neuronale Netzwerkmodell mit Tensorflow unter Mac OS X zu erstellen. Es ist nur ein Spielzeugmodell und die Größe der Eingabedaten nicht 3 MB überschreiten, so sollte es nicht viel Speicher verbrauchen. Wenn ich jedoch ein Modell verwende, erhöht die Speicherauslastung jeden Trainingsstapel signifikant und geht über 10 GB hinaus. Es war nur für zwei Iterationen. Ich könnte es nicht mehr laufen lassen.Tensorflow Speicherverlust mit rekurrenten neuronalen Netzwerk

Hier ist der ganze Code.

from __future__ import absolute_import 
from __future__ import division 
from __future__ import print_function 

import tensorflow as tf 
import numpy as np 
from pympler import summary 

class RNN(): 
    """The RNN model.""" 
    #@profile 
    def inference(self): 
     """calculate outputs and loss for a single batch""" 
     total_loss = 0.0 
     outputs = [] 
     for i in range(self.batch_size): 
      state = self.init_state 
      outputs.append([]) 
      loss = 0.0 
      for j in range(self.num_steps): 
       state, output = self.next_state(self.x[i,j,:],state) 
       outputs[i].append(output) 
       loss += tf.square(self.y[i,j,:]-output) 
      total_loss+=loss 
     return outputs, total_loss/(self.batch_size*self.num_steps) 

    def __init__(self, is_training, config): 
     self.sess = sess = tf.Session() 

     self.prev_see = prev_see = config.prev_see 
     self.num_steps = num_steps = config.num_steps 
     #maybe "self.num_hidden =" part could be removed 
     self.num_hidden = num_hidden = config.num_hidden 
     self.batch_size = config.batch_size 
     self.epoch = config.epoch 
     self.learning_rate = config.learning_rate 
     self.summaries_dir = config.summaries_dir 

     with tf.name_scope('placeholders'): 
      self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see], 
            name='input-x') 
      self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y') 
      default_init_state = tf.zeros([num_hidden]) 
      self.init_state = tf.placeholder_with_default(default_init_state,[num_hidden], 
                name='state_placeholder') 

     def weight_variable(self,shape): 
      """Create a weight variable with appropriate initialization.""" 
      initial = tf.truncated_normal(shape,stddev=0.1) 
      return tf.Variable(initial) 

     def bias_variable(self,shape): 
      """Create a bias variable with appropriate initialization.""" 
      initial = tf.constant(0.1,shape=shape) 
      return tf.Variable(initial) 

     def variable_summaries(self,var,name): 
      """Attach a lot of summaries to a Tensor.""" 
      with tf.name_scope('summaries'): 
       mean = tf.reduce_mean(var) 
       tf.scalar_summary('mean/'+name,mean) 
       with tf.name_scope('stddev'): 
        stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean))) 
       tf.scalar_summary('stddev/'+name,stddev) 
       tf.scalar_summary('max/'+name, tf.reduce_max(var)) 
       tf.scalar_summary('min/'+name, tf.reduce_min(var)) 
       tf.histogram_summary(name, var) 

     #declare weight variables as property 
     layer_name = 'rnn_layer' 
     with tf.name_scope(layer_name): 
      with tf.name_scope('U'): 
       self.U = U = weight_variable(self,[prev_see,num_hidden]) 
       variable_summaries(self,U,layer_name+'/U') 
      with tf.name_scope('W'): 
       self.W = W = weight_variable(self,[num_hidden,num_hidden]) 
       variable_summaries(self,W,layer_name+'/W') 
      with tf.name_scope('b_W'): 
       self.b_W = b_W = bias_variable(self,[num_hidden]) 
       variable_summaries(self,b_W,layer_name+'/b_W') 
      with tf.name_scope('V'): 
       self.V = V = weight_variable(self,[num_hidden,1]) 
       variable_summaries(self,V,layer_name+'/V') 
      with tf.name_scope('b_V'): 
       self.b_V = b_V = bias_variable(self,[1]) 
       variable_summaries(self,b_V,layer_name+'/b_V') 
     self.merged = tf.merge_all_summaries() 
     self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph) 
     tf.initialize_all_variables().run(session=sess) 
     _,self.loss = self.inference() 


    def next_state(self,x,s_prev): 
     """calculate next state and output""" 
     x = tf.reshape(x,[1,-1]) 
     s_prev = tf.reshape(s_prev,[1,-1])   
     s_next = tf.tanh(tf.matmul(x,self.U)+tf.matmul(s_prev,self.W)+self.b_W) 
     output = tf.matmul(s_next,self.V)+self.b_V 
     return s_next, output 

    #@profile 
    def batch_train(self,feed_dict): 
     """train the network for a single batch""" 
     loss = self.loss 
     train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) 
     summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict) 
     #self.train_writer.add_summary(summary) 
     print(loss_value) 


class TrainConfig(): 
    """Train Config.""" 
    total_steps = 245 
    test_ratio = 0.3 
    prev_see = 100 
    num_steps = int(round((total_steps-prev_see)*(1-test_ratio))) 
    num_hidden = 10 
    batch_size = 5 
    epoch = 3 
    learning_rate = 0.1 
    summaries_dir = '/Users/Kyungsu/StockPrediction/log' 

class DebugConfig(): 
    """For debugging memory leak.""" 
    total_steps = 100 
    test_ratio = 0.3 
    prev_see = 100 
    num_steps = 10 
    num_hidden = 10 
    batch_size = 5 
    epoch = 2 
    learning_rate = 0.1 
    summaries_dir = '/Users/Kyungsu/StockPrediction/log' 

#@profile 
def run_epoch(m,x_data,y_data): 
    num_batch = ((len(x_data)-1) // m.batch_size)+1 
    #num_batch = 100 
    for i in range(num_batch): 
     x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
     y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
     feed_dict = {m.x:x_batch,m.y:y_batch} 
     print("%dth/%dbatches"%(i+1,num_batch)) 
     m.batch_train(feed_dict) 

def process_data(data,config): 
    data_size = len(data) 
    prev_see = config.prev_see 
    num_steps = config.num_steps 
    x = np.zeros((data_size,num_steps,prev_see)) 
    y = np.zeros((data_size,num_steps,1)) 
    for i in range(data_size): 
     for j in range(num_steps-prev_see): 
      x[i,j,:] = data[i,i:i+prev_see] 
      y[i,j,0] = data[i,i+prev_see] 
    return x,y 

#@profile 
def main(): 
    train_config = TrainConfig() 
    debug_config = DebugConfig() 
    data = np.load('processed_data.npy') 
    x,y = process_data(data,train_config) 
    rnn_model = RNN(True,train_config) 

    #training phase 
    for i in range(rnn_model.epoch): 
     print("%dth epoch"%(i+1)) 
     run_epoch(rnn_model,x,y) 

main()

Und nach ist das Ergebnis memory_profiler. Seltsame Sache ist, ist die meisten Speicher in für Schleife zugeordnet. (Siehe Zeile 163,135) Ich denke, es bedeutet, dass Speicher undicht ist.

Line # Mem usage Increment Line Contents 
================================================ 
    11 53.062 MiB 0.000 MiB  @profile 
    12         def __init__(self, is_training, config): 
    13 53.875 MiB 0.812 MiB   self.sess = sess = tf.Session() 
    14          
    15 53.875 MiB 0.000 MiB   self.prev_see = prev_see = config.prev_see 
    16 53.875 MiB 0.000 MiB   self.num_steps = num_steps = config.num_steps 
    17          #maybe "self.num_hidden =" part could be removed 
    18 53.875 MiB 0.000 MiB   self.num_hidden = num_hidden = config.num_hidden 
    19 53.875 MiB 0.000 MiB   self.batch_size = config.batch_size 
    20 53.875 MiB 0.000 MiB   self.epoch = config.epoch 
    21 53.875 MiB 0.000 MiB   self.learning_rate = config.learning_rate 
    22 53.875 MiB 0.000 MiB   self.summaries_dir = config.summaries_dir 
    23        
    24 53.875 MiB 0.000 MiB   with tf.name_scope('input'): 
    25 53.875 MiB 0.000 MiB    self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see], 
    26 53.957 MiB 0.082 MiB          name='input-x') 
    27 53.973 MiB 0.016 MiB    self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y') 
    28          
    29 55.316 MiB 1.344 MiB   def weight_variable(self,shape): 
    30           """Create a weight variable with appropriate initialization.""" 
    31 55.371 MiB 0.055 MiB    initial = tf.truncated_normal(shape,stddev=0.1) 
    32 55.414 MiB 0.043 MiB    return tf.Variable(initial) 
    33        
    34 55.707 MiB 0.293 MiB   def bias_variable(self,shape): 
    35           """Create a bias variable with appropriate initialization.""" 
    36 55.727 MiB 0.020 MiB    initial = tf.constant(0.1,shape=shape) 
    37 55.754 MiB 0.027 MiB    return tf.Variable(initial) 
    38          
    39 55.754 MiB 0.000 MiB   def variable_summaries(self,var,name): 
    40           """Attach a lot of summaries to a Tensor.""" 
    41 55.754 MiB 0.000 MiB    with tf.name_scope('summaries'): 
    42 55.801 MiB 0.047 MiB     mean = tf.reduce_mean(var) 
    43 55.824 MiB 0.023 MiB     tf.scalar_summary('mean/'+name,mean) 
    44 55.824 MiB 0.000 MiB     with tf.name_scope('stddev'): 
    45 55.883 MiB 0.059 MiB      stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean))) 
    46 55.906 MiB 0.023 MiB     tf.scalar_summary('stddev/'+name,stddev) 
    47 55.969 MiB 0.062 MiB     tf.scalar_summary('max/'+name, tf.reduce_max(var)) 
    48 56.027 MiB 0.059 MiB     tf.scalar_summary('min/'+name, tf.reduce_min(var)) 
    49 56.055 MiB 0.027 MiB     tf.histogram_summary(name, var) 
    50          
    51          #declare weight variables as property 
    52 53.973 MiB -2.082 MiB   layer_name = 'rnn_layer' 
    53 53.973 MiB 0.000 MiB   with tf.name_scope(layer_name): 
    54 53.973 MiB 0.000 MiB    with tf.name_scope('U'): 
    55 54.230 MiB 0.258 MiB     self.U = U = weight_variable(self,[prev_see,num_hidden]) 
    56 54.598 MiB 0.367 MiB     variable_summaries(self,U,layer_name+'/U') 
    57 54.598 MiB 0.000 MiB    with tf.name_scope('W'): 
    58 54.691 MiB 0.094 MiB     self.W = W = weight_variable(self,[num_hidden,num_hidden]) 
    59 54.961 MiB 0.270 MiB     variable_summaries(self,W,layer_name+'/W') 
    60 54.961 MiB 0.000 MiB    with tf.name_scope('b_W'): 
    61 55.012 MiB 0.051 MiB     self.b_W = b_W = bias_variable(self,[num_hidden]) 
    62 55.316 MiB 0.305 MiB     variable_summaries(self,b_W,layer_name+'/b_W') 
    63 55.316 MiB 0.000 MiB    with tf.name_scope('V'): 
    64 55.414 MiB 0.098 MiB     self.V = V = weight_variable(self,[num_hidden,1]) 
    65 55.707 MiB 0.293 MiB     variable_summaries(self,V,layer_name+'/V') 
    66 55.707 MiB 0.000 MiB    with tf.name_scope('b_V'): 
    67 55.754 MiB 0.047 MiB     self.b_V = b_V = bias_variable(self,[1]) 
    68 56.055 MiB 0.301 MiB     variable_summaries(self,b_V,layer_name+'/b_V') 
    69 56.055 MiB 0.000 MiB   self.merged = tf.merge_all_summaries() 
    70 60.348 MiB 4.293 MiB   self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph) 
    71 62.496 MiB 2.148 MiB   tf.initialize_all_variables().run(session=sess) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    82 3013.336 MiB 0.000 MiB  @profile 
    83         def inference(self): 
    84          """calculate outputs and loss for a single batch""" 
    85 3013.336 MiB 0.000 MiB   total_loss = 0.0 
    86 3013.336 MiB 0.000 MiB   outputs = [] 
    87 3022.352 MiB 9.016 MiB   for i in range(self.batch_size): 
    88 3020.441 MiB -1.910 MiB    state = tf.zeros([self.num_hidden]) 
    89 3020.441 MiB 0.000 MiB    outputs.append([]) 
    90 3020.441 MiB 0.000 MiB    loss = 0.0 
    91 3022.348 MiB 1.906 MiB    for j in range(self.num_steps): 
    92 3022.285 MiB -0.062 MiB     state, output = self.next_state(self.x[i,j,:],state) 
    93 3022.285 MiB 0.000 MiB     outputs[i].append(output) 
    94 3022.348 MiB 0.062 MiB     loss += tf.square(self.y[i,j,:]-output) 
    95 3022.352 MiB 0.004 MiB    total_loss+=loss 
    96 3022.371 MiB 0.020 MiB   return outputs, total_loss/(self.batch_size*self.num_steps) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    97 3013.336 MiB 0.000 MiB  @profile 
    98         def batch_train(self,feed_dict): 
    99          """train the network for a single batch""" 
    100 3022.371 MiB 9.035 MiB   _, loss = self.inference() 
    101 3051.781 MiB 29.410 MiB   train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss) 
    102 3149.891 MiB 98.109 MiB   summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict) 
    103          #self.train_writer.add_summary(summary) 
    104 3149.891 MiB 0.000 MiB   print(loss_value) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    131 1582.758 MiB 0.000 MiB @profile 
    132        def run_epoch(m,x_data,y_data): 
    133 1582.758 MiB 0.000 MiB  num_batch = ((len(x_data)-1) // m.batch_size)+1 
    134         #num_batch = 100 
    135 3149.895 MiB 1567.137 MiB  for i in range(num_batch): 
    136 3013.336 MiB -136.559 MiB   x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
    137 3013.336 MiB 0.000 MiB   y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:] 
    138 3013.336 MiB 0.000 MiB   feed_dict = {m.x:x_batch,m.y:y_batch} 
    139 3013.336 MiB 0.000 MiB   print("%dth/%dbatches"%(i+1,num_batch)) 
    140 3149.891 MiB 136.555 MiB   m.batch_train(feed_dict) 


Filename: rnn.py 

Line # Mem usage Increment Line Contents 
================================================ 
    154 52.914 MiB 0.000 MiB @profile 
    155        def main(): 
    156 52.914 MiB 0.000 MiB  train_config = TrainConfig() 
    157 52.914 MiB 0.000 MiB  debug_config = DebugConfig() 
    158 53.059 MiB 0.145 MiB  data = np.load('processed_data.npy') 
    159 53.062 MiB 0.004 MiB  x,y = process_data(data,debug_config) 
    160 62.496 MiB 9.434 MiB  rnn_model = RNN(True,debug_config) 
    161        
    162         #training phase 
    163 3149.898 MiB 3087.402 MiB  for i in range(rnn_model.epoch): 
    164 1582.758 MiB -1567.141 MiB   print("%dth epoch"%(i+1)) 
    165 3149.898 MiB 1567.141 MiB   run_epoch(rnn_model,x,y)

Dieses Problem nicht aufgetreten war, als ich MNIST model von tensorflow Tutorial einfach ausprobiert. Es sollte also mit dem RNN-Modell in Zusammenhang stehen. Auch konnte ich dieses Problem auf Ubuntu 14.04 reproduzieren, also glaube ich nicht, dass dieses Problem von OS X Sachen verursacht wird. Vielen Dank für das Lesen.

Quelle

2016-07-25 Kyungsu Stanley Kim

ich glaube, das Problem ist, dass diese Linie

train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)

in Ihrer batch_train Funktion auftritt, so bei jeder Iteration ein neue GradientDescentOptimizer erstellt wird. Versuchen Sie, dies direkt nach dem Definieren des Verlustes in die Init-Funktion Ihres Modells zu verschieben und verweisen Sie stattdessen auf self.train_step in Ihrer Batch_Train-Funktion.

Quelle

2016-07-25 09:49:34

Das funktionierte für mich danke! –

Tensorflow Speicherverlust mit rekurrenten neuronalen Netzwerk

Antwort

Verwandte Themen