Ich versuche, ein einfaches rekurrentes neuronale Netzwerkmodell mit Tensorflow unter Mac OS X zu erstellen. Es ist nur ein Spielzeugmodell und die Größe der Eingabedaten nicht 3 MB überschreiten, so sollte es nicht viel Speicher verbrauchen. Wenn ich jedoch ein Modell verwende, erhöht die Speicherauslastung jeden Trainingsstapel signifikant und geht über 10 GB hinaus. Es war nur für zwei Iterationen. Ich könnte es nicht mehr laufen lassen.Tensorflow Speicherverlust mit rekurrenten neuronalen Netzwerk
Hier ist der ganze Code.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
from pympler import summary
class RNN():
"""The RNN model."""
#@profile
def inference(self):
"""calculate outputs and loss for a single batch"""
total_loss = 0.0
outputs = []
for i in range(self.batch_size):
state = self.init_state
outputs.append([])
loss = 0.0
for j in range(self.num_steps):
state, output = self.next_state(self.x[i,j,:],state)
outputs[i].append(output)
loss += tf.square(self.y[i,j,:]-output)
total_loss+=loss
return outputs, total_loss/(self.batch_size*self.num_steps)
def __init__(self, is_training, config):
self.sess = sess = tf.Session()
self.prev_see = prev_see = config.prev_see
self.num_steps = num_steps = config.num_steps
#maybe "self.num_hidden =" part could be removed
self.num_hidden = num_hidden = config.num_hidden
self.batch_size = config.batch_size
self.epoch = config.epoch
self.learning_rate = config.learning_rate
self.summaries_dir = config.summaries_dir
with tf.name_scope('placeholders'):
self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
name='input-x')
self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
default_init_state = tf.zeros([num_hidden])
self.init_state = tf.placeholder_with_default(default_init_state,[num_hidden],
name='state_placeholder')
def weight_variable(self,shape):
"""Create a weight variable with appropriate initialization."""
initial = tf.truncated_normal(shape,stddev=0.1)
return tf.Variable(initial)
def bias_variable(self,shape):
"""Create a bias variable with appropriate initialization."""
initial = tf.constant(0.1,shape=shape)
return tf.Variable(initial)
def variable_summaries(self,var,name):
"""Attach a lot of summaries to a Tensor."""
with tf.name_scope('summaries'):
mean = tf.reduce_mean(var)
tf.scalar_summary('mean/'+name,mean)
with tf.name_scope('stddev'):
stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
tf.scalar_summary('stddev/'+name,stddev)
tf.scalar_summary('max/'+name, tf.reduce_max(var))
tf.scalar_summary('min/'+name, tf.reduce_min(var))
tf.histogram_summary(name, var)
#declare weight variables as property
layer_name = 'rnn_layer'
with tf.name_scope(layer_name):
with tf.name_scope('U'):
self.U = U = weight_variable(self,[prev_see,num_hidden])
variable_summaries(self,U,layer_name+'/U')
with tf.name_scope('W'):
self.W = W = weight_variable(self,[num_hidden,num_hidden])
variable_summaries(self,W,layer_name+'/W')
with tf.name_scope('b_W'):
self.b_W = b_W = bias_variable(self,[num_hidden])
variable_summaries(self,b_W,layer_name+'/b_W')
with tf.name_scope('V'):
self.V = V = weight_variable(self,[num_hidden,1])
variable_summaries(self,V,layer_name+'/V')
with tf.name_scope('b_V'):
self.b_V = b_V = bias_variable(self,[1])
variable_summaries(self,b_V,layer_name+'/b_V')
self.merged = tf.merge_all_summaries()
self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
tf.initialize_all_variables().run(session=sess)
_,self.loss = self.inference()
def next_state(self,x,s_prev):
"""calculate next state and output"""
x = tf.reshape(x,[1,-1])
s_prev = tf.reshape(s_prev,[1,-1])
s_next = tf.tanh(tf.matmul(x,self.U)+tf.matmul(s_prev,self.W)+self.b_W)
output = tf.matmul(s_next,self.V)+self.b_V
return s_next, output
#@profile
def batch_train(self,feed_dict):
"""train the network for a single batch"""
loss = self.loss
train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
#self.train_writer.add_summary(summary)
print(loss_value)
class TrainConfig():
"""Train Config."""
total_steps = 245
test_ratio = 0.3
prev_see = 100
num_steps = int(round((total_steps-prev_see)*(1-test_ratio)))
num_hidden = 10
batch_size = 5
epoch = 3
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
class DebugConfig():
"""For debugging memory leak."""
total_steps = 100
test_ratio = 0.3
prev_see = 100
num_steps = 10
num_hidden = 10
batch_size = 5
epoch = 2
learning_rate = 0.1
summaries_dir = '/Users/Kyungsu/StockPrediction/log'
#@profile
def run_epoch(m,x_data,y_data):
num_batch = ((len(x_data)-1) // m.batch_size)+1
#num_batch = 100
for i in range(num_batch):
x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
feed_dict = {m.x:x_batch,m.y:y_batch}
print("%dth/%dbatches"%(i+1,num_batch))
m.batch_train(feed_dict)
def process_data(data,config):
data_size = len(data)
prev_see = config.prev_see
num_steps = config.num_steps
x = np.zeros((data_size,num_steps,prev_see))
y = np.zeros((data_size,num_steps,1))
for i in range(data_size):
for j in range(num_steps-prev_see):
x[i,j,:] = data[i,i:i+prev_see]
y[i,j,0] = data[i,i+prev_see]
return x,y
#@profile
def main():
train_config = TrainConfig()
debug_config = DebugConfig()
data = np.load('processed_data.npy')
x,y = process_data(data,train_config)
rnn_model = RNN(True,train_config)
#training phase
for i in range(rnn_model.epoch):
print("%dth epoch"%(i+1))
run_epoch(rnn_model,x,y)
main()
Und nach ist das Ergebnis memory_profiler. Seltsame Sache ist, ist die meisten Speicher in für Schleife zugeordnet. (Siehe Zeile 163,135) Ich denke, es bedeutet, dass Speicher undicht ist.
Line # Mem usage Increment Line Contents
================================================
11 53.062 MiB 0.000 MiB @profile
12 def __init__(self, is_training, config):
13 53.875 MiB 0.812 MiB self.sess = sess = tf.Session()
14
15 53.875 MiB 0.000 MiB self.prev_see = prev_see = config.prev_see
16 53.875 MiB 0.000 MiB self.num_steps = num_steps = config.num_steps
17 #maybe "self.num_hidden =" part could be removed
18 53.875 MiB 0.000 MiB self.num_hidden = num_hidden = config.num_hidden
19 53.875 MiB 0.000 MiB self.batch_size = config.batch_size
20 53.875 MiB 0.000 MiB self.epoch = config.epoch
21 53.875 MiB 0.000 MiB self.learning_rate = config.learning_rate
22 53.875 MiB 0.000 MiB self.summaries_dir = config.summaries_dir
23
24 53.875 MiB 0.000 MiB with tf.name_scope('input'):
25 53.875 MiB 0.000 MiB self.x = tf.placeholder(tf.float32,[None,num_steps,config.prev_see],
26 53.957 MiB 0.082 MiB name='input-x')
27 53.973 MiB 0.016 MiB self.y = tf.placeholder(tf.float32, [None,num_steps,1],name='input-y')
28
29 55.316 MiB 1.344 MiB def weight_variable(self,shape):
30 """Create a weight variable with appropriate initialization."""
31 55.371 MiB 0.055 MiB initial = tf.truncated_normal(shape,stddev=0.1)
32 55.414 MiB 0.043 MiB return tf.Variable(initial)
33
34 55.707 MiB 0.293 MiB def bias_variable(self,shape):
35 """Create a bias variable with appropriate initialization."""
36 55.727 MiB 0.020 MiB initial = tf.constant(0.1,shape=shape)
37 55.754 MiB 0.027 MiB return tf.Variable(initial)
38
39 55.754 MiB 0.000 MiB def variable_summaries(self,var,name):
40 """Attach a lot of summaries to a Tensor."""
41 55.754 MiB 0.000 MiB with tf.name_scope('summaries'):
42 55.801 MiB 0.047 MiB mean = tf.reduce_mean(var)
43 55.824 MiB 0.023 MiB tf.scalar_summary('mean/'+name,mean)
44 55.824 MiB 0.000 MiB with tf.name_scope('stddev'):
45 55.883 MiB 0.059 MiB stddev = tf.sqrt(tf.reduce_sum(tf.square(var-mean)))
46 55.906 MiB 0.023 MiB tf.scalar_summary('stddev/'+name,stddev)
47 55.969 MiB 0.062 MiB tf.scalar_summary('max/'+name, tf.reduce_max(var))
48 56.027 MiB 0.059 MiB tf.scalar_summary('min/'+name, tf.reduce_min(var))
49 56.055 MiB 0.027 MiB tf.histogram_summary(name, var)
50
51 #declare weight variables as property
52 53.973 MiB -2.082 MiB layer_name = 'rnn_layer'
53 53.973 MiB 0.000 MiB with tf.name_scope(layer_name):
54 53.973 MiB 0.000 MiB with tf.name_scope('U'):
55 54.230 MiB 0.258 MiB self.U = U = weight_variable(self,[prev_see,num_hidden])
56 54.598 MiB 0.367 MiB variable_summaries(self,U,layer_name+'/U')
57 54.598 MiB 0.000 MiB with tf.name_scope('W'):
58 54.691 MiB 0.094 MiB self.W = W = weight_variable(self,[num_hidden,num_hidden])
59 54.961 MiB 0.270 MiB variable_summaries(self,W,layer_name+'/W')
60 54.961 MiB 0.000 MiB with tf.name_scope('b_W'):
61 55.012 MiB 0.051 MiB self.b_W = b_W = bias_variable(self,[num_hidden])
62 55.316 MiB 0.305 MiB variable_summaries(self,b_W,layer_name+'/b_W')
63 55.316 MiB 0.000 MiB with tf.name_scope('V'):
64 55.414 MiB 0.098 MiB self.V = V = weight_variable(self,[num_hidden,1])
65 55.707 MiB 0.293 MiB variable_summaries(self,V,layer_name+'/V')
66 55.707 MiB 0.000 MiB with tf.name_scope('b_V'):
67 55.754 MiB 0.047 MiB self.b_V = b_V = bias_variable(self,[1])
68 56.055 MiB 0.301 MiB variable_summaries(self,b_V,layer_name+'/b_V')
69 56.055 MiB 0.000 MiB self.merged = tf.merge_all_summaries()
70 60.348 MiB 4.293 MiB self.train_writer = tf.train.SummaryWriter(config.summaries_dir,sess.graph)
71 62.496 MiB 2.148 MiB tf.initialize_all_variables().run(session=sess)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
82 3013.336 MiB 0.000 MiB @profile
83 def inference(self):
84 """calculate outputs and loss for a single batch"""
85 3013.336 MiB 0.000 MiB total_loss = 0.0
86 3013.336 MiB 0.000 MiB outputs = []
87 3022.352 MiB 9.016 MiB for i in range(self.batch_size):
88 3020.441 MiB -1.910 MiB state = tf.zeros([self.num_hidden])
89 3020.441 MiB 0.000 MiB outputs.append([])
90 3020.441 MiB 0.000 MiB loss = 0.0
91 3022.348 MiB 1.906 MiB for j in range(self.num_steps):
92 3022.285 MiB -0.062 MiB state, output = self.next_state(self.x[i,j,:],state)
93 3022.285 MiB 0.000 MiB outputs[i].append(output)
94 3022.348 MiB 0.062 MiB loss += tf.square(self.y[i,j,:]-output)
95 3022.352 MiB 0.004 MiB total_loss+=loss
96 3022.371 MiB 0.020 MiB return outputs, total_loss/(self.batch_size*self.num_steps)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
97 3013.336 MiB 0.000 MiB @profile
98 def batch_train(self,feed_dict):
99 """train the network for a single batch"""
100 3022.371 MiB 9.035 MiB _, loss = self.inference()
101 3051.781 MiB 29.410 MiB train_step = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(loss)
102 3149.891 MiB 98.109 MiB summary,loss_value, _ = self.sess.run([self.merged,loss, train_step],feed_dict=feed_dict)
103 #self.train_writer.add_summary(summary)
104 3149.891 MiB 0.000 MiB print(loss_value)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
131 1582.758 MiB 0.000 MiB @profile
132 def run_epoch(m,x_data,y_data):
133 1582.758 MiB 0.000 MiB num_batch = ((len(x_data)-1) // m.batch_size)+1
134 #num_batch = 100
135 3149.895 MiB 1567.137 MiB for i in range(num_batch):
136 3013.336 MiB -136.559 MiB x_batch = x_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
137 3013.336 MiB 0.000 MiB y_batch = y_data[i*m.batch_size:(i+1)*m.batch_size,:,:]
138 3013.336 MiB 0.000 MiB feed_dict = {m.x:x_batch,m.y:y_batch}
139 3013.336 MiB 0.000 MiB print("%dth/%dbatches"%(i+1,num_batch))
140 3149.891 MiB 136.555 MiB m.batch_train(feed_dict)
Filename: rnn.py
Line # Mem usage Increment Line Contents
================================================
154 52.914 MiB 0.000 MiB @profile
155 def main():
156 52.914 MiB 0.000 MiB train_config = TrainConfig()
157 52.914 MiB 0.000 MiB debug_config = DebugConfig()
158 53.059 MiB 0.145 MiB data = np.load('processed_data.npy')
159 53.062 MiB 0.004 MiB x,y = process_data(data,debug_config)
160 62.496 MiB 9.434 MiB rnn_model = RNN(True,debug_config)
161
162 #training phase
163 3149.898 MiB 3087.402 MiB for i in range(rnn_model.epoch):
164 1582.758 MiB -1567.141 MiB print("%dth epoch"%(i+1))
165 3149.898 MiB 1567.141 MiB run_epoch(rnn_model,x,y)
Dieses Problem nicht aufgetreten war, als ich MNIST model von tensorflow Tutorial einfach ausprobiert. Es sollte also mit dem RNN-Modell in Zusammenhang stehen. Auch konnte ich dieses Problem auf Ubuntu 14.04 reproduzieren, also glaube ich nicht, dass dieses Problem von OS X Sachen verursacht wird. Vielen Dank für das Lesen.
Das funktionierte für mich danke! –