diff --git a/Python/README.md b/Python/README.md
index 3a6fc84..61c7a98 100644
--- a/Python/README.md
+++ b/Python/README.md
@@ -99,6 +99,10 @@ Default: --dim 32 32 3
 ```
 Default: --seed 0
 ```
+12. **--profile**: enable profiling
+```
+Default: false
+```
 
 ### Newton Method
 
@@ -191,4 +195,4 @@ Memory | bsize 1024 | bsize 512| bsize 256
 10% sub-sampled Gv|7.2 GB |3.8 GB|2.1 GB
 5% sub-sampled Gv |7.2 GB |3.8 GB|2.1 GB
 1% sub-sampled Gv |7.2 GB |3.8 GB|2.1 GB
-SGD |7.2 GB|3.8 GB|2.1 GB|
\ No newline at end of file
+SGD |7.2 GB|3.8 GB|2.1 GB|
diff --git a/Python/train.py b/Python/train.py
index e538b88..b1fcc65 100644
--- a/Python/train.py
+++ b/Python/train.py
@@ -1,3 +1,5 @@
+import os
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
 import pdb
 import numpy as np
 import tensorflow as tf
@@ -8,7 +10,10 @@
 
 from net.net import CNN
 from newton_cg import newton_cg
-from utilities import read_data, predict, ConfigClass, normalize_and_reshape
+from utilities import (
+	read_data, predict, ConfigClass, normalize_and_reshape,
+	Profiler)
+
 
 def parse_args():
 	parser = argparse.ArgumentParser(description='Newton method on DNN')
@@ -93,6 +98,8 @@ def parse_args():
 					  default=[32, 32, 3], type=int)
 	parser.add_argument('--seed', dest='seed', help='a nonnegative integer for \
 						reproducibility', type=int)	  
+	parser.add_argument('--profile', action='store_true',
+					  help='enable profiling')
 	args = parser.parse_args()
 	return args
 
@@ -149,7 +156,7 @@ def gradient_trainer(config, sess, network, full_batch, val_batch, saver, test_n
 		log_file = open(config.log_file, 'w')
 		print(config.args, file=log_file)
 	sess.run(tf.compat.v1.global_variables_initializer())
-	
+
 
 	print('-------------- initializing network by methods in He et al. (2015) --------------')
 	param = tf.compat.v1.trainable_variables()
@@ -159,6 +166,8 @@ def gradient_trainer(config, sess, network, full_batch, val_batch, saver, test_n
 	best_acc = 0.0
 	lr = config.lr
 
+	profiler = Profiler(config.args.profile)
+
 	for epoch in range(0, args.epoch):
 		
 		loss_avg = 0.0
@@ -177,10 +186,13 @@ def gradient_trainer(config, sess, network, full_batch, val_batch, saver, test_n
 			batch_labels = np.ascontiguousarray(batch_labels)
 			config.elapsed_time += time.time() - load_time
 
-			step, _, batch_loss= sess.run(
-				[global_step, optimizer, loss_with_reg],
-				feed_dict = {x: batch_input, y: batch_labels, learning_rate: lr}
-				)
+			with profiler:
+				step, _, batch_loss= sess.run(
+					[global_step, optimizer, loss_with_reg],
+					feed_dict = {x: batch_input, y: batch_labels, learning_rate: lr},
+					options=profiler.run_options,
+					run_metadata=profiler.run_metadata
+					)
 
 			# print initial loss
 			if epoch == 0 and i == 0:
@@ -251,8 +263,13 @@ def gradient_trainer(config, sess, network, full_batch, val_batch, saver, test_n
 			.format(val_acc*100, best_acc*100, total_running_time)
 	
 	print(output_str)
+	summary = profiler.summary()
+	if config.args.profile:
+		print(summary)
 	if not config.screen_log_only:
 		print(output_str, file=log_file)
+		if config.args.profile:
+			print(summary, file=log_file)
 		log_file.close()
 
 def newton_trainer(config, sess, network, full_batch, val_batch, saver, test_network):
diff --git a/Python/utilities.py b/Python/utilities.py
index 4d1a58c..dfd5e98 100644
--- a/Python/utilities.py
+++ b/Python/utilities.py
@@ -1,9 +1,11 @@
-import numpy as np
 import math
-import scipy.io as sio
 import os
-import math
-import pdb
+
+import numpy as np
+import scipy.io as sio
+import tensorflow as tf
+from tensorflow.python import _pywrap_stat_summarizer
+
 
 class ConfigClass(object):
 	def __init__(self, args, num_data, num_cls):
@@ -177,3 +179,34 @@ def predict(sess, network, test_batch, bsize):
 	
 	return avg_loss, avg_acc, results
 
+
+class Profiler:
+	def __init__(self, is_enabled=False):
+		self._is_enabled = is_enabled
+		self.run_metadata = None
+		self._summarizer = _pywrap_stat_summarizer.StatSummarizer()
+
+		if self._is_enabled:
+			self.run_options = tf.compat.v1.RunOptions(
+				trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
+		else:
+			self.run_options = None
+
+	def add_stat(self, run_metadata):
+		self._summarizer.ProcessStepStatsStr(
+			run_metadata.step_stats.SerializeToString())
+
+	def __enter__(self):
+		if self._is_enabled:
+			if self.run_metadata is not None:
+				raise RuntimeError('Recursively called')
+			self.run_metadata = tf.compat.v1.RunMetadata()
+		return self
+
+	def __exit__(self, *args, **kwargs):
+		if self._is_enabled:
+			self.add_stat(self.run_metadata)
+			self.run_metadata = None
+
+	def summary(self):
+		return self._summarizer.GetOutputString()