Debugging

Graeme Nail · graemenail · commit df9cf75dc26e · 2022-03-10T13:34:17.000Z
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
@@ -34,15 +34,15 @@ class Tensors {
 
 public:
   Tensors(Ptr<Backend> backend)
-      : tensors_(New<TensorAllocator>(backend)),
-        cache_(New<TensorAllocator>(backend)),
+      : tensors_(New<TensorAllocator>(backend, "tensors")),
+        cache_(New<TensorAllocator>(backend, "cache")),
         shortterm_(New<WeakMemory>()),
         longterm_(New<Memory>())/*,
         midterm_(New<ShortlistMemory>())*/ {}
 
   Tensors(Ptr<Backend> backend, Ptr<Device> device)
       : tensors_(New<TensorAllocator>(backend, device)),
-        cache_(New<TensorAllocator>(backend)),
+        cache_(New<TensorAllocator>(backend, "cache")),
         shortterm_(New<WeakMemory>()),
         longterm_(New<Memory>())/*,
         midterm_(New<ShortlistMemory>())*/ {}
@@ -74,6 +74,8 @@ class Tensors {
 
   Expr findOrRemember(Expr node) {
     size_t hash = node->hash();
+
+    LOG(debug, "Looking for {}", node->name());
     // memoize constant nodes that are not parameters
     // parameters are already memoized in the graph itself
 
@@ -117,6 +119,7 @@ class Tensors {
       auto it = longterm_->find(hash);
       if(it != longterm_->end()) {
         for(auto found : it->second) {
+          LOG(debug, "Found (LT): {}", found->name() );
           return found;
           // @TODO: check why below code does not work for certain nodes and
           // autotuning.
@@ -127,14 +130,18 @@ class Tensors {
         }
       }
 
-      //std::cerr << "Longterm: " << longterm_->size() << " " << node->name() << " Type: " << node->type() << " shape: " << node->shape() << std::endl;
+      // std::cerr << "Longterm: " << longterm_->size() << " " << node->name() << " Type: " << node->type() << " shape: " << node->shape() << std::endl;
+      LOG(debug, "Insert (LT): {}", node->name());
       (*longterm_)[hash].push_back(node);
     }
 
     auto it = shortterm_->find(hash);
     if(it != shortterm_->end()) {
       for(auto found : it->second) {
         if(node->equal(found)) {
+          std::ostringstream addr;
+          addr << &node;
+          LOG(debug, "Found (ST): {} @ {}", found->name(), addr.str());
           return found;
         }
       }
@@ -170,7 +177,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
   Ptr<Tensors> tensors_;
 private:
 
-  std::unordered_map<size_t, std::vector<Expr>> memoized_;
+  // std::unordered_map<size_t, std::vector<Expr>> memoized_;
 
   Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
 
diff --git a/src/graph/node.h b/src/graph/node.h
@@ -51,6 +51,10 @@ class Node : public Chainable<Tensor> {
     : graph_(graph), shape_(shape), valueType_(valueType) {}
 
   virtual ~Node() {
+    std::ostringstream addr;
+    addr << this;
+    LOG(debug, "Destroying ({}): {}", addr.str(), this->name());
+
     free();
   }
 
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
@@ -73,8 +73,8 @@ class Parameters {
   }
 
   virtual void init(Ptr<Backend> backend) {
-    vals_ = New<TensorAllocator>(backend);
-    grads_ = New<TensorAllocator>(backend);
+    vals_ = New<TensorAllocator>(backend, "vals");
+    grads_ = New<TensorAllocator>(backend, "grads");
   }
 
   virtual void init(Ptr<Backend> backend, Ptr<Device> device) {
diff --git a/src/optimizers/optimizers.cpp b/src/optimizers/optimizers.cpp
@@ -21,7 +21,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t re
 void Adagrad::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) {
   ABORT_IF(actualMBSize != refMBWords, "Adagrad does not support rational hyper-parameter adjustment");
   if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getBackend());
+    alloc_ = New<TensorAllocator>(params->getBackend(), "Adagrad::updateImpl");
 
   if(!gt_) {
     int elements = (int)params->size();
@@ -76,7 +76,7 @@ void Adagrad::load(const std::string& name,
     auto opt = std::dynamic_pointer_cast<Adagrad>(opts[localDeviceIndex]);
     if(!opt->gt_) {
       if(!opt->alloc_)
-        opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex]);
+        opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex], "Adagrad::load");
       auto size = end - begin;
       opt->alloc_->reserveExact(sizeof(float) * size);
       opt->alloc_->allocate(opt->gt_, {1, (int)size});
@@ -124,7 +124,7 @@ void Adagrad::resetStats() {
 void Adam::updateImpl(Tensor params, Tensor grads, size_t actualMBSize, size_t refMBWords) {
   // lazy allocation
   if(!alloc_)
-    alloc_ = New<TensorAllocator>(params->getBackend());
+    alloc_ = New<TensorAllocator>(params->getBackend(), "Adam::updateImpl");
 
   if(!mt_) {
     int elements = (int)params->size();
@@ -226,7 +226,7 @@ void Adam::load(const std::string& name,
     auto opt = std::dynamic_pointer_cast<Adam>(opts[localDeviceIndex]);
     if(!opt->mt_ || !opt->vt_) { // lazily allocate
       if(!opt->alloc_)
-        opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex]);
+        opt->alloc_ = New<TensorAllocator>(backends[localDeviceIndex], "Adam::load");
       auto size = end-begin;
       opt->alloc_->reserveExact(2 * sizeof(float) * size);
       opt->alloc_->allocate(opt->mt_, {1, (int)size});
diff --git a/src/optimizers/quantizer.cpp b/src/optimizers/quantizer.cpp
@@ -33,8 +33,8 @@ static void fixedPointQuantization(Tensor data, Tensor res, int numCenters, floa
 }
 
 /* simulate a log-based quantization for values in data. The quantized value will be in the form of
- * S*2^q For example: 
- * data  = [0.9, 0.7, 0.5, 0.2 , 1.1] 
+ * S*2^q For example:
+ * data  = [0.9, 0.7, 0.5, 0.2 , 1.1]
  * res   = [1,   0.5, 0.5, 0.25, 1  ]
  *
  * @param data contains the original data.
@@ -86,7 +86,7 @@ void ModelQuantizer::quantize(Ptr<ExpressionGraph> graph) {
     LOG(info, "Quantizing the model to {}-bits", bits_);
 
     int numElements = (int)graph->params()->vals()->size();
-    auto allocator = New<TensorAllocator>(graph->getBackend());
+    auto allocator = New<TensorAllocator>(graph->getBackend(), "ModelQuantizer::quantize");
     allocator->reserveExact(graph->params()->vals()->memory()->size());
     allocator->allocate(errorResidual_, {1, numElements});
 
@@ -99,7 +99,7 @@ void ModelQuantizer::quantize(Ptr<ExpressionGraph> graph) {
   {
     // apply error feedback mechanism
     using namespace functional;
-    Element(_1 += _2, graph->params()->vals(), errorResidual_); // add the previous error residual to the current model 
+    Element(_1 += _2, graph->params()->vals(), errorResidual_); // add the previous error residual to the current model
     errorResidual_->copyFrom(graph->params()->vals()); // set the model as the error-residual (will be updated below)
   }
 
@@ -127,7 +127,7 @@ void ModelQuantizer::quantize(Ptr<ExpressionGraph> graph) {
 void ModelQuantizer::quantizeImpl(Tensor t) {
   if(!tempVar_) {
     // init the swap tensor
-    auto allocator = New<TensorAllocator>(t->getBackend());
+    auto allocator = New<TensorAllocator>(t->getBackend(), "ModelQuantizer::quantizeImpl");
     allocator->reserveExact(sizeof(float));
     allocator->allocate(tempVar_, {1, 1});
     allocators_.push_back(allocator);
@@ -136,12 +136,12 @@ void ModelQuantizer::quantizeImpl(Tensor t) {
   // init additional tensor for scaling optimization
   if(!delta_ && optSteps_ > 0) {
     int msize = (int) errorResidual_->size();
-    auto allocator = New<TensorAllocator>(errorResidual_->getBackend());
+    auto allocator = New<TensorAllocator>(errorResidual_->getBackend(), "ModelQuantizer::quantizeImpl");
     allocator->reserveExact(msize * sizeof(float));
     allocator->allocate(delta_, {1, msize});
     allocators_.push_back(allocator);
   }
-  
+
   Tensor tflat = t->subtensor(0, t->size());   // flatten t for reduce
 
   float S = 0.0f; // scaling factor S
@@ -155,7 +155,7 @@ void ModelQuantizer::quantizeImpl(Tensor t) {
   // optimize the scaling factor S
   for(int i = 0; i < optSteps_; i++) {
     Tensor q = delta_->subtensor(0, t->size());  // to store the quantized t
-    
+
     // let t be the original tensor, and q be the quantized tensor, and q = S*a where S is the
     // scaling factor. we want to optimize S to minimize MSE(S*a - t) therefore, S =
     // sum(a*t)/sum(a*a) see https://www.aclweb.org/anthology/2020.ngt-1.4.pdf for more details.
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
@@ -18,15 +18,22 @@ class TensorAllocator {
 
   Ptr<Backend> backend_;
   Ptr<Allocator> allocator_;
+  std::string name_;
 
 public:
-  TensorAllocator(Ptr<Backend> backend)
+  TensorAllocator(Ptr<Backend> backend, std::string name = "no_name")
       : backend_(backend),
-        allocator_(New<Allocator>(backend_->getDeviceId(), 0, GROW, ALIGN)) {}
+        allocator_(New<Allocator>(backend_->getDeviceId(), 0, GROW, ALIGN)),
+        name_(name) {
+          LOG(debug, "A new allocator {} is born {} on device", name_, backend_->getDeviceId());
+        }
 
   TensorAllocator(Ptr<Backend> backend, Ptr<Device> device)
       : backend_(backend),
-        allocator_(New<Allocator>(backend_->getDeviceId(), device, 0, GROW, ALIGN)) {}
+        allocator_(New<Allocator>(backend_->getDeviceId(), device, 0, GROW, ALIGN)),
+        name_("unknown") {
+          LOG(debug, "A new allocator {} is born {} on device", name_, backend_->getDeviceId());
+        }
 
   ~TensorAllocator() { clear(); }
 
@@ -49,7 +56,7 @@ class TensorAllocator {
     for(auto part : bytes)
       total += allocator_->alignedSize(part);
     reserveExact(total);
-  }  
+  }
 
   void reserveExact(size_t bytes = 0) {
     size_t mbytes = bytes / MBYTE;
diff --git a/src/training/communicator.h b/src/training/communicator.h
@@ -106,7 +106,7 @@ class DefaultCommunicator : public ICommunicator {
       for(auto graph : graphs_) {
         int __size__ = std::min(shardSize, totalSize);
 
-        auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
+        auto paramsAlloc = New<TensorAllocator>(graph->getBackend(), "DefaultCommunicator::lazyInit");
         paramsAllocs_.push_back(paramsAlloc);
 
         paramsAlloc->reserveExact(__size__ * sizeof(float));
diff --git a/src/translator/translator.h b/src/translator/translator.h
@@ -110,6 +110,7 @@ class Translate : public ModelTask {
         }
 
         for(auto scorer : scorers) {
+          LOG(debug, "Calling socrer->init(graph)");
           scorer->init(graph);
           if(shortlistGenerator_)
             scorer->setShortlistGenerator(shortlistGenerator_);

Original file line number	Diff line number	Diff line change
`@@ -51,6 +51,10 @@ class Node : public Chainable<Tensor> {`
`51`	`51`	`: graph_(graph), shape_(shape), valueType_(valueType) {}`
`52`	`52`
`53`	`53`	`virtual ~Node() {`
	`54`	`+ std::ostringstream addr;`
	`55`	`+ addr << this;`
	`56`	`+ LOG(debug, "Destroying ({}): {}", addr.str(), this->name());`
	`57`	`+`
`54`	`58`	`free();`
`55`	`59`	`}`
`56`	`60`
Original file line number	Diff line number	Diff line change
`@@ -73,8 +73,8 @@ class Parameters {`
`73`	`73`	`}`
`74`	`74`
`75`	`75`	`virtual void init(Ptr<Backend> backend) {`
`76`		`- vals_ = New<TensorAllocator>(backend);`
`77`		`- grads_ = New<TensorAllocator>(backend);`
	`76`	`+ vals_ = New<TensorAllocator>(backend, "vals");`
	`77`	`+ grads_ = New<TensorAllocator>(backend, "grads");`
`78`	`78`	`}`
`79`	`79`
`80`	`80`	`virtual void init(Ptr<Backend> backend, Ptr<Device> device) {`
Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ class Translate : public ModelTask {`
`110`	`110`	`}`
`111`	`111`
`112`	`112`	`for(auto scorer : scorers) {`
	`113`	`+ LOG(debug, "Calling socrer->init(graph)");`
`113`	`114`	`scorer->init(graph);`
`114`	`115`	`if(shortlistGenerator_)`
`115`	`116`	`scorer->setShortlistGenerator(shortlistGenerator_);`