[Graph] Support float32/int32/int64 type for select fusions. (#148)

wwwljc · web-flow · commit 06964b359696 · 2022-04-06T11:41:14.000+08:00
diff --git a/tensorflow/core/graph/template_logicsum_base.h b/tensorflow/core/graph/template_logicsum_base.h
@@ -113,7 +113,8 @@ class TemplateLogicSumBase: public TemplateBase {
       LOG(WARNING) << "Input check failed";
       return false;
     }
-    LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() << "]";
+    LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() <<
+          "][new_name:" << name_prefix << "_" << name() << "]";
 
     Node* node_fused_logicsum = add_fused_logicsum_node(nodes, name_prefix, g, inputs, outputs);
     if (!node_fused_logicsum) {
diff --git a/tensorflow/core/graph/template_select_base.h b/tensorflow/core/graph/template_select_base.h
@@ -33,7 +33,14 @@ class TemplateSelectBase: public TemplateBase {
       std::string name_prefix, Graph* g,
       std::vector<const Edge*>& inputs,
       std::vector<std::vector<const Edge*>>& outputs) override {
-    LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() << "]";
+    DataType datatype = get_data_type(nodes[first_key_].node);
+    if (datatype != DT_FLOAT && datatype != DT_INT32 && datatype != DT_INT64) {
+      LOG(INFO) << "Drop fusion template[" << name() << "] match op[" << nodes[first_key_].node->DebugString() << "]";
+      return false;
+    } else {
+      LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() <<
+          "][new_name:" << name_prefix << "_" << name() << "]";
+    }
 
     Node* node_const_zero = add_zero_like_node(nodes, name_prefix, g, inputs, outputs);
     if (!node_const_zero) {
@@ -66,6 +73,14 @@ class TemplateSelectBase: public TemplateBase {
     return false;
   }
 
+  DataType get_data_type(const Node* node) {
+    DataType datatype;
+    if (GetNodeAttr(node->def(), "T", &datatype) != Status::OK()) {
+      return DT_INVALID;
+    }
+    return datatype;
+  }
+
  protected:
   virtual Node* add_zero_like_node(
       std::map<std::string, MatchedNode>& nodes,
@@ -76,11 +91,21 @@ class TemplateSelectBase: public TemplateBase {
     NodeDef const_zero;
     const_zero.set_op("Const");
     const_zero.set_name(name_prefix + "_const_zero_" + name());
+
+    DataType datatype = get_data_type(nodes[first_key_].node);
     AttrValue attr_type;
-    attr_type.set_type(DT_FLOAT);
+    attr_type.set_type(datatype);
     const_zero.mutable_attr()->insert({"dtype", attr_type});
-    Tensor tensor_zero(DT_FLOAT, {});
-    tensor_zero.scalar<float>()() = 0.0;
+
+    Tensor tensor_zero(datatype, {});
+    if (datatype == DT_FLOAT) {
+      tensor_zero.scalar<float>()() = 0;
+    } else if (datatype == DT_INT32) {
+      tensor_zero.scalar<int32>()() = 0;
+    } else if (datatype == DT_INT64) {
+      tensor_zero.scalar<int64>()() = 0;
+    }
+
     AttrValue value_zero;
     tensor_zero.AsProtoTensorContent(value_zero.mutable_tensor());
     const_zero.mutable_attr()->insert({"value", value_zero});
diff --git a/tensorflow/python/kernel_tests/select_fusion_test.py b/tensorflow/python/kernel_tests/select_fusion_test.py
@@ -0,0 +1,305 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for tensorflow.ops.tf.MSBatchMatMulGrad"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+import numpy as np
+import os
+import shutil
+# os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '2'
+
+from tensorflow.contrib import layers
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary
+from tensorflow.python.training import adagrad
+from tensorflow.python.ops import array_ops
+
+
+
+# run without auto-replacement of fused ops
+def runNonFuse():
+    g1 = ops.Graph()
+    with g1.as_default():
+        random_seed.set_random_seed(0)
+
+        n_num = 1024
+        q_num = 50
+        k_num = 50
+        c_num = 128  # c_num % split_num == 0
+        split_num = 8
+
+        data_float32_q = array_ops.placeholder(
+                dtypes.float32, shape=(None, q_num, c_num))
+        data_float32_k = array_ops.placeholder(
+                dtypes.float32, shape=(None, k_num, c_num))
+
+        x_float32 = data_float32_q
+        y_float32 = data_float32_k
+        m = variable_scope.get_variable(
+                "m_non_fuse", [split_num, n_num, q_num, k_num],
+                dtype=dtypes.int32,
+                initializer=init_ops.random_uniform_initializer(0, 2))
+        m_bool = math_ops.cast(m, dtype=dtypes.bool)
+        m_bool = array_ops.reshape(m_bool, [-1, q_num, k_num])
+        p_float32 = constant_op.constant(
+                0, shape=[split_num*n_num, q_num, k_num],
+                dtype=dtypes.float32)
+
+        with ops.name_scope('NonFuseForward') as scope:
+            with ops.device("/cpu:0"):
+
+                x_float32 = layers.fully_connected(
+                        x_float32, c_num,
+                        activation_fn=nn_ops.leaky_relu, scope="X")
+
+                y_float32 = layers.fully_connected(
+                        y_float32, c_num,
+                        activation_fn=nn_ops.leaky_relu, scope="Y")
+
+                xs_float32 = array_ops.concat(
+                        array_ops.split(x_float32, split_num, axis=2), axis=0)
+                ys_float32 = array_ops.concat(
+                        array_ops.split(y_float32, split_num, axis=2), axis=0)
+                output_non_fuse_float32 = math_ops.matmul(
+                        xs_float32, ys_float32,
+                        transpose_a=False, transpose_b=True)
+
+                zero_tensor = array_ops.zeros_like(array_ops.identity(output_non_fuse_float32))
+                output_non_fuse_float32 = array_ops.where(
+                        m_bool, output_non_fuse_float32, zero_tensor)
+                zero_tensor2 =array_ops.zeros_like(zero_tensor)
+
+                layer1_non_fuse_float32 = layers.fully_connected(
+                        output_non_fuse_float32, 40,
+                        activation_fn=nn_ops.leaky_relu)
+                layer2_non_fuse_float32 = layers.fully_connected(
+                        layer1_non_fuse_float32, 20,
+                        activation_fn=nn_ops.leaky_relu)
+                layer2_non_fuse_float32 = array_ops.reshape(
+                        layer2_non_fuse_float32, [n_num, -1])
+                layer3_non_fuse_float32 = layers.fully_connected(
+                        layer2_non_fuse_float32, 1,
+                        activation_fn=nn_ops.leaky_relu)
+                labels_non_fuse_float32 = constant_op.constant(
+                        1, shape=[n_num, 1], dtype=dtypes.float32)
+                loss_op_non_fuse_float32 = math_ops.reduce_mean(
+                        nn_impl.sigmoid_cross_entropy_with_logits(
+                            logits=layer3_non_fuse_float32,
+                            labels=labels_non_fuse_float32))
+
+        with ops.name_scope('NonFuseBackward') as scope:
+            with ops.device("/cpu:0"):
+                train_op_non_fuse_float32 = adagrad.AdagradOptimizer(
+                        learning_rate=0.0001,
+                        initial_accumulator_value=0.1).minimize(
+                                loss_op_non_fuse_float32)
+
+        init_global = variables.global_variables_initializer()
+        init_local = variables.local_variables_initializer()
+
+        # trigger fusion op or not
+        graph_options = config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    do_op_fusion=False))
+        config = config_pb2.ConfigProto(
+                allow_soft_placement=False, graph_options=graph_options)
+        with session.Session(config=config) as sess:
+            from tensorflow.python.framework import graph_io
+            graph_io.write_graph(sess.graph, './', 'train.pbtxt')
+
+            # output the graph_def
+            np.random.seed(0)
+            feed_data_q = np.random.rand(n_num, q_num, c_num)
+            feed_data_k = np.random.rand(n_num, k_num, c_num)
+
+            sess.run([init_global, init_local])
+            for step in range(50):
+                loss_val_non_fuse, train_op_val = sess.run(
+                        [loss_op_non_fuse_float32,
+                         train_op_non_fuse_float32],
+                        feed_dict={data_float32_q: feed_data_q,
+                                   data_float32_k: feed_data_k})
+
+            print("loss val non-fuse: %2.7f" % (loss_val_non_fuse))
+            return loss_val_non_fuse
+
+
+def runFuse():
+
+    g2 = ops.Graph()
+    with g2.as_default():
+        random_seed.set_random_seed(0)
+
+        n_num = 1024
+        q_num = 50
+        k_num = 50
+        c_num = 128  # c_num % split_num == 0
+        split_num = 8
+
+        data_float32_q = array_ops.placeholder(
+                dtypes.float32, shape=(None, q_num, c_num))
+        data_float32_k = array_ops.placeholder(
+                dtypes.float32, shape=(None, k_num, c_num))
+
+        x_float32 = data_float32_q
+        y_float32 = data_float32_k
+        m = variable_scope.get_variable(
+                "m_fuse", [split_num, n_num, q_num, k_num], dtype=dtypes.int32,
+                initializer=init_ops.random_uniform_initializer(0, 2))
+        m_bool = math_ops.cast(m, dtype=dtypes.bool)
+        m_bool = array_ops.reshape(m_bool, [-1, q_num, k_num])
+        p_float32 = constant_op.constant(
+                0, shape=[split_num*n_num, q_num, k_num], dtype=dtypes.float32)
+
+        with ops.name_scope('FuseForward') as scope:
+            with ops.device("/cpu:0"):
+
+                x_float32 = layers.fully_connected(
+                        x_float32, c_num,
+                        activation_fn=nn_ops.leaky_relu, scope="X")
+
+                y_float32 = layers.fully_connected(
+                        y_float32, c_num,
+                        activation_fn=nn_ops.leaky_relu, scope="Y")
+
+                xs_float32 = array_ops.concat(
+                        array_ops.split(x_float32, split_num, axis=2), axis=0)
+                ys_float32 = array_ops.concat(
+                        array_ops.split(y_float32, split_num, axis=2), axis=0)
+                output_fuse_float32 = math_ops.matmul(
+                        xs_float32, ys_float32,
+                        transpose_a=False, transpose_b=True)
+
+                zero_tensor = array_ops.zeros_like(array_ops.identity(output_fuse_float32))
+                output_fuse_float32 = array_ops.where(
+                        m_bool, output_fuse_float32, zero_tensor)
+                zero_tensor2 = array_ops.zeros_like(zero_tensor)
+
+                layer1_fuse_float32 = layers.fully_connected(
+                        output_fuse_float32, 40,
+                        activation_fn=nn_ops.leaky_relu)
+                layer2_fuse_float32 = layers.fully_connected(
+                        layer1_fuse_float32, 20,
+                        activation_fn=nn_ops.leaky_relu)
+                layer2_fuse_float32 = array_ops.reshape(
+                        layer2_fuse_float32, [n_num, -1])
+                layer3_fuse_float32 = layers.fully_connected(
+                        layer2_fuse_float32, 1,
+                        activation_fn=nn_ops.leaky_relu)
+                labels_fuse_float32 = constant_op.constant(
+                        1, shape=[n_num, 1], dtype=dtypes.float32)
+                loss_op_fuse_float32 = math_ops.reduce_mean(
+                        nn_impl.sigmoid_cross_entropy_with_logits(
+                            logits=layer3_fuse_float32,
+                            labels=labels_fuse_float32))
+
+        with ops.name_scope('FuseBackward') as scope:
+            with ops.device("/cpu:0"):
+                train_op_fuse_float32 = adagrad.AdagradOptimizer(
+                        learning_rate=0.0001,
+                        initial_accumulator_value=0.1).minimize(
+                                loss_op_fuse_float32)
+
+        init_global = variables.global_variables_initializer()
+        init_local = variables.local_variables_initializer()
+
+        # trigger fusion op or not
+        graph_options = config_pb2.GraphOptions(
+                optimizer_options=config_pb2.OptimizerOptions(
+                    do_op_fusion=True))
+        config = config_pb2.ConfigProto(
+                allow_soft_placement=False, graph_options=graph_options)
+        with session.Session(config=config) as sess:
+            from tensorflow.python.framework import graph_io
+            graph_io.write_graph(sess.graph, './', 'train2.pbtxt')
+
+            np.random.seed(0)
+            feed_data_q = np.random.rand(n_num, q_num, c_num)
+            feed_data_k = np.random.rand(n_num, k_num, c_num)
+            sess.run([init_global, init_local])
+
+            for step in range(50):
+                loss_val_replaced, train_op_val = sess.run(
+                        [loss_op_fuse_float32, train_op_fuse_float32],
+                        feed_dict={data_float32_q: feed_data_q,
+                                   data_float32_k: feed_data_k})
+
+            print("loss val fuse: %2.7f" % loss_val_replaced)
+            return loss_val_replaced
+
+def runFuseForIntType():
+    graph_options = config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                do_op_fusion=True))
+    config = config_pb2.ConfigProto(
+            allow_soft_placement=False, graph_options=graph_options)
+
+    with session.Session(config=config) as sess:
+        with sess.graph.as_default():
+
+            # with ops.name_scope('FuseForward') as scope:
+            t_cond = variables.Variable([[True, True], [False, False]], dtype=dtypes.bool)
+            t_then = variables.Variable([[11,12],[13,14]], dtype=dtypes.int32)
+            t_else = variables.Variable([[21,22],[23,24]], dtype=dtypes.int32)
+            t_out  = variables.Variable([[31,32],[33,34]], dtype=dtypes.int32)
+            
+            t_then = array_ops.zeros_like(array_ops.reshape(array_ops.unique(array_ops.reshape(t_then, [-1]))[0], [-1, 2]))
+            t_select = array_ops.where(
+                    t_cond, t_then, t_else)
+            t_result = t_out + t_select
+
+            init_global = variables.global_variables_initializer()
+            init_local = variables.local_variables_initializer()
+
+            from tensorflow.python.framework import graph_io
+            graph_io.write_graph(sess.graph, './', 'train_3.pbtxt')
+
+            np.random.seed(0)
+            feed_p_input = np.random.rand(2, 2)
+            sess.run([init_global, init_local])
+
+            result = sess.run([t_result, ])
+            print("result:", result)
+            return result
+
+class SelectZeroLikeFusionTest(test.TestCase):
+    def testFusion(self):
+        res_non_fuse = runNonFuse()
+        res_fuse = runFuse()
+        self.assertAllCloseAccordingToType(res_non_fuse, res_fuse)
+
+    def testFusionForIntType(self):
+        result = runFuseForIntType()
+        self.assertAllEqual(result, [[[31, 32], [56, 58]]])
+
+if __name__ == "__main__":
+    test.main()

Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,8 @@ class TemplateLogicSumBase: public TemplateBase {`
`113`	`113`	`LOG(WARNING) << "Input check failed";`
`114`	`114`	`return false;`
`115`	`115`	`}`
`116`		`- LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() << "]";`
	`116`	`+ LOG(INFO) << "Fusion template[" << name() << "] match op[" << nodes[first_key_].node->name() <<`
	`117`	`+ "][new_name:" << name_prefix << "_" << name() << "]";`
`117`	`118`
`118`	`119`	`Node* node_fused_logicsum = add_fused_logicsum_node(nodes, name_prefix, g, inputs, outputs);`
`119`	`120`	`if (!node_fused_logicsum) {`