diff --git a/dlrover/python/common/constants.py b/dlrover/python/common/constants.py index e34c20d4f..b3b5ae78a 100644 --- a/dlrover/python/common/constants.py +++ b/dlrover/python/common/constants.py @@ -339,8 +339,11 @@ class AscendConstants(object): class ErrorMonitorConstants(object): TYPE_INFO = "info" + TYPE_WARN = "warn" TYPE_ERROR = "error" + JOB_INSTANCE = "job" + ACTION_WORKER_CREATE = "worker_create" ACTION_STATUS_UPDATE = "status_update" ACTION_EARLY_STOP = "early_stop" @@ -353,3 +356,4 @@ class ErrorMonitorConstants(object): ACTION_RDZV_TIMEOUT = "rendezvous_timeout" ACTION_TRAINING_START = "training_start" ACTION_RESTART_TRAINING = "restart_training" + ACTION_HANG_WARN = "hang_warning" diff --git a/dlrover/python/common/global_context.py b/dlrover/python/common/global_context.py index 421128281..30c4335d1 100644 --- a/dlrover/python/common/global_context.py +++ b/dlrover/python/common/global_context.py @@ -53,6 +53,7 @@ class DefaultValues(object): SEC_TO_CHANGE_PS = 3600 # 1h SEC_TO_WAIT_FAILED_PS = 600 # 10min HANG_CPU_USAGE_RATE = 0.05 + HANG_DETECTION = 1 class Context(Singleton): @@ -95,6 +96,9 @@ def __init__(self): self.is_tfv1_ps = False self.master_port = None self.relaunch_always = False + # The strategy of 'hang detection': + # 0: log only; 1: notify; 2: with fault tolerance + self.hang_detection = DefaultValues.HANG_DETECTION def set_params_from_brain(self): self.train_speed_record_num = self.get_param_value_from_brain( diff --git a/dlrover/python/diagnosis/common/constants.py b/dlrover/python/diagnosis/common/constants.py index 259748573..0152a02a4 100644 --- a/dlrover/python/diagnosis/common/constants.py +++ b/dlrover/python/diagnosis/common/constants.py @@ -24,7 +24,7 @@ class InferenceConfigKey(object): class DiagnosisConstant(object): MASTER_DIAGNOSIS_OBSERVING_INTERVAL_SECS = 180 AGENT_PERIODICALLY_DIAGNOSIS_INTERVAL_SECS = 60 - MASTER = -1 + MASTER_INSTANCE = -1 ANY_INSTANCE = -2 LOCAL_INSTANCE = -3 ACTION_EXPIRED_TIME_PERIOD_DEFAULT = 60 * 5 diff --git a/dlrover/python/diagnosis/common/inference_chain.py b/dlrover/python/diagnosis/common/inference_chain.py index 0bc93de1b..edfa79dbe 100644 --- a/dlrover/python/diagnosis/common/inference_chain.py +++ b/dlrover/python/diagnosis/common/inference_chain.py @@ -21,6 +21,7 @@ class InferenceName: TRAINING = "training" NODE = "node" WORKER = "worker" + ACTION = "action" class InferenceAttribute: @@ -31,9 +32,11 @@ class InferenceAttribute: class InferenceDescription: + NONE = "n/a" HANG = "hang" FAILURE = "failure" METRICS = "metrics" + EVENT = "event" @dataclass diff --git a/dlrover/python/diagnosis/inferencechain/coordinator.py b/dlrover/python/diagnosis/inferencechain/coordinator.py index f092fa95a..2186ec3de 100644 --- a/dlrover/python/diagnosis/inferencechain/coordinator.py +++ b/dlrover/python/diagnosis/inferencechain/coordinator.py @@ -10,12 +10,52 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import json from typing import List -from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction -from dlrover.python.diagnosis.common.inference_chain import Inference +from dlrover.python.common.global_context import Context +from dlrover.python.diagnosis.common.diagnosis_action import ( + DiagnosisAction, + EventAction, + NoAction, +) +from dlrover.python.diagnosis.common.inference_chain import ( + Inference, + InferenceAttribute, + InferenceDescription, + InferenceName, + is_same_inference, +) + +_dlrover_ctx = Context.singleton_instance() + + +def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction: + """ + Transform solutions (of Inference) to executable diagnosis action + + Args: + solutions: solutions of Inference + Return: + diagnosis action + """ + + event_solution = Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.EVENT, + ) + for solution in solutions: + # deal with event + if is_same_inference(solution, event_solution): + event_payload = solution.configs + return EventAction( + event_payload["event_type"], + event_payload["event_instance"], + event_payload["event_action"], + event_payload["event_msg"], + json.loads(event_payload["event_labels"]), + ) -def coordinate_inferences(observations: List[Inference]) -> DiagnosisAction: - return DiagnosisAction() + return NoAction() diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py deleted file mode 100644 index 3bb4add98..000000000 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_training_hang_operator.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List - -from dlrover.python.diagnosis.common.constants import DiagnosisDataType -from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData -from dlrover.python.diagnosis.common.inference_chain import ( - Inference, - InferenceAttribute, - InferenceDescription, - InferenceName, - InferenceOperator, -) - -HANG_METRIC_PREFIX = "XPU_TIMER_COMMON_HANG" - - -class CheckTrainingHangOperator(InferenceOperator): - """ - CheckTrainingHangOperator is the operator to check - if training is hanged - """ - - def __init__(self, data_manager): - super().__init__(data_manager) - - def is_compatible(self, inference: Inference) -> bool: - if ( - inference.name == InferenceName.TRAINING - and inference.attribution == InferenceAttribute.ISORNOT - and inference.description == InferenceDescription.HANG - ): - return True - else: - return False - - def infer(self, inferences: List[Inference]) -> List[Inference]: - if not self.data_manager: - return [ - Inference( - name=InferenceName.TRAINING, - attribution=InferenceAttribute.NOT, - description=InferenceDescription.HANG, - ) - ] - - diagnosis_data = self._data_manager.get_data( - DiagnosisDataType.XPU_TIMER_METRIC - ) - - if diagnosis_data and self.is_hang(diagnosis_data): - return [ - Inference( - name=InferenceName.TRAINING, - attribution=InferenceAttribute.IS, - description=InferenceDescription.HANG, - ) - ] - - return [ - Inference( - name=InferenceName.TRAINING, - attribution=InferenceAttribute.NOT, - description=InferenceDescription.HANG, - ) - ] - - def is_hang(self, diagnosis_data: List[DiagnosisData]): - hang_metric = [] - if not diagnosis_data: - return False - - for data in diagnosis_data: - each_metric = [ - line - for line in data.data_content.splitlines() - if line.startswith(HANG_METRIC_PREFIX) - ] - hang_metric.append(each_metric) - - if len(hang_metric) > 0: - return True - # TODO: implement the judgement - return False diff --git a/dlrover/python/diagnosis/inferencechain/coordinate_solutions.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/__init__.py similarity index 56% rename from dlrover/python/diagnosis/inferencechain/coordinate_solutions.py rename to dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/__init__.py index 4da739934..0c742f3fa 100644 --- a/dlrover/python/diagnosis/inferencechain/coordinate_solutions.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/__init__.py @@ -10,20 +10,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from typing import List - -from dlrover.python.diagnosis.common.diagnosis_action import DiagnosisAction -from dlrover.python.diagnosis.common.inference_chain import Inference - - -def coordinate_solutions(solutions: List[Inference]) -> DiagnosisAction: - """ - Transform solutions (of Inference) to executable diagnosis action - - Args: - solutions: solutions of Inference - Return: - diagnosis action - """ - return DiagnosisAction() diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/check_failure_node_operator.py similarity index 100% rename from dlrover/python/diagnosis/inferencechain/inferenceoperator/check_failure_node_operator.py rename to dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/check_failure_node_operator.py diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/check_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/check_training_hang_operator.py new file mode 100644 index 000000000..650039f8d --- /dev/null +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/check_training_hang_operator.py @@ -0,0 +1,193 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +import sys +from typing import Dict, List, Tuple + +from dlrover.python.common.log import default_logger as logger +from dlrover.python.diagnosis.common.constants import DiagnosisDataType +from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData +from dlrover.python.diagnosis.common.inference_chain import ( + Inference, + InferenceAttribute, + InferenceDescription, + InferenceName, + InferenceOperator, +) + +HANG_METRIC_PREFIX = "XPU_TIMER_COMMON_HANG" + + +class CheckTrainingHangOperator(InferenceOperator): + """ + CheckTrainingHangOperator is the operator to check + if training is hanged + """ + + def __init__(self, data_manager): + super().__init__(data_manager) + + def is_compatible(self, inference: Inference) -> bool: + if ( + inference.name == InferenceName.TRAINING + and inference.attribution == InferenceAttribute.ISORNOT + and inference.description == InferenceDescription.HANG + ): + return True + else: + return False + + def infer(self, inferences: List[Inference]) -> List[Inference]: + if not self.data_manager: + logger.info( + "Skip training-hang inference for there is " + "no diagnosis data." + ) + return [ + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, + ) + ] + + diagnosis_data = self._data_manager.get_data( + DiagnosisDataType.XPU_TIMER_METRIC + ) + + if diagnosis_data and self.is_hang(diagnosis_data): + logger.warning("Training might hanged.") + return [ + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.IS, + description=InferenceDescription.HANG, + ) + ] + + return [ + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, + ) + ] + + def is_hang(self, diagnosis_data: List[DiagnosisData]): + logger.info( + "Hang detection start using diagnosis data, " + f"data number: {len(diagnosis_data)}, " + f"data size: {sys.getsizeof(diagnosis_data)}." + ) + worker_hang_metric: Dict[int, List[Tuple[int, bool]]] = {} + if not diagnosis_data: + return False + + # the format of the hang metric can refer these files: + # dlrover/python/tests/data/xpu_timer/hang + for data in diagnosis_data: + # filter hang metric + each_worker_metric = [ + line + for line in data.data_content.splitlines() + if line.startswith(HANG_METRIC_PREFIX) + ] + + # if all local rank is hanged, tag worker hang + rank_hang_size = 0 + is_worker_hang = False + for each_rank_metric in each_worker_metric: + match = re.search(r"(\d+)(?!.*\d)", each_rank_metric) + if match and match.group(0) == "1": + rank_hang_size += 1 + if rank_hang_size == len(each_worker_metric): + is_worker_hang = True + + if data.node_rank not in worker_hang_metric: + worker_hang_metric[data.node_rank] = [] + worker_hang_metric[data.node_rank].append( + (data.timestamp, is_worker_hang) + ) + + # hang detection rules: + # 1. 100% worker got hang metric + # 2. last for 5+ minutes + hang_id, hang_last = self._get_hang_overlaps(worker_hang_metric) + hang_last_threshold = self._get_hang_time_last_threshold() + if hang_id != -1 and hang_last > hang_last_threshold: + logger.info( + f"Got hang worker: {hang_id}, time last: {hang_last}, " + f"threshold: {hang_last_threshold}" + ) + return True + return False + + def _get_hang_time_last_threshold(self): + # set 5 minutes for now(second) + return 5 * 60 + + def _get_hang_overlaps( + self, worker_hang_metric: Dict[int, List[Tuple[int, bool]]] + ) -> Tuple[int, int]: + """ + Require all workers hang from latest and find the hang overlaps. + + Args: + worker_hang_metric (Dict[int, List[Tuple[int, bool]]]): Input + metric in format: node_id: [(timestamp, is_hang), ...] + + Returns: + The hang overlaps' id and time last in tuple format. + """ + + worker_hang_length_min = 0 + worker_hang_id = -1 + + # find the intersection from latest + for worker_id, tuple_list in worker_hang_metric.items(): + # sorted by timestamp + tuple_list.sort(key=lambda x: x[0]) + worker_hang_length = 0 + + # ort in descending order + reversed_tuple_list = reversed(tuple_list) + for tuple_item in reversed_tuple_list: + if tuple_item[1]: + worker_hang_length += 1 + else: + break + + if worker_hang_length > 0: + if worker_hang_length_min == 0: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + elif worker_hang_length < worker_hang_length_min: + worker_hang_length_min = worker_hang_length + worker_hang_id = worker_id + else: + # there is normal worker + return -1, -1 + + # get the intersection's time last + if worker_hang_id != -1 and worker_hang_length_min != 0: + hang_worker_metric = worker_hang_metric[worker_hang_id] + time_last = ( + hang_worker_metric[len(hang_worker_metric) - 1][0] + - hang_worker_metric[ + len(hang_worker_metric) - worker_hang_length_min + ][0] + ) + return worker_hang_id, time_last + + return -1, -1 diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/metrics_collection_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/metrics_collection_operator.py similarity index 100% rename from dlrover/python/diagnosis/inferencechain/inferenceoperator/metrics_collection_operator.py rename to dlrover/python/diagnosis/inferencechain/inferenceoperator/observer/metrics_collection_operator.py diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py index 454f380dc..1abef5fbe 100644 --- a/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/operator.py @@ -11,15 +11,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import ( # noqa: E501 +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_failure_node_operator import ( # noqa: E501 CheckFailureNodeOperator, ) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_training_hang_operator import ( # noqa: E501 +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_training_hang_operator import ( # noqa: E501 CheckTrainingHangOperator, ) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.metrics_collection_operator import ( # noqa: E501 +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.metrics_collection_operator import ( # noqa: E501 MetricsCollectionOperator, ) +from dlrover.python.diagnosis.inferencechain.inferenceoperator.resolver.resolve_training_hang_operator import ( # noqa: E501 + ResolveTrainingHangOperator, +) from dlrover.python.master.diagnosis.diagnosis_data_manager import ( DiagnosisDataManager, ) @@ -37,7 +40,19 @@ def get_worker_diagnosis_operators(): return [] -def get_master_observe_operators(data_mgr: DiagnosisDataManager = None): +def get_master_observing_operators(data_mgr: DiagnosisDataManager = None): + return [ + CheckTrainingHangOperator(data_mgr), + ] + + +def get_master_observer_operators(data_mgr: DiagnosisDataManager = None): return [ CheckTrainingHangOperator(data_mgr), ] + + +def get_master_resolver_operators(data_mgr: DiagnosisDataManager = None): + return [ + ResolveTrainingHangOperator(data_mgr), + ] diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/__init__.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/__init__.py new file mode 100644 index 000000000..0c742f3fa --- /dev/null +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_training_hang_operator.py b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_training_hang_operator.py new file mode 100644 index 000000000..3371f1d29 --- /dev/null +++ b/dlrover/python/diagnosis/inferencechain/inferenceoperator/resolver/resolve_training_hang_operator.py @@ -0,0 +1,78 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +from typing import List + +from dlrover.python.common.constants import ErrorMonitorConstants +from dlrover.python.common.global_context import Context +from dlrover.python.diagnosis.common.inference_chain import ( + Inference, + InferenceAttribute, + InferenceDescription, + InferenceName, + InferenceOperator, + is_same_inference, +) + +_dlrover_ctx = Context.singleton_instance() + + +class ResolveTrainingHangOperator(InferenceOperator): + """ + CheckTrainingHangOperator is the operator to check + if training is hanged + """ + + def __init__(self, data_manager): + super().__init__(data_manager) + + def is_compatible(self, inference: Inference) -> bool: + if inference.name == InferenceName.TRAINING: + return True + else: + return False + + def infer(self, inferences: List[Inference]) -> List[Inference]: + hang_infer = Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.IS, + description=InferenceDescription.HANG, + ) + if any(is_same_inference(infer, hang_infer) for infer in inferences): + if _dlrover_ctx.hang_detection == 1: + # trigger event action + return [ + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.EVENT, + configs={ + "event_type": ErrorMonitorConstants.TYPE_WARN, + "event_instance": ErrorMonitorConstants.JOB_INSTANCE, # noqa: E501 + "event_action": ErrorMonitorConstants.ACTION_HANG_WARN, # noqa: E501 + "event_msg": "", + "event_labels": json.dumps({}), + }, + ) + ] + elif _dlrover_ctx.hang_detection == 2: + # trigger relaunch action + pass + + return [ + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.NONE, + ) + ] diff --git a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py index 2268ac2d8..ab3bedad3 100644 --- a/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py +++ b/dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py @@ -41,7 +41,7 @@ combine_inferences, is_inference_included, ) -from dlrover.python.diagnosis.inferencechain.coordinate_solutions import ( +from dlrover.python.diagnosis.inferencechain.coordinator import ( coordinate_solutions, ) from dlrover.python.diagnosis.inferencechain.inference_chain import ( diff --git a/dlrover/python/master/args.py b/dlrover/python/master/args.py index 05bf6bd35..177c91f71 100644 --- a/dlrover/python/master/args.py +++ b/dlrover/python/master/args.py @@ -86,6 +86,13 @@ def _build_master_args_parser(): type=pos_int, help="The number of nodes", ) + parser.add_argument( + "--hang_detection", + default=1, + type=pos_int, + help="The strategy of 'hang detection', " + "0: log only; 1: notify; 2: with fault tolerance", + ) add_params(parser) return parser diff --git a/dlrover/python/master/diagnosis/diagnosis.py b/dlrover/python/master/diagnosis/diagnosis.py index 17dd073ea..9142ee7c2 100644 --- a/dlrover/python/master/diagnosis/diagnosis.py +++ b/dlrover/python/master/diagnosis/diagnosis.py @@ -11,9 +11,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import threading import time +from collections import deque from datetime import datetime, timedelta +from itertools import islice from typing import Dict, List from dlrover.python.common.log import default_logger as logger @@ -29,9 +32,10 @@ InferenceChain, InferenceOperator, ) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_training_hang_operator import ( # noqa: E501 +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_training_hang_operator import ( # noqa: E501 CheckTrainingHangOperator, ) +from dlrover.python.master.node.job_context import get_job_context def has_expired(timestamp: float, time_period: int) -> bool: @@ -43,7 +47,7 @@ def has_expired(timestamp: float, time_period: int) -> bool: class DiagnosisManager: def __init__(self): self._is_observing_started = False - self._data_manager: DiagnosisDataManager = DiagnosisDataManager(600) + self._data_manager: DiagnosisDataManager = DiagnosisDataManager() self._diagnostician: Diagnostician = Diagnostician(self._data_manager) def collect_diagnosis_data(self, data: DiagnosisData): @@ -72,8 +76,8 @@ def start_observing(self): try: thread = threading.Thread( - target=self._diagnose_failures(), - name="diagnose_failures", + target=self._diagnose_failures, + name="failure_diagnosis", daemon=True, ) thread.start() @@ -94,10 +98,14 @@ def _diagnose_failures(self): if not self._is_observing_started: logger.info("Stop to diagnose failures for observing.") break + logger.info( + "Current diagnosis " + f"data size: {self._data_manager.get_data_size()}." + ) observed_problems = self._diagnostician.observe_training() for problem in observed_problems: - logger.info(f"observed problems: {problem}") + logger.info(f"Observe problem in diagnosing: {problem}") root_causes = self._diagnostician.diagnose_failure(problem) for root_cause in root_causes: logger.info(f"identify root cause: {root_cause}") @@ -107,36 +115,46 @@ def _diagnose_failures(self): class DiagnosisDataManager: - def __init__(self, expire_time_period): - self.diagnosis_data: Dict[str, List[DiagnosisData]] = {} + def __init__(self, expire_time_period=600): + self._diagnosis_data: Dict[str, deque[DiagnosisData]] = {} self.expire_time_period = expire_time_period + self._job_context = get_job_context() + self._lock = threading.Lock() + + @property + def data(self): + return self._diagnosis_data def store_data(self, data: DiagnosisData): data_type = data.data_type - if data_type not in self.diagnosis_data: - logger.debug(f"{data_type} is not found in the store") - self.diagnosis_data[data_type] = [] - self.diagnosis_data[data_type].append(data) - self._clean_diagnosis_data(data_type) + with self._lock: + if data_type not in self.data: + self.data[data_type] = deque(maxlen=100000) + self.data[data_type].append(data) + self._clean_diagnosis_data(data_type) def get_data(self, data_type: str) -> List[DiagnosisData]: - if data_type not in self.diagnosis_data: - return [] - return self.diagnosis_data[data_type] + with self._lock: + if data_type not in self.data: + return [] + return list(self.data[data_type]) + + def get_data_size(self): + return sys.getsizeof(self.data) def _clean_diagnosis_data(self, data_type: str): - if data_type not in self.diagnosis_data: + if data_type not in self.data: return - data = self.diagnosis_data[data_type] + each_data = self.data[data_type] n = 0 - for d in data: + for d in each_data: if has_expired(d.timestamp, self.expire_time_period): n = n + 1 else: break - - self.diagnosis_data[data_type] = data[n:] + if n > 0: + self.data[data_type] = deque(islice(each_data, n, len(each_data))) class Diagnostician: diff --git a/dlrover/python/master/diagnosis/diagnosis_data_manager.py b/dlrover/python/master/diagnosis/diagnosis_data_manager.py index 6f7785ca6..992261372 100644 --- a/dlrover/python/master/diagnosis/diagnosis_data_manager.py +++ b/dlrover/python/master/diagnosis/diagnosis_data_manager.py @@ -11,7 +11,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from queue import Queue +import threading +from collections import deque from typing import Dict, List from dlrover.python.diagnosis.common.diagnosis_data import DiagnosisData @@ -28,25 +29,26 @@ def __init__(self, expire_time_period): Args: expire_time_period: data expire time period in seconds """ - self.diagnosis_data: Dict[str, Queue[DiagnosisData]] = {} + self.diagnosis_data: Dict[str, deque[DiagnosisData]] = {} self.expire_time_period = expire_time_period + self._lock = threading.Lock() def store_data(self, data: DiagnosisData): data_type = data.data_type - if data_type not in self.diagnosis_data: - self.diagnosis_data[data_type] = Queue(maxsize=100) - q = self.diagnosis_data[data_type] - if q.full(): - _ = q.get() - q.put(data) + with self._lock: + if data_type not in self.diagnosis_data: + self.diagnosis_data[data_type] = deque(maxlen=10000) + q = self.diagnosis_data[data_type] + q.append(data) def get_data(self, data_type: str) -> List[DiagnosisData]: - if data_type not in self.diagnosis_data: - return [] - q = self.diagnosis_data[data_type] - datas = [] - while not q.empty(): - data = q.get() - if not has_expired(data.timestamp, self.expire_time_period): - datas.append(data) - return datas + with self._lock: + if data_type not in self.diagnosis_data: + return [] + data_by_type = self.diagnosis_data[data_type] + result = [] + while len(data_by_type) != 0: + data = data_by_type.popleft() + if not has_expired(data.timestamp, self.expire_time_period): + result.append(data) + return result diff --git a/dlrover/python/master/diagnosis/diagnosis_manager.py b/dlrover/python/master/diagnosis/diagnosis_manager.py index 59be1cf7e..e638ca3d7 100644 --- a/dlrover/python/master/diagnosis/diagnosis_manager.py +++ b/dlrover/python/master/diagnosis/diagnosis_manager.py @@ -25,7 +25,7 @@ InferenceName, InferenceOperator, ) -from dlrover.python.diagnosis.inferencechain.coordinate_solutions import ( +from dlrover.python.diagnosis.inferencechain.coordinator import ( coordinate_solutions, ) from dlrover.python.diagnosis.inferencechain.inference_chain import ( @@ -34,7 +34,8 @@ combine_inferences, ) from dlrover.python.diagnosis.inferencechain.inferenceoperator.operator import ( # noqa: E501 - get_master_observe_operators, + get_master_observer_operators, + get_master_resolver_operators, ) from dlrover.python.master.diagnosis.diagnosis_data_manager import ( DiagnosisDataManager, @@ -77,8 +78,12 @@ def start_observing(self): ] self._diagnostician.register_training_problems(problems) - operators = get_master_observe_operators(self._data_manager) - self._diagnostician.register_observing_operators(operators) + self._diagnostician.register_observers( + get_master_observer_operators(self._data_manager) + ) + self._diagnostician.register_resolvers( + get_master_resolver_operators(self._data_manager) + ) try: thread = threading.Thread( @@ -106,7 +111,7 @@ def _diagnose(self): break observed_problems = self._diagnostician.observe_training() - action = self._diagnostician.diagnose_problems(observed_problems) + action = self._diagnostician.resolve_problems(observed_problems) self._job_context.enqueue_action(action) time.sleep( @@ -124,8 +129,8 @@ def __init__(self, data_manager): self._data_manager = data_manager self._pre_checks: List[Inference] = [] self._training_problems: List[Inference] = [] - self._observing_operators: List[InferenceOperator] = [] - self._diagnosis_operators: List[InferenceOperator] = [] + self._observers: List[InferenceOperator] = [] + self._resolvers: List[InferenceOperator] = [] def register_pre_check(self, pre_checks: List[Inference]): self._pre_checks = pre_checks @@ -133,8 +138,11 @@ def register_pre_check(self, pre_checks: List[Inference]): def register_training_problems(self, problems: List[Inference]): self._training_problems = problems - def register_observing_operators(self, operators: List[InferenceOperator]): - self._observing_operators = operators + def register_observers(self, operators: List[InferenceOperator]): + self._observers = operators + + def register_resolvers(self, operators: List[InferenceOperator]): + self._resolvers = operators def observe_training(self) -> List[Inference]: """ @@ -146,30 +154,34 @@ def observe_training(self) -> List[Inference]: if len(self._training_problems) == 0: logger.warning("No training problem is registered.") return [] - observed_problems: List[Inference] = [] + combined_problems: List[Inference] = [] for problem in self._training_problems: - ic = InferenceChain([problem], self._observing_operators) - ob_problems = ic.infer() - observed_problems = combine_inferences( - observed_problems, ob_problems + logger.info(f"Observing problem: {problem}") + ic = InferenceChain([problem], self._observers) + observed_problems = ic.infer() + combined_problems = combine_inferences( + combined_problems, observed_problems ) - return observed_problems + return combined_problems - def diagnose_problems(self, problems: List[Inference]) -> DiagnosisAction: + def resolve_problems(self, problems: List[Inference]) -> DiagnosisAction: """ Generate the diagnosis action for observed problem Args: - problems: observed problems + problems: observed(combined) problems + Return: diagnosis action """ - solutions: List[Inference] = [] + combined_solutions: List[Inference] = [] for problem in problems: - logger.info(f"observed problems: {problem}") - ic = InferenceChain([problem], self._diagnosis_operators) - sols = ic.infer() - if len(sols) > 0: - solutions = combine_inferences(solutions, sols) - - return coordinate_solutions(solutions) + logger.info(f"Resolving problem: {problem}") + ic = InferenceChain([problem], self._resolvers) + input_solutions = ic.infer() + if len(input_solutions) > 0: + combined_solutions = combine_inferences( + combined_solutions, input_solutions + ) + + return coordinate_solutions(combined_solutions) diff --git a/dlrover/python/master/node/dist_job_manager.py b/dlrover/python/master/node/dist_job_manager.py index 93e59e636..90647730b 100644 --- a/dlrover/python/master/node/dist_job_manager.py +++ b/dlrover/python/master/node/dist_job_manager.py @@ -36,6 +36,10 @@ from dlrover.python.common.grpc import ParallelConfig from dlrover.python.common.log import default_logger as logger from dlrover.python.common.node import Node, NodeGroupResource +from dlrover.python.diagnosis.common.constants import ( + DiagnosisActionType, + DiagnosisConstant, +) from dlrover.python.diagnosis.common.diagnosis_action import ( DiagnosisAction, NoAction, @@ -267,7 +271,7 @@ def should_early_stop(self): self._report_event( ErrorMonitorConstants.TYPE_INFO, - "job", + ErrorMonitorConstants.JOB_INSTANCE, ErrorMonitorConstants.ACTION_EARLY_STOP, "All node check failed", {"nodes": json.dumps(self._worker_manager.cur_nodes)}, @@ -294,7 +298,7 @@ def should_early_stop(self): ) self._report_event( ErrorMonitorConstants.TYPE_INFO, - "job", + ErrorMonitorConstants.JOB_INSTANCE, ErrorMonitorConstants.ACTION_EARLY_STOP, "PS OOM", {}, @@ -323,7 +327,7 @@ def should_early_stop(self): self._report_event( ErrorMonitorConstants.TYPE_INFO, - "job", + ErrorMonitorConstants.JOB_INSTANCE, ErrorMonitorConstants.ACTION_EARLY_STOP, "Pending nodes", { @@ -347,7 +351,7 @@ def should_early_stop(self): ) self._report_event( ErrorMonitorConstants.TYPE_INFO, - "job", + ErrorMonitorConstants.JOB_INSTANCE, ErrorMonitorConstants.ACTION_EARLY_STOP, "Not enough nodes", {"nodes": json.dumps(self._worker_manager.cur_nodes)}, @@ -503,7 +507,11 @@ def _diagnose_job(self): logger.warning(e) detail_trace_back = traceback.format_exc() logger.warning(detail_trace_back) - self._process_diagnosis_action(self._job_context.next_action()) + self._process_diagnosis_action( + self._job_context.next_action( + instance=DiagnosisConstant.MASTER_INSTANCE + ) + ) time.sleep(15) def _get_dead_node_event(self, window_interval=900) -> List[NodeEvent]: @@ -678,7 +686,10 @@ def _get_pod_unique_labels(self, node: Node): } def _process_diagnosis_action(self, action: DiagnosisAction): - pass + if not action or action.action_type == DiagnosisActionType.NONE: + return + + # TODO def _process_event(self, event: NodeEvent): node_type = event.node.type @@ -1263,6 +1274,12 @@ def process_reported_node_event(self, node_event: NodeEvent): self._job_context.update_job_node(target_node) + def get_node_required_info(self): + return self._nodes_required + + def get_job_strategy(self): + return self._job_args.distribution_strategy + def create_job_manager(args: JobArgs, speed_monitor) -> DistributedJobManager: critical_worker_index = get_critical_worker_index(args) diff --git a/dlrover/python/master/servicer.py b/dlrover/python/master/servicer.py index 2e2be385a..9a1db93c6 100644 --- a/dlrover/python/master/servicer.py +++ b/dlrover/python/master/servicer.py @@ -358,7 +358,7 @@ def report(self, request, _): elif isinstance(message, grpc.NodeCheckpointState): success = self._sync_checkpoint(node_type, node_id, message) elif isinstance(message, grpc.DiagnosisReportData): - success = self._report_worker_diagnosis_data(message) + success = self._report_node_diagnosis_data(message) response.success = success return response @@ -614,7 +614,7 @@ def _sync_checkpoint( rdzv_manager = self._rdzv_managers[RendezvousName.ELASTIC_TRAINING] return rdzv_manager.sync_ckpt_nodes(node_id, message.step) - def _report_worker_diagnosis_data(self, message: grpc.DiagnosisReportData): + def _report_node_diagnosis_data(self, message: grpc.DiagnosisReportData): if self._diagnosis_manager: data_cls: Optional[DiagnosisData] = getattr( self._diagnosis_data_module, diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all new file mode 100644 index 000000000..e384fb2de --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_all @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some new file mode 100644 index 000000000..59142aa4c --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/hang/xpu_timer_metric_some @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/data/xpu_timer_metrics b/dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 similarity index 100% rename from dlrover/python/tests/data/xpu_timer_metrics rename to dlrover/python/tests/data/xpu_timer/normal/xpu_timer_metric_0 diff --git a/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single new file mode 100644 index 000000000..0e646c2ab --- /dev/null +++ b/dlrover/python/tests/data/xpu_timer/xpu_timer_metric_single @@ -0,0 +1,193 @@ +# HELP exposer_transferred_bytes_total Transferred bytes to metrics services +# TYPE exposer_transferred_bytes_total counter +exposer_transferred_bytes_total 12375203 +# HELP exposer_scrapes_total Number of times metrics were scraped +# TYPE exposer_scrapes_total counter +exposer_scrapes_total 6174 +# HELP exposer_request_latencies Latencies of serving scrape requests, in microseconds +# TYPE exposer_request_latencies summary +exposer_request_latencies_count 6174 +exposer_request_latencies_sum 6043374 +exposer_request_latencies{quantile="0.5"} 888 +exposer_request_latencies{quantile="0.9"} 888 +exposer_request_latencies{quantile="0.99"} 888 +# TYPE XPU_TIMER_MM_KERNEL_AVG_LATENCY gauge +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 19063 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3719 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78409 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 3673 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 3682 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 7500 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23150 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 3675 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3662 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 7267 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 3678 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 7082 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 7076 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 8104 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 7587 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 3690 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 7732 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3511 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 18365 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 11184 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 17503 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23387 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 6777 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23613 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 79162 +XPU_TIMER_MM_KERNEL_AVG_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 3694 +# TYPE XPU_TIMER_MM_KERNEL_MAX_LATENCY gauge +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3990 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72937 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_MAX_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4085 +# TYPE XPU_TIMER_MM_KERNEL_P99_LATENCY gauge +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 73411 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 4994 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 78856 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 4011 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 4025 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 63087 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 71453 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 4035 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 3989 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 63198 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 4001 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 63342 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 63214 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 63184 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 64022 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 4112 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 63349 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 3971 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 71930 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 75577 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 76229 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 72832 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 63274 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 72472 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 80781 +XPU_TIMER_MM_KERNEL_P99_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 4084 +# TYPE XPU_TIMER_MM_KERNEL_MIN_LATENCY gauge +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1183 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1257 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 77810 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 2995 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 2992 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1040 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 2991 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1037 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 1039 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1058 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 1049 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 2996 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1057 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 2997 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 1181 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 1200 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 1205 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 1180 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 1042 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 1182 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 77646 +XPU_TIMER_MM_KERNEL_MIN_LATENCY{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 2991 +# TYPE XPU_TIMER_MM_KERNEL_FLOPS gauge +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 341.3345944233395 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 333.0872448878719 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 297.7779568156592 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 412.980679183614 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 412.7927898846224 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 371.6654952875318 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 333.9413147465587 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 412.6426014946334 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 365.3253468575526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 414.2250512081811 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 366.8107237284316 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="3",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 366.655433637526 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 372.6319021584654 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 376.5763187831131 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 412.4764489189885 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 380.7275537933207 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 414.4304717174849 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="1",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 340.2784849108911 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="6",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 342.0186789595024 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 339.7178943261944 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="4",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 333.5362100380105 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="8",local_rank="2",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 368.8886616386404 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="7",local_rank="5",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 330.9267993677032 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="6",local_rank="0",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 293.8160541599364 +XPU_TIMER_MM_KERNEL_FLOPS{dtype="bf16",ip="11.36.23.0",job_name="aistudio-155220033",level="9",local_rank="7",operation="Matmul",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 410.5513810707692 +# TYPE XPU_TIMER_COMMON_HANG gauge +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_HANG{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_START_DUMP gauge +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_START_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_END_DUMP gauge +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 0 +XPU_TIMER_COMMON_END_DUMP{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 0 +# TYPE XPU_TIMER_COMMON_POOL_QUEUE_SIZE gauge +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 23 +XPU_TIMER_COMMON_POOL_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 23 +# TYPE XPU_TIMER_COMMON_WORK_QUEUE_SIZE gauge +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="4",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="4"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="0",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="0"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="2",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="2"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="6",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="6"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="1",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="1"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="5",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="5"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="3",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="3"} 14 +XPU_TIMER_COMMON_WORK_QUEUE_SIZE{ip="11.36.23.0",job_name="aistudio-155220033",local_rank="7",pod_name="aistudio-z1y0p6pq-edljob-worker-0",rank="7"} 14 diff --git a/dlrover/python/tests/test_diagnosis_coordinator.py b/dlrover/python/tests/test_diagnosis_coordinator.py new file mode 100644 index 000000000..3c3cb9fde --- /dev/null +++ b/dlrover/python/tests/test_diagnosis_coordinator.py @@ -0,0 +1,71 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from dlrover.python.common.constants import ErrorMonitorConstants +from dlrover.python.diagnosis.common.constants import DiagnosisActionType +from dlrover.python.diagnosis.common.inference_chain import ( + Inference, + InferenceAttribute, + InferenceDescription, + InferenceName, +) +from dlrover.python.diagnosis.inferencechain.coordinator import ( + coordinate_solutions, +) + + +class InferenceChainTest(unittest.TestCase): + def setUp(self): + pass + + def tearDown(self): + pass + + def test_event_action(self): + test_solutions = [] + self.assertEqual( + coordinate_solutions(test_solutions).action_type, + DiagnosisActionType.NONE, + ) + + test_solutions.append( + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.EVENT, + configs={ + "event_type": ErrorMonitorConstants.TYPE_WARN, + "event_instance": ErrorMonitorConstants.JOB_INSTANCE, + "event_action": ErrorMonitorConstants.ACTION_HANG_WARN, + "event_msg": "", + "event_labels": "{}", + }, + ) + ) + action = coordinate_solutions(test_solutions) + self.assertEqual(action.action_type, DiagnosisActionType.EVENT) + self.assertEqual(action.event_type, ErrorMonitorConstants.TYPE_WARN) + self.assertEqual( + action.event_instance, ErrorMonitorConstants.JOB_INSTANCE + ) + self.assertEqual( + action.event_action, ErrorMonitorConstants.ACTION_HANG_WARN + ) + self.assertEqual(action.event_msg, "") + self.assertEqual(action.event_labels, {}) + + +if __name__ == "__main__": + unittest.main() diff --git a/dlrover/python/tests/test_diagnosis_data_collector.py b/dlrover/python/tests/test_diagnosis_data_collector.py index 7a6eb10e0..a69ea2daa 100644 --- a/dlrover/python/tests/test_diagnosis_data_collector.py +++ b/dlrover/python/tests/test_diagnosis_data_collector.py @@ -71,7 +71,7 @@ def test_xpu_timer_metric_collector(self): self.assertEqual(collector.collect_data(), "") - file = "data/xpu_timer_metrics" + file = "data/xpu_timer/xpu_timer_metric_single" file_path = os.path.join(os.path.dirname(__file__), file) with open(file_path, "r", encoding="utf-8") as file: test_metrics = file.read() diff --git a/dlrover/python/tests/test_diagnosis_manager.py b/dlrover/python/tests/test_diagnosis_manager.py index 5602a8142..281cb63e7 100644 --- a/dlrover/python/tests/test_diagnosis_manager.py +++ b/dlrover/python/tests/test_diagnosis_manager.py @@ -14,6 +14,7 @@ import time import unittest from typing import List +from unittest import mock from dlrover.python.diagnosis.common.constants import ( DiagnosisActionType, @@ -30,7 +31,7 @@ InferenceName, is_training_hanged, ) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_training_hang_operator import ( # noqa: E501 +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_training_hang_operator import ( # noqa: E501 CheckTrainingHangOperator, ) from dlrover.python.master.diagnosis.diagnosis_data_manager import ( @@ -83,8 +84,8 @@ def test_diagnosis_manager(self): data_mgr = DiagnosisDataManager(10000) operator = CheckTrainingHangOperator(data_mgr) - mgr._diagnostician.register_observing_operators([operator]) - self.assertEqual(len(mgr._diagnostician._observing_operators), 1) + mgr._diagnostician.register_observers([operator]) + self.assertEqual(len(mgr._diagnostician._observers), 1) data = DiagnosisData( data_type=DiagnosisDataType.XPU_TIMER_METRIC, @@ -92,10 +93,15 @@ def test_diagnosis_manager(self): ) data_mgr.store_data(data) + # mock training hang + mgr._diagnostician._observers[0].is_hang = mock.MagicMock( + return_value=True + ) + # observe training problems observed_problems = mgr._diagnostician.observe_training() self.assertTrue(is_training_hanged(observed_problems[0])) # explore solutions to observed problems - action = mgr._diagnostician.diagnose_problems(observed_problems) + action = mgr._diagnostician.resolve_problems(observed_problems) self.assertEqual(action.action_type, DiagnosisActionType.NONE) diff --git a/dlrover/python/tests/test_inference.py b/dlrover/python/tests/test_inference.py new file mode 100644 index 000000000..43a54f997 --- /dev/null +++ b/dlrover/python/tests/test_inference.py @@ -0,0 +1,488 @@ +# Copyright 2024 The DLRover Authors. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +from typing import Dict, List, Tuple +from unittest import mock +from unittest.mock import patch + +from dlrover.python.common import env_utils +from dlrover.python.common.constants import ( + ErrorMonitorConstants, + NodeEnv, + NodeType, +) +from dlrover.python.diagnosis.common.constants import ( + DiagnosisDataType, + EnvConfigKey, + InferenceConfigKey, +) +from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric +from dlrover.python.diagnosis.common.inference_chain import ( + Inference, + InferenceAttribute, + InferenceDescription, + InferenceName, + is_same_inference, +) +from dlrover.python.diagnosis.inferencechain.inference_chain import ( + InferenceChain, +) +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_failure_node_operator import ( # noqa: E501 + CheckFailureNodeOperator, +) +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.check_training_hang_operator import ( # noqa: E501 + CheckTrainingHangOperator, +) +from dlrover.python.diagnosis.inferencechain.inferenceoperator.observer.metrics_collection_operator import ( # noqa: E501 + MetricsCollectionOperator, +) +from dlrover.python.diagnosis.inferencechain.inferenceoperator.resolver.resolve_training_hang_operator import ( # noqa: E501 + ResolveTrainingHangOperator, +) +from dlrover.python.elastic_agent.master_client import ( + MasterClient, + build_master_client, +) +from dlrover.python.tests.test_utils import start_local_master + + +class InferenceChainTest(unittest.TestCase): + def setUp(self): + self._master, self._addr = start_local_master() + MasterClient._instance = build_master_client(self._addr, 1) + + def tearDown(self): + os.environ.clear() + + def test_check_training_hang_operator_find_intersection(self): + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [(1, True), (2, False), (3, True), (4, True), (5, True)], + 2: [(1, True), (2, True), (3, True), (4, True), (5, False)], + 3: [(1, False), (2, True), (3, True), (4, True), (5, True)], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._get_hang_overlaps(test_metric), (-1, -1)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (7, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (7, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._get_hang_overlaps(test_metric), (2, 1)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._get_hang_overlaps(test_metric), (2, 2)) + + test_metric: Dict[int, List[Tuple[int, bool]]] = { + 1: [ + (1, True), + (2, False), + (3, True), + (4, True), + (5, True), + (6, True), + (8, False), + ], + 2: [ + (1, True), + (2, True), + (3, True), + (4, True), + (5, False), + (6, True), + (8, True), + ], + 3: [ + (1, False), + (2, True), + (3, True), + (4, True), + (5, True), + (6, True), + (8, True), + ], + } + operator = CheckTrainingHangOperator(None) + self.assertEqual(operator._get_hang_overlaps(test_metric), (-1, -1)) + + def test_check_training_hang_operator_is_hang(self): + operator = CheckTrainingHangOperator(None) + operator._get_hang_time_last_threshold = mock.MagicMock(return_value=0) + + # prepare test data + normal_metric, some_abnormal_metric, all_abnormal_metric = "", "", "" + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/normal/xpu_timer_metric_0", + ) + with open(file_path, "r", encoding="utf-8") as file: + normal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_some", + ) + with open(file_path, "r", encoding="utf-8") as file: + some_abnormal_metric = file.read() + file_path = os.path.join( + os.path.dirname(__file__), + "data/xpu_timer/hang/xpu_timer_metric_all", + ) + with open(file_path, "r", encoding="utf-8") as file: + all_abnormal_metric = file.read() + + # test data: no worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=normal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data0: 1 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=some_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertFalse(operator.is_hang(test_data)) + test_data.clear() + + # test data: 2 of 2 worker hang + w0_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w0_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=0, + node_type="worker", + node_rank=0, + ) + w1_t1 = WorkerTrainingMetric( + timestamp=1, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + w1_t2 = WorkerTrainingMetric( + timestamp=2, + data_type=DiagnosisDataType.XPU_TIMER_METRIC, + data_content=all_abnormal_metric, + node_id=1, + node_type="worker", + node_rank=1, + ) + test_data = [w0_t1, w1_t1, w0_t2, w1_t2] + + self.assertTrue(operator.is_hang(test_data)) + test_data.clear() + + def test_check_training_hang_operator(self): + # no data + operator = CheckTrainingHangOperator(None) + inf = Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.ISORNOT, + description=InferenceDescription.HANG, + ) + self.assertTrue(operator.is_compatible(inf)) + + results = operator.infer([inf]) + self.assertEqual( + results[0], + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.HANG, + ), + ) + + def test_check_failure_node_operator(self): + file = "data/training.log" + path = os.path.dirname(__file__) + file_path = os.path.join(path, file) + + operator = CheckFailureNodeOperator() + inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.ISORNOT, + description=InferenceDescription.FAILURE, + configs={ + InferenceConfigKey.LOG_FILE: file_path, + InferenceConfigKey.ERRORS: "error code is 507035", + }, + ) + self.assertTrue(operator.is_compatible(inf)) + + results = operator.infer([inf]) + failure_inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.IS, + description=InferenceDescription.FAILURE, + ) + self.assertTrue(is_same_inference(results[0], failure_inf)) + + ######################################################### + inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.ISORNOT, + description=InferenceDescription.FAILURE, + configs={ + InferenceConfigKey.LOG_FILE: file_path, + InferenceConfigKey.ERRORS: "error code is 123456", + }, + ) + + results = operator.infer([inf]) + not_failure_inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.NOT, + description=InferenceDescription.FAILURE, + ) + self.assertTrue(is_same_inference(results[0], not_failure_inf)) + + def test_resolve_training_hang_operator(self): + operator = ResolveTrainingHangOperator(None) + input_infers = [] + result_infers = operator.infer(input_infers) + self.assertEqual( + result_infers, + [ + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.NONE, + ) + ], + ) + + input_infers.append( + Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.ISORNOT, + description=InferenceDescription.FAILURE, + configs={ + InferenceConfigKey.LOG_FILE: "test", + InferenceConfigKey.ERRORS: "error code is 123456", + }, + ) + ) + result_infers = operator.infer(input_infers) + self.assertEqual( + result_infers, + [ + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.NONE, + ) + ], + ) + + input_infers.append( + Inference( + name=InferenceName.TRAINING, + attribution=InferenceAttribute.IS, + description=InferenceDescription.HANG, + ) + ) + result_infers = operator.infer(input_infers) + self.assertEqual( + result_infers, + [ + Inference( + name=InferenceName.ACTION, + attribution=InferenceAttribute.IS, + description=InferenceDescription.EVENT, + configs={ + "event_type": ErrorMonitorConstants.TYPE_WARN, + "event_instance": ErrorMonitorConstants.JOB_INSTANCE, + "event_action": ErrorMonitorConstants.ACTION_HANG_WARN, + "event_msg": "", + "event_labels": "{}", + }, + ) + ], + ) + + def test_inference_chain(self): + file = "data/training.log" + path = os.path.dirname(__file__) + file_path = os.path.join(path, file) + inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.ISORNOT, + description=InferenceDescription.FAILURE, + configs={ + InferenceConfigKey.LOG_FILE: file_path, + InferenceConfigKey.ERRORS: "error code is 507035", + }, + ) + + operators = [CheckFailureNodeOperator()] + ic = InferenceChain([inf], operators) + results = ic.infer() + failure_inf = Inference( + name=InferenceName.NODE, + attribution=InferenceAttribute.IS, + description=InferenceDescription.FAILURE, + ) + self.assertTrue(is_same_inference(results[0], failure_inf)) + + @patch( + "dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector" + ".XpuTimerMetricsCollector.collect_data" + ) + def test_collect_metrics_operator(self, mock_collector): + mock_collector.return_value = "data" + operator = MetricsCollectionOperator() + inf = Inference( + name=InferenceName.WORKER, + attribution=InferenceAttribute.COLLECT, + description=InferenceDescription.METRICS, + ) + self.assertTrue(operator.is_compatible(inf)) + + env_utils.set_env(EnvConfigKey.XPU_TIMER_PORT, 18889) + env_utils.set_env(NodeEnv.NODE_ID, 1) + env_utils.set_env(NodeEnv.NODE_TYPE, NodeType.WORKER) + env_utils.set_env(NodeEnv.NODE_RANK, 1) + infs = operator.infer([]) + self.assertEqual(len(infs), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/dlrover/python/tests/test_inference_chain.py b/dlrover/python/tests/test_inference_chain.py deleted file mode 100644 index 61a37c160..000000000 --- a/dlrover/python/tests/test_inference_chain.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright 2024 The DLRover Authors. All rights reserved. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -from unittest.mock import patch - -from dlrover.python.common import env_utils -from dlrover.python.common.constants import NodeEnv, NodeType -from dlrover.python.diagnosis.common.constants import ( - EnvConfigKey, - InferenceConfigKey, -) -from dlrover.python.diagnosis.common.inference_chain import ( - Inference, - InferenceAttribute, - InferenceDescription, - InferenceName, - is_same_inference, -) -from dlrover.python.diagnosis.inferencechain.inference_chain import ( - InferenceChain, -) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_failure_node_operator import ( # noqa: E501 - CheckFailureNodeOperator, -) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.check_training_hang_operator import ( # noqa: E501 - CheckTrainingHangOperator, -) -from dlrover.python.diagnosis.inferencechain.inferenceoperator.metrics_collection_operator import ( # noqa: E501 - MetricsCollectionOperator, -) -from dlrover.python.elastic_agent.master_client import ( - MasterClient, - build_master_client, -) -from dlrover.python.tests.test_utils import start_local_master - - -class InferenceChainTest(unittest.TestCase): - def setUp(self): - self._master, self._addr = start_local_master() - MasterClient._instance = build_master_client(self._addr, 1) - - def tearDown(self): - os.environ.clear() - - def test_check_training_hang_operator(self): - operator = CheckTrainingHangOperator(None) - inf = Inference( - name=InferenceName.TRAINING, - attribution=InferenceAttribute.ISORNOT, - description=InferenceDescription.HANG, - ) - self.assertTrue(operator.is_compatible(inf)) - - results = operator.infer([inf]) - self.assertEqual( - results[0], - Inference( - name=InferenceName.TRAINING, - attribution=InferenceAttribute.NOT, - description=InferenceDescription.HANG, - ), - ) - - def test_check_failure_node_operator(self): - file = "data/training.log" - path = os.path.dirname(__file__) - file_path = os.path.join(path, file) - - operator = CheckFailureNodeOperator() - inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.ISORNOT, - description=InferenceDescription.FAILURE, - configs={ - InferenceConfigKey.LOG_FILE: file_path, - InferenceConfigKey.ERRORS: "error code is 507035", - }, - ) - self.assertTrue(operator.is_compatible(inf)) - - results = operator.infer([inf]) - failure_inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.IS, - description=InferenceDescription.FAILURE, - ) - self.assertTrue(is_same_inference(results[0], failure_inf)) - - ######################################################### - inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.ISORNOT, - description=InferenceDescription.FAILURE, - configs={ - InferenceConfigKey.LOG_FILE: file_path, - InferenceConfigKey.ERRORS: "error code is 123456", - }, - ) - - results = operator.infer([inf]) - not_failure_inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.NOT, - description=InferenceDescription.FAILURE, - ) - self.assertTrue(is_same_inference(results[0], not_failure_inf)) - - def test_inference_chain(self): - file = "data/training.log" - path = os.path.dirname(__file__) - file_path = os.path.join(path, file) - inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.ISORNOT, - description=InferenceDescription.FAILURE, - configs={ - InferenceConfigKey.LOG_FILE: file_path, - InferenceConfigKey.ERRORS: "error code is 507035", - }, - ) - - operators = [CheckFailureNodeOperator()] - ic = InferenceChain([inf], operators) - results = ic.infer() - failure_inf = Inference( - name=InferenceName.NODE, - attribution=InferenceAttribute.IS, - description=InferenceDescription.FAILURE, - ) - self.assertTrue(is_same_inference(results[0], failure_inf)) - - @patch( - "dlrover.python.diagnosis.datacollector.xpu_timer_metric_collector" - ".XpuTimerMetricsCollector.collect_data" - ) - def test_collect_metrics_operator(self, mock_collector): - mock_collector.return_value = "data" - operator = MetricsCollectionOperator() - inf = Inference( - name=InferenceName.WORKER, - attribution=InferenceAttribute.COLLECT, - description=InferenceDescription.METRICS, - ) - self.assertTrue(operator.is_compatible(inf)) - - env_utils.set_env(EnvConfigKey.XPU_TIMER_PORT, 18889) - env_utils.set_env(NodeEnv.NODE_ID, 1) - env_utils.set_env(NodeEnv.NODE_TYPE, NodeType.WORKER) - env_utils.set_env(NodeEnv.NODE_RANK, 1) - infs = operator.infer([]) - self.assertEqual(len(infs), 0) - - -if __name__ == "__main__": - unittest.main() diff --git a/dlrover/python/tests/test_servicer.py b/dlrover/python/tests/test_servicer.py index cb9a02363..bce29407b 100644 --- a/dlrover/python/tests/test_servicer.py +++ b/dlrover/python/tests/test_servicer.py @@ -424,7 +424,7 @@ def test_sync_checkpoint(self): success = self.servicer._sync_checkpoint(NodeType.WORKER, 1, message) self.assertTrue(success) - def test_report_worker_diagnosis_data(self): + def test_report_node_diagnosis_data(self): test = WorkerTrainingMetric( data_content="test123", node_id=env_utils.get_node_id(), @@ -438,7 +438,7 @@ def test_report_worker_diagnosis_data(self): test.to_json(), test.node_rank, ) - self.assertTrue(self.servicer._report_worker_diagnosis_data(request)) + self.assertTrue(self.servicer._report_node_diagnosis_data(request)) def test_deal_with_reported_node_event(self): request = grpc.NodeEvent(node=grpc.NodeMeta())