From cb6e10b944e888e9c4cfc77470cc1b2b74241974 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sat, 18 Sep 2021 01:19:42 -0400 Subject: [PATCH 01/18] [capability] Computer monitor pkg, copied from https://github.com/PR2/pr2_robot/tree/2361bf3015c51628e3f0445f01d51087878d79a4 --- pr2_computer_monitor/CHANGELOG.rst | 50 ++ pr2_computer_monitor/CMakeLists.txt | 39 + pr2_computer_monitor/INSTALL | 5 + pr2_computer_monitor/demo/cpu_monitor.launch | 8 + pr2_computer_monitor/demo/hd_monitor.launch | 6 + pr2_computer_monitor/demo/ntp_monitor.launch | 4 + pr2_computer_monitor/package.xml | 30 + pr2_computer_monitor/scripts/README.md | 1 + pr2_computer_monitor/scripts/cpu_monitor.py | 846 ++++++++++++++++++ pr2_computer_monitor/scripts/hd_monitor.py | 386 ++++++++ pr2_computer_monitor/scripts/ntp_monitor.py | 170 ++++ pr2_computer_monitor/scripts/nvidia_temp.py | 86 ++ pr2_computer_monitor/scripts/wifi_monitor.py | 154 ++++ pr2_computer_monitor/setup.py | 11 + pr2_computer_monitor/src/network_detector.cpp | 88 ++ .../pr2_computer_monitor/nvidia_smi_util.py | 165 ++++ pr2_computer_monitor/test/parse_test.py | 126 +++ .../sample_output/nvidia_smi_high_temp.txt | 18 + .../test/sample_output/nvidia_smi_out.txt | 18 + 19 files changed, 2211 insertions(+) create mode 100644 pr2_computer_monitor/CHANGELOG.rst create mode 100644 pr2_computer_monitor/CMakeLists.txt create mode 100644 pr2_computer_monitor/INSTALL create mode 100644 pr2_computer_monitor/demo/cpu_monitor.launch create mode 100644 pr2_computer_monitor/demo/hd_monitor.launch create mode 100644 pr2_computer_monitor/demo/ntp_monitor.launch create mode 100644 pr2_computer_monitor/package.xml create mode 100644 pr2_computer_monitor/scripts/README.md create mode 100755 pr2_computer_monitor/scripts/cpu_monitor.py create mode 100755 pr2_computer_monitor/scripts/hd_monitor.py create mode 100755 pr2_computer_monitor/scripts/ntp_monitor.py create mode 100755 pr2_computer_monitor/scripts/nvidia_temp.py create mode 100755 pr2_computer_monitor/scripts/wifi_monitor.py create mode 100644 pr2_computer_monitor/setup.py create mode 100644 pr2_computer_monitor/src/network_detector.cpp create mode 100644 pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py create mode 100755 pr2_computer_monitor/test/parse_test.py create mode 100644 pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt create mode 100644 pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt diff --git a/pr2_computer_monitor/CHANGELOG.rst b/pr2_computer_monitor/CHANGELOG.rst new file mode 100644 index 0000000..7248f8c --- /dev/null +++ b/pr2_computer_monitor/CHANGELOG.rst @@ -0,0 +1,50 @@ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Changelog for package pr2_computer_monitor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1.6.32 (2021-05-26) +------------------- +* Merge pull request `#268 `_ from k-okada/fix_for_noetic +* run 2to3 -w -fexcept . +* run 2to3 -w -fprint . +* Contributors: Kei Okada + +1.6.31 (2020-04-14) +------------------- +* add --ignore-self arg in ntp_monitor.py (`#259 `_) +* fixed CMake errors +* Contributors: David Feil-Seifer, Shingo Kitagawa + +1.6.30 (2018-04-23) +------------------- +* removed more tests for jenkins build +* Contributors: David Feil-Seifer + +1.6.29 (2018-04-22) +------------------- + +1.6.28 (2018-04-21) +------------------- +* made sure tests only run if CATKIN_ENABLE_TESTING is set +* Contributors: David Feil-Seifer + +1.6.27 (2018-04-20) +------------------- + +1.6.26 (2018-03-19) +------------------- + +1.6.25 (2018-03-19) +------------------- +* updated packages for new maintainer +* updated changelogs +* Contributors: David Feil-Seifer + +1.6.7 (2015-02-11) +------------------ +* Reverted changes +* Added dependencies in catkin +* Added catkin_package() to pr2_robot +* Updated mainpage.dox +* Fix binary location of network_detector +* Contributors: Ryohei Ueda, TheDash diff --git a/pr2_computer_monitor/CMakeLists.txt b/pr2_computer_monitor/CMakeLists.txt new file mode 100644 index 0000000..e1d4269 --- /dev/null +++ b/pr2_computer_monitor/CMakeLists.txt @@ -0,0 +1,39 @@ +# http://ros.org/doc/groovy/api/catkin/html/user_guide/supposed.html +cmake_minimum_required(VERSION 2.8.3) +project(pr2_computer_monitor) +# Load catkin and all dependencies required for this package +# TODO: remove all from COMPONENTS that are not catkin packages. +find_package(catkin REQUIRED COMPONENTS roscpp std_msgs) + +if(CATKIN_ENABLE_TESTING) + #catkin_add_nosetests(test/parse_test.py) +endif() + +include_directories(include ${catkin_INCLUDE_DIRS}) + +catkin_package( + DEPENDS roscpp std_msgs + CATKIN_DEPENDS # TODO + INCLUDE_DIRS # TODO include + LIBRARIES network_detector# TODO +) + + +install(DIRECTORY demo + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}) + +file(GLOB PYTHON_SCRIPTS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" + "${CMAKE_CURRENT_SOURCE_DIR}/scripts/*") +install(PROGRAMS ${PYTHON_SCRIPTS} + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) + +add_executable(network_detector src/network_detector.cpp) +target_link_libraries(network_detector ${catkin_LIBRARIES}) +add_dependencies(network_detector ${catkin_EXPORTED_TARGETS} ${${PROJECT_NAME}_EXPORTED_TARGETS}) + +install(TARGETS network_detector + ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) + +catkin_python_setup() diff --git a/pr2_computer_monitor/INSTALL b/pr2_computer_monitor/INSTALL new file mode 100644 index 0000000..df31116 --- /dev/null +++ b/pr2_computer_monitor/INSTALL @@ -0,0 +1,5 @@ +In order for ipmitool to work on computers with a BMC, the following line +needs to appear in /etc/sudoers: + +ALL ALL=NOPASSWD: /usr/bin/ipmitool sdr type Temperature + diff --git a/pr2_computer_monitor/demo/cpu_monitor.launch b/pr2_computer_monitor/demo/cpu_monitor.launch new file mode 100644 index 0000000..6566691 --- /dev/null +++ b/pr2_computer_monitor/demo/cpu_monitor.launch @@ -0,0 +1,8 @@ + + + + + + + diff --git a/pr2_computer_monitor/demo/hd_monitor.launch b/pr2_computer_monitor/demo/hd_monitor.launch new file mode 100644 index 0000000..9a73f83 --- /dev/null +++ b/pr2_computer_monitor/demo/hd_monitor.launch @@ -0,0 +1,6 @@ + + + + + diff --git a/pr2_computer_monitor/demo/ntp_monitor.launch b/pr2_computer_monitor/demo/ntp_monitor.launch new file mode 100644 index 0000000..5fefeae --- /dev/null +++ b/pr2_computer_monitor/demo/ntp_monitor.launch @@ -0,0 +1,4 @@ + + + diff --git a/pr2_computer_monitor/package.xml b/pr2_computer_monitor/package.xml new file mode 100644 index 0000000..2146f07 --- /dev/null +++ b/pr2_computer_monitor/package.xml @@ -0,0 +1,30 @@ + + pr2_computer_monitor + 1.6.32 + Monitors the computer's processor and hard drives of the PR2 and publishes data to diagnostics. + Dave Feil-Seifer + + BSD + + http://www.ros.org/wiki/pr2_computer_monitor + + + Kevin Watts (watts@willowgarage.com) + + catkin + + diagnostic_msgs + pr2_msgs + roscpp + std_msgs + + diagnostic_msgs + rospy + pr2_msgs + roscpp + std_msgs + + + + + diff --git a/pr2_computer_monitor/scripts/README.md b/pr2_computer_monitor/scripts/README.md new file mode 100644 index 0000000..b004ae2 --- /dev/null +++ b/pr2_computer_monitor/scripts/README.md @@ -0,0 +1 @@ +This is the indigo code, and will not work correctly on a precise machine diff --git a/pr2_computer_monitor/scripts/cpu_monitor.py b/pr2_computer_monitor/scripts/cpu_monitor.py new file mode 100755 index 0000000..486a4aa --- /dev/null +++ b/pr2_computer_monitor/scripts/cpu_monitor.py @@ -0,0 +1,846 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2009, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement +import roslib +roslib.load_manifest('pr2_computer_monitor') + +import rospy + +import traceback +import threading +from threading import Timer +import sys, os, time +from time import sleep +import subprocess +import string + +import socket + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } + +# Output entire IPMI data set +def check_ipmi(): + diag_vals = [] + diag_msgs = [] + diag_level = DiagnosticStatus.OK + + try: + p = subprocess.Popen('sudo ipmitool sdr', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + diag_level = DiagnosticStatus.ERROR + diag_msgs = [ 'ipmitool Error' ] + diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ] + return diag_vals, diag_msgs, diag_level + + lines = stdout.split('\n') + if len(lines) < 2: + diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ] + + diag_msgs = [ 'No ipmitool response' ] + diag_level = DiagnosticStatus.ERROR + + return diag_vals, diag_msgs, diag_level + + for ln in lines: + if len(ln) < 3: + continue + + words = ln.split('|') + if len(words) < 3: + continue + + name = words[0].strip() + ipmi_val = words[1].strip() + stat_byte = words[2].strip() + + # CPU temps + if words[0].startswith('CPU') and words[0].strip().endswith('Temp'): + if words[1].strip().endswith('degrees C'): + tmp = ipmi_val.rstrip(' degrees C').lstrip() + if unicode(tmp).isnumeric(): + temperature = float(tmp) + diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) + + cpu_name = name.split()[0] + if temperature >= 80 and temperature < 89: + diag_level = max(diag_level, DiagnosticStatus.WARN) + if diag_msgs.count('CPU Hot') == 0: + diag_msgs.append('CPU Warm') + + if temperature >= 89: # CPU should shut down here + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Hot') + # Don't keep CPU Warm in list if CPU is hot + if diag_msgs.count('CPU Warm') > 0: + idx = diag_msgs.index('CPU Warm') + diag_msgs.pop(idx) + else: + diag_vals.append(KeyValue(key = name, value = words[1])) + + + # MP, BP, FP temps + if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp': + if ipmi_val.endswith('degrees C'): + tmp = ipmi_val.rstrip(' degrees C').lstrip() + diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) + # Give temp warning + dev_name = name.split()[0] + if unicode(tmp).isnumeric(): + temperature = float(tmp) + + if temperature >= 60 and temperature < 75: + diag_level = max(diag_level, DiagnosticStatus.WARN) + diag_msgs.append('%s Warm' % dev_name) + + if temperature >= 75: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('%s Hot' % dev_name) + else: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('%s Error' % dev_name) + else: + diag_vals.append(KeyValue(key = name, value = ipmi_val)) + + # CPU fan speeds + if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan': + if ipmi_val.endswith('RPM'): + rpm = ipmi_val.rstrip(' RPM').lstrip() + if unicode(rpm).isnumeric(): + if int(rpm) == 0: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Fan Off') + + diag_vals.append(KeyValue(key = name + ' RPM', value = rpm)) + else: + diag_vals.append(KeyValue(key = name, value = ipmi_val)) + + # If CPU is hot we get an alarm from ipmitool, report that too + # CPU should shut down if we get a hot alarm, so report as error + if name.startswith('CPU') and name.endswith('hot'): + if ipmi_val == '0x01': + diag_vals.append(KeyValue(key = name, value = 'OK')) + else: + diag_vals.append(KeyValue(key = name, value = 'Hot')) + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Hot Alarm') + + except Exception as e: + diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) + diag_level = DiagnosticStatus.ERROR + diag_msgs.append('Exception') + + return diag_vals, diag_msgs, diag_level + + +##\brief Check CPU core temps +## +## Use 'find /sys -name temp1_input' to find cores +## Read from every core, divide by 1000 +def check_core_temps(sys_temp_strings): + diag_vals = [] + diag_level = 0 + diag_msgs = [] + + for index, temp_str in enumerate(sys_temp_strings): + if len(temp_str) < 5: + continue + + cmd = 'cat %s' % temp_str + p = subprocess.Popen(cmd, stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + diag_level = DiagnosticStatus.ERROR + diag_msg = [ 'Core Temp Error' ] + diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), + KeyValue(key = 'Output', value = stdout) ] + return diag_vals, diag_msgs, diag_level + + tmp = stdout.strip() + if unicode(tmp).isnumeric(): + temp = float(tmp) / 1000 + diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp))) + + if temp >= 85 and temp < 90: + diag_level = max(diag_level, DiagnosticStatus.WARN) + diag_msgs.append('Warm') + if temp >= 90: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('Hot') + else: + diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value + diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp)) + + return diag_vals, diag_msgs, diag_level + +## Checks clock speed from reading from CPU info +def check_clock_speed(enforce_speed): + vals = [] + msgs = [] + lvl = DiagnosticStatus.OK + + try: + p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + lvl = DiagnosticStatus.ERROR + msgs = [ 'Clock speed error' ] + vals = [ KeyValue(key = 'Clock speed error', value = stderr), + KeyValue(key = 'Output', value = stdout) ] + + return (vals, msgs, lvl) + + for index, ln in enumerate(stdout.split('\n')): + words = ln.split(':') + if len(words) < 2: + continue + + speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal + vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed)) + if unicode(speed).isnumeric(): + mhz = float(speed) + + if mhz < 2240 and mhz > 2150: + lvl = max(lvl, DiagnosticStatus.WARN) + if mhz <= 2150: + lvl = max(lvl, DiagnosticStatus.ERROR) + else: + # Automatically give error if speed isn't a number + lvl = max(lvl, DiagnosticStatus.ERROR) + + if not enforce_speed: + lvl = DiagnosticStatus.OK + + if lvl == DiagnosticStatus.WARN and enforce_speed: + msgs = [ 'Core slowing' ] + elif lvl == DiagnosticStatus.ERROR and enforce_speed: + msgs = [ 'Core throttled' ] + + except Exception as e: + rospy.logerr(traceback.format_exc()) + lvl = DiagnosticStatus.ERROR + msgs.append('Exception') + vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) + + return vals, msgs, lvl + + +# Add msgs output, too +##\brief Uses 'uptime' to see load average +def check_uptime(load1_threshold, load5_threshold): + level = DiagnosticStatus.OK + vals = [] + + load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' } + + try: + p = subprocess.Popen('uptime', stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + vals.append(KeyValue(key = 'uptime Failed', value = stderr)) + return DiagnosticStatus.ERROR, vals + + upvals = stdout.split() + load1 = upvals[-3].rstrip(',') + load5 = upvals[-2].rstrip(',') + load15 = upvals[-1] + num_users = upvals[-7] + + # Give warning if we go over load limit + if float(load1) > load1_threshold or float(load5) > load5_threshold: + level = DiagnosticStatus.WARN + + vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level])) + vals.append(KeyValue(key = '1 min Load Average', value = load1)) + vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold))) + vals.append(KeyValue(key = '5 min Load Average', value = load5)) + vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold))) + vals.append(KeyValue(key = '15 min Load Average', value = load15)) + vals.append(KeyValue(key = 'Number of Users', value = num_users)) + + except Exception as e: + rospy.logerr(traceback.format_exc()) + level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc())) + + return level, load_dict[level], vals + +# Add msgs output +##\brief Uses 'free -m' to check free memory +def check_memory(): + values = [] + level = DiagnosticStatus.OK + msg = '' + + mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' } + + try: + p = subprocess.Popen('free -m', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode))) + return DiagnosticStatus.ERROR, values + + rows = stdout.split('\n') + data = rows[1].split() + total_mem = data[1] + used_mem = data[2] + free_mem = data[3] + + level = DiagnosticStatus.OK + if float(free_mem) < 25: + level = DiagnosticStatus.WARN + if float(free_mem) < 1: + level = DiagnosticStatus.ERROR + + values.append(KeyValue(key = 'Memory Status', value = mem_dict[level])) + values.append(KeyValue(key = 'Total Memory', value = total_mem)) + values.append(KeyValue(key = 'Used Memory', value = used_mem)) + values.append(KeyValue(key = 'Free Memory', value = free_mem)) + + msg = mem_dict[level] + except Exception as e: + rospy.logerr(traceback.format_exc()) + msg = 'Memory Usage Check Error' + values.append(KeyValue(key = msg, value = str(e))) + level = DiagnosticStatus.ERROR + + return level, mem_dict[level], values + + + +##\brief Use mpstat to find CPU usage +## +usage_old = 0 +has_warned_mpstat = False +has_error_core_count = False +def check_mpstat(core_count = -1): + vals = [] + mp_level = DiagnosticStatus.OK + + load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' } + + try: + p = subprocess.Popen('mpstat -P ALL 1 1', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + global has_warned_mpstat + if not has_warned_mpstat: + rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode) + has_warned_mpstat = True + + mp_level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode))) + return mp_level, 'Unable to Check CPU Usage', vals + + # Check which column '%idle' is, #4539 + # mpstat output changed between 8.06 and 8.1 + rows = stdout.split('\n') + col_names = rows[2].split() + idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2 + + num_cores = 0 + cores_loaded = 0 + for index, row in enumerate(stdout.split('\n')): + if index < 3: + continue + + # Skip row containing 'all' data + if row.find('all') > -1: + continue + + lst = row.split() + if len(lst) < 8: + continue + + ## Ignore 'Average: ...' data + if lst[0].startswith('Average'): + continue + + cpu_name = '%d' % (num_cores) + idle = lst[idle_col].replace(',', '.') + user = lst[3].replace(',', '.') + nice = lst[4].replace(',', '.') + system = lst[5].replace(',', '.') + + core_level = 0 + usage = float(user) + float(nice) + if usage > 1000: # wrong reading, use old reading instead + rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old)) + usage = usage_old + usage_old = usage + + if usage > 90.0: + cores_loaded += 1 + core_level = DiagnosticStatus.WARN + if usage > 110.0: + core_level = DiagnosticStatus.ERROR + + vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level])) + vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user)) + vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice)) + vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system)) + vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle)) + + num_cores += 1 + + # Warn for high load only if we have <= 2 cores that aren't loaded + if num_cores - cores_loaded <= 2 and num_cores > 2: + mp_level = DiagnosticStatus.WARN + + # Check the number of cores if core_count > 0, #4850 + if core_count > 0 and core_count != num_cores: + mp_level = DiagnosticStatus.ERROR + global has_error_core_count + if not has_error_core_count: + rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.', + core_count, num_cores) + has_error_core_count = True + return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals + + except Exception as e: + mp_level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = 'mpstat Exception', value = str(e))) + + return mp_level, load_dict[mp_level], vals + +## Returns names for core temperature files +## Returns list of names, each name can be read like file +def get_core_temp_names(): + temp_vals = [] + try: + p = subprocess.Popen('find /sys/devices -name temp1_input', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + rospy.logerr('Error find core temp locations: %s' % stderr) + return [] + + for ln in stdout.split('\n'): + temp_vals.append(ln.strip()) + + return temp_vals + except: + rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc()) + return [] + +def update_status_stale(stat, last_update_time): + time_since_update = rospy.get_time() - last_update_time + + stale_status = 'OK' + if time_since_update > 20 and time_since_update <= 35: + stale_status = 'Lagging' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.WARN) + if time_since_update > 35: + stale_status = 'Stale' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.ERROR) + + + stat.values.pop(0) + stat.values.pop(0) + stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) + stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) + + +class CPUMonitor(): + def __init__(self, hostname, diag_hostname): + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._mutex = threading.Lock() + + self._check_ipmi = rospy.get_param('~check_ipmi_tool', True) + self._enforce_speed = rospy.get_param('~enforce_clock_speed', True) + + self._check_core_temps = rospy.get_param('~check_core_temps', False) + if self._check_core_temps: + rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle') + self._check_nfs = rospy.get_param('~check_nfs', False) + if self._check_nfs: + rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle') + + self._load1_threshold = rospy.get_param('~load1_threshold', 5.0) + self._load5_threshold = rospy.get_param('~load5_threshold', 3.0) + + self._num_cores = rospy.get_param('~num_cores', 8.0) + + self._temps_timer = None + self._usage_timer = None + self._nfs_timer = None + + # Get temp_input files + self._temp_vals = get_core_temp_names() + + # CPU stats + self._temp_stat = DiagnosticStatus() + self._temp_stat.name = '%s CPU Temperature' % diag_hostname + self._temp_stat.level = 1 + self._temp_stat.hardware_id = hostname + self._temp_stat.message = 'No Data' + self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._usage_stat = DiagnosticStatus() + self._usage_stat.name = '%s CPU Usage' % diag_hostname + self._usage_stat.level = 1 + self._usage_stat.hardware_id = hostname + self._usage_stat.message = 'No Data' + self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._nfs_stat = DiagnosticStatus() + self._nfs_stat.name = '%s NFS IO' % diag_hostname + self._nfs_stat.level = 1 + self._nfs_stat.hardware_id = hostname + self._nfs_stat.message = 'No Data' + self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._last_temp_time = 0 + self._last_usage_time = 0 + self._last_nfs_time = 0 + self._last_publish_time = 0 + + # Start checking everything + self.check_temps() + if self._check_nfs: + self.check_nfs_stat() + self.check_usage() + + # Restart temperature checking + def _restart_temp_check(self): + rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen') + try: + with self._mutex: + if self._temps_timer: + self._temps_timer.cancel() + + self.check_temps() + except Exception as e: + rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc()) + + + ## Must have the lock to cancel everything + def cancel_timers(self): + if self._temps_timer: + self._temps_timer.cancel() + + if self._nfs_timer: + self._nfs_timer.cancel() + + if self._usage_timer: + self._usage_timer.cancel() + + def check_nfs_stat(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + nfs_level = 0 + msg = 'OK' + vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = str(0) )] + + try: + p = subprocess.Popen('iostat -n', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + nfs_level = DiagnosticStatus.ERROR + msg = 'iostat Error' + vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e))) + stdout = '' + + + for index, row in enumerate(stdout.split('\n')): + if index < 3: + continue + + lst = row.split() + if len(lst) < 7: + continue + + file_sys = lst[0] + read_blk = lst[1] + write_blk = lst[2] + read_blk_dir = lst[3] + write_blk_dir = lst[4] + r_blk_srv = lst[5] + w_blk_srv = lst[6] + + vals.append(KeyValue( + key = '%s Read Blks/s' % file_sys, value=read_blk)) + vals.append(KeyValue( + key = '%s Write Blks/s' % file_sys, value=write_blk)) + vals.append(KeyValue( + key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir)) + vals.append(KeyValue( + key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir)) + vals.append(KeyValue( + key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv)) + vals.append(KeyValue( + key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv)) + + except Exception as e: + rospy.logerr(traceback.format_exc()) + nfs_level = DiagnosticStatus.ERROR + msg = 'Exception' + vals.append(KeyValue(key = 'Exception', value = str(e))) + + with self._mutex: + self._nfs_stat.level = nfs_level + self._nfs_stat.message = msg + self._nfs_stat.values = vals + + self._last_nfs_time = rospy.get_time() + + if not rospy.is_shutdown(): + self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat) + self._nfs_timer.start() + else: + self.cancel_timers() + + + ## Call every 10sec at minimum + def check_temps(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = str(0) ) ] + diag_msgs = [] + diag_level = 0 + + if self._check_ipmi: + ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi() + diag_vals.extend(ipmi_vals) + diag_msgs.extend(ipmi_msgs) + diag_level = max(diag_level, ipmi_level) + + if self._check_core_temps: + core_vals, core_msgs, core_level = check_core_temps(self._temp_vals) + diag_vals.extend(core_vals) + diag_msgs.extend(core_msgs) + diag_level = max(diag_level, core_level) + + clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed) + diag_vals.extend(clock_vals) + diag_msgs.extend(clock_msgs) + diag_level = max(diag_level, clock_level) + + diag_log = set(diag_msgs) + if len(diag_log) > 0: + message = ', '.join(diag_log) + else: + message = stat_dict[diag_level] + + with self._mutex: + self._last_temp_time = rospy.get_time() + + self._temp_stat.level = diag_level + self._temp_stat.message = message + self._temp_stat.values = diag_vals + + if not rospy.is_shutdown(): + self._temps_timer = threading.Timer(5.0, self.check_temps) + self._temps_timer.start() + else: + self.cancel_timers() + + def check_usage(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_level = 0 + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = 0 )] + diag_msgs = [] + + # Check mpstat + mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores) + diag_vals.extend(mp_vals) + if mp_level > 0: + diag_msgs.append(mp_msg) + diag_level = max(diag_level, mp_level) + + # Check uptime + uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold) + diag_vals.extend(up_vals) + if uptime_level > 0: + diag_msgs.append(up_msg) + diag_level = max(diag_level, uptime_level) + + # Check memory + mem_level, mem_msg, mem_vals = check_memory() + diag_vals.extend(mem_vals) + if mem_level > 0: + diag_msgs.append(mem_msg) + diag_level = max(diag_level, mem_level) + + if diag_msgs and diag_level > 0: + usage_msg = ', '.join(set(diag_msgs)) + else: + usage_msg = stat_dict[diag_level] + + # Update status + with self._mutex: + self._last_usage_time = rospy.get_time() + self._usage_stat.level = diag_level + self._usage_stat.values = diag_vals + + self._usage_stat.message = usage_msg + + if not rospy.is_shutdown(): + self._usage_timer = threading.Timer(5.0, self.check_usage) + self._usage_timer.start() + else: + self.cancel_timers() + + def publish_stats(self): + with self._mutex: + # Update everything with last update times + update_status_stale(self._temp_stat, self._last_temp_time) + update_status_stale(self._usage_stat, self._last_usage_time) + if self._check_nfs: + update_status_stale(self._nfs_stat, self._last_nfs_time) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(self._temp_stat) + msg.status.append(self._usage_stat) + if self._check_nfs: + msg.status.append(self._nfs_stat) + + if rospy.get_time() - self._last_publish_time > 0.5: + self._diag_pub.publish(msg) + self._last_publish_time = rospy.get_time() + + + # Restart temperature checking if it goes stale, #4171 + # Need to run this without mutex + if rospy.get_time() - self._last_temp_time > 90: + self._restart_temp_check() + + +if __name__ == '__main__': + hostname = socket.gethostname() + + import optparse + parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default = hostname) + options, args = parser.parse_args(rospy.myargv()) + + try: + rospy.init_node('cpu_monitor_%s' % hostname) + except rospy.exceptions.ROSInitException: + print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr) + sys.exit(0) + + cpu_node = CPUMonitor(hostname, options.diag_hostname) + + rate = rospy.Rate(1.0) + try: + while not rospy.is_shutdown(): + rate.sleep() + cpu_node.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + traceback.print_exc() + rospy.logerr(traceback.format_exc()) + + cpu_node.cancel_timers() + sys.exit(0) + + + + + + + diff --git a/pr2_computer_monitor/scripts/hd_monitor.py b/pr2_computer_monitor/scripts/hd_monitor.py new file mode 100755 index 0000000..8c582fa --- /dev/null +++ b/pr2_computer_monitor/scripts/hd_monitor.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2009, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement +import roslib +roslib.load_manifest('pr2_computer_monitor') + +import rospy + +import traceback +import threading +from threading import Timer +import sys, os, time +from time import sleep +import subprocess + +import socket + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +low_hd_level = 5 +critical_hd_level = 1 + +hd_temp_warn = 55 #3580, setting to 55C to after checking manual +hd_temp_error = 70 # Above this temperature, hard drives will have serious problems + +stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } +temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' } +usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' } + +REMOVABLE = ['/dev/sda'] # Store removable drives so we can ignore if removed + +## Connects to hddtemp daemon to get temp, HD make. +def get_hddtemp_data(hostname = 'localhost', port = 7634): + try: + hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + hd_sock.connect((hostname, port)) + sock_data = '' + while True: + newdat = hd_sock.recv(1024) + if len(newdat) == 0: + break + sock_data = sock_data + newdat + hd_sock.close() + + sock_vals = sock_data.split('|') + + # Format of output looks like ' | DRIVE | MAKE | TEMP | ' + idx = 0 + + drives = [] + makes = [] + temps = [] + while idx + 5 < len(sock_vals): + this_drive = sock_vals[idx + 1] + this_make = sock_vals[idx + 2] + this_temp = sock_vals[idx + 3] + + # Sometimes we get duplicate makes if hard drives are mounted + # to two different points + if this_make in makes: + idx += 5 + continue + + drives.append(this_drive) + makes.append(this_make) + temps.append(this_temp) + + idx += 5 + + return True, drives, makes, temps + except: + rospy.logerr(traceback.format_exc()) + return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ] + +def update_status_stale(stat, last_update_time): + time_since_update = rospy.get_time() - last_update_time + + stale_status = 'OK' + if time_since_update > 20 and time_since_update <= 35: + stale_status = 'Lagging' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.WARN) + if time_since_update > 35: + stale_status = 'Stale' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.ERROR) + + stat.values.pop(0) + stat.values.pop(0) + stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) + stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) + +class hd_monitor(): + def __init__(self, hostname, diag_hostname, home_dir = ''): + self._mutex = threading.Lock() + + self._hostname = hostname + self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False) + if self._no_temp_warn: + rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle') + self._home_dir = home_dir + + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._last_temp_time = 0 + self._last_usage_time = 0 + self._last_publish_time = 0 + + self._temp_timer = None + self._usage_timer = None + + self._temp_stat = DiagnosticStatus() + self._temp_stat.name = "%s HD Temperature" % diag_hostname + self._temp_stat.level = DiagnosticStatus.ERROR + self._temp_stat.hardware_id = hostname + self._temp_stat.message = 'No Data' + self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + if self._home_dir != '': + self._usage_stat = DiagnosticStatus() + self._usage_stat.level = DiagnosticStatus.ERROR + self._usage_stat.hardware_id = hostname + self._usage_stat.name = '%s HD Usage' % diag_hostname + self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + self.check_disk_usage() + + self.check_temps() + + ## Must have the lock to cancel everything + def cancel_timers(self): + if self._temp_timer: + self._temp_timer.cancel() + self._temp_timer = None + + if self._usage_timer: + self._usage_timer.cancel() + self._usage_timer = None + + def check_temps(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) , + KeyValue(key = 'Time Since Last Update', value = '0' ) ] + diag_level = DiagnosticStatus.OK + diag_message = 'OK' + + temp_ok, drives, makes, temps = get_hddtemp_data() + + for index in range(0, len(drives)): + temp = temps[index] + + if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE: + temp_level = DiagnosticStatus.ERROR + temp_ok = False + elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE: + temp_level = DiagnosticStatus.OK + temp = "Removed" + else: + temp_level = DiagnosticStatus.OK + if float(temp) > hd_temp_warn: + temp_level = DiagnosticStatus.WARN + if float(temp) > hd_temp_error: + temp_level = DiagnosticStatus.ERROR + + diag_level = max(diag_level, temp_level) + + diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level])) + diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index])) + diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index])) + diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp)) + + if not temp_ok: + diag_level = DiagnosticStatus.ERROR + + with self._mutex: + self._last_temp_time = rospy.get_time() + self._temp_stat.values = diag_strs + self._temp_stat.level = diag_level + + # Give No Data message if we have no reading + self._temp_stat.message = temp_dict[diag_level] + if not temp_ok: + self._temp_stat.message = 'Error' + + if self._no_temp_warn and temp_ok: + self._temp_stat.level = DiagnosticStatus.OK + + if not rospy.is_shutdown(): + self._temp_timer = threading.Timer(10.0, self.check_temps) + self._temp_timer.start() + else: + self.cancel_timers() + + def check_disk_usage(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = '0' ) ] + diag_level = DiagnosticStatus.OK + diag_message = 'OK' + + try: + p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + retcode = p.returncode + + if (retcode == 0): + + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK')) + row_count = 0 + for row in stdout.split('\n'): + if len(row.split()) < 2: + continue + if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives + continue + + row_count += 1 + g_available = row.split()[-3] + name = row.split()[0] + size = row.split()[1] + mount_pt = row.split()[-1] + + if (float(g_available) > low_hd_level): + level = DiagnosticStatus.OK + elif (float(g_available) > critical_hd_level): + level = DiagnosticStatus.WARN + else: + level = DiagnosticStatus.ERROR + + diag_vals.append(KeyValue( + key = 'Disk %d Name' % row_count, value = name)) + diag_vals.append(KeyValue( + key = 'Disk %d Available' % row_count, value = g_available)) + diag_vals.append(KeyValue( + key = 'Disk %d Size' % row_count, value = size)) + diag_vals.append(KeyValue( + key = 'Disk %d Status' % row_count, value = stat_dict[level])) + diag_vals.append(KeyValue( + key = 'Disk %d Mount Point' % row_count, value = mount_pt)) + + diag_level = max(diag_level, level) + diag_message = usage_dict[diag_level] + + else: + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed')) + diag_level = DiagnosticStatus.ERROR + diag_message = stat_dict[diag_level] + + + except: + rospy.logerr(traceback.format_exc()) + + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception')) + diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc())) + + diag_level = DiagnosticStatus.ERROR + diag_message = stat_dict[diag_level] + + # Update status + with self._mutex: + self._last_usage_time = rospy.get_time() + self._usage_stat.values = diag_vals + self._usage_stat.message = diag_message + self._usage_stat.level = diag_level + + if not rospy.is_shutdown(): + self._usage_timer = threading.Timer(5.0, self.check_disk_usage) + self._usage_timer.start() + else: + self.cancel_timers() + + + def publish_stats(self): + with self._mutex: + update_status_stale(self._temp_stat, self._last_temp_time) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(self._temp_stat) + if self._home_dir != '': + update_status_stale(self._usage_stat, self._last_usage_time) + msg.status.append(self._usage_stat) + + if rospy.get_time() - self._last_publish_time > 0.5: + self._diag_pub.publish(msg) + self._last_publish_time = rospy.get_time() + + + + +##\todo Need to check HD input/output too using iostat + +if __name__ == '__main__': + hostname = socket.gethostname() + + import optparse + parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default = hostname) + options, args = parser.parse_args(rospy.myargv()) + + home_dir = '' + if len(args) > 1: + home_dir = args[1] + + try: + rospy.init_node('hd_monitor_%s' % hostname) + except rospy.exceptions.ROSInitException: + print('HD monitor is unable to initialize node. Master may not be running.') + sys.exit(0) + + hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir) + rate = rospy.Rate(1.0) + + try: + while not rospy.is_shutdown(): + rate.sleep() + hd_monitor.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + traceback.print_exc() + + hd_monitor.cancel_timers() + sys.exit(0) + + + diff --git a/pr2_computer_monitor/scripts/ntp_monitor.py b/pr2_computer_monitor/scripts/ntp_monitor.py new file mode 100755 index 0000000..1b4b94b --- /dev/null +++ b/pr2_computer_monitor/scripts/ntp_monitor.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# Software License Agreement (BSD License) +# +# Copyright (c) 2008, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import roslib +roslib.load_manifest('pr2_computer_monitor') + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +import sys +import rospy +import socket +from subprocess import Popen, PIPE + +import time + +import re + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +NAME = 'ntp_monitor' + +def ntp_monitor(ntp_hostname, offset=500, self_offset=500, diag_hostname=None, + error_offset=5000000, ignore_self=False): + pub = rospy.Publisher("/diagnostics", DiagnosticArray, queue_size=10) + rospy.init_node(NAME, anonymous=True) + + hostname = socket.gethostname() + if diag_hostname is None: + diag_hostname = hostname + + ntp_checks = [] + stat = DiagnosticStatus() + stat.level = 0 + stat.name = "NTP offset from "+ diag_hostname + " to " + ntp_hostname + stat.message = "OK" + stat.hardware_id = hostname + stat.values = [] + ntp_checks.append((stat, ntp_hostname, offset)) + + if not ignore_self: + self_stat = DiagnosticStatus() + self_stat.level = DiagnosticStatus.OK + self_stat.name = "NTP self-offset for "+ diag_hostname + self_stat.message = "OK" + self_stat.hardware_id = hostname + self_stat.values = [] + ntp_checks.append((self_stat, hostname, self_offset)) + + while not rospy.is_shutdown(): + msg = DiagnosticArray() + for st, host, off in ntp_checks: + try: + p = Popen(["ntpdate", "-q", host], stdout=PIPE, stdin=PIPE, stderr=PIPE) + res = p.wait() + (o,e) = p.communicate() + except OSError as e: + (errno, msg) = e.args + if errno == 4: + break #ctrl-c interrupt + else: + raise + if (res == 0): + measured_offset = float(re.search("offset (.*),", o).group(1))*1000000 + + st.level = DiagnosticStatus.OK + st.message = "OK" + st.values = [ KeyValue("Offset (us)", str(measured_offset)), + KeyValue("Offset tolerance (us)", str(off)), + KeyValue("Offset tolerance (us) for Error", str(error_offset)) ] + + if (abs(measured_offset) > off): + st.level = DiagnosticStatus.WARN + st.message = "NTP Offset Too High" + if (abs(measured_offset) > error_offset): + st.level = DiagnosticStatus.ERROR + st.message = "NTP Offset Too High" + + else: + st.level = DiagnosticStatus.ERROR + st.message = "Error Running ntpdate. Returned %d" % res + st.values = [ KeyValue("Offset (us)", "N/A"), + KeyValue("Offset tolerance (us)", str(off)), + KeyValue("Offset tolerance (us) for Error", str(error_offset)), + KeyValue("Output", o), + KeyValue("Errors", e) ] + msg.status.append(st) + + msg.header.stamp = rospy.get_rostime() + pub.publish(msg) + time.sleep(1) + +def ntp_monitor_main(argv=sys.argv): + import optparse + parser = optparse.OptionParser(usage="usage: ntp_monitor ntp-hostname []") + parser.add_option("--offset-tolerance", dest="offset_tol", + action="store", default=500, + help="Offset from NTP host", metavar="OFFSET-TOL") + parser.add_option("--error-offset-tolerance", dest="error_offset_tol", + action="store", default=5000000, + help="Offset from NTP host. Above this is error", metavar="OFFSET-TOL") + parser.add_option("--self_offset-tolerance", dest="self_offset_tol", + action="store", default=500, + help="Offset from self", metavar="SELF_OFFSET-TOL") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default=None) + parser.add_option("--ignore-self", dest="ignore_self", + help="Ignore self NTP test", action="store_true") + options, args = parser.parse_args(rospy.myargv()) + + if (len(args) != 2): + parser.error("Invalid arguments. Must have HOSTNAME [args]. %s" % args) + + + try: + offset = int(options.offset_tol) + self_offset = int(options.self_offset_tol) + error_offset = int(options.error_offset_tol) + ignore_self = options.ignore_self + except: + parser.error("Offsets must be numbers") + + ntp_monitor(args[1], offset, self_offset, options.diag_hostname, + error_offset, ignore_self) + + +if __name__ == "__main__": + try: + ntp_monitor_main(rospy.myargv()) + except KeyboardInterrupt: pass + except SystemExit: pass + except: + import traceback + traceback.print_exc() diff --git a/pr2_computer_monitor/scripts/nvidia_temp.py b/pr2_computer_monitor/scripts/nvidia_temp.py new file mode 100755 index 0000000..639d554 --- /dev/null +++ b/pr2_computer_monitor/scripts/nvidia_temp.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts +##\brief Publishes diagnostic data on temperature and usage for a Quadro 600 GPU + +from __future__ import with_statement, division + +PKG = 'pr2_computer_monitor' +import roslib; roslib.load_manifest(PKG) + +import rospy + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus +from pr2_msgs.msg import GPUStatus + +import pr2_computer_monitor + +class NVidiaTempMonitor(object): + def __init__(self): + self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + + def pub_status(self): + gpu_stat = GPUStatus() + stat = DiagnosticStatus() + try: + card_out = pr2_computer_monitor.get_gpu_status() + gpu_stat = pr2_computer_monitor.parse_smi_output(card_out) + stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) + except Exception as e: + import traceback + rospy.logerr('Unable to process nVidia GPU data') + rospy.logerr(traceback.format_exc()) + + gpu_stat.header.stamp = rospy.get_rostime() + + array = DiagnosticArray() + array.header.stamp = rospy.get_rostime() + + array.status = [ stat ] + + self._pub.publish(array) + self._gpu_pub.publish(gpu_stat) + +if __name__ == '__main__': + rospy.init_node('nvidia_temp_monitor') + + monitor = NVidiaTempMonitor() + my_rate = rospy.Rate(1.0) + while not rospy.is_shutdown(): + monitor.pub_status() + my_rate.sleep() + + diff --git a/pr2_computer_monitor/scripts/wifi_monitor.py b/pr2_computer_monitor/scripts/wifi_monitor.py new file mode 100755 index 0000000..ce23d40 --- /dev/null +++ b/pr2_computer_monitor/scripts/wifi_monitor.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts +##\brief Republishes the data from ddwrt/accesspoint onto diagnostics + +from __future__ import with_statement + +PKG = 'pr2_computer_monitor' +import roslib +roslib.load_manifest(PKG) + +import rospy + +import threading +import sys + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +from pr2_msgs.msg import AccessPoint + +DIAG_NAME = 'Wifi Status (ddwrt)' +WARN_TIME = 30 +ERROR_TIME = 60 + + +def wifi_to_diag(msg): + stat = DiagnosticStatus() + + stat.name = DIAG_NAME + stat.level = DiagnosticStatus.OK + stat.message = 'OK' + + stat.values.append(KeyValue(key='ESSID', value=msg.essid)) + stat.values.append(KeyValue(key='Mac Address', value=msg.macaddr)) + stat.values.append(KeyValue(key='Signal', value=str(msg.signal))) + stat.values.append(KeyValue(key='Noise', value=str(msg.noise))) + stat.values.append(KeyValue(key='Sig/Noise', value=str(msg.snr))) + stat.values.append(KeyValue(key='Channel', value=str(msg.channel))) + stat.values.append(KeyValue(key='Rate', value=msg.rate)) + stat.values.append(KeyValue(key='TX Power', value=msg.tx_power)) + stat.values.append(KeyValue(key='Quality', value=str(msg.quality))) + + return stat + +def mark_diag_stale(diag_stat = None, error = False): + if not diag_stat: + diag_stat = DiagnosticStatus() + diag_stat.message = 'No Updates' + diag_stat.name = DIAG_NAME + else: + diag_stat.message = 'Updates Stale' + + diag_stat.level = DiagnosticStatus.WARN + if error: + diag_stat.level = DiagnosticStatus.ERROR + + return diag_stat + +class WifiMonitor(object): + def __init__(self): + self._mutex = threading.Lock() + + self._last_msg = None + self._last_update_time = None + self._start_time = rospy.get_time() + + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._ddwrt_sub = rospy.Subscriber('ddwrt/accesspoint', AccessPoint, self._cb) + + def _cb(self, msg): + with self._mutex: + self._last_msg = msg + self._last_update_time = rospy.get_time() + + def publish_stats(self): + with self._mutex: + if self._last_msg: + ddwrt_stat = wifi_to_diag(self._last_msg) + + update_diff = rospy.get_time() - self._last_update_time + if update_diff > WARN_TIME: + ddwrt_stat = mark_diag_stale(ddwrt_stat) + if (rospy.get_time() - self._last_update_time) > ERROR_TIME: + ddwrt_stat = mark_diag_stale(ddwrt_stat, True) + + ddwrt_stat.values.append(KeyValue(key='Time Since Update', value=str(update_diff))) + else: + error_state = (rospy.get_time() - self._start_time) > ERROR_TIME + ddwrt_stat = mark_diag_stale(None, error_state) + ddwrt_stat.values.append(KeyValue(key='Time Since Update', value="N/A")) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(ddwrt_stat) + + self._diag_pub.publish(msg) + + +if __name__ == '__main__': + try: + rospy.init_node('ddwrt_diag') + except rospy.exceptions.ROSInitException: + print('Wifi monitor is unable to initialize node. Master may not be running.') + sys.exit(2) + + wifi_monitor = WifiMonitor() + rate = rospy.Rate(1.0) + + try: + while not rospy.is_shutdown(): + rate.sleep() + wifi_monitor.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + import traceback + traceback.print_exc() + + sys.exit(0) + + + diff --git a/pr2_computer_monitor/setup.py b/pr2_computer_monitor/setup.py new file mode 100644 index 0000000..a00b195 --- /dev/null +++ b/pr2_computer_monitor/setup.py @@ -0,0 +1,11 @@ +## ! DO NOT MANUALLY INVOKE THIS setup.py, USE CATKIN INSTEAD + +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +# fetch values from package.xml +setup_args = generate_distutils_setup( + packages=['pr2_computer_monitor'], + package_dir={'': 'src'}) + +setup(**setup_args) diff --git a/pr2_computer_monitor/src/network_detector.cpp b/pr2_computer_monitor/src/network_detector.cpp new file mode 100644 index 0000000..cb77e10 --- /dev/null +++ b/pr2_computer_monitor/src/network_detector.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include + +#include + +#include "ros/ros.h" +#include "std_msgs/Bool.h" + +static int socket_fd = -1; + +bool initSocket() +{ + socket_fd = socket( AF_INET, SOCK_DGRAM, 0 ); + if( socket_fd != -1 ) + return true; + else + return false; +} + +bool interfaceIsRunning( std::string interface_name ) +{ + struct ifreq ifr; + + strcpy( ifr.ifr_name, interface_name.c_str() ); + if( ioctl( socket_fd, SIOCGIFFLAGS, &ifr ) < 0 ) + { + static std::string last_warning; + std::string warning = "Query of interface '" + interface_name + "' failed: '" + strerror( errno ) + "' Presuming down."; + if( warning != last_warning ) + { + ROS_WARN("%s", warning.c_str() ); + } + last_warning = warning; + return false; + } + bool running = (ifr.ifr_flags & IFF_RUNNING); + bool up = (ifr.ifr_flags & IFF_UP); + + return up && running; +} + +int main( int argc, char **argv ) +{ + ros::init( argc, argv, "network_detector" ); + ros::NodeHandle node; + std::string interface_name; + if( !ros::param::get( "~interface_name", interface_name )) + { + ROS_FATAL( "No parameter 'interface_name' specified. Don't know which interface to watch. Exiting." ); + exit(1); + } + ros::Publisher running_pub = node.advertise( "network/connected", 0, true ); + int loop_count; + bool first_time = true; + bool was_running = false; + float inner_loop_hertz = 4; + ros::Rate loop_rate( inner_loop_hertz ); + if( !initSocket() ) + { + ROS_FATAL( "Failed to open socket for network ioctl: '%s'. Exiting.", + strerror( errno )); + exit(1); + } + while( ros::ok() ) + { + bool is_running = interfaceIsRunning( interface_name ); + if( is_running != was_running || first_time || loop_count > inner_loop_hertz * 5 ) + { + if( is_running != was_running ) + { + ROS_INFO( "Interface '%s' %s.", interface_name.c_str(), (is_running ? "connected" : "disconnected") ); + } + + std_msgs::Bool msg; + msg.data = is_running; + running_pub.publish( msg ); + + loop_count = 0; + first_time = false; + } + ros::spinOnce(); + loop_rate.sleep(); + loop_count++; + was_running = is_running; + } +} diff --git a/pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py b/pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py new file mode 100644 index 0000000..b2b5858 --- /dev/null +++ b/pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import division + +PKG = 'pr2_computer_monitor' +import roslib; roslib.load_manifest(PKG) + +import rospy + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +from pr2_msgs.msg import GPUStatus + +import subprocess +import math + +MAX_FAN_RPM = 4500 + +def _rads_to_rpm(rads): + return rads / (2 * math.pi) * 60 + +def _rpm_to_rads(rpm): + return rpm * (2 * math.pi) / 60 + +def gpu_status_to_diag(gpu_stat): + stat = DiagnosticStatus() + stat.name = 'GPU Status' + stat.message = 'OK' + stat.level = DiagnosticStatus.OK + stat.hardware_id = gpu_stat.pci_device_id + + stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) + stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) + stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) + stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) + stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) + stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % _rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) + stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) + + # Check for valid data + if not gpu_stat.product_name or not gpu_stat.pci_device_id: + stat.level = DiagnosticStatus.ERROR + stat.message = 'No Device Data' + return stat + + # Check load + if gpu_stat.gpu_usage > 98: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Load' + + # Check thresholds + if gpu_stat.temperature > 90: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Temperature' + if gpu_stat.temperature > 95: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'Temperature Alarm' + + # Check fan + if gpu_stat.fan_speed == 0: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'No Fan Speed' + + + + return stat + + +def _find_val(output, word): + lines = output.split('\n') + for line in lines: + tple = line.split(':') + if not len(tple) > 1: + continue + + name = tple[0].strip() + val = ':'.join(tple[1:]).strip() + + if not name.lower() == word.lower(): + continue + + return val.strip() + + return '' + +def parse_smi_output(output): + gpu_stat = GPUStatus() + + + gpu_stat.product_name = _find_val(output, 'Product Name') + gpu_stat.pci_device_id = _find_val(output, 'PCI Device/Vendor ID') + gpu_stat.pci_location = _find_val(output, 'PCI Location ID') + gpu_stat.display = _find_val(output, 'Display') + gpu_stat.driver_version = _find_val(output, 'Driver Version') + + temp_str = _find_val(output, 'Temperature') + if temp_str: + temp, units = temp_str.split() + gpu_stat.temperature = int(temp) + + fan_str = _find_val(output, 'Fan Speed') + if fan_str: + # Fan speed in RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * MAX_FAN_RPM + # Convert fan speed to Hz + gpu_stat.fan_speed = _rpm_to_rads(fan_spd) + + usage_str = _find_val(output, 'GPU') + if usage_str: + usage = usage_str.strip('\%').strip() + gpu_stat.gpu_usage = int(usage) + + mem_str = _find_val(output, 'Memory') + if mem_str: + mem = mem_str.strip('\%').strip() + gpu_stat.memory_usage = int(mem) + + return gpu_stat + +def get_gpu_status(): + p = subprocess.Popen('sudo nvidia-smi -a', stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + (o, e) = p.communicate() + + if not p.returncode == 0: + return '' + + if not o: return '' + + return o diff --git a/pr2_computer_monitor/test/parse_test.py b/pr2_computer_monitor/test/parse_test.py new file mode 100755 index 0000000..e64e639 --- /dev/null +++ b/pr2_computer_monitor/test/parse_test.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement + +PKG = 'pr2_computer_monitor' + +import roslib; roslib.load_manifest(PKG) +import unittest + +import pr2_computer_monitor + +import os, sys + +TEXT_PATH = 'test/sample_output/nvidia_smi_out.txt' +TEXT_HIGH_TEMP_PATH = 'test/sample_output/nvidia_smi_high_temp.txt' + + +##\brief Parses launch, tests.xml and configs.xml files in qualification +class TestNominalParser(unittest.TestCase): + def setUp(self): + with open(os.path.join(roslib.packages.get_pkg_dir('pr2_computer_monitor'), TEXT_PATH), 'r') as f: + self.data = f.read() + + with open(os.path.join(roslib.packages.get_pkg_dir('pr2_computer_monitor'), TEXT_HIGH_TEMP_PATH), 'r') as f: + self.high_temp_data = f.read() + + def test_parse(self): + gpu_stat = pr2_computer_monitor.parse_smi_output(self.data) + + # Check valid + self.assert_(self.data, "Unable to read sample output, no test to run") + + # Check all string fields of message + self.assert_(gpu_stat.pci_device_id, "No PCI Device ID found") + self.assert_(gpu_stat.pci_location, "No PCI Location found") + self.assert_(gpu_stat.display, "No Display found") + self.assert_(gpu_stat.driver_version, "No Driver Version found") + self.assert_(gpu_stat.product_name, "No Product Name found") + + self.assert_(gpu_stat.temperature > 40 and gpu_stat.temperature < 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) + self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %f" % gpu_stat.fan_speed) + + diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 0, "Diagnostics reports an error for nominal input. Message: %s" % diag_stat.message) + + def test_high_temp_parse(self): + gpu_stat = pr2_computer_monitor.parse_smi_output(self.high_temp_data) + + # Check valid + self.assert_(self.high_temp_data, "Unable to read sample output, no test to run") + + # Check all string fields of message + self.assert_(gpu_stat.pci_device_id, "No PCI Device ID found") + self.assert_(gpu_stat.pci_location, "No PCI Location found") + self.assert_(gpu_stat.display, "No Display found") + self.assert_(gpu_stat.driver_version, "No Driver Version found") + self.assert_(gpu_stat.product_name, "No Product Name found") + + self.assert_(gpu_stat.temperature > 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) + self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %s" % gpu_stat.fan_speed) + + diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 1, "Diagnostics didn't report warning for high temp input. Level %d, Message: %s" % (diag_stat.level, diag_stat.message)) + + + def test_empty_parse(self): + gpu_stat = pr2_computer_monitor.parse_smi_output('') + + self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) + + diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 2, "Diagnostics didn't reports an error for empty input. Level: %d, Message: %s" % (diag_stat.level, diag_stat.message)) + + + +if __name__ == '__main__': + if len(sys.argv) > 1 and sys.argv[1] == '-v': + # Use to run tests verbosly + suite = unittest.TestSuite() + suite.addTest(TestNominalParser('test_parse')) + suite.addTest(TestNominalParser('test_empty_parse')) + suite.addTest(TestNominalParser('test_high_temp_parse')) + + unittest.TextTestRunner(verbosity = 2).run(suite) + else: + import rostest + rostest.unitrun(PKG, 'parse_nominal', TestNominalParser) + + diff --git a/pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt b/pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt new file mode 100644 index 0000000..0290602 --- /dev/null +++ b/pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt @@ -0,0 +1,18 @@ + +==============NVSMI LOG============== + + +Timestamp : Wed Sep 29 10:45:16 2010 + +Driver Version : 260.24 + +GPU 0: + Product Name : Quadro 600 + PCI Device/Vendor ID : df810de + PCI Location ID : 0:3:0 + Display : Connected + Temperature : 92 C + Fan Speed : 90% + Utilization + GPU : 95% + Memory : 22% diff --git a/pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt b/pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt new file mode 100644 index 0000000..0106e1c --- /dev/null +++ b/pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt @@ -0,0 +1,18 @@ + +==============NVSMI LOG============== + + +Timestamp : Wed Sep 29 10:37:16 2010 + +Driver Version : 260.24 + +GPU 0: + Product Name : Quadro 600 + PCI Device/Vendor ID : df810de + PCI Location ID : 0:3:0 + Display : Connected + Temperature : 54 C + Fan Speed : 38% + Utilization + GPU : 0% + Memory : 0% From ecb7ec7e40c1fb7f0dbcc1e905076cc987289e49 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 4 Aug 2021 03:05:58 -0400 Subject: [PATCH 02/18] [capability] Rename from pr2_computer_monitor to computer_hw package --- .../CHANGELOG.rst | 4 ++ .../CMakeLists.txt | 20 +++++----- computer_hw/README.md | 17 ++++++++ .../demo/cpu_monitor.launch | 4 ++ computer_hw/demo/hd_monitor.launch | 10 +++++ computer_hw/demo/ntp_monitor.launch | 4 ++ computer_hw/package.xml | 24 ++++++++++++ .../scripts/README.md | 0 .../scripts/cpu_monitor.py | 4 ++ .../scripts/hd_monitor.py | 4 ++ .../scripts/ntp_monitor.py | 4 ++ .../scripts/nvidia_temp.py | 12 ++++++ .../scripts/wifi_monitor.py | 9 +---- .../setup.py | 4 ++ .../src/computer_monitor}/nvidia_smi_util.py | 4 ++ .../src/network_detector.cpp | 0 .../test/parse_test.py | 39 +++++++++++++++++++ .../sample_output/nvidia_smi_high_temp.txt | 0 .../test/sample_output/nvidia_smi_out.txt | 0 pr2_computer_monitor/INSTALL | 5 --- pr2_computer_monitor/demo/hd_monitor.launch | 6 --- pr2_computer_monitor/demo/ntp_monitor.launch | 4 -- pr2_computer_monitor/package.xml | 30 -------------- 23 files changed, 146 insertions(+), 62 deletions(-) rename {pr2_computer_monitor => computer_hw}/CHANGELOG.rst (85%) rename {pr2_computer_monitor => computer_hw}/CMakeLists.txt (69%) create mode 100644 computer_hw/README.md rename {pr2_computer_monitor => computer_hw}/demo/cpu_monitor.launch (54%) create mode 100644 computer_hw/demo/hd_monitor.launch create mode 100644 computer_hw/demo/ntp_monitor.launch create mode 100644 computer_hw/package.xml rename {pr2_computer_monitor => computer_hw}/scripts/README.md (100%) rename {pr2_computer_monitor => computer_hw}/scripts/cpu_monitor.py (99%) rename {pr2_computer_monitor => computer_hw}/scripts/hd_monitor.py (98%) rename {pr2_computer_monitor => computer_hw}/scripts/ntp_monitor.py (96%) rename {pr2_computer_monitor => computer_hw}/scripts/nvidia_temp.py (81%) rename {pr2_computer_monitor => computer_hw}/scripts/wifi_monitor.py (97%) rename {pr2_computer_monitor => computer_hw}/setup.py (60%) rename {pr2_computer_monitor/src/pr2_computer_monitor => computer_hw/src/computer_monitor}/nvidia_smi_util.py (95%) rename {pr2_computer_monitor => computer_hw}/src/network_detector.cpp (100%) rename {pr2_computer_monitor => computer_hw}/test/parse_test.py (71%) rename {pr2_computer_monitor => computer_hw}/test/sample_output/nvidia_smi_high_temp.txt (100%) rename {pr2_computer_monitor => computer_hw}/test/sample_output/nvidia_smi_out.txt (100%) delete mode 100644 pr2_computer_monitor/INSTALL delete mode 100644 pr2_computer_monitor/demo/hd_monitor.launch delete mode 100644 pr2_computer_monitor/demo/ntp_monitor.launch delete mode 100644 pr2_computer_monitor/package.xml diff --git a/pr2_computer_monitor/CHANGELOG.rst b/computer_hw/CHANGELOG.rst similarity index 85% rename from pr2_computer_monitor/CHANGELOG.rst rename to computer_hw/CHANGELOG.rst index 7248f8c..63e66ab 100644 --- a/pr2_computer_monitor/CHANGELOG.rst +++ b/computer_hw/CHANGELOG.rst @@ -1,5 +1,9 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +<<<<<<< HEAD:pr2_computer_monitor/CHANGELOG.rst Changelog for package pr2_computer_monitor +======= +Changelog for package computer_hw +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/CHANGELOG.rst ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 1.6.32 (2021-05-26) diff --git a/pr2_computer_monitor/CMakeLists.txt b/computer_hw/CMakeLists.txt similarity index 69% rename from pr2_computer_monitor/CMakeLists.txt rename to computer_hw/CMakeLists.txt index e1d4269..fffd495 100644 --- a/pr2_computer_monitor/CMakeLists.txt +++ b/computer_hw/CMakeLists.txt @@ -1,12 +1,12 @@ # http://ros.org/doc/groovy/api/catkin/html/user_guide/supposed.html cmake_minimum_required(VERSION 2.8.3) -project(pr2_computer_monitor) +project(computer_hw) # Load catkin and all dependencies required for this package # TODO: remove all from COMPONENTS that are not catkin packages. find_package(catkin REQUIRED COMPONENTS roscpp std_msgs) if(CATKIN_ENABLE_TESTING) - #catkin_add_nosetests(test/parse_test.py) + catkin_add_nosetests(test/parse_test.py) endif() include_directories(include ${catkin_INCLUDE_DIRS}) @@ -18,14 +18,9 @@ catkin_package( LIBRARIES network_detector# TODO ) - -install(DIRECTORY demo - DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}) - -file(GLOB PYTHON_SCRIPTS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" - "${CMAKE_CURRENT_SOURCE_DIR}/scripts/*") -install(PROGRAMS ${PYTHON_SCRIPTS} - DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) +catkin_install_python(PROGRAMS + executables/cpu_monitor.py executables/hd_monitor.py executables/ntp_monitor.py executables/ntp_monitor.py executables/nvidia_temp.py executables/wifi_monitor.py + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) add_executable(network_detector src/network_detector.cpp) target_link_libraries(network_detector ${catkin_LIBRARIES}) @@ -36,4 +31,9 @@ install(TARGETS network_detector LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) +foreach(dir conf) + install(DIRECTORY ${dir}/ + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/${dir}) +endforeach(dir) + catkin_python_setup() diff --git a/computer_hw/README.md b/computer_hw/README.md new file mode 100644 index 0000000..d0285ba --- /dev/null +++ b/computer_hw/README.md @@ -0,0 +1,17 @@ +# computer_hw + +## Development history +`computer_hw` package was originally made in [pr2_robot](https://github.com/PR2/pr2_robot) repository. See discussion [pr2_common#286](https://github.com/PR2/pr2_common/issues/286) for the migration. + +## Installation tips +In order for ipmitool to work on computers with a BMC, the following line needs to appear in `/etc/sudoers`: + +``` +ALL ALL=NOPASSWD: /usr/bin/ipmitool sdr type Temperature +``` + +## Usage / Operation +TBD + +EoF + diff --git a/pr2_computer_monitor/demo/cpu_monitor.launch b/computer_hw/demo/cpu_monitor.launch similarity index 54% rename from pr2_computer_monitor/demo/cpu_monitor.launch rename to computer_hw/demo/cpu_monitor.launch index 6566691..f19b907 100644 --- a/pr2_computer_monitor/demo/cpu_monitor.launch +++ b/computer_hw/demo/cpu_monitor.launch @@ -1,5 +1,9 @@ +<<<<<<< HEAD:pr2_computer_monitor/demo/cpu_monitor.launch >>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/demo/cpu_monitor.launch args="--diag-hostname=my_machine" > diff --git a/computer_hw/demo/hd_monitor.launch b/computer_hw/demo/hd_monitor.launch new file mode 100644 index 0000000..4f2f381 --- /dev/null +++ b/computer_hw/demo/hd_monitor.launch @@ -0,0 +1,10 @@ + +<<<<<<< HEAD:pr2_computer_monitor/demo/hd_monitor.launch + >>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/demo/hd_monitor.launch + args="$(optenv HOME /home) --diag-hostname=my_machine" name="hd_monitor" > + + + diff --git a/computer_hw/demo/ntp_monitor.launch b/computer_hw/demo/ntp_monitor.launch new file mode 100644 index 0000000..60d4945 --- /dev/null +++ b/computer_hw/demo/ntp_monitor.launch @@ -0,0 +1,4 @@ + + + diff --git a/computer_hw/package.xml b/computer_hw/package.xml new file mode 100644 index 0000000..354d8b6 --- /dev/null +++ b/computer_hw/package.xml @@ -0,0 +1,24 @@ + + computer_hw + 1.6.31 + Monitors the computer's processor and hard drives of the PR2 and publishes data to diagnostics. Originally taken from pr2_robot repo. + ROS Orphaned Package Maintainers + Isaac I.Y. Saito + BSD + + http://www.ros.org/wiki/computer_hw + https://github.com/130s/computer_hw/issues + + Kevin Watts (watts@willowgarage.com) + Isaac Saito + + catkin + + diagnostic_msgs + computer_status_msgs + roscpp + std_msgs + libsensors_monitor + rospy + + diff --git a/pr2_computer_monitor/scripts/README.md b/computer_hw/scripts/README.md similarity index 100% rename from pr2_computer_monitor/scripts/README.md rename to computer_hw/scripts/README.md diff --git a/pr2_computer_monitor/scripts/cpu_monitor.py b/computer_hw/scripts/cpu_monitor.py similarity index 99% rename from pr2_computer_monitor/scripts/cpu_monitor.py rename to computer_hw/scripts/cpu_monitor.py index 486a4aa..b82ce26 100755 --- a/pr2_computer_monitor/scripts/cpu_monitor.py +++ b/computer_hw/scripts/cpu_monitor.py @@ -36,7 +36,11 @@ from __future__ import with_statement import roslib +<<<<<<< HEAD:pr2_computer_monitor/scripts/cpu_monitor.py roslib.load_manifest('pr2_computer_monitor') +======= +roslib.load_manifest('computer_hw') +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/cpu_monitor.py import rospy diff --git a/pr2_computer_monitor/scripts/hd_monitor.py b/computer_hw/scripts/hd_monitor.py similarity index 98% rename from pr2_computer_monitor/scripts/hd_monitor.py rename to computer_hw/scripts/hd_monitor.py index 8c582fa..a9fc301 100755 --- a/pr2_computer_monitor/scripts/hd_monitor.py +++ b/computer_hw/scripts/hd_monitor.py @@ -36,7 +36,11 @@ from __future__ import with_statement import roslib +<<<<<<< HEAD:pr2_computer_monitor/scripts/hd_monitor.py roslib.load_manifest('pr2_computer_monitor') +======= +roslib.load_manifest('computer_hw') +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/hd_monitor.py import rospy diff --git a/pr2_computer_monitor/scripts/ntp_monitor.py b/computer_hw/scripts/ntp_monitor.py similarity index 96% rename from pr2_computer_monitor/scripts/ntp_monitor.py rename to computer_hw/scripts/ntp_monitor.py index 1b4b94b..0ce5b95 100755 --- a/pr2_computer_monitor/scripts/ntp_monitor.py +++ b/computer_hw/scripts/ntp_monitor.py @@ -32,7 +32,11 @@ # POSSIBILITY OF SUCH DAMAGE. import roslib +<<<<<<< HEAD:pr2_computer_monitor/scripts/ntp_monitor.py roslib.load_manifest('pr2_computer_monitor') +======= +roslib.load_manifest('computer_hw') +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/ntp_monitor.py from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue diff --git a/pr2_computer_monitor/scripts/nvidia_temp.py b/computer_hw/scripts/nvidia_temp.py similarity index 81% rename from pr2_computer_monitor/scripts/nvidia_temp.py rename to computer_hw/scripts/nvidia_temp.py index 639d554..ac1b39f 100755 --- a/pr2_computer_monitor/scripts/nvidia_temp.py +++ b/computer_hw/scripts/nvidia_temp.py @@ -45,7 +45,11 @@ from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus from pr2_msgs.msg import GPUStatus +<<<<<<< HEAD:pr2_computer_monitor/scripts/nvidia_temp.py import pr2_computer_monitor +======= +import computer_hw +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/nvidia_temp.py class NVidiaTempMonitor(object): def __init__(self): @@ -56,10 +60,18 @@ def pub_status(self): gpu_stat = GPUStatus() stat = DiagnosticStatus() try: +<<<<<<< HEAD:pr2_computer_monitor/scripts/nvidia_temp.py card_out = pr2_computer_monitor.get_gpu_status() gpu_stat = pr2_computer_monitor.parse_smi_output(card_out) stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) except Exception as e: +======= + card_out = computer_hw.get_gpu_status() + gpu_stat = computer_hw.parse_smi_output(card_out) + stat = computer_hw.gpu_status_to_diag(gpu_stat) + rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) + except Exception, e: +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/nvidia_temp.py import traceback rospy.logerr('Unable to process nVidia GPU data') rospy.logerr(traceback.format_exc()) diff --git a/pr2_computer_monitor/scripts/wifi_monitor.py b/computer_hw/scripts/wifi_monitor.py similarity index 97% rename from pr2_computer_monitor/scripts/wifi_monitor.py rename to computer_hw/scripts/wifi_monitor.py index ce23d40..4072e01 100755 --- a/pr2_computer_monitor/scripts/wifi_monitor.py +++ b/computer_hw/scripts/wifi_monitor.py @@ -37,18 +37,13 @@ from __future__ import with_statement -PKG = 'pr2_computer_monitor' -import roslib -roslib.load_manifest(PKG) - +from computer_status_msgs.msg import AccessPoint +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue import rospy import threading import sys -from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue -from pr2_msgs.msg import AccessPoint - DIAG_NAME = 'Wifi Status (ddwrt)' WARN_TIME = 30 ERROR_TIME = 60 diff --git a/pr2_computer_monitor/setup.py b/computer_hw/setup.py similarity index 60% rename from pr2_computer_monitor/setup.py rename to computer_hw/setup.py index a00b195..f34fc97 100644 --- a/pr2_computer_monitor/setup.py +++ b/computer_hw/setup.py @@ -5,7 +5,11 @@ # fetch values from package.xml setup_args = generate_distutils_setup( +<<<<<<< HEAD:pr2_computer_monitor/setup.py packages=['pr2_computer_monitor'], +======= + packages=['computer_hw'], +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/setup.py package_dir={'': 'src'}) setup(**setup_args) diff --git a/pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py b/computer_hw/src/computer_monitor/nvidia_smi_util.py similarity index 95% rename from pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py rename to computer_hw/src/computer_monitor/nvidia_smi_util.py index b2b5858..df48927 100644 --- a/pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py +++ b/computer_hw/src/computer_monitor/nvidia_smi_util.py @@ -36,7 +36,11 @@ from __future__ import division +<<<<<<< HEAD:pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py PKG = 'pr2_computer_monitor' +======= +PKG = 'computer_hw' +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/src/computer_monitor/nvidia_smi_util.py import roslib; roslib.load_manifest(PKG) import rospy diff --git a/pr2_computer_monitor/src/network_detector.cpp b/computer_hw/src/network_detector.cpp similarity index 100% rename from pr2_computer_monitor/src/network_detector.cpp rename to computer_hw/src/network_detector.cpp diff --git a/pr2_computer_monitor/test/parse_test.py b/computer_hw/test/parse_test.py similarity index 71% rename from pr2_computer_monitor/test/parse_test.py rename to computer_hw/test/parse_test.py index e64e639..a4a7611 100755 --- a/pr2_computer_monitor/test/parse_test.py +++ b/computer_hw/test/parse_test.py @@ -36,12 +36,20 @@ from __future__ import with_statement +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py PKG = 'pr2_computer_monitor' +======= +PKG = 'computer_hw' +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py import roslib; roslib.load_manifest(PKG) import unittest +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py import pr2_computer_monitor +======= +import computer_hw +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py import os, sys @@ -52,6 +60,7 @@ ##\brief Parses launch, tests.xml and configs.xml files in qualification class TestNominalParser(unittest.TestCase): def setUp(self): +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py with open(os.path.join(roslib.packages.get_pkg_dir('pr2_computer_monitor'), TEXT_PATH), 'r') as f: self.data = f.read() @@ -60,6 +69,16 @@ def setUp(self): def test_parse(self): gpu_stat = pr2_computer_monitor.parse_smi_output(self.data) +======= + with open(os.path.join(roslib.packages.get_pkg_dir('computer_hw'), TEXT_PATH), 'r') as f: + self.data = f.read() + + with open(os.path.join(roslib.packages.get_pkg_dir('computer_hw'), TEXT_HIGH_TEMP_PATH), 'r') as f: + self.high_temp_data = f.read() + + def test_parse(self): + gpu_stat = computer_hw.parse_smi_output(self.data) +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py # Check valid self.assert_(self.data, "Unable to read sample output, no test to run") @@ -74,12 +93,20 @@ def test_parse(self): self.assert_(gpu_stat.temperature > 40 and gpu_stat.temperature < 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %f" % gpu_stat.fan_speed) +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) +======= + diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 0, "Diagnostics reports an error for nominal input. Message: %s" % diag_stat.message) def test_high_temp_parse(self): +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py gpu_stat = pr2_computer_monitor.parse_smi_output(self.high_temp_data) +======= + gpu_stat = computer_hw.parse_smi_output(self.high_temp_data) +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py # Check valid self.assert_(self.high_temp_data, "Unable to read sample output, no test to run") @@ -94,17 +121,29 @@ def test_high_temp_parse(self): self.assert_(gpu_stat.temperature > 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %s" % gpu_stat.fan_speed) +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) +======= + diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 1, "Diagnostics didn't report warning for high temp input. Level %d, Message: %s" % (diag_stat.level, diag_stat.message)) def test_empty_parse(self): +<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py gpu_stat = pr2_computer_monitor.parse_smi_output('') self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) +======= + gpu_stat = computer_hw.parse_smi_output('') + + self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) + + diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) +>>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 2, "Diagnostics didn't reports an error for empty input. Level: %d, Message: %s" % (diag_stat.level, diag_stat.message)) diff --git a/pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt b/computer_hw/test/sample_output/nvidia_smi_high_temp.txt similarity index 100% rename from pr2_computer_monitor/test/sample_output/nvidia_smi_high_temp.txt rename to computer_hw/test/sample_output/nvidia_smi_high_temp.txt diff --git a/pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt b/computer_hw/test/sample_output/nvidia_smi_out.txt similarity index 100% rename from pr2_computer_monitor/test/sample_output/nvidia_smi_out.txt rename to computer_hw/test/sample_output/nvidia_smi_out.txt diff --git a/pr2_computer_monitor/INSTALL b/pr2_computer_monitor/INSTALL deleted file mode 100644 index df31116..0000000 --- a/pr2_computer_monitor/INSTALL +++ /dev/null @@ -1,5 +0,0 @@ -In order for ipmitool to work on computers with a BMC, the following line -needs to appear in /etc/sudoers: - -ALL ALL=NOPASSWD: /usr/bin/ipmitool sdr type Temperature - diff --git a/pr2_computer_monitor/demo/hd_monitor.launch b/pr2_computer_monitor/demo/hd_monitor.launch deleted file mode 100644 index 9a73f83..0000000 --- a/pr2_computer_monitor/demo/hd_monitor.launch +++ /dev/null @@ -1,6 +0,0 @@ - - - - - diff --git a/pr2_computer_monitor/demo/ntp_monitor.launch b/pr2_computer_monitor/demo/ntp_monitor.launch deleted file mode 100644 index 5fefeae..0000000 --- a/pr2_computer_monitor/demo/ntp_monitor.launch +++ /dev/null @@ -1,4 +0,0 @@ - - - diff --git a/pr2_computer_monitor/package.xml b/pr2_computer_monitor/package.xml deleted file mode 100644 index 2146f07..0000000 --- a/pr2_computer_monitor/package.xml +++ /dev/null @@ -1,30 +0,0 @@ - - pr2_computer_monitor - 1.6.32 - Monitors the computer's processor and hard drives of the PR2 and publishes data to diagnostics. - Dave Feil-Seifer - - BSD - - http://www.ros.org/wiki/pr2_computer_monitor - - - Kevin Watts (watts@willowgarage.com) - - catkin - - diagnostic_msgs - pr2_msgs - roscpp - std_msgs - - diagnostic_msgs - rospy - pr2_msgs - roscpp - std_msgs - - - - - From 3331e7fa499d1608e81e3e122870c2107da72d63 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 4 Aug 2021 03:10:16 -0400 Subject: [PATCH 03/18] [capability] Add CPU monitoring --- computer_hw/conf/cpu_monitor.launch | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 computer_hw/conf/cpu_monitor.launch diff --git a/computer_hw/conf/cpu_monitor.launch b/computer_hw/conf/cpu_monitor.launch new file mode 100644 index 0000000..ef54d84 --- /dev/null +++ b/computer_hw/conf/cpu_monitor.launch @@ -0,0 +1,3 @@ + + + \ No newline at end of file From ba38cc304a24368fa85c686f4c5c03f480b5fa78 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 4 Aug 2021 03:20:05 -0400 Subject: [PATCH 04/18] [computer_hw][maintenance] Rename scripts to executable, for more genericity. Move all .launch files to conf folder. --- computer_hw/CHANGELOG.rst | 4 -- computer_hw/conf/cpu_monitor.launch | 9 ++++- computer_hw/conf/hd_monitor.launch | 6 +++ computer_hw/conf/monitor.launch | 6 +++ computer_hw/{demo => conf}/ntp_monitor.launch | 0 computer_hw/demo/cpu_monitor.launch | 4 -- computer_hw/demo/hd_monitor.launch | 10 ----- .../{scripts => executables}/README.md | 0 .../{scripts => executables}/cpu_monitor.py | 6 --- .../{scripts => executables}/hd_monitor.py | 6 --- .../{scripts => executables}/ntp_monitor.py | 7 ---- .../{scripts => executables}/nvidia_temp.py | 26 ++++-------- .../{scripts => executables}/wifi_monitor.py | 3 +- computer_hw/setup.py | 4 -- computer_hw/src/computer_hw/__init__.py | 0 .../nvidia_smi_util.py | 16 +++----- computer_hw/test/parse_test.py | 40 ------------------- 17 files changed, 32 insertions(+), 115 deletions(-) create mode 100644 computer_hw/conf/hd_monitor.launch create mode 100644 computer_hw/conf/monitor.launch rename computer_hw/{demo => conf}/ntp_monitor.launch (100%) delete mode 100644 computer_hw/demo/hd_monitor.launch rename computer_hw/{scripts => executables}/README.md (100%) rename computer_hw/{scripts => executables}/cpu_monitor.py (99%) rename computer_hw/{scripts => executables}/hd_monitor.py (98%) rename computer_hw/{scripts => executables}/ntp_monitor.py (95%) rename computer_hw/{scripts => executables}/nvidia_temp.py (75%) rename computer_hw/{scripts => executables}/wifi_monitor.py (99%) create mode 100644 computer_hw/src/computer_hw/__init__.py rename computer_hw/src/{computer_monitor => computer_hw}/nvidia_smi_util.py (93%) diff --git a/computer_hw/CHANGELOG.rst b/computer_hw/CHANGELOG.rst index 63e66ab..fd3df3d 100644 --- a/computer_hw/CHANGELOG.rst +++ b/computer_hw/CHANGELOG.rst @@ -1,9 +1,5 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -<<<<<<< HEAD:pr2_computer_monitor/CHANGELOG.rst -Changelog for package pr2_computer_monitor -======= Changelog for package computer_hw ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/CHANGELOG.rst ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 1.6.32 (2021-05-26) diff --git a/computer_hw/conf/cpu_monitor.launch b/computer_hw/conf/cpu_monitor.launch index ef54d84..b661581 100644 --- a/computer_hw/conf/cpu_monitor.launch +++ b/computer_hw/conf/cpu_monitor.launch @@ -1,3 +1,8 @@ - - \ No newline at end of file + + + + + + diff --git a/computer_hw/conf/hd_monitor.launch b/computer_hw/conf/hd_monitor.launch new file mode 100644 index 0000000..680bc48 --- /dev/null +++ b/computer_hw/conf/hd_monitor.launch @@ -0,0 +1,6 @@ + + + + + diff --git a/computer_hw/conf/monitor.launch b/computer_hw/conf/monitor.launch new file mode 100644 index 0000000..32ccc87 --- /dev/null +++ b/computer_hw/conf/monitor.launch @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/computer_hw/demo/ntp_monitor.launch b/computer_hw/conf/ntp_monitor.launch similarity index 100% rename from computer_hw/demo/ntp_monitor.launch rename to computer_hw/conf/ntp_monitor.launch diff --git a/computer_hw/demo/cpu_monitor.launch b/computer_hw/demo/cpu_monitor.launch index f19b907..b661581 100644 --- a/computer_hw/demo/cpu_monitor.launch +++ b/computer_hw/demo/cpu_monitor.launch @@ -1,9 +1,5 @@ -<<<<<<< HEAD:pr2_computer_monitor/demo/cpu_monitor.launch - >>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/demo/cpu_monitor.launch args="--diag-hostname=my_machine" > diff --git a/computer_hw/demo/hd_monitor.launch b/computer_hw/demo/hd_monitor.launch deleted file mode 100644 index 4f2f381..0000000 --- a/computer_hw/demo/hd_monitor.launch +++ /dev/null @@ -1,10 +0,0 @@ - -<<<<<<< HEAD:pr2_computer_monitor/demo/hd_monitor.launch - >>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/demo/hd_monitor.launch - args="$(optenv HOME /home) --diag-hostname=my_machine" name="hd_monitor" > - - - diff --git a/computer_hw/scripts/README.md b/computer_hw/executables/README.md similarity index 100% rename from computer_hw/scripts/README.md rename to computer_hw/executables/README.md diff --git a/computer_hw/scripts/cpu_monitor.py b/computer_hw/executables/cpu_monitor.py similarity index 99% rename from computer_hw/scripts/cpu_monitor.py rename to computer_hw/executables/cpu_monitor.py index b82ce26..c4221ea 100755 --- a/computer_hw/scripts/cpu_monitor.py +++ b/computer_hw/executables/cpu_monitor.py @@ -35,12 +35,6 @@ ##\author Kevin Watts from __future__ import with_statement -import roslib -<<<<<<< HEAD:pr2_computer_monitor/scripts/cpu_monitor.py -roslib.load_manifest('pr2_computer_monitor') -======= -roslib.load_manifest('computer_hw') ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/cpu_monitor.py import rospy diff --git a/computer_hw/scripts/hd_monitor.py b/computer_hw/executables/hd_monitor.py similarity index 98% rename from computer_hw/scripts/hd_monitor.py rename to computer_hw/executables/hd_monitor.py index a9fc301..3a8854c 100755 --- a/computer_hw/scripts/hd_monitor.py +++ b/computer_hw/executables/hd_monitor.py @@ -35,12 +35,6 @@ ##\author Kevin Watts from __future__ import with_statement -import roslib -<<<<<<< HEAD:pr2_computer_monitor/scripts/hd_monitor.py -roslib.load_manifest('pr2_computer_monitor') -======= -roslib.load_manifest('computer_hw') ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/hd_monitor.py import rospy diff --git a/computer_hw/scripts/ntp_monitor.py b/computer_hw/executables/ntp_monitor.py similarity index 95% rename from computer_hw/scripts/ntp_monitor.py rename to computer_hw/executables/ntp_monitor.py index 0ce5b95..51bcf36 100755 --- a/computer_hw/scripts/ntp_monitor.py +++ b/computer_hw/executables/ntp_monitor.py @@ -31,13 +31,6 @@ # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import roslib -<<<<<<< HEAD:pr2_computer_monitor/scripts/ntp_monitor.py -roslib.load_manifest('pr2_computer_monitor') -======= -roslib.load_manifest('computer_hw') ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/ntp_monitor.py - from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue import sys diff --git a/computer_hw/scripts/nvidia_temp.py b/computer_hw/executables/nvidia_temp.py similarity index 75% rename from computer_hw/scripts/nvidia_temp.py rename to computer_hw/executables/nvidia_temp.py index ac1b39f..ebe3db8 100755 --- a/computer_hw/scripts/nvidia_temp.py +++ b/computer_hw/executables/nvidia_temp.py @@ -37,19 +37,14 @@ from __future__ import with_statement, division -PKG = 'pr2_computer_monitor' -import roslib; roslib.load_manifest(PKG) - +from computer_status_msgs.msg import GPUStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus import rospy -from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus -from pr2_msgs.msg import GPUStatus +from computer_hw.nvidia_smi_util import get_gpu_status, parse_smi_output, gpu_status_to_diag -<<<<<<< HEAD:pr2_computer_monitor/scripts/nvidia_temp.py -import pr2_computer_monitor -======= import computer_hw ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/nvidia_temp.py + class NVidiaTempMonitor(object): def __init__(self): @@ -60,18 +55,11 @@ def pub_status(self): gpu_stat = GPUStatus() stat = DiagnosticStatus() try: -<<<<<<< HEAD:pr2_computer_monitor/scripts/nvidia_temp.py - card_out = pr2_computer_monitor.get_gpu_status() - gpu_stat = pr2_computer_monitor.parse_smi_output(card_out) - stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) - except Exception as e: -======= - card_out = computer_hw.get_gpu_status() - gpu_stat = computer_hw.parse_smi_output(card_out) - stat = computer_hw.gpu_status_to_diag(gpu_stat) + card_out = get_gpu_status() + gpu_stat = parse_smi_output(card_out) + stat = gpu_status_to_diag(gpu_stat) rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) except Exception, e: ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/scripts/nvidia_temp.py import traceback rospy.logerr('Unable to process nVidia GPU data') rospy.logerr(traceback.format_exc()) diff --git a/computer_hw/scripts/wifi_monitor.py b/computer_hw/executables/wifi_monitor.py similarity index 99% rename from computer_hw/scripts/wifi_monitor.py rename to computer_hw/executables/wifi_monitor.py index 4072e01..8fd6406 100755 --- a/computer_hw/scripts/wifi_monitor.py +++ b/computer_hw/executables/wifi_monitor.py @@ -40,9 +40,8 @@ from computer_status_msgs.msg import AccessPoint from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue import rospy - -import threading import sys +import threading DIAG_NAME = 'Wifi Status (ddwrt)' WARN_TIME = 30 diff --git a/computer_hw/setup.py b/computer_hw/setup.py index f34fc97..821027e 100644 --- a/computer_hw/setup.py +++ b/computer_hw/setup.py @@ -5,11 +5,7 @@ # fetch values from package.xml setup_args = generate_distutils_setup( -<<<<<<< HEAD:pr2_computer_monitor/setup.py - packages=['pr2_computer_monitor'], -======= packages=['computer_hw'], ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/setup.py package_dir={'': 'src'}) setup(**setup_args) diff --git a/computer_hw/src/computer_hw/__init__.py b/computer_hw/src/computer_hw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/computer_hw/src/computer_monitor/nvidia_smi_util.py b/computer_hw/src/computer_hw/nvidia_smi_util.py similarity index 93% rename from computer_hw/src/computer_monitor/nvidia_smi_util.py rename to computer_hw/src/computer_hw/nvidia_smi_util.py index df48927..41c47c4 100644 --- a/computer_hw/src/computer_monitor/nvidia_smi_util.py +++ b/computer_hw/src/computer_hw/nvidia_smi_util.py @@ -36,20 +36,14 @@ from __future__ import division -<<<<<<< HEAD:pr2_computer_monitor/src/pr2_computer_monitor/nvidia_smi_util.py -PKG = 'pr2_computer_monitor' -======= -PKG = 'computer_hw' ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/src/computer_monitor/nvidia_smi_util.py -import roslib; roslib.load_manifest(PKG) - -import rospy - +from computer_status_msgs.msg import GPUStatus from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue -from pr2_msgs.msg import GPUStatus +from computer_status_msgs.msg import GPUStatus -import subprocess import math +import rospy +import subprocess + MAX_FAN_RPM = 4500 diff --git a/computer_hw/test/parse_test.py b/computer_hw/test/parse_test.py index a4a7611..287ade5 100755 --- a/computer_hw/test/parse_test.py +++ b/computer_hw/test/parse_test.py @@ -36,20 +36,11 @@ from __future__ import with_statement -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py -PKG = 'pr2_computer_monitor' -======= PKG = 'computer_hw' ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py -import roslib; roslib.load_manifest(PKG) import unittest -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py -import pr2_computer_monitor -======= import computer_hw ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py import os, sys @@ -60,16 +51,6 @@ ##\brief Parses launch, tests.xml and configs.xml files in qualification class TestNominalParser(unittest.TestCase): def setUp(self): -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py - with open(os.path.join(roslib.packages.get_pkg_dir('pr2_computer_monitor'), TEXT_PATH), 'r') as f: - self.data = f.read() - - with open(os.path.join(roslib.packages.get_pkg_dir('pr2_computer_monitor'), TEXT_HIGH_TEMP_PATH), 'r') as f: - self.high_temp_data = f.read() - - def test_parse(self): - gpu_stat = pr2_computer_monitor.parse_smi_output(self.data) -======= with open(os.path.join(roslib.packages.get_pkg_dir('computer_hw'), TEXT_PATH), 'r') as f: self.data = f.read() @@ -78,7 +59,6 @@ def test_parse(self): def test_parse(self): gpu_stat = computer_hw.parse_smi_output(self.data) ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py # Check valid self.assert_(self.data, "Unable to read sample output, no test to run") @@ -93,20 +73,12 @@ def test_parse(self): self.assert_(gpu_stat.temperature > 40 and gpu_stat.temperature < 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %f" % gpu_stat.fan_speed) -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py - diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) -======= diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 0, "Diagnostics reports an error for nominal input. Message: %s" % diag_stat.message) def test_high_temp_parse(self): -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py - gpu_stat = pr2_computer_monitor.parse_smi_output(self.high_temp_data) -======= gpu_stat = computer_hw.parse_smi_output(self.high_temp_data) ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py # Check valid self.assert_(self.high_temp_data, "Unable to read sample output, no test to run") @@ -121,29 +93,17 @@ def test_high_temp_parse(self): self.assert_(gpu_stat.temperature > 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %s" % gpu_stat.fan_speed) -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py - diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) -======= diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 1, "Diagnostics didn't report warning for high temp input. Level %d, Message: %s" % (diag_stat.level, diag_stat.message)) def test_empty_parse(self): -<<<<<<< HEAD:pr2_computer_monitor/test/parse_test.py - gpu_stat = pr2_computer_monitor.parse_smi_output('') - - self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) - - diag_stat = pr2_computer_monitor.gpu_status_to_diag(gpu_stat) -======= gpu_stat = computer_hw.parse_smi_output('') self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) ->>>>>>> d60763b ([capability] Add computer_hw package that is previously called _pr2_computer_monitor):computer_hw/test/parse_test.py self.assert_(diag_stat.level == 2, "Diagnostics didn't reports an error for empty input. Level: %d, Message: %s" % (diag_stat.level, diag_stat.message)) From 3eb71ef6795050db94a0d69e27c51796060e0e75 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sat, 7 Aug 2021 02:28:49 -0400 Subject: [PATCH 05/18] [computer_hw][maintenance] Refactoring: Move a class from executable to module. --- computer_hw/executables/nvidia_temp.py | 37 +---------- .../computer_hw/nvidia_temperature_monitor.py | 66 +++++++++++++++++++ 2 files changed, 67 insertions(+), 36 deletions(-) create mode 100644 computer_hw/src/computer_hw/nvidia_temperature_monitor.py diff --git a/computer_hw/executables/nvidia_temp.py b/computer_hw/executables/nvidia_temp.py index ebe3db8..c0f511b 100755 --- a/computer_hw/executables/nvidia_temp.py +++ b/computer_hw/executables/nvidia_temp.py @@ -37,42 +37,9 @@ from __future__ import with_statement, division -from computer_status_msgs.msg import GPUStatus -from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus import rospy -from computer_hw.nvidia_smi_util import get_gpu_status, parse_smi_output, gpu_status_to_diag - -import computer_hw - - -class NVidiaTempMonitor(object): - def __init__(self): - self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) - self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) - - def pub_status(self): - gpu_stat = GPUStatus() - stat = DiagnosticStatus() - try: - card_out = get_gpu_status() - gpu_stat = parse_smi_output(card_out) - stat = gpu_status_to_diag(gpu_stat) - rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) - except Exception, e: - import traceback - rospy.logerr('Unable to process nVidia GPU data') - rospy.logerr(traceback.format_exc()) - - gpu_stat.header.stamp = rospy.get_rostime() - - array = DiagnosticArray() - array.header.stamp = rospy.get_rostime() - - array.status = [ stat ] - - self._pub.publish(array) - self._gpu_pub.publish(gpu_stat) +from computer_hw.nvidia_temperature_monitor import NVidiaTempMonitor if __name__ == '__main__': rospy.init_node('nvidia_temp_monitor') @@ -82,5 +49,3 @@ def pub_status(self): while not rospy.is_shutdown(): monitor.pub_status() my_rate.sleep() - - diff --git a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py new file mode 100644 index 0000000..90276e4 --- /dev/null +++ b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py @@ -0,0 +1,66 @@ +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from computer_status_msgs.msg import GPUStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus +import rospy + +from computer_hw.nvidia_smi_util import get_gpu_status, parse_smi_output, gpu_status_to_diag + + +class NVidiaTempMonitor(object): + def __init__(self): + self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + + def pub_status(self): + gpu_stat = GPUStatus() + stat = DiagnosticStatus() + try: + card_out = get_gpu_status() + gpu_stat = parse_smi_output(card_out) + stat = gpu_status_to_diag(gpu_stat) + rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) + except Exception, e: + import traceback + rospy.logerr('Unable to process nVidia GPU data') + rospy.logerr(traceback.format_exc()) + + gpu_stat.header.stamp = rospy.get_rostime() + + array = DiagnosticArray() + array.header.stamp = rospy.get_rostime() + + array.status = [ stat ] + + self._pub.publish(array) + self._gpu_pub.publish(gpu_stat) From 647228d8eb6c1b3003525e3034006007747e81f3 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sat, 7 Aug 2021 02:48:43 -0400 Subject: [PATCH 06/18] [computer_hw][fix] GPU temperature not detected (fix https://github.com/130s/computer_monitor/issues/1) --- computer_hw/src/computer_hw/nvidia_smi_util.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/computer_hw/src/computer_hw/nvidia_smi_util.py b/computer_hw/src/computer_hw/nvidia_smi_util.py index 41c47c4..cae2028 100644 --- a/computer_hw/src/computer_hw/nvidia_smi_util.py +++ b/computer_hw/src/computer_hw/nvidia_smi_util.py @@ -126,10 +126,13 @@ def parse_smi_output(output): gpu_stat.display = _find_val(output, 'Display') gpu_stat.driver_version = _find_val(output, 'Driver Version') - temp_str = _find_val(output, 'Temperature') - if temp_str: - temp, units = temp_str.split() - gpu_stat.temperature = int(temp) + TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] + for query in TEMPERATURE_QUERIES: + temp_str = _find_val(output, query) + if temp_str: + temp, units = temp_str.split() + gpu_stat.temperature = int(temp) + break fan_str = _find_val(output, 'Fan Speed') if fan_str: From 0091fdb466c6bc5ae4fbbcfcce118a49eed73855 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Mon, 9 Aug 2021 12:33:53 -0400 Subject: [PATCH 07/18] [computer_hw][capability] '/diagnostics_agg' topic not being published so that diagnostics analyzer and other downstream utils ignore --- computer_hw/conf/monitor.launch | 4 ++++ computer_hw/package.xml | 1 + 2 files changed, 5 insertions(+) diff --git a/computer_hw/conf/monitor.launch b/computer_hw/conf/monitor.launch index 32ccc87..38e8c3d 100644 --- a/computer_hw/conf/monitor.launch +++ b/computer_hw/conf/monitor.launch @@ -3,4 +3,8 @@ + + + + \ No newline at end of file diff --git a/computer_hw/package.xml b/computer_hw/package.xml index 354d8b6..30dc378 100644 --- a/computer_hw/package.xml +++ b/computer_hw/package.xml @@ -18,6 +18,7 @@ computer_status_msgs roscpp std_msgs + diagnostic_aggregator libsensors_monitor rospy From 3d65d9d0774ac3e68075f0c97479b33d2601a333 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sat, 18 Sep 2021 00:05:24 -0400 Subject: [PATCH 08/18] WIP: [fix] GPU status not captured when running from Docker (or potentially any container). --- computer_hw/src/computer_hw/nvidia_smi_util.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/computer_hw/src/computer_hw/nvidia_smi_util.py b/computer_hw/src/computer_hw/nvidia_smi_util.py index cae2028..22670ef 100644 --- a/computer_hw/src/computer_hw/nvidia_smi_util.py +++ b/computer_hw/src/computer_hw/nvidia_smi_util.py @@ -152,9 +152,21 @@ def parse_smi_output(output): gpu_stat.memory_usage = int(mem) return gpu_stat - + def get_gpu_status(): - p = subprocess.Popen('sudo nvidia-smi -a', stdout = subprocess.PIPE, + """ + @summary: Relying on a command on the host 'nvidia-smi'. + + Regarding 'nvidia-smi', some people believe that it at least needs to be run by 'root' + for the first invocation https://serverfault.com/questions/975859/nvidia-smi-must-be-run-by-root-before-it-can-be-used-by-regular-users, + but it seems to be working without initial invocation. + @todo: OpenQuetion-1: When this method is invoked from a container where + nvidia-smi, which is typically available on a host, is not easily + available. -> For docker, passing '--runtime=nvidia' enables the cmd + from a container. Then show warning when unavailable. + @todo: OpenQuetion-2: What if the cmd 'nvidia-smi' is not available? + """ + p = subprocess.Popen('nvidia-smi -a', stdout = subprocess.PIPE, stderr = subprocess.PIPE, shell = True) (o, e) = p.communicate() From 5d3d6fca8e242a85013bfbbce60480a4ee0aba27 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sat, 18 Sep 2021 00:30:13 -0400 Subject: [PATCH 09/18] [improve] Add test case and data for nvidia-smi --- computer_hw/test/parse_test.py | 2 +- .../test/sample_output/nvidia_smi_out.txt | 18 -- .../sample_output/nvidia_smi_out_2010.txt | 69 +++++++ .../sample_output/nvidia_smi_out_2021.txt | 193 ++++++++++++++++++ 4 files changed, 263 insertions(+), 19 deletions(-) delete mode 100644 computer_hw/test/sample_output/nvidia_smi_out.txt create mode 100644 computer_hw/test/sample_output/nvidia_smi_out_2010.txt create mode 100644 computer_hw/test/sample_output/nvidia_smi_out_2021.txt diff --git a/computer_hw/test/parse_test.py b/computer_hw/test/parse_test.py index 287ade5..b77d03f 100755 --- a/computer_hw/test/parse_test.py +++ b/computer_hw/test/parse_test.py @@ -44,7 +44,7 @@ import os, sys -TEXT_PATH = 'test/sample_output/nvidia_smi_out.txt' +TEXT_PATH = 'test/sample_output/nvidia_smi_out_2021.txt' TEXT_HIGH_TEMP_PATH = 'test/sample_output/nvidia_smi_high_temp.txt' diff --git a/computer_hw/test/sample_output/nvidia_smi_out.txt b/computer_hw/test/sample_output/nvidia_smi_out.txt deleted file mode 100644 index 0106e1c..0000000 --- a/computer_hw/test/sample_output/nvidia_smi_out.txt +++ /dev/null @@ -1,18 +0,0 @@ - -==============NVSMI LOG============== - - -Timestamp : Wed Sep 29 10:37:16 2010 - -Driver Version : 260.24 - -GPU 0: - Product Name : Quadro 600 - PCI Device/Vendor ID : df810de - PCI Location ID : 0:3:0 - Display : Connected - Temperature : 54 C - Fan Speed : 38% - Utilization - GPU : 0% - Memory : 0% diff --git a/computer_hw/test/sample_output/nvidia_smi_out_2010.txt b/computer_hw/test/sample_output/nvidia_smi_out_2010.txt new file mode 100644 index 0000000..692410e --- /dev/null +++ b/computer_hw/test/sample_output/nvidia_smi_out_2010.txt @@ -0,0 +1,69 @@ +Timestamp : Sat Sep 18 04:23:41 2021 +Driver Version : 440.64 +CUDA Version : 10.2 + +Attached GPUs : 1 +GPU 00000000:01:00.0 + Product Name : GeForce GTX 1060 6GB + Product Brand : GeForce + Display Mode : Enabled + Display Active : Enabled + Persistence Mode : Enabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-7f9b4a72-68fe-e2a9-8907-4590704d3431 + Minor Number : 0 + VBIOS Version : 86.06.45.00.60 + MultiGPU Board : No + Board ID : 0x100 + GPU Part Number : N/A + Inforom Version + Image Version : G001.0000.01.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1C0310DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x61633842 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Fan Speed : 5 % + Performance State : P8 + Clocks Throttle Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active diff --git a/computer_hw/test/sample_output/nvidia_smi_out_2021.txt b/computer_hw/test/sample_output/nvidia_smi_out_2021.txt new file mode 100644 index 0000000..d34bfd8 --- /dev/null +++ b/computer_hw/test/sample_output/nvidia_smi_out_2021.txt @@ -0,0 +1,193 @@ +==============NVSMI LOG============== [115/1954] + +Timestamp : Sat Sep 18 04:23:41 2021 +Driver Version : 440.64 +CUDA Version : 10.2 + +Attached GPUs : 1 +GPU 00000000:01:00.0 + Product Name : GeForce GTX 1060 6GB + Product Brand : GeForce + Display Mode : Enabled + Display Active : Enabled + Persistence Mode : Enabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-7f9b4a72-68fe-e2a9-8907-4590704d3431 + Minor Number : 0 + VBIOS Version : 86.06.45.00.60 + MultiGPU Board : No + Board ID : 0x100 + GPU Part Number : N/A + Inforom Version + Image Version : G001.0000.01.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1C0310DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x61633842 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Fan Speed : 5 % + Performance State : P8 + Clocks Throttle Reasons [54/1954] + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 6077 MiB + Used : 114 MiB + Free : 5963 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 5 MiB + Free : 251 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 2 % + Encoder : 0 % + Decoder : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + Ecc Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + Single Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Double Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Aggregate + Single Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Double Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Temperature + GPU Current Temp : 51 C + GPU Shutdown Temp : 102 C + GPU Slowdown Temp : 99 C + GPU Max Operating Temp : N/A + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + Power Readings + Power Management : Supported + Power Draw : 6.00 W + Power Limit : 120.00 W + Default Power Limit : 120.00 W + Enforced Power Limit : 120.00 W + Min Power Limit : 60.00 W + Max Power Limit : 140.00 W + Clocks + Graphics : 139 MHz + SM : 139 MHz + Memory : 405 MHz + Video : 544 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Max Clocks + Graphics : 2012 MHz + SM : 2012 MHz + Memory : 4004 MHz + Video : 1708 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Processes + + +gpu_stat: header: + seq: 0 + stamp: + secs: 0 + nsecs: 0 + frame_id: '' +product_name: "GeForce GTX 1060 6GB" +pci_device_id: '' +pci_location: '' +display: '' +driver_version: "440.64" +temperature: 51 +fan_speed: 23.5619449019 +gpu_usage: 0 +memory_usage: 2 From 6f43370adf945970fdacc38d36e2348a8758ac1e Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sun, 19 Sep 2021 08:44:48 -0400 Subject: [PATCH 10/18] [computer_hw] Py format err (part of https://github.com/ros-drivers/linux_peripheral_interfaces/pull/20#issuecomment-922467065) --- computer_hw/src/computer_hw/nvidia_temperature_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py index 90276e4..176d2bf 100644 --- a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py +++ b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py @@ -50,7 +50,7 @@ def pub_status(self): gpu_stat = parse_smi_output(card_out) stat = gpu_status_to_diag(gpu_stat) rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) - except Exception, e: + except Exception as e: import traceback rospy.logerr('Unable to process nVidia GPU data') rospy.logerr(traceback.format_exc()) From 1864e08db744760a4121af778d602df4e8c0f548 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Sun, 23 Jan 2022 10:36:00 -0500 Subject: [PATCH 11/18] [computer_hw][maintenance] 1) Separate ROS-agnostic components. 2) Separate vendor specific portions TODO fix unit test --- computer_hw/conf/monitor.launch | 6 +- computer_hw/executables/nvidia_temp.py | 11 +- computer_hw/package.xml | 2 +- .../src/computer_hw/gpu_stat_entity.py | 127 +++++++++++++ .../src/computer_hw/gpu_status_monitor.py | 143 ++++++++++++++ computer_hw/src/computer_hw/gpu_util.py | 147 +++++++++++++++ .../src/computer_hw/nvidia_smi_util.py | 178 ------------------ .../computer_hw/nvidia_temperature_monitor.py | 84 ++++++++- computer_hw/src/computer_hw/nvidia_util.py | 64 +++++++ 9 files changed, 573 insertions(+), 189 deletions(-) create mode 100644 computer_hw/src/computer_hw/gpu_stat_entity.py create mode 100644 computer_hw/src/computer_hw/gpu_status_monitor.py create mode 100644 computer_hw/src/computer_hw/gpu_util.py delete mode 100644 computer_hw/src/computer_hw/nvidia_smi_util.py create mode 100644 computer_hw/src/computer_hw/nvidia_util.py diff --git a/computer_hw/conf/monitor.launch b/computer_hw/conf/monitor.launch index 38e8c3d..f087fbe 100644 --- a/computer_hw/conf/monitor.launch +++ b/computer_hw/conf/monitor.launch @@ -2,9 +2,11 @@ - + + + - \ No newline at end of file + diff --git a/computer_hw/executables/nvidia_temp.py b/computer_hw/executables/nvidia_temp.py index c0f511b..ccd015f 100755 --- a/computer_hw/executables/nvidia_temp.py +++ b/computer_hw/executables/nvidia_temp.py @@ -39,13 +39,12 @@ import rospy -from computer_hw.nvidia_temperature_monitor import NVidiaTempMonitor +from computer_hw.gpu_status_monitor import GpuMonitor +from computer_hw.nvidia_util import Nvidia_GPU_Stat if __name__ == '__main__': rospy.init_node('nvidia_temp_monitor') - monitor = NVidiaTempMonitor() - my_rate = rospy.Rate(1.0) - while not rospy.is_shutdown(): - monitor.pub_status() - my_rate.sleep() + monitor = GpuMonitor(Nvidia_GPU_Stat) + monitor.run() + diff --git a/computer_hw/package.xml b/computer_hw/package.xml index 30dc378..8e08d7f 100644 --- a/computer_hw/package.xml +++ b/computer_hw/package.xml @@ -9,7 +9,7 @@ http://www.ros.org/wiki/computer_hw https://github.com/130s/computer_hw/issues - Kevin Watts (watts@willowgarage.com) + Kevin Watts Isaac Saito catkin diff --git a/computer_hw/src/computer_hw/gpu_stat_entity.py b/computer_hw/src/computer_hw/gpu_stat_entity.py new file mode 100644 index 0000000..d24d66f --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_stat_entity.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# @summary: Utilities for nvidia. This file is agnostic from framework e.g. ROS. + + +class GPU_Stat(object): + """ + @summary Entity class to contain GPU status. + @note: The format of this class is NOT dependent on any framework e.g. ROS + """ + _product_name = "" + _pci_device_id = "" + _pci_location = "" + _display = "" + _driver_version = "" + # TODO + _fan_speed = 0.0 + _gpu_usage = 0 + _memory_usage = 0 + _temperature = 0 + + def __init__(self): + pass + + @property + def fan_speed(self): + return self._fan_speed + + @fan_speed.setter + def fan_speed(self, v): + self._fan_speed = v + + @property + def gpu_usage(self): + return self._gpu_usage + + @gpu_usage.setter + def gpu_usage(self, v): + self._gpu_usage = v + + @property + def memory_usage(self): + return self._memory_usage + + @memory_usage.setter + def memory_usage(self, v): + self._memory_usage = v + + @property + def temperature(self): + return self._temperature + + @temperature.setter + def temperature(self, v): + self._temperature = v + + @property + def product_name(self): + return self._product_name + + @product_name.setter + def product_name(self, v): + self._product_name = v + + @property + def pci_device_id(self): + return self._pci_device_id + + @pci_device_id.setter + def pci_device_id(self, v): + self._pci_device_id = v + + @property + def pci_location(self): + return self._pci_location + + @pci_location.setter + def pci_location(self, v): + self._pci_location = v + + @property + def display(self): + return self._display + + @display.setter + def display(self, v): + self._display = v + + @property + def driver_version(self): + return self._driver_version + + @driver_version.setter + def driver_version(self, v): + self._driver_version = v diff --git a/computer_hw/src/computer_hw/gpu_status_monitor.py b/computer_hw/src/computer_hw/gpu_status_monitor.py new file mode 100644 index 0000000..932e873 --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_status_monitor.py @@ -0,0 +1,143 @@ +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from computer_status_msgs.msg import GPUStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +import rospy +import traceback + +from computer_hw.gpu_util import GPUStatusHandler + + +class GpuMonitor(object): + def __init__(self, stat_handler_class): + """ + @param stat_handler_class: Class object that is to be delgated to return + GPU status. E.g. computer_hw.nvidia_util.Nvidia_GPU_Stat + @type stat_handler_class: computer_hw.gpu_util.GPUStatusHandler + """ + # Instantiating GPU status handler. + self._gpu_status_handler = stat_handler_class() + if not isinstance(self._gpu_status_handler, GPUStatusHandler): + raise TypeError("GPU status handler passed '{}' is not compatible. This class needs a derived class of {}".format( + stat_handler_class, GPUStatusHandler)) + self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + + def gpu_status_to_diag(self, gpu_stat): + stat = DiagnosticStatus() + stat.name = 'GPU Status' + stat.message = 'OK' + stat.level = DiagnosticStatus.OK + stat.hardware_id = gpu_stat.pci_device_id + + stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) + stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) + stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) + stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) + stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) + stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % GPUStatusHandler.rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) + stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) + + # Check for valid data + if not gpu_stat.product_name or not gpu_stat.pci_device_id: + stat.level = DiagnosticStatus.ERROR + stat.message = 'No Device Data' + return stat + + # Check load + if gpu_stat.gpu_usage > 98: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Load' + + # Check thresholds + if gpu_stat.temperature > 90: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Temperature' + if gpu_stat.temperature > 95: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'Temperature Alarm' + + # Check fan + if gpu_stat.fan_speed == 0: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'No Fan Speed' + return stat + + def pub_status(self): + stat = DiagnosticStatus() + gpu_stat = None + try: + _non_ros_gpu_stat = self._gpu_status_handler.get_gpu_status() + gpu_stat = self._convert_output(_non_ros_gpu_stat) + stat = self.gpu_status_to_diag(gpu_stat) + rospy.loginfo("gpu_stat: {}\n".format(gpu_stat)) + except AttributeError as e: + rospy.logerr('Unable to process GPU status as getting GPU status with proprietary command failed : {}'.format(str(e))) + except Exception as e: + rospy.logerr('Unable to process GPU status: {}'.format(str(e))) + rospy.logerr(traceback.format_exc()) + + gpu_stat.header.stamp = rospy.get_rostime() + + array = DiagnosticArray() + array.header.stamp = rospy.get_rostime() + + array.status = [ stat ] + + self._pub.publish(array) + self._gpu_pub.publish(gpu_stat) + + def _convert_output(self, gpu_stat_proprietary): + """ + @param gpu_stat_proprietary: + @rtype computer_status_msgs.GPUStatus + """ + gpu_stat = GPUStatus() + gpu_stat.product_name = gpu_stat_proprietary.product_name + gpu_stat.pci_device_id = gpu_stat_proprietary.pci_device_id + gpu_stat.pci_location = gpu_stat_proprietary.pci_location + gpu_stat.display = gpu_stat_proprietary.display + gpu_stat.driver_version = gpu_stat_proprietary.driver_version + gpu_stat.temperature = gpu_stat_proprietary.temperature + gpu_stat.fan_speed = gpu_stat_proprietary.fan_speed + gpu_stat.gpu_usage = gpu_stat_proprietary.gpu_usage + gpu_stat.memory_usage = gpu_stat_proprietary.memory_usage + return gpu_stat + + def run(self): + my_rate = rospy.Rate(rospy.get_param("gpu_monitor_rate", 1.0)) + while not rospy.is_shutdown(): + self.pub_status() + my_rate.sleep() diff --git a/computer_hw/src/computer_hw/gpu_util.py b/computer_hw/src/computer_hw/gpu_util.py new file mode 100644 index 0000000..d6c081a --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_util.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import division + +from computer_status_msgs.msg import GPUStatus +import logging +import math +import subprocess + +from computer_hw.gpu_stat_entity import GPU_Stat + +class GPUStatusHandler(object): + """ + @summary: Utilities for nvidia. This file is agnostic from framework e.g. ROS. + """ + _MAX_FAN_RPM = 4500 + + @property + def max_fan_rpm(self): + return self._MAX_FAN_RPM + + @max_fan_rpm.setter + def max_fan_rpm(self, v): + self._MAX_FAN_RPM = v + + @staticmethod + def rads_to_rpm(rads): + return rads / (2 * math.pi) * 60 + + @staticmethod + def rpm_to_rads(rpm): + return rpm * (2 * math.pi) / 60 + + @staticmethod + def get_raw_gpu_status(): + """Needs implemented in the derived class""" + raise NotImplemented() + + @staticmethod + def _find_val(output, word): + lines = output.split('\n') + for line in lines: + tple = line.split(':') + if not len(tple) > 1: + continue + + name = tple[0].strip() + val = ':'.join(tple[1:]).strip() + + if not name.lower() == word.lower(): + continue + + return val.strip() + + return '' + + def convert_proprietary_out(self, proprietary_output_raw): + """ + @summary: Parse Nvidia's SMI tool output and returns in a more + programming friendly format. + @param proprietary_output_raw: str of shell command output i.e. output of + 'get_raw_gpu_status' method. + @return File: gpu_stat_entity.GPU_Stat instance + @raise AttributeError: When 'proprietary_output_raw' is not in an + expected form. + """ + if not proprietary_output_raw: + raise AttributeError("Input proprietary data is empty. Can't convert") + + gpu_stat = GPU_Stat() + + gpu_stat.product_name = GPUStatusHandler._find_val(proprietary_output_raw, 'Product Name') + gpu_stat.pci_device_id = GPUStatusHandler._find_val(proprietary_output_raw, 'PCI Device/Vendor ID') + gpu_stat.pci_location = GPUStatusHandler._find_val(proprietary_output_raw, 'PCI Location ID') + gpu_stat.display = GPUStatusHandler._find_val(proprietary_output_raw, 'Display') + gpu_stat.driver_version = GPUStatusHandler._find_val(proprietary_output_raw, 'Driver Version') + + TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] + for query in TEMPERATURE_QUERIES: + temp_str = GPUStatusHandler._find_val(proprietary_output_raw, query) + if temp_str: + temp, units = temp_str.split() + gpu_stat.temperature = int(temp) + break + + fan_str = GPUStatusHandler._find_val(proprietary_output_raw, 'Fan Speed') + if fan_str: + # Fan speed in RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * self.max_fan_rpm + # Convert fan speed to Hz + gpu_stat.fan_speed = GPUStatusHandler.rpm_to_rads(fan_spd) + + usage_str = GPUStatusHandler._find_val(proprietary_output_raw, 'GPU') + if usage_str: + usage = usage_str.strip('\%').strip() + gpu_stat.gpu_usage = int(usage) + + mem_str = GPUStatusHandler._find_val(proprietary_output_raw, 'Memory') + if mem_str: + mem = mem_str.strip('\%').strip() + gpu_stat.memory_usage = int(mem) + + return gpu_stat + + def get_gpu_status(self): + """ + @summary: Get GPU status and return in an instance. + @return GPU_Stat instance + @raise AttributeError: When 'proprietary_output' is not in an + expected form. + """ + raw_output = self.get_raw_gpu_status() + return self.convert_proprietary_out(raw_output) diff --git a/computer_hw/src/computer_hw/nvidia_smi_util.py b/computer_hw/src/computer_hw/nvidia_smi_util.py deleted file mode 100644 index 22670ef..0000000 --- a/computer_hw/src/computer_hw/nvidia_smi_util.py +++ /dev/null @@ -1,178 +0,0 @@ -#!/usr/bin/env python -# -# Software License Agreement (BSD License) -# -# Copyright (c) 2010, Willow Garage, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided -# with the distribution. -# * Neither the name of the Willow Garage nor the names of its -# contributors may be used to endorse or promote products derived -# from this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN -# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -# POSSIBILITY OF SUCH DAMAGE. - -##\author Kevin Watts - -from __future__ import division - -from computer_status_msgs.msg import GPUStatus -from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue -from computer_status_msgs.msg import GPUStatus - -import math -import rospy -import subprocess - - -MAX_FAN_RPM = 4500 - -def _rads_to_rpm(rads): - return rads / (2 * math.pi) * 60 - -def _rpm_to_rads(rpm): - return rpm * (2 * math.pi) / 60 - -def gpu_status_to_diag(gpu_stat): - stat = DiagnosticStatus() - stat.name = 'GPU Status' - stat.message = 'OK' - stat.level = DiagnosticStatus.OK - stat.hardware_id = gpu_stat.pci_device_id - - stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) - stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) - stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) - stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) - stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) - stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) - stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % _rads_to_rpm(gpu_stat.fan_speed))) - stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) - stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) - - # Check for valid data - if not gpu_stat.product_name or not gpu_stat.pci_device_id: - stat.level = DiagnosticStatus.ERROR - stat.message = 'No Device Data' - return stat - - # Check load - if gpu_stat.gpu_usage > 98: - stat.level = max(stat.level, DiagnosticStatus.WARN) - stat.message = 'High Load' - - # Check thresholds - if gpu_stat.temperature > 90: - stat.level = max(stat.level, DiagnosticStatus.WARN) - stat.message = 'High Temperature' - if gpu_stat.temperature > 95: - stat.level = max(stat.level, DiagnosticStatus.ERROR) - stat.message = 'Temperature Alarm' - - # Check fan - if gpu_stat.fan_speed == 0: - stat.level = max(stat.level, DiagnosticStatus.ERROR) - stat.message = 'No Fan Speed' - - - - return stat - - -def _find_val(output, word): - lines = output.split('\n') - for line in lines: - tple = line.split(':') - if not len(tple) > 1: - continue - - name = tple[0].strip() - val = ':'.join(tple[1:]).strip() - - if not name.lower() == word.lower(): - continue - - return val.strip() - - return '' - -def parse_smi_output(output): - gpu_stat = GPUStatus() - - - gpu_stat.product_name = _find_val(output, 'Product Name') - gpu_stat.pci_device_id = _find_val(output, 'PCI Device/Vendor ID') - gpu_stat.pci_location = _find_val(output, 'PCI Location ID') - gpu_stat.display = _find_val(output, 'Display') - gpu_stat.driver_version = _find_val(output, 'Driver Version') - - TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] - for query in TEMPERATURE_QUERIES: - temp_str = _find_val(output, query) - if temp_str: - temp, units = temp_str.split() - gpu_stat.temperature = int(temp) - break - - fan_str = _find_val(output, 'Fan Speed') - if fan_str: - # Fan speed in RPM - fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * MAX_FAN_RPM - # Convert fan speed to Hz - gpu_stat.fan_speed = _rpm_to_rads(fan_spd) - - usage_str = _find_val(output, 'GPU') - if usage_str: - usage = usage_str.strip('\%').strip() - gpu_stat.gpu_usage = int(usage) - - mem_str = _find_val(output, 'Memory') - if mem_str: - mem = mem_str.strip('\%').strip() - gpu_stat.memory_usage = int(mem) - - return gpu_stat - -def get_gpu_status(): - """ - @summary: Relying on a command on the host 'nvidia-smi'. - - Regarding 'nvidia-smi', some people believe that it at least needs to be run by 'root' - for the first invocation https://serverfault.com/questions/975859/nvidia-smi-must-be-run-by-root-before-it-can-be-used-by-regular-users, - but it seems to be working without initial invocation. - @todo: OpenQuetion-1: When this method is invoked from a container where - nvidia-smi, which is typically available on a host, is not easily - available. -> For docker, passing '--runtime=nvidia' enables the cmd - from a container. Then show warning when unavailable. - @todo: OpenQuetion-2: What if the cmd 'nvidia-smi' is not available? - """ - p = subprocess.Popen('nvidia-smi -a', stdout = subprocess.PIPE, - stderr = subprocess.PIPE, shell = True) - (o, e) = p.communicate() - - if not p.returncode == 0: - return '' - - if not o: return '' - - return o diff --git a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py index 176d2bf..72f1eff 100644 --- a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py +++ b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py @@ -31,10 +31,10 @@ # POSSIBILITY OF SUCH DAMAGE. from computer_status_msgs.msg import GPUStatus -from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue import rospy -from computer_hw.nvidia_smi_util import get_gpu_status, parse_smi_output, gpu_status_to_diag +from computer_hw.gpu_util import Nvidia_GPU_Stat class NVidiaTempMonitor(object): @@ -42,6 +42,48 @@ def __init__(self): self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + def gpu_status_to_diag(gpu_stat): + stat = DiagnosticStatus() + stat.name = 'GPU Status' + stat.message = 'OK' + stat.level = DiagnosticStatus.OK + stat.hardware_id = gpu_stat.pci_device_id + + stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) + stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) + stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) + stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) + stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) + stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % _rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) + stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) + + # Check for valid data + if not gpu_stat.product_name or not gpu_stat.pci_device_id: + stat.level = DiagnosticStatus.ERROR + stat.message = 'No Device Data' + return stat + + # Check load + if gpu_stat.gpu_usage > 98: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Load' + + # Check thresholds + if gpu_stat.temperature > 90: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Temperature' + if gpu_stat.temperature > 95: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'Temperature Alarm' + + # Check fan + if gpu_stat.fan_speed == 0: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'No Fan Speed' + return stat + def pub_status(self): gpu_stat = GPUStatus() stat = DiagnosticStatus() @@ -64,3 +106,41 @@ def pub_status(self): self._pub.publish(array) self._gpu_pub.publish(gpu_stat) + + +def parse_smi_output(output): + gpu_stat = GPUStatus() + + gpu_stat.product_name = _find_val(output, 'Product Name') + gpu_stat.pci_device_id = _find_val(output, 'PCI Device/Vendor ID') + gpu_stat.pci_location = _find_val(output, 'PCI Location ID') + gpu_stat.display = _find_val(output, 'Display') + gpu_stat.driver_version = _find_val(output, 'Driver Version') + + TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] + for query in TEMPERATURE_QUERIES: + temp_str = _find_val(output, query) + if temp_str: + temp, units = temp_str.split() + gpu_stat.temperature = int(temp) + break + + fan_str = _find_val(output, 'Fan Speed') + if fan_str: + # Fan speed in RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * MAX_FAN_RPM + # Convert fan speed to Hz + gpu_stat.fan_speed = _rpm_to_rads(fan_spd) + + usage_str = _find_val(output, 'GPU') + if usage_str: + usage = usage_str.strip('\%').strip() + gpu_stat.gpu_usage = int(usage) + + mem_str = _find_val(output, 'Memory') + if mem_str: + mem = mem_str.strip('\%').strip() + gpu_stat.memory_usage = int(mem) + + return gpu_stat + diff --git a/computer_hw/src/computer_hw/nvidia_util.py b/computer_hw/src/computer_hw/nvidia_util.py new file mode 100644 index 0000000..a9b9371 --- /dev/null +++ b/computer_hw/src/computer_hw/nvidia_util.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import logging +import subprocess + +from computer_hw.gpu_util import GPUStatusHandler + + +class Nvidia_GPU_Stat(GPUStatusHandler): + def get_raw_gpu_status(self): + """ + @summary: Relying on a command on the host 'nvidia-smi'. + + Regarding 'nvidia-smi', some people believe that it at least needs to be run by 'root' + for the first invocation https://serverfault.com/questions/975859/nvidia-smi-must-be-run-by-root-before-it-can-be-used-by-regular-users, + but it seems to be working without initial invocation. + @todo: OpenQuetion-1: When this method is invoked from a container where + nvidia-smi, which is typically available on a host, is not easily + available. -> For docker, passing '--runtime=nvidia' enables the cmd + from a container. Then show warning when unavailable. + @todo: OpenQuetion-2: What if the cmd 'nvidia-smi' is not available? + """ + p = subprocess.Popen('nvidia-smi -a', stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=True) + (o, e) = p.communicate() + + if not p.returncode == 0: + return '' + + if not o: return '' + logging.debug("card_out: {}".format(o)) + return o From 8655e1d0d95756b046e153398666c23d3596e8d1 Mon Sep 17 00:00:00 2001 From: "Isaac I. Y. Saito" Date: Mon, 21 Feb 2022 09:30:04 -0600 Subject: [PATCH 12/18] WIP: Allow more runtime config. --- computer_hw/conf/monitor.launch | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/computer_hw/conf/monitor.launch b/computer_hw/conf/monitor.launch index f087fbe..4cc1034 100644 --- a/computer_hw/conf/monitor.launch +++ b/computer_hw/conf/monitor.launch @@ -1,9 +1,14 @@ + + + - + + + From a41dabafc75408452d69b1abe67635654b5182a2 Mon Sep 17 00:00:00 2001 From: "Isaac I. Y. Saito" Date: Mon, 21 Feb 2022 23:44:49 -0600 Subject: [PATCH 13/18] [improve] Stop printing GPU info per every msg reception. --- computer_hw/src/computer_hw/gpu_status_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/computer_hw/src/computer_hw/gpu_status_monitor.py b/computer_hw/src/computer_hw/gpu_status_monitor.py index 932e873..69bcfaa 100644 --- a/computer_hw/src/computer_hw/gpu_status_monitor.py +++ b/computer_hw/src/computer_hw/gpu_status_monitor.py @@ -102,7 +102,7 @@ def pub_status(self): _non_ros_gpu_stat = self._gpu_status_handler.get_gpu_status() gpu_stat = self._convert_output(_non_ros_gpu_stat) stat = self.gpu_status_to_diag(gpu_stat) - rospy.loginfo("gpu_stat: {}\n".format(gpu_stat)) + rospy.logdebug("gpu_stat: {}\n".format(gpu_stat)) except AttributeError as e: rospy.logerr('Unable to process GPU status as getting GPU status with proprietary command failed : {}'.format(str(e))) except Exception as e: From 84fec606e20ec75bc815e7b6733e68eb0d69f5b7 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Thu, 24 Feb 2022 17:42:38 -0500 Subject: [PATCH 14/18] WIP: Python 2to3 (for now done manually) --- computer_hw/executables/cpu_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/computer_hw/executables/cpu_monitor.py b/computer_hw/executables/cpu_monitor.py index c4221ea..21aa80e 100755 --- a/computer_hw/executables/cpu_monitor.py +++ b/computer_hw/executables/cpu_monitor.py @@ -34,7 +34,7 @@ ##\author Kevin Watts -from __future__ import with_statement +from __future__ import with_statement, print_function import rospy From 627afbc17b215f025df0812fd354598387241f3e Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 22 Feb 2023 07:06:09 -0500 Subject: [PATCH 15/18] Capability: [CI] Add GitHub Action, for ROS Melodic, Noetic --- .github/workflows/ci.yml | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6e68502 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,46 @@ +name: CI - Ubuntu Focal + +on: + # direct pushes to protected branches are not supported + pull_request: + push: + # run every day, at 6am UTC + schedule: + - cron: '0 6 * * *' + # allow manually starting this workflow + workflow_dispatch: + +jobs: + industrial_ci: + name: ROS Noetic (${{ matrix.ros_repo }}) + runs-on: ubuntu-20.04 + + strategy: + matrix: + ROS_DISTRO: [ melodic, noetic ] + ROS_REPO: [ main ] + + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache" + + steps: + - name: Fetch repository + uses: actions/checkout@v3 + + - name: ccache cache + uses: actions/cache@v3 + with: + path: ${{ env.CCACHE_DIR }} + # we always want the ccache cache to be persisted, as we cannot easily + # determine whether dependencies have changed, and ccache will manage + # updating the cache for us. Adding 'run_id' to the key will force an + # upload at the end of the job. + key: ccache-${{ matrix.ros_distro }}-${{ matrix.ros_repo }}-${{github.run_id}} + restore-keys: | + ccache-${{ matrix.ros_distro }}-${{ matrix.ros_repo }} + + - name: Run industrial_ci + uses: ros-industrial/industrial_ci@master + env: + ROS_DISTRO: ${{ matrix.ros_distro }} + ROS_REPO: ${{ matrix.ros_repo }} From c8feea4e7dbb466847c58d4f0bc123a16a62eb5c Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 22 Feb 2023 07:06:37 -0500 Subject: [PATCH 16/18] Drop capability: [CI] Travis CI to be off. Delegate to GHA --- .travis.yml | 54 ----------------------------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ad2292d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,54 +0,0 @@ - -# this is .traivs.yml written by - - -# https://github.com/ros-infrastructure/ros_buildfarm/blob/master/doc/jobs/devel_jobs.rst -# https://github.com/ros-infrastructure/ros_buildfarm/blob/master/doc/jobs/prerelease_jobs.rst -# while this doesn't require sudo we don't want to run within a Docker container -sudo: true -dist: bionic -language: python -addons: - apt: - packages: - - 2to3 -env: - global: - - JOB_PATH=/tmp/devel_job - - ABORT_ON_TEST_FAILURE=1 - - INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/production/index.yaml - matrix: - - CHECK_PYTHON3_COMPILE=true - - ROS_DISTRO_NAME=kinetic OS_NAME=ubuntu OS_CODE_NAME=xenial ARCH=amd64 INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/7e6385e/index.yaml - - ROS_DISTRO_NAME=melodic OS_NAME=ubuntu OS_CODE_NAME=bionic ARCH=amd64 - - ROS_DISTRO_NAME=noetic OS_NAME=ubuntu OS_CODE_NAME=focal ARCH=amd64 -# matrix: -# allow_failures: -# - env: ROS_DISTRO_NAME=indigo OS_NAME=ubuntu OS_CODE_NAME=trusty ARCH=amd64 INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/6a93d17/index.yaml -install: - # check python3 compatibility - - if [ "${CHECK_PYTHON3_COMPILE}" == "true" ]; then bash -c "ret=0; trap 'ret=1' ERR; python3 -m compileall .; 2to3 -w -f except -f execfile -f has_key -f raw_input .; git diff --exit-code . > /dev/null; echo Exitting with \$ret; exit \$ret"; exit $?; fi - # either install the latest released version of ros_buildfarm - # - pip install ros_buildfarm - # or checkout a specific branch - - git clone -b master https://github.com/ros-infrastructure/ros_buildfarm /tmp/ros_buildfarm - # force enable `rosdep update --include-eol-distros` until https://github.com/ros-infrastructure/ros_buildfarm/pull/890 released - - (cd /tmp/ros_buildfarm; git checkout f7a12d8) - - (cd /tmp; wget https://github.com/ros-infrastructure/ros_buildfarm/pull/890.diff) - - (cd /tmp/ros_buildfarm; patch -p1 < /tmp/890.diff) - - pip install /tmp/ros_buildfarm - # checkout catkin for catkin_test_results script - - git clone https://github.com/ros/catkin /tmp/catkin - # run devel job for a ROS repository with the same name as this repo - - export REPOSITORY_NAME=`basename $TRAVIS_BUILD_DIR` - # use the code already checked out by Travis - - mkdir -p $JOB_PATH/ws/src - - cp -R $TRAVIS_BUILD_DIR $JOB_PATH/ws/src/ - # generate the script to run a pre-release job for that target and repo - - generate_prerelease_script.py $INDEX_URL $ROS_DISTRO_NAME default $OS_NAME $OS_CODE_NAME $ARCH --output-dir $JOB_PATH --custom-rosdep-update-options=--include-eol-distros - # run the actual job which involves Docker - - cd $JOB_PATH; sh ./prerelease.sh -y -script: - # get summary of test results - - /tmp/catkin/bin/catkin_test_results $JOB_PATH/ws/test_results --all -notifications: - email: false From 866275755fa371716d48750724b62777e406ef32 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 22 Feb 2023 07:32:38 -0500 Subject: [PATCH 17/18] Improve Python import definition --- computer_hw/test/parse_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/computer_hw/test/parse_test.py b/computer_hw/test/parse_test.py index b77d03f..5c8258a 100755 --- a/computer_hw/test/parse_test.py +++ b/computer_hw/test/parse_test.py @@ -38,12 +38,12 @@ PKG = 'computer_hw' +import os +import sys import unittest import computer_hw -import os, sys - TEXT_PATH = 'test/sample_output/nvidia_smi_out_2021.txt' TEXT_HIGH_TEMP_PATH = 'test/sample_output/nvidia_smi_high_temp.txt' From deb5cecc358667d9a383b55d938743f65ba06c53 Mon Sep 17 00:00:00 2001 From: "Isaac I.Y. Saito" <130s@2000.jukuin.keio.ac.jp> Date: Wed, 22 Feb 2023 07:13:34 -0500 Subject: [PATCH 18/18] Fix failing import in test (See CI failure https://github.com/kinu-garage/linux_peripheral_interfaces/actions/runs/4242516401/jobs/7374123303#step:4:480) Fix failing test (https://github.com/kinu-garage/linux_peripheral_interfaces/actions/runs/4261908716/jobs/7416781435) --- computer_hw/package.xml | 1 + .../computer_hw/nvidia_temperature_monitor.py | 28 +++++++++---------- computer_hw/test/parse_test.py | 15 ++++++---- 3 files changed, 24 insertions(+), 20 deletions(-) diff --git a/computer_hw/package.xml b/computer_hw/package.xml index 8e08d7f..af9340b 100644 --- a/computer_hw/package.xml +++ b/computer_hw/package.xml @@ -21,5 +21,6 @@ diagnostic_aggregator libsensors_monitor rospy + roslib diff --git a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py index 72f1eff..626340e 100644 --- a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py +++ b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py @@ -34,7 +34,7 @@ from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue import rospy -from computer_hw.gpu_util import Nvidia_GPU_Stat +from computer_hw.gpu_util import GPUStatusHandler class NVidiaTempMonitor(object): @@ -55,7 +55,7 @@ def gpu_status_to_diag(gpu_stat): stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) - stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % _rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % GPUStatusHandler.rads_to_rpm(gpu_stat.fan_speed))) stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) @@ -111,33 +111,33 @@ def pub_status(self): def parse_smi_output(output): gpu_stat = GPUStatus() - gpu_stat.product_name = _find_val(output, 'Product Name') - gpu_stat.pci_device_id = _find_val(output, 'PCI Device/Vendor ID') - gpu_stat.pci_location = _find_val(output, 'PCI Location ID') - gpu_stat.display = _find_val(output, 'Display') - gpu_stat.driver_version = _find_val(output, 'Driver Version') + gpu_stat.product_name = GPUStatusHandler._find_val(output, 'Product Name') + gpu_stat.pci_device_id = GPUStatusHandler._find_val(output, 'PCI Device/Vendor ID') + gpu_stat.pci_location = GPUStatusHandler._find_val(output, 'PCI Location ID') + gpu_stat.display = GPUStatusHandler._find_val(output, 'Display') + gpu_stat.driver_version = GPUStatusHandler._find_val(output, 'Driver Version') TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] for query in TEMPERATURE_QUERIES: - temp_str = _find_val(output, query) + temp_str = GPUStatusHandler._find_val(output, query) if temp_str: - temp, units = temp_str.split() + temp = temp_str.split()[0] gpu_stat.temperature = int(temp) break - fan_str = _find_val(output, 'Fan Speed') + fan_str = GPUStatusHandler._find_val(output, 'Fan Speed') if fan_str: # Fan speed in RPM - fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * MAX_FAN_RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * GPUStatusHandler._MAX_FAN_RPM # Convert fan speed to Hz - gpu_stat.fan_speed = _rpm_to_rads(fan_spd) + gpu_stat.fan_speed = GPUStatusHandler.rpm_to_rads(fan_spd) - usage_str = _find_val(output, 'GPU') + usage_str = GPUStatusHandler._find_val(output, 'GPU') if usage_str: usage = usage_str.strip('\%').strip() gpu_stat.gpu_usage = int(usage) - mem_str = _find_val(output, 'Memory') + mem_str = GPUStatusHandler._find_val(output, 'Memory') if mem_str: mem = mem_str.strip('\%').strip() gpu_stat.memory_usage = int(mem) diff --git a/computer_hw/test/parse_test.py b/computer_hw/test/parse_test.py index 5c8258a..a79b93a 100755 --- a/computer_hw/test/parse_test.py +++ b/computer_hw/test/parse_test.py @@ -42,7 +42,10 @@ import sys import unittest +import roslib + import computer_hw +from computer_hw.nvidia_temperature_monitor import NVidiaTempMonitor, parse_smi_output TEXT_PATH = 'test/sample_output/nvidia_smi_out_2021.txt' TEXT_HIGH_TEMP_PATH = 'test/sample_output/nvidia_smi_high_temp.txt' @@ -58,7 +61,7 @@ def setUp(self): self.high_temp_data = f.read() def test_parse(self): - gpu_stat = computer_hw.parse_smi_output(self.data) + gpu_stat = parse_smi_output(self.data) # Check valid self.assert_(self.data, "Unable to read sample output, no test to run") @@ -73,12 +76,12 @@ def test_parse(self): self.assert_(gpu_stat.temperature > 40 and gpu_stat.temperature < 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %f" % gpu_stat.fan_speed) - diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) self.assert_(diag_stat.level == 0, "Diagnostics reports an error for nominal input. Message: %s" % diag_stat.message) def test_high_temp_parse(self): - gpu_stat = computer_hw.parse_smi_output(self.high_temp_data) + gpu_stat = parse_smi_output(self.high_temp_data) # Check valid self.assert_(self.high_temp_data, "Unable to read sample output, no test to run") @@ -93,17 +96,17 @@ def test_high_temp_parse(self): self.assert_(gpu_stat.temperature > 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %s" % gpu_stat.fan_speed) - diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) self.assert_(diag_stat.level == 1, "Diagnostics didn't report warning for high temp input. Level %d, Message: %s" % (diag_stat.level, diag_stat.message)) def test_empty_parse(self): - gpu_stat = computer_hw.parse_smi_output('') + gpu_stat = parse_smi_output('') self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) - diag_stat = computer_hw.gpu_status_to_diag(gpu_stat) + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) self.assert_(diag_stat.level == 2, "Diagnostics didn't reports an error for empty input. Level: %d, Message: %s" % (diag_stat.level, diag_stat.message))