diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6e68502 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,46 @@ +name: CI - Ubuntu Focal + +on: + # direct pushes to protected branches are not supported + pull_request: + push: + # run every day, at 6am UTC + schedule: + - cron: '0 6 * * *' + # allow manually starting this workflow + workflow_dispatch: + +jobs: + industrial_ci: + name: ROS Noetic (${{ matrix.ros_repo }}) + runs-on: ubuntu-20.04 + + strategy: + matrix: + ROS_DISTRO: [ melodic, noetic ] + ROS_REPO: [ main ] + + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache" + + steps: + - name: Fetch repository + uses: actions/checkout@v3 + + - name: ccache cache + uses: actions/cache@v3 + with: + path: ${{ env.CCACHE_DIR }} + # we always want the ccache cache to be persisted, as we cannot easily + # determine whether dependencies have changed, and ccache will manage + # updating the cache for us. Adding 'run_id' to the key will force an + # upload at the end of the job. + key: ccache-${{ matrix.ros_distro }}-${{ matrix.ros_repo }}-${{github.run_id}} + restore-keys: | + ccache-${{ matrix.ros_distro }}-${{ matrix.ros_repo }} + + - name: Run industrial_ci + uses: ros-industrial/industrial_ci@master + env: + ROS_DISTRO: ${{ matrix.ros_distro }} + ROS_REPO: ${{ matrix.ros_repo }} diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ad2292d..0000000 --- a/.travis.yml +++ /dev/null @@ -1,54 +0,0 @@ - -# this is .traivs.yml written by - - -# https://github.com/ros-infrastructure/ros_buildfarm/blob/master/doc/jobs/devel_jobs.rst -# https://github.com/ros-infrastructure/ros_buildfarm/blob/master/doc/jobs/prerelease_jobs.rst -# while this doesn't require sudo we don't want to run within a Docker container -sudo: true -dist: bionic -language: python -addons: - apt: - packages: - - 2to3 -env: - global: - - JOB_PATH=/tmp/devel_job - - ABORT_ON_TEST_FAILURE=1 - - INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/production/index.yaml - matrix: - - CHECK_PYTHON3_COMPILE=true - - ROS_DISTRO_NAME=kinetic OS_NAME=ubuntu OS_CODE_NAME=xenial ARCH=amd64 INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/7e6385e/index.yaml - - ROS_DISTRO_NAME=melodic OS_NAME=ubuntu OS_CODE_NAME=bionic ARCH=amd64 - - ROS_DISTRO_NAME=noetic OS_NAME=ubuntu OS_CODE_NAME=focal ARCH=amd64 -# matrix: -# allow_failures: -# - env: ROS_DISTRO_NAME=indigo OS_NAME=ubuntu OS_CODE_NAME=trusty ARCH=amd64 INDEX_URL=https://raw.githubusercontent.com/ros-infrastructure/ros_buildfarm_config/6a93d17/index.yaml -install: - # check python3 compatibility - - if [ "${CHECK_PYTHON3_COMPILE}" == "true" ]; then bash -c "ret=0; trap 'ret=1' ERR; python3 -m compileall .; 2to3 -w -f except -f execfile -f has_key -f raw_input .; git diff --exit-code . > /dev/null; echo Exitting with \$ret; exit \$ret"; exit $?; fi - # either install the latest released version of ros_buildfarm - # - pip install ros_buildfarm - # or checkout a specific branch - - git clone -b master https://github.com/ros-infrastructure/ros_buildfarm /tmp/ros_buildfarm - # force enable `rosdep update --include-eol-distros` until https://github.com/ros-infrastructure/ros_buildfarm/pull/890 released - - (cd /tmp/ros_buildfarm; git checkout f7a12d8) - - (cd /tmp; wget https://github.com/ros-infrastructure/ros_buildfarm/pull/890.diff) - - (cd /tmp/ros_buildfarm; patch -p1 < /tmp/890.diff) - - pip install /tmp/ros_buildfarm - # checkout catkin for catkin_test_results script - - git clone https://github.com/ros/catkin /tmp/catkin - # run devel job for a ROS repository with the same name as this repo - - export REPOSITORY_NAME=`basename $TRAVIS_BUILD_DIR` - # use the code already checked out by Travis - - mkdir -p $JOB_PATH/ws/src - - cp -R $TRAVIS_BUILD_DIR $JOB_PATH/ws/src/ - # generate the script to run a pre-release job for that target and repo - - generate_prerelease_script.py $INDEX_URL $ROS_DISTRO_NAME default $OS_NAME $OS_CODE_NAME $ARCH --output-dir $JOB_PATH --custom-rosdep-update-options=--include-eol-distros - # run the actual job which involves Docker - - cd $JOB_PATH; sh ./prerelease.sh -y -script: - # get summary of test results - - /tmp/catkin/bin/catkin_test_results $JOB_PATH/ws/test_results --all -notifications: - email: false diff --git a/computer_hw/CHANGELOG.rst b/computer_hw/CHANGELOG.rst new file mode 100644 index 0000000..fd3df3d --- /dev/null +++ b/computer_hw/CHANGELOG.rst @@ -0,0 +1,50 @@ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Changelog for package computer_hw +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1.6.32 (2021-05-26) +------------------- +* Merge pull request `#268 `_ from k-okada/fix_for_noetic +* run 2to3 -w -fexcept . +* run 2to3 -w -fprint . +* Contributors: Kei Okada + +1.6.31 (2020-04-14) +------------------- +* add --ignore-self arg in ntp_monitor.py (`#259 `_) +* fixed CMake errors +* Contributors: David Feil-Seifer, Shingo Kitagawa + +1.6.30 (2018-04-23) +------------------- +* removed more tests for jenkins build +* Contributors: David Feil-Seifer + +1.6.29 (2018-04-22) +------------------- + +1.6.28 (2018-04-21) +------------------- +* made sure tests only run if CATKIN_ENABLE_TESTING is set +* Contributors: David Feil-Seifer + +1.6.27 (2018-04-20) +------------------- + +1.6.26 (2018-03-19) +------------------- + +1.6.25 (2018-03-19) +------------------- +* updated packages for new maintainer +* updated changelogs +* Contributors: David Feil-Seifer + +1.6.7 (2015-02-11) +------------------ +* Reverted changes +* Added dependencies in catkin +* Added catkin_package() to pr2_robot +* Updated mainpage.dox +* Fix binary location of network_detector +* Contributors: Ryohei Ueda, TheDash diff --git a/computer_hw/CMakeLists.txt b/computer_hw/CMakeLists.txt new file mode 100644 index 0000000..fffd495 --- /dev/null +++ b/computer_hw/CMakeLists.txt @@ -0,0 +1,39 @@ +# http://ros.org/doc/groovy/api/catkin/html/user_guide/supposed.html +cmake_minimum_required(VERSION 2.8.3) +project(computer_hw) +# Load catkin and all dependencies required for this package +# TODO: remove all from COMPONENTS that are not catkin packages. +find_package(catkin REQUIRED COMPONENTS roscpp std_msgs) + +if(CATKIN_ENABLE_TESTING) + catkin_add_nosetests(test/parse_test.py) +endif() + +include_directories(include ${catkin_INCLUDE_DIRS}) + +catkin_package( + DEPENDS roscpp std_msgs + CATKIN_DEPENDS # TODO + INCLUDE_DIRS # TODO include + LIBRARIES network_detector# TODO +) + +catkin_install_python(PROGRAMS + executables/cpu_monitor.py executables/hd_monitor.py executables/ntp_monitor.py executables/ntp_monitor.py executables/nvidia_temp.py executables/wifi_monitor.py + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) + +add_executable(network_detector src/network_detector.cpp) +target_link_libraries(network_detector ${catkin_LIBRARIES}) +add_dependencies(network_detector ${catkin_EXPORTED_TARGETS} ${${PROJECT_NAME}_EXPORTED_TARGETS}) + +install(TARGETS network_detector + ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} + RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) + +foreach(dir conf) + install(DIRECTORY ${dir}/ + DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/${dir}) +endforeach(dir) + +catkin_python_setup() diff --git a/computer_hw/README.md b/computer_hw/README.md new file mode 100644 index 0000000..d0285ba --- /dev/null +++ b/computer_hw/README.md @@ -0,0 +1,17 @@ +# computer_hw + +## Development history +`computer_hw` package was originally made in [pr2_robot](https://github.com/PR2/pr2_robot) repository. See discussion [pr2_common#286](https://github.com/PR2/pr2_common/issues/286) for the migration. + +## Installation tips +In order for ipmitool to work on computers with a BMC, the following line needs to appear in `/etc/sudoers`: + +``` +ALL ALL=NOPASSWD: /usr/bin/ipmitool sdr type Temperature +``` + +## Usage / Operation +TBD + +EoF + diff --git a/computer_hw/conf/cpu_monitor.launch b/computer_hw/conf/cpu_monitor.launch new file mode 100644 index 0000000..b661581 --- /dev/null +++ b/computer_hw/conf/cpu_monitor.launch @@ -0,0 +1,8 @@ + + + + + + + diff --git a/computer_hw/conf/hd_monitor.launch b/computer_hw/conf/hd_monitor.launch new file mode 100644 index 0000000..680bc48 --- /dev/null +++ b/computer_hw/conf/hd_monitor.launch @@ -0,0 +1,6 @@ + + + + + diff --git a/computer_hw/conf/monitor.launch b/computer_hw/conf/monitor.launch new file mode 100644 index 0000000..4cc1034 --- /dev/null +++ b/computer_hw/conf/monitor.launch @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/computer_hw/conf/ntp_monitor.launch b/computer_hw/conf/ntp_monitor.launch new file mode 100644 index 0000000..60d4945 --- /dev/null +++ b/computer_hw/conf/ntp_monitor.launch @@ -0,0 +1,4 @@ + + + diff --git a/computer_hw/demo/cpu_monitor.launch b/computer_hw/demo/cpu_monitor.launch new file mode 100644 index 0000000..b661581 --- /dev/null +++ b/computer_hw/demo/cpu_monitor.launch @@ -0,0 +1,8 @@ + + + + + + + diff --git a/computer_hw/executables/README.md b/computer_hw/executables/README.md new file mode 100644 index 0000000..b004ae2 --- /dev/null +++ b/computer_hw/executables/README.md @@ -0,0 +1 @@ +This is the indigo code, and will not work correctly on a precise machine diff --git a/computer_hw/executables/cpu_monitor.py b/computer_hw/executables/cpu_monitor.py new file mode 100755 index 0000000..21aa80e --- /dev/null +++ b/computer_hw/executables/cpu_monitor.py @@ -0,0 +1,844 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2009, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement, print_function + +import rospy + +import traceback +import threading +from threading import Timer +import sys, os, time +from time import sleep +import subprocess +import string + +import socket + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } + +# Output entire IPMI data set +def check_ipmi(): + diag_vals = [] + diag_msgs = [] + diag_level = DiagnosticStatus.OK + + try: + p = subprocess.Popen('sudo ipmitool sdr', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + diag_level = DiagnosticStatus.ERROR + diag_msgs = [ 'ipmitool Error' ] + diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ] + return diag_vals, diag_msgs, diag_level + + lines = stdout.split('\n') + if len(lines) < 2: + diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ] + + diag_msgs = [ 'No ipmitool response' ] + diag_level = DiagnosticStatus.ERROR + + return diag_vals, diag_msgs, diag_level + + for ln in lines: + if len(ln) < 3: + continue + + words = ln.split('|') + if len(words) < 3: + continue + + name = words[0].strip() + ipmi_val = words[1].strip() + stat_byte = words[2].strip() + + # CPU temps + if words[0].startswith('CPU') and words[0].strip().endswith('Temp'): + if words[1].strip().endswith('degrees C'): + tmp = ipmi_val.rstrip(' degrees C').lstrip() + if unicode(tmp).isnumeric(): + temperature = float(tmp) + diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) + + cpu_name = name.split()[0] + if temperature >= 80 and temperature < 89: + diag_level = max(diag_level, DiagnosticStatus.WARN) + if diag_msgs.count('CPU Hot') == 0: + diag_msgs.append('CPU Warm') + + if temperature >= 89: # CPU should shut down here + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Hot') + # Don't keep CPU Warm in list if CPU is hot + if diag_msgs.count('CPU Warm') > 0: + idx = diag_msgs.index('CPU Warm') + diag_msgs.pop(idx) + else: + diag_vals.append(KeyValue(key = name, value = words[1])) + + + # MP, BP, FP temps + if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp': + if ipmi_val.endswith('degrees C'): + tmp = ipmi_val.rstrip(' degrees C').lstrip() + diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) + # Give temp warning + dev_name = name.split()[0] + if unicode(tmp).isnumeric(): + temperature = float(tmp) + + if temperature >= 60 and temperature < 75: + diag_level = max(diag_level, DiagnosticStatus.WARN) + diag_msgs.append('%s Warm' % dev_name) + + if temperature >= 75: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('%s Hot' % dev_name) + else: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('%s Error' % dev_name) + else: + diag_vals.append(KeyValue(key = name, value = ipmi_val)) + + # CPU fan speeds + if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan': + if ipmi_val.endswith('RPM'): + rpm = ipmi_val.rstrip(' RPM').lstrip() + if unicode(rpm).isnumeric(): + if int(rpm) == 0: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Fan Off') + + diag_vals.append(KeyValue(key = name + ' RPM', value = rpm)) + else: + diag_vals.append(KeyValue(key = name, value = ipmi_val)) + + # If CPU is hot we get an alarm from ipmitool, report that too + # CPU should shut down if we get a hot alarm, so report as error + if name.startswith('CPU') and name.endswith('hot'): + if ipmi_val == '0x01': + diag_vals.append(KeyValue(key = name, value = 'OK')) + else: + diag_vals.append(KeyValue(key = name, value = 'Hot')) + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('CPU Hot Alarm') + + except Exception as e: + diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) + diag_level = DiagnosticStatus.ERROR + diag_msgs.append('Exception') + + return diag_vals, diag_msgs, diag_level + + +##\brief Check CPU core temps +## +## Use 'find /sys -name temp1_input' to find cores +## Read from every core, divide by 1000 +def check_core_temps(sys_temp_strings): + diag_vals = [] + diag_level = 0 + diag_msgs = [] + + for index, temp_str in enumerate(sys_temp_strings): + if len(temp_str) < 5: + continue + + cmd = 'cat %s' % temp_str + p = subprocess.Popen(cmd, stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + diag_level = DiagnosticStatus.ERROR + diag_msg = [ 'Core Temp Error' ] + diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), + KeyValue(key = 'Output', value = stdout) ] + return diag_vals, diag_msgs, diag_level + + tmp = stdout.strip() + if unicode(tmp).isnumeric(): + temp = float(tmp) / 1000 + diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp))) + + if temp >= 85 and temp < 90: + diag_level = max(diag_level, DiagnosticStatus.WARN) + diag_msgs.append('Warm') + if temp >= 90: + diag_level = max(diag_level, DiagnosticStatus.ERROR) + diag_msgs.append('Hot') + else: + diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value + diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp)) + + return diag_vals, diag_msgs, diag_level + +## Checks clock speed from reading from CPU info +def check_clock_speed(enforce_speed): + vals = [] + msgs = [] + lvl = DiagnosticStatus.OK + + try: + p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + lvl = DiagnosticStatus.ERROR + msgs = [ 'Clock speed error' ] + vals = [ KeyValue(key = 'Clock speed error', value = stderr), + KeyValue(key = 'Output', value = stdout) ] + + return (vals, msgs, lvl) + + for index, ln in enumerate(stdout.split('\n')): + words = ln.split(':') + if len(words) < 2: + continue + + speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal + vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed)) + if unicode(speed).isnumeric(): + mhz = float(speed) + + if mhz < 2240 and mhz > 2150: + lvl = max(lvl, DiagnosticStatus.WARN) + if mhz <= 2150: + lvl = max(lvl, DiagnosticStatus.ERROR) + else: + # Automatically give error if speed isn't a number + lvl = max(lvl, DiagnosticStatus.ERROR) + + if not enforce_speed: + lvl = DiagnosticStatus.OK + + if lvl == DiagnosticStatus.WARN and enforce_speed: + msgs = [ 'Core slowing' ] + elif lvl == DiagnosticStatus.ERROR and enforce_speed: + msgs = [ 'Core throttled' ] + + except Exception as e: + rospy.logerr(traceback.format_exc()) + lvl = DiagnosticStatus.ERROR + msgs.append('Exception') + vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) + + return vals, msgs, lvl + + +# Add msgs output, too +##\brief Uses 'uptime' to see load average +def check_uptime(load1_threshold, load5_threshold): + level = DiagnosticStatus.OK + vals = [] + + load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' } + + try: + p = subprocess.Popen('uptime', stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + vals.append(KeyValue(key = 'uptime Failed', value = stderr)) + return DiagnosticStatus.ERROR, vals + + upvals = stdout.split() + load1 = upvals[-3].rstrip(',') + load5 = upvals[-2].rstrip(',') + load15 = upvals[-1] + num_users = upvals[-7] + + # Give warning if we go over load limit + if float(load1) > load1_threshold or float(load5) > load5_threshold: + level = DiagnosticStatus.WARN + + vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level])) + vals.append(KeyValue(key = '1 min Load Average', value = load1)) + vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold))) + vals.append(KeyValue(key = '5 min Load Average', value = load5)) + vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold))) + vals.append(KeyValue(key = '15 min Load Average', value = load15)) + vals.append(KeyValue(key = 'Number of Users', value = num_users)) + + except Exception as e: + rospy.logerr(traceback.format_exc()) + level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc())) + + return level, load_dict[level], vals + +# Add msgs output +##\brief Uses 'free -m' to check free memory +def check_memory(): + values = [] + level = DiagnosticStatus.OK + msg = '' + + mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' } + + try: + p = subprocess.Popen('free -m', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode))) + return DiagnosticStatus.ERROR, values + + rows = stdout.split('\n') + data = rows[1].split() + total_mem = data[1] + used_mem = data[2] + free_mem = data[3] + + level = DiagnosticStatus.OK + if float(free_mem) < 25: + level = DiagnosticStatus.WARN + if float(free_mem) < 1: + level = DiagnosticStatus.ERROR + + values.append(KeyValue(key = 'Memory Status', value = mem_dict[level])) + values.append(KeyValue(key = 'Total Memory', value = total_mem)) + values.append(KeyValue(key = 'Used Memory', value = used_mem)) + values.append(KeyValue(key = 'Free Memory', value = free_mem)) + + msg = mem_dict[level] + except Exception as e: + rospy.logerr(traceback.format_exc()) + msg = 'Memory Usage Check Error' + values.append(KeyValue(key = msg, value = str(e))) + level = DiagnosticStatus.ERROR + + return level, mem_dict[level], values + + + +##\brief Use mpstat to find CPU usage +## +usage_old = 0 +has_warned_mpstat = False +has_error_core_count = False +def check_mpstat(core_count = -1): + vals = [] + mp_level = DiagnosticStatus.OK + + load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' } + + try: + p = subprocess.Popen('mpstat -P ALL 1 1', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + global has_warned_mpstat + if not has_warned_mpstat: + rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode) + has_warned_mpstat = True + + mp_level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode))) + return mp_level, 'Unable to Check CPU Usage', vals + + # Check which column '%idle' is, #4539 + # mpstat output changed between 8.06 and 8.1 + rows = stdout.split('\n') + col_names = rows[2].split() + idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2 + + num_cores = 0 + cores_loaded = 0 + for index, row in enumerate(stdout.split('\n')): + if index < 3: + continue + + # Skip row containing 'all' data + if row.find('all') > -1: + continue + + lst = row.split() + if len(lst) < 8: + continue + + ## Ignore 'Average: ...' data + if lst[0].startswith('Average'): + continue + + cpu_name = '%d' % (num_cores) + idle = lst[idle_col].replace(',', '.') + user = lst[3].replace(',', '.') + nice = lst[4].replace(',', '.') + system = lst[5].replace(',', '.') + + core_level = 0 + usage = float(user) + float(nice) + if usage > 1000: # wrong reading, use old reading instead + rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old)) + usage = usage_old + usage_old = usage + + if usage > 90.0: + cores_loaded += 1 + core_level = DiagnosticStatus.WARN + if usage > 110.0: + core_level = DiagnosticStatus.ERROR + + vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level])) + vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user)) + vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice)) + vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system)) + vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle)) + + num_cores += 1 + + # Warn for high load only if we have <= 2 cores that aren't loaded + if num_cores - cores_loaded <= 2 and num_cores > 2: + mp_level = DiagnosticStatus.WARN + + # Check the number of cores if core_count > 0, #4850 + if core_count > 0 and core_count != num_cores: + mp_level = DiagnosticStatus.ERROR + global has_error_core_count + if not has_error_core_count: + rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.', + core_count, num_cores) + has_error_core_count = True + return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals + + except Exception as e: + mp_level = DiagnosticStatus.ERROR + vals.append(KeyValue(key = 'mpstat Exception', value = str(e))) + + return mp_level, load_dict[mp_level], vals + +## Returns names for core temperature files +## Returns list of names, each name can be read like file +def get_core_temp_names(): + temp_vals = [] + try: + p = subprocess.Popen('find /sys/devices -name temp1_input', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + rospy.logerr('Error find core temp locations: %s' % stderr) + return [] + + for ln in stdout.split('\n'): + temp_vals.append(ln.strip()) + + return temp_vals + except: + rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc()) + return [] + +def update_status_stale(stat, last_update_time): + time_since_update = rospy.get_time() - last_update_time + + stale_status = 'OK' + if time_since_update > 20 and time_since_update <= 35: + stale_status = 'Lagging' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.WARN) + if time_since_update > 35: + stale_status = 'Stale' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.ERROR) + + + stat.values.pop(0) + stat.values.pop(0) + stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) + stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) + + +class CPUMonitor(): + def __init__(self, hostname, diag_hostname): + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._mutex = threading.Lock() + + self._check_ipmi = rospy.get_param('~check_ipmi_tool', True) + self._enforce_speed = rospy.get_param('~enforce_clock_speed', True) + + self._check_core_temps = rospy.get_param('~check_core_temps', False) + if self._check_core_temps: + rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle') + self._check_nfs = rospy.get_param('~check_nfs', False) + if self._check_nfs: + rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle') + + self._load1_threshold = rospy.get_param('~load1_threshold', 5.0) + self._load5_threshold = rospy.get_param('~load5_threshold', 3.0) + + self._num_cores = rospy.get_param('~num_cores', 8.0) + + self._temps_timer = None + self._usage_timer = None + self._nfs_timer = None + + # Get temp_input files + self._temp_vals = get_core_temp_names() + + # CPU stats + self._temp_stat = DiagnosticStatus() + self._temp_stat.name = '%s CPU Temperature' % diag_hostname + self._temp_stat.level = 1 + self._temp_stat.hardware_id = hostname + self._temp_stat.message = 'No Data' + self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._usage_stat = DiagnosticStatus() + self._usage_stat.name = '%s CPU Usage' % diag_hostname + self._usage_stat.level = 1 + self._usage_stat.hardware_id = hostname + self._usage_stat.message = 'No Data' + self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._nfs_stat = DiagnosticStatus() + self._nfs_stat.name = '%s NFS IO' % diag_hostname + self._nfs_stat.level = 1 + self._nfs_stat.hardware_id = hostname + self._nfs_stat.message = 'No Data' + self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + self._last_temp_time = 0 + self._last_usage_time = 0 + self._last_nfs_time = 0 + self._last_publish_time = 0 + + # Start checking everything + self.check_temps() + if self._check_nfs: + self.check_nfs_stat() + self.check_usage() + + # Restart temperature checking + def _restart_temp_check(self): + rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen') + try: + with self._mutex: + if self._temps_timer: + self._temps_timer.cancel() + + self.check_temps() + except Exception as e: + rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc()) + + + ## Must have the lock to cancel everything + def cancel_timers(self): + if self._temps_timer: + self._temps_timer.cancel() + + if self._nfs_timer: + self._nfs_timer.cancel() + + if self._usage_timer: + self._usage_timer.cancel() + + def check_nfs_stat(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + nfs_level = 0 + msg = 'OK' + vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = str(0) )] + + try: + p = subprocess.Popen('iostat -n', + stdout = subprocess.PIPE, + stderr = subprocess.PIPE, shell = True) + stdout, stderr = p.communicate() + retcode = p.returncode + + if retcode != 0: + nfs_level = DiagnosticStatus.ERROR + msg = 'iostat Error' + vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e))) + stdout = '' + + + for index, row in enumerate(stdout.split('\n')): + if index < 3: + continue + + lst = row.split() + if len(lst) < 7: + continue + + file_sys = lst[0] + read_blk = lst[1] + write_blk = lst[2] + read_blk_dir = lst[3] + write_blk_dir = lst[4] + r_blk_srv = lst[5] + w_blk_srv = lst[6] + + vals.append(KeyValue( + key = '%s Read Blks/s' % file_sys, value=read_blk)) + vals.append(KeyValue( + key = '%s Write Blks/s' % file_sys, value=write_blk)) + vals.append(KeyValue( + key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir)) + vals.append(KeyValue( + key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir)) + vals.append(KeyValue( + key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv)) + vals.append(KeyValue( + key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv)) + + except Exception as e: + rospy.logerr(traceback.format_exc()) + nfs_level = DiagnosticStatus.ERROR + msg = 'Exception' + vals.append(KeyValue(key = 'Exception', value = str(e))) + + with self._mutex: + self._nfs_stat.level = nfs_level + self._nfs_stat.message = msg + self._nfs_stat.values = vals + + self._last_nfs_time = rospy.get_time() + + if not rospy.is_shutdown(): + self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat) + self._nfs_timer.start() + else: + self.cancel_timers() + + + ## Call every 10sec at minimum + def check_temps(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = str(0) ) ] + diag_msgs = [] + diag_level = 0 + + if self._check_ipmi: + ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi() + diag_vals.extend(ipmi_vals) + diag_msgs.extend(ipmi_msgs) + diag_level = max(diag_level, ipmi_level) + + if self._check_core_temps: + core_vals, core_msgs, core_level = check_core_temps(self._temp_vals) + diag_vals.extend(core_vals) + diag_msgs.extend(core_msgs) + diag_level = max(diag_level, core_level) + + clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed) + diag_vals.extend(clock_vals) + diag_msgs.extend(clock_msgs) + diag_level = max(diag_level, clock_level) + + diag_log = set(diag_msgs) + if len(diag_log) > 0: + message = ', '.join(diag_log) + else: + message = stat_dict[diag_level] + + with self._mutex: + self._last_temp_time = rospy.get_time() + + self._temp_stat.level = diag_level + self._temp_stat.message = message + self._temp_stat.values = diag_vals + + if not rospy.is_shutdown(): + self._temps_timer = threading.Timer(5.0, self.check_temps) + self._temps_timer.start() + else: + self.cancel_timers() + + def check_usage(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_level = 0 + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = 0 )] + diag_msgs = [] + + # Check mpstat + mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores) + diag_vals.extend(mp_vals) + if mp_level > 0: + diag_msgs.append(mp_msg) + diag_level = max(diag_level, mp_level) + + # Check uptime + uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold) + diag_vals.extend(up_vals) + if uptime_level > 0: + diag_msgs.append(up_msg) + diag_level = max(diag_level, uptime_level) + + # Check memory + mem_level, mem_msg, mem_vals = check_memory() + diag_vals.extend(mem_vals) + if mem_level > 0: + diag_msgs.append(mem_msg) + diag_level = max(diag_level, mem_level) + + if diag_msgs and diag_level > 0: + usage_msg = ', '.join(set(diag_msgs)) + else: + usage_msg = stat_dict[diag_level] + + # Update status + with self._mutex: + self._last_usage_time = rospy.get_time() + self._usage_stat.level = diag_level + self._usage_stat.values = diag_vals + + self._usage_stat.message = usage_msg + + if not rospy.is_shutdown(): + self._usage_timer = threading.Timer(5.0, self.check_usage) + self._usage_timer.start() + else: + self.cancel_timers() + + def publish_stats(self): + with self._mutex: + # Update everything with last update times + update_status_stale(self._temp_stat, self._last_temp_time) + update_status_stale(self._usage_stat, self._last_usage_time) + if self._check_nfs: + update_status_stale(self._nfs_stat, self._last_nfs_time) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(self._temp_stat) + msg.status.append(self._usage_stat) + if self._check_nfs: + msg.status.append(self._nfs_stat) + + if rospy.get_time() - self._last_publish_time > 0.5: + self._diag_pub.publish(msg) + self._last_publish_time = rospy.get_time() + + + # Restart temperature checking if it goes stale, #4171 + # Need to run this without mutex + if rospy.get_time() - self._last_temp_time > 90: + self._restart_temp_check() + + +if __name__ == '__main__': + hostname = socket.gethostname() + + import optparse + parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default = hostname) + options, args = parser.parse_args(rospy.myargv()) + + try: + rospy.init_node('cpu_monitor_%s' % hostname) + except rospy.exceptions.ROSInitException: + print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr) + sys.exit(0) + + cpu_node = CPUMonitor(hostname, options.diag_hostname) + + rate = rospy.Rate(1.0) + try: + while not rospy.is_shutdown(): + rate.sleep() + cpu_node.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + traceback.print_exc() + rospy.logerr(traceback.format_exc()) + + cpu_node.cancel_timers() + sys.exit(0) + + + + + + + diff --git a/computer_hw/executables/hd_monitor.py b/computer_hw/executables/hd_monitor.py new file mode 100755 index 0000000..3a8854c --- /dev/null +++ b/computer_hw/executables/hd_monitor.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2009, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement + +import rospy + +import traceback +import threading +from threading import Timer +import sys, os, time +from time import sleep +import subprocess + +import socket + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +low_hd_level = 5 +critical_hd_level = 1 + +hd_temp_warn = 55 #3580, setting to 55C to after checking manual +hd_temp_error = 70 # Above this temperature, hard drives will have serious problems + +stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } +temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' } +usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' } + +REMOVABLE = ['/dev/sda'] # Store removable drives so we can ignore if removed + +## Connects to hddtemp daemon to get temp, HD make. +def get_hddtemp_data(hostname = 'localhost', port = 7634): + try: + hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + hd_sock.connect((hostname, port)) + sock_data = '' + while True: + newdat = hd_sock.recv(1024) + if len(newdat) == 0: + break + sock_data = sock_data + newdat + hd_sock.close() + + sock_vals = sock_data.split('|') + + # Format of output looks like ' | DRIVE | MAKE | TEMP | ' + idx = 0 + + drives = [] + makes = [] + temps = [] + while idx + 5 < len(sock_vals): + this_drive = sock_vals[idx + 1] + this_make = sock_vals[idx + 2] + this_temp = sock_vals[idx + 3] + + # Sometimes we get duplicate makes if hard drives are mounted + # to two different points + if this_make in makes: + idx += 5 + continue + + drives.append(this_drive) + makes.append(this_make) + temps.append(this_temp) + + idx += 5 + + return True, drives, makes, temps + except: + rospy.logerr(traceback.format_exc()) + return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ] + +def update_status_stale(stat, last_update_time): + time_since_update = rospy.get_time() - last_update_time + + stale_status = 'OK' + if time_since_update > 20 and time_since_update <= 35: + stale_status = 'Lagging' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.WARN) + if time_since_update > 35: + stale_status = 'Stale' + if stat.level == DiagnosticStatus.OK: + stat.message = stale_status + elif stat.message.find(stale_status) < 0: + stat.message = ', '.join([stat.message, stale_status]) + stat.level = max(stat.level, DiagnosticStatus.ERROR) + + stat.values.pop(0) + stat.values.pop(0) + stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) + stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) + +class hd_monitor(): + def __init__(self, hostname, diag_hostname, home_dir = ''): + self._mutex = threading.Lock() + + self._hostname = hostname + self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False) + if self._no_temp_warn: + rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle') + self._home_dir = home_dir + + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._last_temp_time = 0 + self._last_usage_time = 0 + self._last_publish_time = 0 + + self._temp_timer = None + self._usage_timer = None + + self._temp_stat = DiagnosticStatus() + self._temp_stat.name = "%s HD Temperature" % diag_hostname + self._temp_stat.level = DiagnosticStatus.ERROR + self._temp_stat.hardware_id = hostname + self._temp_stat.message = 'No Data' + self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + + if self._home_dir != '': + self._usage_stat = DiagnosticStatus() + self._usage_stat.level = DiagnosticStatus.ERROR + self._usage_stat.hardware_id = hostname + self._usage_stat.name = '%s HD Usage' % diag_hostname + self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), + KeyValue(key = 'Time Since Last Update', value = 'N/A') ] + self.check_disk_usage() + + self.check_temps() + + ## Must have the lock to cancel everything + def cancel_timers(self): + if self._temp_timer: + self._temp_timer.cancel() + self._temp_timer = None + + if self._usage_timer: + self._usage_timer.cancel() + self._usage_timer = None + + def check_temps(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) , + KeyValue(key = 'Time Since Last Update', value = '0' ) ] + diag_level = DiagnosticStatus.OK + diag_message = 'OK' + + temp_ok, drives, makes, temps = get_hddtemp_data() + + for index in range(0, len(drives)): + temp = temps[index] + + if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE: + temp_level = DiagnosticStatus.ERROR + temp_ok = False + elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE: + temp_level = DiagnosticStatus.OK + temp = "Removed" + else: + temp_level = DiagnosticStatus.OK + if float(temp) > hd_temp_warn: + temp_level = DiagnosticStatus.WARN + if float(temp) > hd_temp_error: + temp_level = DiagnosticStatus.ERROR + + diag_level = max(diag_level, temp_level) + + diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level])) + diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index])) + diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index])) + diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp)) + + if not temp_ok: + diag_level = DiagnosticStatus.ERROR + + with self._mutex: + self._last_temp_time = rospy.get_time() + self._temp_stat.values = diag_strs + self._temp_stat.level = diag_level + + # Give No Data message if we have no reading + self._temp_stat.message = temp_dict[diag_level] + if not temp_ok: + self._temp_stat.message = 'Error' + + if self._no_temp_warn and temp_ok: + self._temp_stat.level = DiagnosticStatus.OK + + if not rospy.is_shutdown(): + self._temp_timer = threading.Timer(10.0, self.check_temps) + self._temp_timer.start() + else: + self.cancel_timers() + + def check_disk_usage(self): + if rospy.is_shutdown(): + with self._mutex: + self.cancel_timers() + return + + diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), + KeyValue(key = 'Time Since Last Update', value = '0' ) ] + diag_level = DiagnosticStatus.OK + diag_message = 'OK' + + try: + p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + retcode = p.returncode + + if (retcode == 0): + + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK')) + row_count = 0 + for row in stdout.split('\n'): + if len(row.split()) < 2: + continue + if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives + continue + + row_count += 1 + g_available = row.split()[-3] + name = row.split()[0] + size = row.split()[1] + mount_pt = row.split()[-1] + + if (float(g_available) > low_hd_level): + level = DiagnosticStatus.OK + elif (float(g_available) > critical_hd_level): + level = DiagnosticStatus.WARN + else: + level = DiagnosticStatus.ERROR + + diag_vals.append(KeyValue( + key = 'Disk %d Name' % row_count, value = name)) + diag_vals.append(KeyValue( + key = 'Disk %d Available' % row_count, value = g_available)) + diag_vals.append(KeyValue( + key = 'Disk %d Size' % row_count, value = size)) + diag_vals.append(KeyValue( + key = 'Disk %d Status' % row_count, value = stat_dict[level])) + diag_vals.append(KeyValue( + key = 'Disk %d Mount Point' % row_count, value = mount_pt)) + + diag_level = max(diag_level, level) + diag_message = usage_dict[diag_level] + + else: + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed')) + diag_level = DiagnosticStatus.ERROR + diag_message = stat_dict[diag_level] + + + except: + rospy.logerr(traceback.format_exc()) + + diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception')) + diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc())) + + diag_level = DiagnosticStatus.ERROR + diag_message = stat_dict[diag_level] + + # Update status + with self._mutex: + self._last_usage_time = rospy.get_time() + self._usage_stat.values = diag_vals + self._usage_stat.message = diag_message + self._usage_stat.level = diag_level + + if not rospy.is_shutdown(): + self._usage_timer = threading.Timer(5.0, self.check_disk_usage) + self._usage_timer.start() + else: + self.cancel_timers() + + + def publish_stats(self): + with self._mutex: + update_status_stale(self._temp_stat, self._last_temp_time) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(self._temp_stat) + if self._home_dir != '': + update_status_stale(self._usage_stat, self._last_usage_time) + msg.status.append(self._usage_stat) + + if rospy.get_time() - self._last_publish_time > 0.5: + self._diag_pub.publish(msg) + self._last_publish_time = rospy.get_time() + + + + +##\todo Need to check HD input/output too using iostat + +if __name__ == '__main__': + hostname = socket.gethostname() + + import optparse + parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default = hostname) + options, args = parser.parse_args(rospy.myargv()) + + home_dir = '' + if len(args) > 1: + home_dir = args[1] + + try: + rospy.init_node('hd_monitor_%s' % hostname) + except rospy.exceptions.ROSInitException: + print('HD monitor is unable to initialize node. Master may not be running.') + sys.exit(0) + + hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir) + rate = rospy.Rate(1.0) + + try: + while not rospy.is_shutdown(): + rate.sleep() + hd_monitor.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + traceback.print_exc() + + hd_monitor.cancel_timers() + sys.exit(0) + + + diff --git a/computer_hw/executables/ntp_monitor.py b/computer_hw/executables/ntp_monitor.py new file mode 100755 index 0000000..51bcf36 --- /dev/null +++ b/computer_hw/executables/ntp_monitor.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# Software License Agreement (BSD License) +# +# Copyright (c) 2008, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue + +import sys +import rospy +import socket +from subprocess import Popen, PIPE + +import time + +import re + +##### monkey-patch to suppress threading error message in python 2.7.3 +##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug +if sys.version_info[:3] == (2, 7, 3): + import threading + threading._DummyThread._Thread__stop = lambda x: 42 +##### + +NAME = 'ntp_monitor' + +def ntp_monitor(ntp_hostname, offset=500, self_offset=500, diag_hostname=None, + error_offset=5000000, ignore_self=False): + pub = rospy.Publisher("/diagnostics", DiagnosticArray, queue_size=10) + rospy.init_node(NAME, anonymous=True) + + hostname = socket.gethostname() + if diag_hostname is None: + diag_hostname = hostname + + ntp_checks = [] + stat = DiagnosticStatus() + stat.level = 0 + stat.name = "NTP offset from "+ diag_hostname + " to " + ntp_hostname + stat.message = "OK" + stat.hardware_id = hostname + stat.values = [] + ntp_checks.append((stat, ntp_hostname, offset)) + + if not ignore_self: + self_stat = DiagnosticStatus() + self_stat.level = DiagnosticStatus.OK + self_stat.name = "NTP self-offset for "+ diag_hostname + self_stat.message = "OK" + self_stat.hardware_id = hostname + self_stat.values = [] + ntp_checks.append((self_stat, hostname, self_offset)) + + while not rospy.is_shutdown(): + msg = DiagnosticArray() + for st, host, off in ntp_checks: + try: + p = Popen(["ntpdate", "-q", host], stdout=PIPE, stdin=PIPE, stderr=PIPE) + res = p.wait() + (o,e) = p.communicate() + except OSError as e: + (errno, msg) = e.args + if errno == 4: + break #ctrl-c interrupt + else: + raise + if (res == 0): + measured_offset = float(re.search("offset (.*),", o).group(1))*1000000 + + st.level = DiagnosticStatus.OK + st.message = "OK" + st.values = [ KeyValue("Offset (us)", str(measured_offset)), + KeyValue("Offset tolerance (us)", str(off)), + KeyValue("Offset tolerance (us) for Error", str(error_offset)) ] + + if (abs(measured_offset) > off): + st.level = DiagnosticStatus.WARN + st.message = "NTP Offset Too High" + if (abs(measured_offset) > error_offset): + st.level = DiagnosticStatus.ERROR + st.message = "NTP Offset Too High" + + else: + st.level = DiagnosticStatus.ERROR + st.message = "Error Running ntpdate. Returned %d" % res + st.values = [ KeyValue("Offset (us)", "N/A"), + KeyValue("Offset tolerance (us)", str(off)), + KeyValue("Offset tolerance (us) for Error", str(error_offset)), + KeyValue("Output", o), + KeyValue("Errors", e) ] + msg.status.append(st) + + msg.header.stamp = rospy.get_rostime() + pub.publish(msg) + time.sleep(1) + +def ntp_monitor_main(argv=sys.argv): + import optparse + parser = optparse.OptionParser(usage="usage: ntp_monitor ntp-hostname []") + parser.add_option("--offset-tolerance", dest="offset_tol", + action="store", default=500, + help="Offset from NTP host", metavar="OFFSET-TOL") + parser.add_option("--error-offset-tolerance", dest="error_offset_tol", + action="store", default=5000000, + help="Offset from NTP host. Above this is error", metavar="OFFSET-TOL") + parser.add_option("--self_offset-tolerance", dest="self_offset_tol", + action="store", default=500, + help="Offset from self", metavar="SELF_OFFSET-TOL") + parser.add_option("--diag-hostname", dest="diag_hostname", + help="Computer name in diagnostics output (ex: 'c1')", + metavar="DIAG_HOSTNAME", + action="store", default=None) + parser.add_option("--ignore-self", dest="ignore_self", + help="Ignore self NTP test", action="store_true") + options, args = parser.parse_args(rospy.myargv()) + + if (len(args) != 2): + parser.error("Invalid arguments. Must have HOSTNAME [args]. %s" % args) + + + try: + offset = int(options.offset_tol) + self_offset = int(options.self_offset_tol) + error_offset = int(options.error_offset_tol) + ignore_self = options.ignore_self + except: + parser.error("Offsets must be numbers") + + ntp_monitor(args[1], offset, self_offset, options.diag_hostname, + error_offset, ignore_self) + + +if __name__ == "__main__": + try: + ntp_monitor_main(rospy.myargv()) + except KeyboardInterrupt: pass + except SystemExit: pass + except: + import traceback + traceback.print_exc() diff --git a/computer_hw/executables/nvidia_temp.py b/computer_hw/executables/nvidia_temp.py new file mode 100755 index 0000000..ccd015f --- /dev/null +++ b/computer_hw/executables/nvidia_temp.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts +##\brief Publishes diagnostic data on temperature and usage for a Quadro 600 GPU + +from __future__ import with_statement, division + +import rospy + +from computer_hw.gpu_status_monitor import GpuMonitor +from computer_hw.nvidia_util import Nvidia_GPU_Stat + +if __name__ == '__main__': + rospy.init_node('nvidia_temp_monitor') + + monitor = GpuMonitor(Nvidia_GPU_Stat) + monitor.run() + diff --git a/computer_hw/executables/wifi_monitor.py b/computer_hw/executables/wifi_monitor.py new file mode 100755 index 0000000..8fd6406 --- /dev/null +++ b/computer_hw/executables/wifi_monitor.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts +##\brief Republishes the data from ddwrt/accesspoint onto diagnostics + +from __future__ import with_statement + +from computer_status_msgs.msg import AccessPoint +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +import rospy +import sys +import threading + +DIAG_NAME = 'Wifi Status (ddwrt)' +WARN_TIME = 30 +ERROR_TIME = 60 + + +def wifi_to_diag(msg): + stat = DiagnosticStatus() + + stat.name = DIAG_NAME + stat.level = DiagnosticStatus.OK + stat.message = 'OK' + + stat.values.append(KeyValue(key='ESSID', value=msg.essid)) + stat.values.append(KeyValue(key='Mac Address', value=msg.macaddr)) + stat.values.append(KeyValue(key='Signal', value=str(msg.signal))) + stat.values.append(KeyValue(key='Noise', value=str(msg.noise))) + stat.values.append(KeyValue(key='Sig/Noise', value=str(msg.snr))) + stat.values.append(KeyValue(key='Channel', value=str(msg.channel))) + stat.values.append(KeyValue(key='Rate', value=msg.rate)) + stat.values.append(KeyValue(key='TX Power', value=msg.tx_power)) + stat.values.append(KeyValue(key='Quality', value=str(msg.quality))) + + return stat + +def mark_diag_stale(diag_stat = None, error = False): + if not diag_stat: + diag_stat = DiagnosticStatus() + diag_stat.message = 'No Updates' + diag_stat.name = DIAG_NAME + else: + diag_stat.message = 'Updates Stale' + + diag_stat.level = DiagnosticStatus.WARN + if error: + diag_stat.level = DiagnosticStatus.ERROR + + return diag_stat + +class WifiMonitor(object): + def __init__(self): + self._mutex = threading.Lock() + + self._last_msg = None + self._last_update_time = None + self._start_time = rospy.get_time() + + self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + + self._ddwrt_sub = rospy.Subscriber('ddwrt/accesspoint', AccessPoint, self._cb) + + def _cb(self, msg): + with self._mutex: + self._last_msg = msg + self._last_update_time = rospy.get_time() + + def publish_stats(self): + with self._mutex: + if self._last_msg: + ddwrt_stat = wifi_to_diag(self._last_msg) + + update_diff = rospy.get_time() - self._last_update_time + if update_diff > WARN_TIME: + ddwrt_stat = mark_diag_stale(ddwrt_stat) + if (rospy.get_time() - self._last_update_time) > ERROR_TIME: + ddwrt_stat = mark_diag_stale(ddwrt_stat, True) + + ddwrt_stat.values.append(KeyValue(key='Time Since Update', value=str(update_diff))) + else: + error_state = (rospy.get_time() - self._start_time) > ERROR_TIME + ddwrt_stat = mark_diag_stale(None, error_state) + ddwrt_stat.values.append(KeyValue(key='Time Since Update', value="N/A")) + + msg = DiagnosticArray() + msg.header.stamp = rospy.get_rostime() + msg.status.append(ddwrt_stat) + + self._diag_pub.publish(msg) + + +if __name__ == '__main__': + try: + rospy.init_node('ddwrt_diag') + except rospy.exceptions.ROSInitException: + print('Wifi monitor is unable to initialize node. Master may not be running.') + sys.exit(2) + + wifi_monitor = WifiMonitor() + rate = rospy.Rate(1.0) + + try: + while not rospy.is_shutdown(): + rate.sleep() + wifi_monitor.publish_stats() + except KeyboardInterrupt: + pass + except Exception as e: + import traceback + traceback.print_exc() + + sys.exit(0) + + + diff --git a/computer_hw/package.xml b/computer_hw/package.xml new file mode 100644 index 0000000..af9340b --- /dev/null +++ b/computer_hw/package.xml @@ -0,0 +1,26 @@ + + computer_hw + 1.6.31 + Monitors the computer's processor and hard drives of the PR2 and publishes data to diagnostics. Originally taken from pr2_robot repo. + ROS Orphaned Package Maintainers + Isaac I.Y. Saito + BSD + + http://www.ros.org/wiki/computer_hw + https://github.com/130s/computer_hw/issues + + Kevin Watts + Isaac Saito + + catkin + + diagnostic_msgs + computer_status_msgs + roscpp + std_msgs + diagnostic_aggregator + libsensors_monitor + rospy + roslib + + diff --git a/computer_hw/setup.py b/computer_hw/setup.py new file mode 100644 index 0000000..821027e --- /dev/null +++ b/computer_hw/setup.py @@ -0,0 +1,11 @@ +## ! DO NOT MANUALLY INVOKE THIS setup.py, USE CATKIN INSTEAD + +from distutils.core import setup +from catkin_pkg.python_setup import generate_distutils_setup + +# fetch values from package.xml +setup_args = generate_distutils_setup( + packages=['computer_hw'], + package_dir={'': 'src'}) + +setup(**setup_args) diff --git a/computer_hw/src/computer_hw/__init__.py b/computer_hw/src/computer_hw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/computer_hw/src/computer_hw/gpu_stat_entity.py b/computer_hw/src/computer_hw/gpu_stat_entity.py new file mode 100644 index 0000000..d24d66f --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_stat_entity.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +# @summary: Utilities for nvidia. This file is agnostic from framework e.g. ROS. + + +class GPU_Stat(object): + """ + @summary Entity class to contain GPU status. + @note: The format of this class is NOT dependent on any framework e.g. ROS + """ + _product_name = "" + _pci_device_id = "" + _pci_location = "" + _display = "" + _driver_version = "" + # TODO + _fan_speed = 0.0 + _gpu_usage = 0 + _memory_usage = 0 + _temperature = 0 + + def __init__(self): + pass + + @property + def fan_speed(self): + return self._fan_speed + + @fan_speed.setter + def fan_speed(self, v): + self._fan_speed = v + + @property + def gpu_usage(self): + return self._gpu_usage + + @gpu_usage.setter + def gpu_usage(self, v): + self._gpu_usage = v + + @property + def memory_usage(self): + return self._memory_usage + + @memory_usage.setter + def memory_usage(self, v): + self._memory_usage = v + + @property + def temperature(self): + return self._temperature + + @temperature.setter + def temperature(self, v): + self._temperature = v + + @property + def product_name(self): + return self._product_name + + @product_name.setter + def product_name(self, v): + self._product_name = v + + @property + def pci_device_id(self): + return self._pci_device_id + + @pci_device_id.setter + def pci_device_id(self, v): + self._pci_device_id = v + + @property + def pci_location(self): + return self._pci_location + + @pci_location.setter + def pci_location(self, v): + self._pci_location = v + + @property + def display(self): + return self._display + + @display.setter + def display(self, v): + self._display = v + + @property + def driver_version(self): + return self._driver_version + + @driver_version.setter + def driver_version(self, v): + self._driver_version = v diff --git a/computer_hw/src/computer_hw/gpu_status_monitor.py b/computer_hw/src/computer_hw/gpu_status_monitor.py new file mode 100644 index 0000000..69bcfaa --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_status_monitor.py @@ -0,0 +1,143 @@ +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from computer_status_msgs.msg import GPUStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +import rospy +import traceback + +from computer_hw.gpu_util import GPUStatusHandler + + +class GpuMonitor(object): + def __init__(self, stat_handler_class): + """ + @param stat_handler_class: Class object that is to be delgated to return + GPU status. E.g. computer_hw.nvidia_util.Nvidia_GPU_Stat + @type stat_handler_class: computer_hw.gpu_util.GPUStatusHandler + """ + # Instantiating GPU status handler. + self._gpu_status_handler = stat_handler_class() + if not isinstance(self._gpu_status_handler, GPUStatusHandler): + raise TypeError("GPU status handler passed '{}' is not compatible. This class needs a derived class of {}".format( + stat_handler_class, GPUStatusHandler)) + self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + + def gpu_status_to_diag(self, gpu_stat): + stat = DiagnosticStatus() + stat.name = 'GPU Status' + stat.message = 'OK' + stat.level = DiagnosticStatus.OK + stat.hardware_id = gpu_stat.pci_device_id + + stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) + stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) + stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) + stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) + stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) + stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % GPUStatusHandler.rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) + stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) + + # Check for valid data + if not gpu_stat.product_name or not gpu_stat.pci_device_id: + stat.level = DiagnosticStatus.ERROR + stat.message = 'No Device Data' + return stat + + # Check load + if gpu_stat.gpu_usage > 98: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Load' + + # Check thresholds + if gpu_stat.temperature > 90: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Temperature' + if gpu_stat.temperature > 95: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'Temperature Alarm' + + # Check fan + if gpu_stat.fan_speed == 0: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'No Fan Speed' + return stat + + def pub_status(self): + stat = DiagnosticStatus() + gpu_stat = None + try: + _non_ros_gpu_stat = self._gpu_status_handler.get_gpu_status() + gpu_stat = self._convert_output(_non_ros_gpu_stat) + stat = self.gpu_status_to_diag(gpu_stat) + rospy.logdebug("gpu_stat: {}\n".format(gpu_stat)) + except AttributeError as e: + rospy.logerr('Unable to process GPU status as getting GPU status with proprietary command failed : {}'.format(str(e))) + except Exception as e: + rospy.logerr('Unable to process GPU status: {}'.format(str(e))) + rospy.logerr(traceback.format_exc()) + + gpu_stat.header.stamp = rospy.get_rostime() + + array = DiagnosticArray() + array.header.stamp = rospy.get_rostime() + + array.status = [ stat ] + + self._pub.publish(array) + self._gpu_pub.publish(gpu_stat) + + def _convert_output(self, gpu_stat_proprietary): + """ + @param gpu_stat_proprietary: + @rtype computer_status_msgs.GPUStatus + """ + gpu_stat = GPUStatus() + gpu_stat.product_name = gpu_stat_proprietary.product_name + gpu_stat.pci_device_id = gpu_stat_proprietary.pci_device_id + gpu_stat.pci_location = gpu_stat_proprietary.pci_location + gpu_stat.display = gpu_stat_proprietary.display + gpu_stat.driver_version = gpu_stat_proprietary.driver_version + gpu_stat.temperature = gpu_stat_proprietary.temperature + gpu_stat.fan_speed = gpu_stat_proprietary.fan_speed + gpu_stat.gpu_usage = gpu_stat_proprietary.gpu_usage + gpu_stat.memory_usage = gpu_stat_proprietary.memory_usage + return gpu_stat + + def run(self): + my_rate = rospy.Rate(rospy.get_param("gpu_monitor_rate", 1.0)) + while not rospy.is_shutdown(): + self.pub_status() + my_rate.sleep() diff --git a/computer_hw/src/computer_hw/gpu_util.py b/computer_hw/src/computer_hw/gpu_util.py new file mode 100644 index 0000000..d6c081a --- /dev/null +++ b/computer_hw/src/computer_hw/gpu_util.py @@ -0,0 +1,147 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import division + +from computer_status_msgs.msg import GPUStatus +import logging +import math +import subprocess + +from computer_hw.gpu_stat_entity import GPU_Stat + +class GPUStatusHandler(object): + """ + @summary: Utilities for nvidia. This file is agnostic from framework e.g. ROS. + """ + _MAX_FAN_RPM = 4500 + + @property + def max_fan_rpm(self): + return self._MAX_FAN_RPM + + @max_fan_rpm.setter + def max_fan_rpm(self, v): + self._MAX_FAN_RPM = v + + @staticmethod + def rads_to_rpm(rads): + return rads / (2 * math.pi) * 60 + + @staticmethod + def rpm_to_rads(rpm): + return rpm * (2 * math.pi) / 60 + + @staticmethod + def get_raw_gpu_status(): + """Needs implemented in the derived class""" + raise NotImplemented() + + @staticmethod + def _find_val(output, word): + lines = output.split('\n') + for line in lines: + tple = line.split(':') + if not len(tple) > 1: + continue + + name = tple[0].strip() + val = ':'.join(tple[1:]).strip() + + if not name.lower() == word.lower(): + continue + + return val.strip() + + return '' + + def convert_proprietary_out(self, proprietary_output_raw): + """ + @summary: Parse Nvidia's SMI tool output and returns in a more + programming friendly format. + @param proprietary_output_raw: str of shell command output i.e. output of + 'get_raw_gpu_status' method. + @return File: gpu_stat_entity.GPU_Stat instance + @raise AttributeError: When 'proprietary_output_raw' is not in an + expected form. + """ + if not proprietary_output_raw: + raise AttributeError("Input proprietary data is empty. Can't convert") + + gpu_stat = GPU_Stat() + + gpu_stat.product_name = GPUStatusHandler._find_val(proprietary_output_raw, 'Product Name') + gpu_stat.pci_device_id = GPUStatusHandler._find_val(proprietary_output_raw, 'PCI Device/Vendor ID') + gpu_stat.pci_location = GPUStatusHandler._find_val(proprietary_output_raw, 'PCI Location ID') + gpu_stat.display = GPUStatusHandler._find_val(proprietary_output_raw, 'Display') + gpu_stat.driver_version = GPUStatusHandler._find_val(proprietary_output_raw, 'Driver Version') + + TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] + for query in TEMPERATURE_QUERIES: + temp_str = GPUStatusHandler._find_val(proprietary_output_raw, query) + if temp_str: + temp, units = temp_str.split() + gpu_stat.temperature = int(temp) + break + + fan_str = GPUStatusHandler._find_val(proprietary_output_raw, 'Fan Speed') + if fan_str: + # Fan speed in RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * self.max_fan_rpm + # Convert fan speed to Hz + gpu_stat.fan_speed = GPUStatusHandler.rpm_to_rads(fan_spd) + + usage_str = GPUStatusHandler._find_val(proprietary_output_raw, 'GPU') + if usage_str: + usage = usage_str.strip('\%').strip() + gpu_stat.gpu_usage = int(usage) + + mem_str = GPUStatusHandler._find_val(proprietary_output_raw, 'Memory') + if mem_str: + mem = mem_str.strip('\%').strip() + gpu_stat.memory_usage = int(mem) + + return gpu_stat + + def get_gpu_status(self): + """ + @summary: Get GPU status and return in an instance. + @return GPU_Stat instance + @raise AttributeError: When 'proprietary_output' is not in an + expected form. + """ + raw_output = self.get_raw_gpu_status() + return self.convert_proprietary_out(raw_output) diff --git a/computer_hw/src/computer_hw/nvidia_temperature_monitor.py b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py new file mode 100644 index 0000000..626340e --- /dev/null +++ b/computer_hw/src/computer_hw/nvidia_temperature_monitor.py @@ -0,0 +1,146 @@ +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +from computer_status_msgs.msg import GPUStatus +from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue +import rospy + +from computer_hw.gpu_util import GPUStatusHandler + + +class NVidiaTempMonitor(object): + def __init__(self): + self._pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10) + self._gpu_pub = rospy.Publisher('gpu_status', GPUStatus, queue_size=10) + + def gpu_status_to_diag(gpu_stat): + stat = DiagnosticStatus() + stat.name = 'GPU Status' + stat.message = 'OK' + stat.level = DiagnosticStatus.OK + stat.hardware_id = gpu_stat.pci_device_id + + stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name)) + stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id)) + stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location)) + stat.values.append(KeyValue(key='Display', value = gpu_stat.display)) + stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version)) + stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature)) + stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % GPUStatusHandler.rads_to_rpm(gpu_stat.fan_speed))) + stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage)) + stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage)) + + # Check for valid data + if not gpu_stat.product_name or not gpu_stat.pci_device_id: + stat.level = DiagnosticStatus.ERROR + stat.message = 'No Device Data' + return stat + + # Check load + if gpu_stat.gpu_usage > 98: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Load' + + # Check thresholds + if gpu_stat.temperature > 90: + stat.level = max(stat.level, DiagnosticStatus.WARN) + stat.message = 'High Temperature' + if gpu_stat.temperature > 95: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'Temperature Alarm' + + # Check fan + if gpu_stat.fan_speed == 0: + stat.level = max(stat.level, DiagnosticStatus.ERROR) + stat.message = 'No Fan Speed' + return stat + + def pub_status(self): + gpu_stat = GPUStatus() + stat = DiagnosticStatus() + try: + card_out = get_gpu_status() + gpu_stat = parse_smi_output(card_out) + stat = gpu_status_to_diag(gpu_stat) + rospy.loginfo("card_out: {}\ngpu_stat: {}\n".format(card_out, gpu_stat)) + except Exception as e: + import traceback + rospy.logerr('Unable to process nVidia GPU data') + rospy.logerr(traceback.format_exc()) + + gpu_stat.header.stamp = rospy.get_rostime() + + array = DiagnosticArray() + array.header.stamp = rospy.get_rostime() + + array.status = [ stat ] + + self._pub.publish(array) + self._gpu_pub.publish(gpu_stat) + + +def parse_smi_output(output): + gpu_stat = GPUStatus() + + gpu_stat.product_name = GPUStatusHandler._find_val(output, 'Product Name') + gpu_stat.pci_device_id = GPUStatusHandler._find_val(output, 'PCI Device/Vendor ID') + gpu_stat.pci_location = GPUStatusHandler._find_val(output, 'PCI Location ID') + gpu_stat.display = GPUStatusHandler._find_val(output, 'Display') + gpu_stat.driver_version = GPUStatusHandler._find_val(output, 'Driver Version') + + TEMPERATURE_QUERIES = ["Temperature", "GPU Current Temp"] + for query in TEMPERATURE_QUERIES: + temp_str = GPUStatusHandler._find_val(output, query) + if temp_str: + temp = temp_str.split()[0] + gpu_stat.temperature = int(temp) + break + + fan_str = GPUStatusHandler._find_val(output, 'Fan Speed') + if fan_str: + # Fan speed in RPM + fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * GPUStatusHandler._MAX_FAN_RPM + # Convert fan speed to Hz + gpu_stat.fan_speed = GPUStatusHandler.rpm_to_rads(fan_spd) + + usage_str = GPUStatusHandler._find_val(output, 'GPU') + if usage_str: + usage = usage_str.strip('\%').strip() + gpu_stat.gpu_usage = int(usage) + + mem_str = GPUStatusHandler._find_val(output, 'Memory') + if mem_str: + mem = mem_str.strip('\%').strip() + gpu_stat.memory_usage = int(mem) + + return gpu_stat + diff --git a/computer_hw/src/computer_hw/nvidia_util.py b/computer_hw/src/computer_hw/nvidia_util.py new file mode 100644 index 0000000..a9b9371 --- /dev/null +++ b/computer_hw/src/computer_hw/nvidia_util.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +import logging +import subprocess + +from computer_hw.gpu_util import GPUStatusHandler + + +class Nvidia_GPU_Stat(GPUStatusHandler): + def get_raw_gpu_status(self): + """ + @summary: Relying on a command on the host 'nvidia-smi'. + + Regarding 'nvidia-smi', some people believe that it at least needs to be run by 'root' + for the first invocation https://serverfault.com/questions/975859/nvidia-smi-must-be-run-by-root-before-it-can-be-used-by-regular-users, + but it seems to be working without initial invocation. + @todo: OpenQuetion-1: When this method is invoked from a container where + nvidia-smi, which is typically available on a host, is not easily + available. -> For docker, passing '--runtime=nvidia' enables the cmd + from a container. Then show warning when unavailable. + @todo: OpenQuetion-2: What if the cmd 'nvidia-smi' is not available? + """ + p = subprocess.Popen('nvidia-smi -a', stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=True) + (o, e) = p.communicate() + + if not p.returncode == 0: + return '' + + if not o: return '' + logging.debug("card_out: {}".format(o)) + return o diff --git a/computer_hw/src/network_detector.cpp b/computer_hw/src/network_detector.cpp new file mode 100644 index 0000000..cb77e10 --- /dev/null +++ b/computer_hw/src/network_detector.cpp @@ -0,0 +1,88 @@ +#include +#include +#include +#include + +#include + +#include "ros/ros.h" +#include "std_msgs/Bool.h" + +static int socket_fd = -1; + +bool initSocket() +{ + socket_fd = socket( AF_INET, SOCK_DGRAM, 0 ); + if( socket_fd != -1 ) + return true; + else + return false; +} + +bool interfaceIsRunning( std::string interface_name ) +{ + struct ifreq ifr; + + strcpy( ifr.ifr_name, interface_name.c_str() ); + if( ioctl( socket_fd, SIOCGIFFLAGS, &ifr ) < 0 ) + { + static std::string last_warning; + std::string warning = "Query of interface '" + interface_name + "' failed: '" + strerror( errno ) + "' Presuming down."; + if( warning != last_warning ) + { + ROS_WARN("%s", warning.c_str() ); + } + last_warning = warning; + return false; + } + bool running = (ifr.ifr_flags & IFF_RUNNING); + bool up = (ifr.ifr_flags & IFF_UP); + + return up && running; +} + +int main( int argc, char **argv ) +{ + ros::init( argc, argv, "network_detector" ); + ros::NodeHandle node; + std::string interface_name; + if( !ros::param::get( "~interface_name", interface_name )) + { + ROS_FATAL( "No parameter 'interface_name' specified. Don't know which interface to watch. Exiting." ); + exit(1); + } + ros::Publisher running_pub = node.advertise( "network/connected", 0, true ); + int loop_count; + bool first_time = true; + bool was_running = false; + float inner_loop_hertz = 4; + ros::Rate loop_rate( inner_loop_hertz ); + if( !initSocket() ) + { + ROS_FATAL( "Failed to open socket for network ioctl: '%s'. Exiting.", + strerror( errno )); + exit(1); + } + while( ros::ok() ) + { + bool is_running = interfaceIsRunning( interface_name ); + if( is_running != was_running || first_time || loop_count > inner_loop_hertz * 5 ) + { + if( is_running != was_running ) + { + ROS_INFO( "Interface '%s' %s.", interface_name.c_str(), (is_running ? "connected" : "disconnected") ); + } + + std_msgs::Bool msg; + msg.data = is_running; + running_pub.publish( msg ); + + loop_count = 0; + first_time = false; + } + ros::spinOnce(); + loop_rate.sleep(); + loop_count++; + was_running = is_running; + } +} diff --git a/computer_hw/test/parse_test.py b/computer_hw/test/parse_test.py new file mode 100755 index 0000000..a79b93a --- /dev/null +++ b/computer_hw/test/parse_test.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python +# +# Software License Agreement (BSD License) +# +# Copyright (c) 2010, Willow Garage, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials provided +# with the distribution. +# * Neither the name of the Willow Garage nor the names of its +# contributors may be used to endorse or promote products derived +# from this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +##\author Kevin Watts + +from __future__ import with_statement + +PKG = 'computer_hw' + +import os +import sys +import unittest + +import roslib + +import computer_hw +from computer_hw.nvidia_temperature_monitor import NVidiaTempMonitor, parse_smi_output + +TEXT_PATH = 'test/sample_output/nvidia_smi_out_2021.txt' +TEXT_HIGH_TEMP_PATH = 'test/sample_output/nvidia_smi_high_temp.txt' + + +##\brief Parses launch, tests.xml and configs.xml files in qualification +class TestNominalParser(unittest.TestCase): + def setUp(self): + with open(os.path.join(roslib.packages.get_pkg_dir('computer_hw'), TEXT_PATH), 'r') as f: + self.data = f.read() + + with open(os.path.join(roslib.packages.get_pkg_dir('computer_hw'), TEXT_HIGH_TEMP_PATH), 'r') as f: + self.high_temp_data = f.read() + + def test_parse(self): + gpu_stat = parse_smi_output(self.data) + + # Check valid + self.assert_(self.data, "Unable to read sample output, no test to run") + + # Check all string fields of message + self.assert_(gpu_stat.pci_device_id, "No PCI Device ID found") + self.assert_(gpu_stat.pci_location, "No PCI Location found") + self.assert_(gpu_stat.display, "No Display found") + self.assert_(gpu_stat.driver_version, "No Driver Version found") + self.assert_(gpu_stat.product_name, "No Product Name found") + + self.assert_(gpu_stat.temperature > 40 and gpu_stat.temperature < 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) + self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %f" % gpu_stat.fan_speed) + + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 0, "Diagnostics reports an error for nominal input. Message: %s" % diag_stat.message) + + def test_high_temp_parse(self): + gpu_stat = parse_smi_output(self.high_temp_data) + + # Check valid + self.assert_(self.high_temp_data, "Unable to read sample output, no test to run") + + # Check all string fields of message + self.assert_(gpu_stat.pci_device_id, "No PCI Device ID found") + self.assert_(gpu_stat.pci_location, "No PCI Location found") + self.assert_(gpu_stat.display, "No Display found") + self.assert_(gpu_stat.driver_version, "No Driver Version found") + self.assert_(gpu_stat.product_name, "No Product Name found") + + self.assert_(gpu_stat.temperature > 90, "Invalid temperature readings. Temperature: %d" % gpu_stat.temperature) + self.assert_(gpu_stat.fan_speed > 0 and gpu_stat.fan_speed < 471, "Invalid fan speed readings. Fan Speed %s" % gpu_stat.fan_speed) + + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 1, "Diagnostics didn't report warning for high temp input. Level %d, Message: %s" % (diag_stat.level, diag_stat.message)) + + + def test_empty_parse(self): + gpu_stat = parse_smi_output('') + + self.assert_(gpu_stat.temperature == 0, "Invalid temperature reading. Should be 0. Reading: %d" % gpu_stat.temperature) + + diag_stat = NVidiaTempMonitor.gpu_status_to_diag(gpu_stat) + + self.assert_(diag_stat.level == 2, "Diagnostics didn't reports an error for empty input. Level: %d, Message: %s" % (diag_stat.level, diag_stat.message)) + + + +if __name__ == '__main__': + if len(sys.argv) > 1 and sys.argv[1] == '-v': + # Use to run tests verbosly + suite = unittest.TestSuite() + suite.addTest(TestNominalParser('test_parse')) + suite.addTest(TestNominalParser('test_empty_parse')) + suite.addTest(TestNominalParser('test_high_temp_parse')) + + unittest.TextTestRunner(verbosity = 2).run(suite) + else: + import rostest + rostest.unitrun(PKG, 'parse_nominal', TestNominalParser) + + diff --git a/computer_hw/test/sample_output/nvidia_smi_high_temp.txt b/computer_hw/test/sample_output/nvidia_smi_high_temp.txt new file mode 100644 index 0000000..0290602 --- /dev/null +++ b/computer_hw/test/sample_output/nvidia_smi_high_temp.txt @@ -0,0 +1,18 @@ + +==============NVSMI LOG============== + + +Timestamp : Wed Sep 29 10:45:16 2010 + +Driver Version : 260.24 + +GPU 0: + Product Name : Quadro 600 + PCI Device/Vendor ID : df810de + PCI Location ID : 0:3:0 + Display : Connected + Temperature : 92 C + Fan Speed : 90% + Utilization + GPU : 95% + Memory : 22% diff --git a/computer_hw/test/sample_output/nvidia_smi_out_2010.txt b/computer_hw/test/sample_output/nvidia_smi_out_2010.txt new file mode 100644 index 0000000..692410e --- /dev/null +++ b/computer_hw/test/sample_output/nvidia_smi_out_2010.txt @@ -0,0 +1,69 @@ +Timestamp : Sat Sep 18 04:23:41 2021 +Driver Version : 440.64 +CUDA Version : 10.2 + +Attached GPUs : 1 +GPU 00000000:01:00.0 + Product Name : GeForce GTX 1060 6GB + Product Brand : GeForce + Display Mode : Enabled + Display Active : Enabled + Persistence Mode : Enabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-7f9b4a72-68fe-e2a9-8907-4590704d3431 + Minor Number : 0 + VBIOS Version : 86.06.45.00.60 + MultiGPU Board : No + Board ID : 0x100 + GPU Part Number : N/A + Inforom Version + Image Version : G001.0000.01.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1C0310DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x61633842 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Fan Speed : 5 % + Performance State : P8 + Clocks Throttle Reasons + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active diff --git a/computer_hw/test/sample_output/nvidia_smi_out_2021.txt b/computer_hw/test/sample_output/nvidia_smi_out_2021.txt new file mode 100644 index 0000000..d34bfd8 --- /dev/null +++ b/computer_hw/test/sample_output/nvidia_smi_out_2021.txt @@ -0,0 +1,193 @@ +==============NVSMI LOG============== [115/1954] + +Timestamp : Sat Sep 18 04:23:41 2021 +Driver Version : 440.64 +CUDA Version : 10.2 + +Attached GPUs : 1 +GPU 00000000:01:00.0 + Product Name : GeForce GTX 1060 6GB + Product Brand : GeForce + Display Mode : Enabled + Display Active : Enabled + Persistence Mode : Enabled + Accounting Mode : Disabled + Accounting Mode Buffer Size : 4000 + Driver Model + Current : N/A + Pending : N/A + Serial Number : N/A + GPU UUID : GPU-7f9b4a72-68fe-e2a9-8907-4590704d3431 + Minor Number : 0 + VBIOS Version : 86.06.45.00.60 + MultiGPU Board : No + Board ID : 0x100 + GPU Part Number : N/A + Inforom Version + Image Version : G001.0000.01.04 + OEM Object : 1.1 + ECC Object : N/A + Power Management Object : N/A + GPU Operation Mode + Current : N/A + Pending : N/A + GPU Virtualization Mode + Virtualization Mode : None + Host VGPU Mode : N/A + IBMNPU + Relaxed Ordering Mode : N/A + PCI + Bus : 0x01 + Device : 0x00 + Domain : 0x0000 + Device Id : 0x1C0310DE + Bus Id : 00000000:01:00.0 + Sub System Id : 0x61633842 + GPU Link Info + PCIe Generation + Max : 3 + Current : 1 + Link Width + Max : 16x + Current : 16x + Bridge Chip + Type : N/A + Firmware : N/A + Replays Since Reset : 0 + Replay Number Rollovers : 0 + Tx Throughput : 0 KB/s + Rx Throughput : 0 KB/s + Fan Speed : 5 % + Performance State : P8 + Clocks Throttle Reasons [54/1954] + Idle : Active + Applications Clocks Setting : Not Active + SW Power Cap : Not Active + HW Slowdown : Not Active + HW Thermal Slowdown : Not Active + HW Power Brake Slowdown : Not Active + Sync Boost : Not Active + SW Thermal Slowdown : Not Active + Display Clock Setting : Not Active + FB Memory Usage + Total : 6077 MiB + Used : 114 MiB + Free : 5963 MiB + BAR1 Memory Usage + Total : 256 MiB + Used : 5 MiB + Free : 251 MiB + Compute Mode : Default + Utilization + Gpu : 0 % + Memory : 2 % + Encoder : 0 % + Decoder : 0 % + Encoder Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + FBC Stats + Active Sessions : 0 + Average FPS : 0 + Average Latency : 0 + Ecc Mode + Current : N/A + Pending : N/A + ECC Errors + Volatile + Single Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Double Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Aggregate + Single Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Double Bit + Device Memory : N/A + Register File : N/A + L1 Cache : N/A + L2 Cache : N/A + Texture Memory : N/A + Texture Shared : N/A + CBU : N/A + Total : N/A + Retired Pages + Single Bit ECC : N/A + Double Bit ECC : N/A + Pending Page Blacklist : N/A + Temperature + GPU Current Temp : 51 C + GPU Shutdown Temp : 102 C + GPU Slowdown Temp : 99 C + GPU Max Operating Temp : N/A + Memory Current Temp : N/A + Memory Max Operating Temp : N/A + Power Readings + Power Management : Supported + Power Draw : 6.00 W + Power Limit : 120.00 W + Default Power Limit : 120.00 W + Enforced Power Limit : 120.00 W + Min Power Limit : 60.00 W + Max Power Limit : 140.00 W + Clocks + Graphics : 139 MHz + SM : 139 MHz + Memory : 405 MHz + Video : 544 MHz + Applications Clocks + Graphics : N/A + Memory : N/A + Default Applications Clocks + Graphics : N/A + Memory : N/A + Max Clocks + Graphics : 2012 MHz + SM : 2012 MHz + Memory : 4004 MHz + Video : 1708 MHz + Max Customer Boost Clocks + Graphics : N/A + Clock Policy + Auto Boost : N/A + Auto Boost Default : N/A + Processes + + +gpu_stat: header: + seq: 0 + stamp: + secs: 0 + nsecs: 0 + frame_id: '' +product_name: "GeForce GTX 1060 6GB" +pci_device_id: '' +pci_location: '' +display: '' +driver_version: "440.64" +temperature: 51 +fan_speed: 23.5619449019 +gpu_usage: 0 +memory_usage: 2