unknown state reduced to hearbeat monitoring timings and status

This commit is contained in:
Dirk Alders 2023-12-20 09:36:42 +01:00
parent 149cf7cdeb
commit f6ab1dc4ce

View File

@ -12,6 +12,10 @@ logger = logging.getLogger(ROOT_LOGGER_NAME).getChild(__name__)
class base(object):
MONITORING_HEARTBEAT = "heartbeat"
MONITORING_BATTERY = "battery"
MONITORING_FOLLOW_SETPOINT = "follow_setpoint"
#
FOLLOW_REQUEST_WARNING = 5 # Seconds, till warning comes up, if device does not follow the command
FOLLOW_REQUEST_ERROR = 60 # Seconds, till error comes up, if device does not follow the command
FOLLOW_KEY = None
@ -19,12 +23,14 @@ class base(object):
BATTERY_LVL_WARNING = 15
BATTERY_LVL_ERROR = 5
#
LAST_MSG_WARNING = 6 * 24 * 60 * 60
LAST_MSG_ERROR = 24 * 24 * 60 * 60
LAST_MSG_WARNING = 6 * 60 * 60
LAST_MSG_ERROR = 24 * 60 * 60
def __init__(self, mqtt_client: mqtt.mqtt_client, topic):
self.topic = topic
#
self.__unknown_tm__ = {}
#
mqtt_client.add_callback(topic, self.__rx__)
mqtt_client.add_callback(topic + '/#', self.__rx__)
#
@ -58,8 +64,8 @@ class base(object):
#
# battery level
#
if "battery" in payload and message.topic == self.topic:
self.battery = payload["battery"]
if self.MONITORING_BATTERY in payload and message.topic == self.topic:
self.battery = payload[self.MONITORING_BATTERY]
def target(self, key, value):
tm_t, value_t = self.__target_storage__.get(key, (0, None))
@ -75,51 +81,70 @@ class base(object):
#
# HEARTBEAT
#
if key == "heartbeat":
if key == self.MONITORING_HEARTBEAT:
if self.last_device_msg is None:
return {"status": nagios.Nagios.UNKNOWN, "msg": "Device exists, but no data received or unknown monitoring"}
return self.__nagios_return__(self.MONITORING_HEARTBEAT, nagios.Nagios.UNKNOWN, "Device exists, but no data received")
else:
dt = time.time() - self.last_device_msg
dt_disp = dt / 60 / 60
if dt > self.LAST_MSG_ERROR:
return {"status": nagios.Nagios.ERROR, "msg": "Last message %.1fh ago" % dt_disp}
return self.__nagios_return__(self.MONITORING_HEARTBEAT, nagios.Nagios.ERROR, "Last message %.1fh ago" % dt_disp)
elif dt > self.LAST_MSG_WARNING:
return {"status": nagios.Nagios.WARNING, "msg": "Last message %.1fh ago" % dt_disp}
return self.__nagios_return__(self.MONITORING_HEARTBEAT, nagios.Nagios.WARNING, "Last message %.1fh ago" % dt_disp)
else:
return {"status": nagios.Nagios.OK, "msg": "Last message %.1fh ago" % dt_disp}
return self.__nagios_return__(self.MONITORING_HEARTBEAT, nagios.Nagios.OK, "Last message %.1fh ago" % dt_disp)
#
# FOLLOW SETPOINT
#
elif key == 'follow_setpoint':
elif key == self.MONITORING_FOLLOW_SETPOINT:
if self.FOLLOW_KEY is None:
return {"status": nagios.Nagios.UNKNOWN, "msg": "Device exist, but does not follow any setpoint."}
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.UNKNOWN, "Device exist, but does not follow any setpoint.", force=True)
tm_s, value_s = self.__state_storage__.get(self.FOLLOW_KEY, (0, None))
try:
tm_t, value_t = self.__target_storage__[self.FOLLOW_KEY]
except KeyError:
if value_s is not None:
return {"status": nagios.Nagios.WARNING, "msg": "Current temperature setpoint %.1f°C (age=%.1fmin), but never received a setpoint" % (value_s, (time.time()-tm_s)/60)}
return {"status": nagios.Nagios.UNKNOWN, "msg": "Device exists, but no data received"}
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.WARNING, "Current temperature setpoint %.1f°C (age=%.1fmin), but never received a setpoint" % (value_s, (time.time()-tm_s)/60))
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.UNKNOWN, "Device exists, but no data received")
else:
tm = time.time()
dt = tm - tm_t
if value_t != value_s and dt > self.FOLLOW_REQUEST_ERROR:
return {"status": nagios.Nagios.ERROR, "msg": "Requested setpoint unequal valve setpoint %.1f°C since %.1fmin" % (value_s, (time.time()-tm_s)/60)}
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.ERROR, "Requested setpoint unequal valve setpoint %.1f°C since %.1fmin" % (value_s, (time.time()-tm_s)/60))
elif value_t != value_s and dt > self.FOLLOW_REQUEST_WARNING:
return {"status": nagios.Nagios.WARNING, "msg": "Requested setpoint unequal valve setpoint %.1f°C since %.1fmin" % (value_s, (time.time()-tm_s))}
return {"status": nagios.Nagios.OK, "msg": "Requested setpoint equal valve setpoint %.1f°C" % value_s}
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.WARNING, "Requested setpoint unequal valve setpoint %.1f°C since %.1fmin" % (value_s, (time.time()-tm_s)))
return self.__nagios_return__(self.MONITORING_FOLLOW_SETPOINT, nagios.Nagios.OK, "Requested setpoint equal valve setpoint %.1f°C" % value_s)
#
# BATTERY
#
elif key == "battery":
elif key == self.MONITORING_BATTERY:
if self.battery is None:
return {"status": nagios.Nagios.UNKNOWN, "msg": "Device exists, but no data received or unknown monitoring"}
return self.__nagios_return__(self.MONITORING_BATTERY, nagios.Nagios.UNKNOWN, "Device exists, but no data received or unknown monitoring")
elif self.battery <= self.BATTERY_LVL_ERROR:
return {"status": nagios.Nagios.ERROR, "msg": "Battery level critical low (%.1f%%)" % self.battery}
return self.__nagios_return__(self.MONITORING_BATTERY, nagios.Nagios.ERROR, "Battery level critical low (%.1f%%)" % self.battery)
elif self.battery <= self.BATTERY_LVL_WARNING:
return {"status": nagios.Nagios.WARNING, "msg": "Battery level low (%.1f%%)" % self.battery}
return self.__nagios_return__(self.MONITORING_BATTERY, nagios.Nagios.WARNING, "Battery level low (%.1f%%)" % self.battery)
else:
return {"status": nagios.Nagios.OK, "msg": "Battery okay (%.1f%%)" % self.battery}
return self.__nagios_return__(self.MONITORING_BATTERY, nagios.Nagios.OK, "Battery okay (%.1f%%)" % self.battery)
def __nagios_return__(self, monitoring_name, status, msg, force=False):
tm = time.time()
if monitoring_name not in self.__unknown_tm__:
self.__unknown_tm__[monitoring_name] = None
if status == nagios.Nagios.UNKNOWN and not force:
if self.__unknown_tm__[monitoring_name] is None:
self.__unknown_tm__[monitoring_name] = tm
dt = tm - self.__unknown_tm__[monitoring_name]
if dt >= self.LAST_MSG_ERROR:
status = nagios.Nagios.UNKNOWN
elif dt >= self.LAST_MSG_WARNING:
status = nagios.Nagios.WARNING
else:
status = nagios.Nagios.OK
msg += " - since %.1fh" % (dt / 3600)
else:
self.__unknown_tm__[monitoring_name] = None
return {"status": status, "msg": msg}
class group(object):