poll_errors.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. #!/usr/bin/env python
  2. #
  3. # Copyright (c) 2017-2023 Joe Clarke <jclarke@cisco.com>
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions
  8. # are met:
  9. # 1. Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # 2. Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. # SUCH DAMAGE.
  26. from __future__ import print_function
  27. import netsnmp # type: ignore
  28. import os
  29. import json
  30. import argparse
  31. import sys
  32. from sparker import Sparker, MessageType # type: ignore
  33. import CLEUCreds # type: ignore
  34. from cleu.config import Config as C # type: ignore
  35. CACHE_FILE = "/home/jclarke/errors_cache"
  36. THRESHOLD = 1
  37. WINDOW = 12
  38. REARM = 6
  39. IF_UP = 1
  40. prev_state = {}
  41. curr_state = {}
  42. if __name__ == "__main__":
  43. spark = Sparker(token=CLEUCreds.SPARK_TOKEN)
  44. parser = argparse.ArgumentParser(prog=sys.argv[0], description="Poll errors from network devices")
  45. parser.add_argument(
  46. "--name",
  47. "-n",
  48. metavar="<NAME>",
  49. help="Name of the poller",
  50. required=True,
  51. )
  52. parser.add_argument(
  53. "--device-file",
  54. "-f",
  55. metavar="<DEVICE_FILE>",
  56. help="Path to the JSON file containing the devices to poll",
  57. required=True,
  58. )
  59. parser.add_argument("--webex-room", "-r", metavar="<ROOM_NAME>", help="Name of Webex room to send alerts to", required=True)
  60. parser.add_argument(
  61. "--ignore-interfaces-file", "-i", metavar="<IGNORE_FILE>", help="Path to JSON file that maps devices and interfaces to ignore"
  62. )
  63. parser.add_argument("--no-discards", help="Poll ifIn/OutDiscards (default: discards are polled)", action="store_true")
  64. args = parser.parse_args()
  65. devices = None
  66. try:
  67. with open(args.device_file) as fd:
  68. devices = json.load(fd)
  69. except Exception as e:
  70. print("ERROR: Failed to load device file {}: {}".format(args.device_file, getattr(e, "message", repr(e))))
  71. sys.exit(1)
  72. ignore_interfaces = {}
  73. if args.ignore_interfaces_file:
  74. try:
  75. with open(args.ignore_interfaces_file) as fd:
  76. ignore_interfaces = json.load(fd)
  77. except Exception as e:
  78. print(
  79. "ERROR: Failed to load the ignore interfaces file {}: {}".format(
  80. args.ignore_interfaces_file, getattr(e, "message", repr(e))
  81. )
  82. )
  83. sys.exit(1)
  84. cache_file = CACHE_FILE + "_" + args.name + ".dat"
  85. if os.path.exists(cache_file):
  86. with open(cache_file, "r") as fd:
  87. prev_state = json.load(fd)
  88. for device in devices:
  89. swent = {}
  90. if not args.no_discards:
  91. vars = netsnmp.VarList(
  92. netsnmp.Varbind("ifDescr"),
  93. netsnmp.Varbind("ifInErrors"),
  94. netsnmp.Varbind("ifOutErrors"),
  95. netsnmp.Varbind("ifInDiscards"),
  96. netsnmp.Varbind("ifOutDiscards"),
  97. netsnmp.Varbind("ifAlias"),
  98. netsnmp.Varbind("ifOperStatus"),
  99. )
  100. else:
  101. vars = netsnmp.VarList(
  102. netsnmp.Varbind("ifDescr"),
  103. netsnmp.Varbind("ifInErrors"),
  104. netsnmp.Varbind("ifOutErrors"),
  105. netsnmp.Varbind("ifAlias"),
  106. netsnmp.Varbind("ifOperStatus"),
  107. )
  108. netsnmp.snmpwalk(
  109. vars,
  110. Version=3,
  111. DestHost=device,
  112. SecLevel="authPriv",
  113. SecName="CLEUR",
  114. AuthProto="SHA",
  115. AuthPass=CLEUCreds.SNMP_AUTH_PASS,
  116. PrivProto="AES",
  117. PrivPass=CLEUCreds.SNMP_PRIV_PASS,
  118. )
  119. for var in vars:
  120. if var.iid not in swent:
  121. swent[var.iid] = {}
  122. swent[var.iid]["count"] = 0
  123. swent[var.iid]["suppressed"] = False
  124. swent[var.iid][var.tag] = var.val
  125. curr_state[device] = swent
  126. if not device in prev_state:
  127. continue
  128. for ins, vard in list(curr_state[device].items()):
  129. if not ins in prev_state[device]:
  130. continue
  131. if not "ifDescr" in vard:
  132. continue
  133. if "ifOperStatus" not in vard or int(vard["ifOperStatus"]) != IF_UP:
  134. continue
  135. if not "ifAlias" in vard:
  136. vard["ifAlias"] = ""
  137. if "count" in prev_state[device][ins]:
  138. curr_state[device][ins]["count"] = prev_state[device][ins]["count"]
  139. if "suppressed" in prev_state[device][ins]:
  140. curr_state[device][ins]["suppressed"] = prev_state[device][ins]["suppressed"]
  141. if_descr = vard["ifDescr"]
  142. if_alias = vard["ifAlias"]
  143. if device in ignore_interfaces and if_descr in ignore_interfaces[device]:
  144. continue
  145. found_error = False
  146. for k, v in list(vard.items()):
  147. if k == "ifDescr" or k == "ifAlias" or k == "count" or k == "suppressed":
  148. continue
  149. if k in prev_state[device][ins]:
  150. diff = int(v) - int(prev_state[device][ins][k])
  151. if diff >= THRESHOLD:
  152. found_error = True
  153. if curr_state[device][ins]["count"] < WINDOW and not curr_state[device][ins]["suppressed"]:
  154. spark.post_to_spark(
  155. C.WEBEX_TEAM,
  156. args.webex_room,
  157. "Interface **{}** ({}) on device _{}_ has seen an increase of **{}** {} since the last poll (previous: {}, current: {}).".format(
  158. if_descr, if_alias, device, diff, k, prev_state[device][ins][k], v
  159. ),
  160. MessageType.WARNING,
  161. )
  162. elif not curr_state[device][ins]["suppressed"]:
  163. curr_state[device][ins]["suppressed"] = True
  164. spark.post_to_spark(
  165. C.WEBEX_TEAM,
  166. args.webex_room,
  167. "Suppressing alarms for interface **{}** ({}) on device _{}_".format(if_descr, if_alias, device),
  168. )
  169. if not found_error:
  170. if curr_state[device][ins]["count"] > 0:
  171. curr_state[device][ins]["count"] -= 1
  172. if curr_state[device][ins]["count"] < REARM and curr_state[device][ins]["suppressed"]:
  173. spark.post_to_spark(
  174. C.WEBEX_TEAM,
  175. args.webex_room,
  176. "Interface **{}** ({}) on device _{}_ is no longer seeing an increase of errors".format(
  177. if_descr, if_alias, device
  178. ),
  179. MessageType.GOOD,
  180. )
  181. curr_state[device][ins]["suppressed"] = False
  182. else:
  183. curr_state[device][ins]["count"] += 1
  184. with open(cache_file, "w") as fd:
  185. json.dump(curr_state, fd, indent=4)