poll_errors.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. #!/usr/bin/env python2
  2. #
  3. # Copyright (c) 2017-2019 Joe Clarke <jclarke@cisco.com>
  4. # All rights reserved.
  5. #
  6. # Redistribution and use in source and binary forms, with or without
  7. # modification, are permitted provided that the following conditions
  8. # are met:
  9. # 1. Redistributions of source code must retain the above copyright
  10. # notice, this list of conditions and the following disclaimer.
  11. # 2. Redistributions in binary form must reproduce the above copyright
  12. # notice, this list of conditions and the following disclaimer in the
  13. # documentation and/or other materials provided with the distribution.
  14. #
  15. # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  16. # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  18. # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  19. # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  21. # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  22. # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  25. # SUCH DAMAGE.
  26. import netsnmp
  27. import os
  28. import json
  29. from sparker import Sparker
  30. import CLEUCreds
  31. CACHE_FILE = '/home/jclarke/errors_cache.dat'
  32. THRESHOLD = 1
  33. WINDOW = 12
  34. REARM = 6
  35. SPARK_TEAM = 'CL19 NOC Team'
  36. SPARK_ROOM = 'Data Center Alarms'
  37. devices = ['dc1-fcsw-1', 'dc1-fcsw-2', 'dc2-fcsw-1', 'dc2-fcsw-2',
  38. 'dc1-ethsw-1', 'dc1-ethsw-2', 'dc2-ethsw-1', 'dc2-ethsw-2']
  39. ignore_interfaces = {}
  40. prev_state = {}
  41. curr_state = {}
  42. if __name__ == '__main__':
  43. spark = Sparker(token=CLEUCreds.SPARK_TOKEN)
  44. if os.path.exists(CACHE_FILE):
  45. fd = open(CACHE_FILE, 'r')
  46. prev_state = json.load(fd)
  47. fd.close()
  48. for device in devices:
  49. swent = {}
  50. vars = netsnmp.VarList(netsnmp.Varbind('ifDescr'), netsnmp.Varbind('ifInErrors'), netsnmp.Varbind(
  51. 'ifOutErrors'), netsnmp.Varbind('ifInDiscards'), netsnmp.Varbind('ifOutDiscards'), netsnmp.Varbind('ifAlias'))
  52. netsnmp.snmpwalk(vars,
  53. Version=3,
  54. DestHost=device,
  55. SecLevel='authPriv',
  56. SecName='CLEUR',
  57. AuthProto='SHA',
  58. AuthPass=CLEUCreds.SNMP_AUTH_PASS,
  59. PrivProto='DES',
  60. PrivPass=CLEUCreds.SNMP_PRIV_PASS)
  61. for var in vars:
  62. if var.iid not in swent:
  63. swent[var.iid] = {}
  64. swent[var.iid]['count'] = 0
  65. swent[var.iid]['suppressed'] = False
  66. swent[var.iid][var.tag] = var.val
  67. curr_state[device] = swent
  68. if not device in prev_state:
  69. continue
  70. for ins, vard in curr_state[device].items():
  71. if not ins in prev_state[device]:
  72. continue
  73. if not 'ifDescr' in vard:
  74. continue
  75. if not 'ifAlias' in vard:
  76. vard['ifAlias'] = ''
  77. if 'count' in prev_state[device][ins]:
  78. curr_state[device][ins]['count'] = prev_state[device][ins]['count']
  79. if 'suppressed' in prev_state[device][ins]:
  80. curr_state[device][ins]['suppressed'] = prev_state[
  81. device][ins]['suppressed']
  82. if_descr = vard['ifDescr']
  83. if_alias = vard['ifAlias']
  84. if device in ignore_interfaces and if_descr in ignore_interfaces[device]:
  85. continue
  86. found_error = False
  87. for k, v in vard.items():
  88. if k == 'ifDescr' or k == 'ifAlias' or k == 'count' or k == 'suppressed':
  89. continue
  90. if k in prev_state[device][ins]:
  91. diff = int(v) - int(prev_state[device][ins][k])
  92. if diff >= THRESHOLD:
  93. found_error = True
  94. if curr_state[device][ins]['count'] < WINDOW and not curr_state[device][ins]['suppressed']:
  95. spark.post_to_spark(
  96. SPARK_TEAM, SPARK_ROOM, '**WARNING**: Interface **{}** ({}) on device _{}_ has seen an increase of **{}** {} since the last poll (previous: {}, current: {}).'.format(if_descr, if_alias, device, diff, k, prev_state[device][ins][k], v))
  97. elif not curr_state[device][ins]['suppressed']:
  98. curr_state[device][ins]['suppressed'] = True
  99. spark.post_to_spark(
  100. SPARK_TEAM, SPARK_ROOM, 'Suppressing alarms for interface **{}** ({}) on device _{}_'.format(if_descr, if_alias, device))
  101. if not found_error:
  102. if curr_state[device][ins]['count'] > 0:
  103. curr_state[device][ins]['count'] -= 1
  104. if curr_state[device][ins]['count'] < REARM and curr_state[device][ins]['suppressed']:
  105. spark.post_to_spark(
  106. SPARK_TEAM, SPARK_ROOM, 'Interface **{}** ({}) on device _{}_ is no longer seeing an increase of errors'.format(if_descr, if_alias, device))
  107. curr_state[device][ins]['suppressed'] = False
  108. else:
  109. curr_state[device][ins]['count'] += 1
  110. fd = open(CACHE_FILE, 'w')
  111. json.dump(curr_state, fd, indent=4)
  112. fd.close()