Changeset 2195


Ignore:
Timestamp:
09/04/12 11:27:36 (12 years ago)
Author:
sanmai
Message:
  • Add monitoring plugin for AVATecH CLAM.
  • Done main work refactoring the plugins. Now there is a generic plugin module, with special extensions for each plugin. Less code, easier for addition of new plugins, better maintainable,

wider coverage of error handling and logging. Only the AVATecH CLAM and Discojuice JSON and HTML plugin were refactored thus far.

  • Change checks for Discojuice as requested.
Location:
monitoring/plugins/mpi
Files:
2 added
3 edited

Legend:

Unmodified
Added
Removed
  • monitoring/plugins/mpi/check_clarin_discojuice_html.py

    r2028 r2195  
    11#!/usr/bin/python
    22
    3 import sys, getopt, httplib, subprocess, os, re, pdb
     3import generic_tla_monitoring
     4import os, pdb
    45
    56
    6 DESCRIPTION     = "Discojuice HTML"
     7special_plugin_description      = "Discojuice HTML"
     8special_plugin_file_name        = os.path.basename(__file__)
    79
    8 nagios_codes    = {
    9                     'OK'        : 0,
    10                     'WARNING'   : 1,
    11                     'CRITICAL'  : 2,
    12                     'UNKNOWN'   : 3,
    13                     'DEPENDENT' : 4
    14                     }
     10def special_main_subroutine(host) :
     11#result = test_case(host);
     12    #result          = check_condition(host)
     13    #generic_tla_monitoring.nagios_return(result['code'], result['message'])
    1514
    16 def usage() :
    17     """ returns nagios status UNKNOWN with
    18         a one line usage description
    19         usage() calls nagios_return()
    20     """
    21     nagios_return('UNKNOWN',
    22                   "usage: %s -h host" % (sys.argv[0]))
    23 
    24 def nagios_return(code, response) :
    25     """ prints the response message
    26         and exits the script with one
    27         of the defined exit codes
    28         DOES NOT RETURN
    29     """
    30     print code + ": " + response
    31     sys.exit(nagios_codes[code])
    32 
    33 def check_response_data_well_formedness(data) :
    34 
    35     pattern                 = '.*<html.*>.+</html>.*'
    36     pattern_regex           = re.compile(pattern, re.MULTILINE | re.IGNORECASE | re.DOTALL)
    37     results                 = pattern_regex.search(data)
     15    UP_URLs          = ('/mw1/sds/discojuice', '/mw2/sds/discojuice',) # X- use frozenset
    3816   
    39     if results is not None :
    40         return True
    41     else :
    42         return False
    43 
     17    # Check status for all UP_URLs.
     18    results          = map(lambda UP_URL : generic_tla_monitoring.check_condition(host      = host,
     19                                                                                  UP_URL    = UP_URL,
     20                                                                                  validator = generic_tla_monitoring.check_HTML_wellformedness,
     21                                                                                  special_plugin_file_name = special_plugin_file_name),
     22                           UP_URLs)   
    4423   
    45 
    46 def check_condition(host) :
    47     UP_URL  = '/mw/sds/discojuice'
    48 
    49     conn    = httplib.HTTPConnection(host)
    50    
    51     # X- Use exception handling.
    52     conn.request("GET", UP_URL)
    53    
    54     r1      = conn.getresponse()
    55 
    56     data    = r1.read()
    57    
    58     conn.close()
    59    
    60     if r1.status == 200 :
    61         well_formed = check_response_data_well_formedness(data)
    62 
    63         if well_formed :
    64             return {
    65                     "code"      : "OK",
    66                     "message"   : 'Host %s, service %s is up and returns well-formed HTML data.' % (host, DESCRIPTION)
    67                    }
    68         else :
    69             return {
    70                     "code"      : "CRITICAL",
    71                     "message"   : 'Host %s, service %s is up but returns non-well-formed HTML data.' % (host, DESCRIPTION)
    72                     }
    73     else :
    74         return {
    75                 "code"      : "CRITICAL",
    76                 "message"   : 'Host %s, service %s has a problem.' % (host, DESCRIPTION)
    77                }
    78 
    79 def main() :
    80     """ example options processing
    81         here we're expecting 1 option "-h"
    82         with a parameter
    83     """
    84 
    85     if len(sys.argv) < 2 :
    86         usage()
    87 
    88     try:
    89         opts, args  = getopt.getopt(sys.argv[1:], "h:")
    90     except getopt.GetoptError, err :
    91         usage()
    92 
    93     for o, value in opts :
    94         if o == "-h" :
    95             host    = value
    96         else :
    97             usage()
    98    
    99     #result = test_case(host);
    100     result          = check_condition(host)
    101     nagios_return(result['code'], result['message'])
     24    generic_tla_monitoring.nagios_return_complex(results, reporter = special_plugin_description)
    10225
    10326if __name__ == "__main__" :
    104     main()
    105 
     27    generic_tla_monitoring.main(special_main_subroutine)
  • monitoring/plugins/mpi/check_clarin_discojuice_json.py

    r2081 r2195  
    11#!/usr/bin/python
    22
    3 import sys, getopt, httplib, subprocess, os, re, pdb, traceback, datetime
    4 import simplejson as json
     3import generic_tla_monitoring
     4import os, pdb
    55#import xml.etree.ElementTree
    66
    77
    8 DESCRIPTION     = "Discojuice JSON"
     8special_plugin_description      = "Discojuice JSON"
     9special_plugin_file_name        = os.path.basename(__file__)
    910
    10 nagios_codes    = {
    11                     'OK'        : 0,
    12                     'WARNING'   : 1,
    13                     'CRITICAL'  : 2,
    14                     'UNKNOWN'   : 3,
    15                     'DEPENDENT' : 4
    16                     }
    17 
    18 def usage() :
    19     """ returns nagios status UNKNOWN with
    20         a one line usage description
    21         usage() calls nagios_return()
    22     """
    23     nagios_return('UNKNOWN',
    24                   "usage: %s -h host" % (sys.argv[0]))
    25 
    26 def nagios_return_complex(results) :
    27     def deal_with_result(result) :
    28         print result['code'] + ": " + result['message']
    29         return result['code']
    30 
    31     # Scan all condition/status check results and create a list of appropriate exit codes.
    32     exit_code_keys          = map(lambda result     : deal_with_result(result), results)
    33     suggested_exit_codes    = list(map(lambda key   : nagios_codes[key], exit_code_keys))
    34 
    35     # Exit with the highest suggested exit code, because the higher the exit code the more problematic the status is and problems have priority over harmony.
    36     sys.exit(max(suggested_exit_codes))
    37 
    38 def check_response_data_well_formedness(data, descriptive_string) :
    39 
    40     timestamp = datetime.datetime.today().isoformat()
    41 
    42     try :
    43         json.loads(data)
    44     except :
    45         traceback_string = traceback.format_exc()
    46         #pdb.set_trace()
    47 
    48         err_log_file_path = os.path.normpath("/tmp/err_unparseable." + descriptive_string.replace('/' , '__') + "_" + timestamp + ".log")
    49 
    50         with open(name = err_log_file_path, mode = "wt") as debugging_output_file :
    51             debugging_output_file.write(traceback_string)
    52        
    53         return False
    54     else :
    55         return True
    56 
    57 def check_condition(host, UP_URL) :
    58 
    59     def handle_connection_failure(problem_description) :
    60         err_log_file_path = os.path.normpath("/tmp/err_connection_failure." + UP_URL.replace('/' , '__') + "_" + timestamp + ".log")
    61 
    62         with open(name = err_log_file_path, mode = "wt") as debugging_output_file :
    63            debugging_output_file.write(problem_description)
    64    
    65     timestamp                = datetime.datetime.today().isoformat()
    66    
    67     try :
    68         conn                 = httplib.HTTPConnection(host)   
    69 
    70         request              = conn.request("GET", UP_URL)
    71     except :
    72         traceback_string     = traceback.format_exc()
    73 
    74         handle_connection_failure(traceback_string + "\nThis problem originates from location 1 in 'check_clarin_discojuice_json.py'.")
    75 
    76         return {
    77                 "code"      : "CRITICAL",
    78                 "message"   : '[%s] HTTP connection to host %s failed.' % (DESCRIPTION, host)
    79                }
    80     else :
    81         try :
    82             response         = conn.getresponse()
    83 
    84             data             = response.read()
    85            
    86             conn.close()
    87         except :
    88             traceback_string = traceback.format_exc()
    89 
    90             handle_connection_failure(traceback_string + "\nThis problem originates from location 2 in 'check_clarin_discojuice_json.py'.")
    91         else :
    92             if response.status == 200 :
    93                 well_formed  = check_response_data_well_formedness(data, UP_URL)
    94 
    95                 if well_formed :
    96                     return {
    97                             "code"      : "OK",
    98                             "message"   : '[%s] Host %s is up and returns parseable JSON data at "%s".' % (DESCRIPTION, host, UP_URL)
    99                            }
    100                 else :
    101                     return {
    102                             "code"      : "CRITICAL",
    103                             "message"   : '[%s] Host %s is up but returns unparseable JSON data at "%s".' % (DESCRIPTION, host, UP_URL)
    104                             }
    105             else :
    106 
    107                 handle_connection_failure("Unreachable URL: HTTP response code" + str(response.status) + "\nThis problem originates from location 3 in 'check_clarin_discojuice_json.py'.")
    108                 return {
    109                         "code"      : "CRITICAL",
    110                         "message"   : '[%s] Host %s has a problem with the URL path component "%s".' % (DESCRIPTION, host, UP_URL)
    111                        }
    112 
    113 def main() :
    114     """ example options processing
    115         here we're expecting 1 option "-h"
    116         with a parameter
    117     """
    118 
    119     if len(sys.argv) < 2 :
    120         usage()
    121 
    122     try:
    123         opts, args  = getopt.getopt(sys.argv[1:], "h:")
    124     except getopt.GetoptError, err :
    125         usage()
    126 
    127     for o, value in opts :
    128         if o == "-h" :
    129             host    = value
    130         else :
    131             usage()
    132 
     11def special_main_subroutine(host) :
    13312    UP_URLs          = ("/discojuice/metadata_clarin1.json", "/discojuice/metadata_clarin2.json",)
    13413   
    13514    # Check status for all UP_URLs.
    136     results          = map(lambda UP_URL : check_condition(host = host, UP_URL = UP_URL),
     15    results          = map(lambda UP_URL : generic_tla_monitoring.check_condition(host                      = host,
     16                                                                                  UP_URL                    = UP_URL,
     17                                                                                  validator                 = generic_tla_monitoring.check_JSON_wellformedness,
     18                                                                                  special_plugin_file_name  = special_plugin_file_name),
    13719                           UP_URLs)
    13820   
    139     nagios_return_complex(results)
     21    generic_tla_monitoring.nagios_return_complex(results, reporter = special_plugin_description)
    14022
    14123if __name__ == "__main__" :
    142     main()
     24    generic_tla_monitoring.main(special_main_subroutine)
  • monitoring/plugins/mpi/plugins_hosts_services.tab

    r2028 r2195  
    1212check_discojuice_json.py        Discojuice JSON catalog.clarin.eu
    1313check_discojuice_html.py        Discojuice HTML catalog.clarin.eu
     14check_lat_avatech_clam.py       AVATecH CLAM    lux17.mpi.nl
Note: See TracChangeset for help on using the changeset viewer.