pythonapiloggingforkcheck-mk

python script breaks often while making API request


I have a python script which runs as a Ganeti hook, that run after adding, removing, shutting down and starting an instance to Ganeti. When adding a new instance to Ganeti the hook should add this instance to check_mk with a API call. Removing the instance in Ganeti triggers the deletion of the instance in check_mk. Shutting down an instance sets down time in check_mk and starting the instance removes the down time in check_mk if it was set by the hook. We have Ganeti clusters in multiple locations (datacenters).

We have distributed monitoring with check-mk-raw with one master and multiple slaves running in each datacenter. Thus adding, removing etc.. can only be done with the API call to the master.

Ganeti hooks stdout and stderr are redirected to files, this is hard coded in Ganeti. Errors will be written to stdout (cosole) only if the script fails, but if it runs successfully output is redirected to files and mostly isn't much. So print() doesn't help. Hence I am using the logging library.

The major problem is that the script breaks often and some times without logging. I don't know if it is my modest ability of coding or network latency. I added the whole exceptions today to find out what is going on, but that didn't help.

I would appreciate any help on this. Below is the complete script.

Thanks a lot.

EDIT: I removed the majority of the exceptions since they are not really relevant and fixed some typos in the script.

#!/usr/bin/env python

"""Manage host in monitoring."""

import os
import re
import sys
import json
import socket
import logging
import requests


APIURL = 'https://checkmk.host/site/check_mk/webapi.py'
WEBURL = 'https://checkmk.host/site/check_mk/view.py'


def hook_mon_token():
    """Get secrets for monitoring from file.

    This file is written by ganeti puppet module.
    """
    with open('/root/.hook_mon_token', 'r') as _file:
        ldap_secret = _file.readline()
        mon_token = _file.readline()
        return ldap_secret, mon_token


def get_datacenter():
    """Get datacenter we are run at."""
    datacenter = requests.get('http://localhost:8500/v1/catalog/datacenters').json()
    if '-' in datacenter[0]:
        datacenter = datacenter[0].split('-')[1]
        return datacenter.lower()
    else:
        return datacenter[0].lower()


def get_tenant(datacenter):
    """Return tenant name."""
    tenant = str.lower(''.join(re.findall(r'tenant:([\w-]+)', os.environ['GANETI_INSTANCE_NIC0_NETWORK_TAGS'])))
    if tenant == '':
        tenant = datacenter
    return tenant


def checkmk_api_call(action):
    """Call the Web API."""
    ldap_secret, mon_token = hook_mon_token()
    auth = ('check_mk_user', ldap_secret.strip())
    datacenter = get_datacenter()
    instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
    server_ip = os.environ['GANETI_INSTANCE_NIC0_IP']
    params = {'action': action, '_username': 'automation', '_secret': mon_token.strip()}

    if action == 'add_host':
        if datacenter in ('dc1', 'dc2', 'dc3'):
            tenant = get_tenant(datacenter)
            hostname = tenant.upper() + '.' + instance_name
            folder = datacenter + "/" + tenant + "/hosts"
            request = {
                'hostname': hostname,
                'folder': folder,
                'attributes': {
                    'ipaddress': server_ip,
                    'site': datacenter,
                    'tag_' + datacenter: datacenter,
                    'tag_' + datacenter + '-vm': datacenter + '-vm',
                    'tag_' + tenant + '-vm': tenant + '-vm',
                    'tag_agent': 'cmk-agent',
                    'tag_snmp': 'no-snmp'
                },
                'create_folders': '0'
            }
        else:
            hostname = datacenter.upper() + '.' + instance_name
            folder = datacenter + "/hosts"
            request = {
                'hostname': hostname,
                'folder': folder,
                'attributes': {
                    'ipaddress': server_ip,
                    'site': datacenter,
                    'tag_' + datacenter: datacenter,
                    'tag_' + datacenter + '-vm': datacenter + '-vm',
                    'tag_agent': 'cmk-agent',
                    'tag_snmp': 'no-snmp'
                },
                'create_folders': '0'
            }

    elif action == 'delete_host':
        api_answer, request, hostname = get_host('delete_host')
        if api_answer != server_ip:
            return api_answer, hostname

    try:
        resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
        if json.loads(resp_post.content)['result_code']:
            apierror = json.loads(resp_post.content)['result']
            return apierror, hostname
        return False, hostname
    except requests.exceptions.RequestException as error:
        return error, hostname


def get_host(action):
    """Get the Host."""
    ldap_secret, mon_token = hook_mon_token()
    auth = ('check_mk_user', ldap_secret.strip())
    datacenter = get_datacenter()
    instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
    params = {'action': 'get_host', '_username': 'automation', '_secret': mon_token.strip()}

    if datacenter in ('dc1', 'dc2', 'dc3'):
        tenant = get_tenant(datacenter)
        hostname = tenant.upper() + '.' + instance_name

        request = {
            'hostname': hostname
        }

    else:
        hostname = datacenter.upper() + '.' + instance_name
        request = {
            'hostname': hostname
        }

    if action == "delete_host":
        try:
            resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
            if json.loads(resp_post.content)['result_code']:
                apierror = json.loads(resp_post.content)['result']
                return apierror, request, hostname
            response_post = resp_post.json()
            host_ip = response_post['result']['attributes']['ipaddress']
            return host_ip, request, hostname
        except requests.exceptions.RequestException as error:
            return error, request, hostname

    else:
        try:
            resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
            if json.loads(resp_post.content)['result_code']:
                apierror = json.loads(resp_post.content)['result']
                return apierror
            return False
        except requests.exceptions.RequestException as error:
            return error


def is_down():
    """Check, if down and downtime comment."""
    ldap_secret, mon_token = hook_mon_token()
    auth = ('check_mk_user', ldap_secret.strip())
    datacenter = get_datacenter()
    instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]

    if datacenter in ('dc1', 'dc2', 'dc3'):
        tenant = get_tenant(datacenter)
        hostname = tenant.upper() + '.' + instance_name
    else:
        hostname = datacenter.upper() + '.' + instance_name

    params = {
        '_username': 'automation',
        '_secret': mon_token.strip(),
        'output_format': 'JSON',
        'host_regex': hostname,
        'view_name': 'downtimes'
    }

    apierror = get_host('get_host')
    if apierror:
        return None, apierror, hostname

    try:
        resp_get = requests.get(WEBURL, params=params, auth=auth).text
        resp_json = json.loads(resp_get)
        if len(resp_json) == 1:
            host_is_down = False
            down_comment = ''
        else:
            host_is_down = True
            down_comment = resp_json[1][resp_json[0].index('downtime_comment')]
        return host_is_down, down_comment, hostname
    except requests.exceptions.RequestException as error:
        return None, error, hostname


def checkmk_web_call(action):
    """Call web page view."""
    ldap_secret, mon_token = hook_mon_token()
    auth = ('check_mk_user', ldap_secret.strip())
    datacenter = get_datacenter()
    instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]

    apierror = get_host('get_host')
    if apierror:
        return apierror

    if datacenter in ('dc1', 'dc2', 'dc3'):
        tenant = get_tenant(datacenter)
        hostname = tenant.upper() + '.' + instance_name
    else:
        hostname = datacenter.upper() + '.' + instance_name

    params = {
        '_do_confirm': 'yes',
        '_do_actions': 'yes',
        '_transid': '-1',
        '_username': 'automation',
        '_secret': mon_token.strip(),
        'output_format': 'JSON'
    }

    if action == 'stop':
        params.update({
            'view_name': 'host',
            'host': hostname,
            '_on_hosts': 'on',
            '_downrange__next_year': 'This+year',
            '_down_comment': 'down by ganeti shutdown'
        })
    elif action == 'start':
        params.update({
            'view_name': 'downtimes',
            'host_regex': hostname,
            '_remove_downtimes': 'Remove'
        })

    try:
        resp = requests.post(WEBURL, params=params, auth=auth)
        return False
    except requests.exceptions.RequestException as error:
        return error


def gnt_action(action):

    logger = logging.getLogger(__name__)

    if action == 'instance-add':
        apierror, hostname = checkmk_api_call('add_host')
        if apierror:
            logger.error(apierror, 'Could not add "%s" to check_mk! Please add it manually!' % hostname)
            os.system('tail -1 /tmp/monitoring_hook.log')
            sys.exit(1)
        logger.info('Added "%s" successfully to check_mk. Please activete changes in WATO' % hostname)
    elif action == 'instance-remove':
        apierror, hostname = checkmk_api_call('delete_host')
        if apierror:
            logger.error(apierror, 'Could not remove "%s" from check_mk! Please remove it manually!' % hostname)
            os.system('tail -1 /tmp/monitoring_hook.log')
            sys.exit(1)
        logger.info('Removed "%s" successfully from check_mk. Please activate changes in WATO' % hostname)
    elif action == 'instance-start':
        host_is_down, down_comment, hostname = is_down()
        if host_is_down is None:
            logger.info(down_comment)
            os.system('tail -1 /tmp/monitoring_hook.log') 
            sys.exit(1)
        elif host_is_down and down_comment == 'down by ganeti shutdown':
            apierror = checkmk_web_call('start')
            if apierror:
                logger.error(apierror)
                os.system('tail -1 /tmp/monitoring_hook.log')
                sys.exit(1)
            logger.info('Removed down time successfully for "%s" in check_mk' % hostname)
        else:
            logger.info('Nothing to do')
    elif action == 'instance-stop':
        host_is_down, down_comment, hostname = is_down()
        if host_is_down is None:
            logger.info(down_comment)
            sys.exit(1)
        elif host_is_down is False:
            apierror = checkmk_web_call('stop')
            if apierror:
                logger.error(apierror)
                os.system('tail -1 /tmp/monitoring_hook.log')
                sys.exit(1)
            logger.info('Set down time successfully for "%s" in check_mk' % hostname)
        else:
            logger.info('Nothing to do. "%s" is already down' % hostname)


def main():

    logger = logging.getLogger(__name__)
    log_file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(process)s - %(message)s')
    log_file_handler = logging.FileHandler('/tmp/monitoring_hook.log')
    log_file_handler.setFormatter(log_file_format)
    log_file_handler.setLevel(logging.DEBUG)
    logger.addHandler(log_file_handler)
    logger.setLevel(logging.INFO)

    """Add Hook for Ganeti to add new instance to monitoring."""
    if socket.getfqdn() == os.environ['GANETI_MASTER']:
        action = os.environ['GANETI_HOOKS_PATH']
        if os.environ['GANETI_POST_INSTANCE_TAGS']:
            if 'monitoring:no' in os.environ['GANETI_POST_INSTANCE_TAGS']:
                logger.info('VM will not be added to check_mk')
                sys.exit(0)
            else:
                gnt_action(action)
        else:
            gnt_action(action)


if __name__ == "__main__":
    try:
        pid = os.fork()
        if pid > 0:
            # Exit parent process
            sys.exit(0)
    except OSError, e:
        print('fork failed: %d (%s)' % (e.errno, e.strerror))
        sys.exit(1)

    main()

Solution

  • After more debugging I found out that script fails only in some datacenters and always succeed in others, it was clear it is a network issue.

    The API requests are sent to the WAN IP of the monitoring server, so I just replaced it with the LAN IP in /etc/hosts until I can find the root cause.

    Sorry for the irrelevant post, since the script does what it should do.