I have a python script which runs as a Ganeti hook, that run after adding, removing, shutting down and starting an instance to Ganeti. When adding a new instance to Ganeti the hook should add this instance to check_mk with a API call. Removing the instance in Ganeti triggers the deletion of the instance in check_mk. Shutting down an instance sets down time in check_mk and starting the instance removes the down time in check_mk if it was set by the hook. We have Ganeti clusters in multiple locations (datacenters).
We have distributed monitoring with check-mk-raw with one master and multiple slaves running in each datacenter. Thus adding, removing etc.. can only be done with the API call to the master.
Ganeti hooks stdout and stderr are redirected to files, this is hard coded in Ganeti. Errors will be written to stdout (cosole) only if the script fails, but if it runs successfully output is redirected to files and mostly isn't much. So print()
doesn't help. Hence I am using the logging library.
The major problem is that the script breaks often and some times without logging. I don't know if it is my modest ability of coding or network latency. I added the whole exceptions today to find out what is going on, but that didn't help.
I would appreciate any help on this. Below is the complete script.
Thanks a lot.
EDIT: I removed the majority of the exceptions since they are not really relevant and fixed some typos in the script.
#!/usr/bin/env python
"""Manage host in monitoring."""
import os
import re
import sys
import json
import socket
import logging
import requests
APIURL = 'https://checkmk.host/site/check_mk/webapi.py'
WEBURL = 'https://checkmk.host/site/check_mk/view.py'
def hook_mon_token():
"""Get secrets for monitoring from file.
This file is written by ganeti puppet module.
"""
with open('/root/.hook_mon_token', 'r') as _file:
ldap_secret = _file.readline()
mon_token = _file.readline()
return ldap_secret, mon_token
def get_datacenter():
"""Get datacenter we are run at."""
datacenter = requests.get('http://localhost:8500/v1/catalog/datacenters').json()
if '-' in datacenter[0]:
datacenter = datacenter[0].split('-')[1]
return datacenter.lower()
else:
return datacenter[0].lower()
def get_tenant(datacenter):
"""Return tenant name."""
tenant = str.lower(''.join(re.findall(r'tenant:([\w-]+)', os.environ['GANETI_INSTANCE_NIC0_NETWORK_TAGS'])))
if tenant == '':
tenant = datacenter
return tenant
def checkmk_api_call(action):
"""Call the Web API."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
server_ip = os.environ['GANETI_INSTANCE_NIC0_IP']
params = {'action': action, '_username': 'automation', '_secret': mon_token.strip()}
if action == 'add_host':
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
folder = datacenter + "/" + tenant + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_' + tenant + '-vm': tenant + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
else:
hostname = datacenter.upper() + '.' + instance_name
folder = datacenter + "/hosts"
request = {
'hostname': hostname,
'folder': folder,
'attributes': {
'ipaddress': server_ip,
'site': datacenter,
'tag_' + datacenter: datacenter,
'tag_' + datacenter + '-vm': datacenter + '-vm',
'tag_agent': 'cmk-agent',
'tag_snmp': 'no-snmp'
},
'create_folders': '0'
}
elif action == 'delete_host':
api_answer, request, hostname = get_host('delete_host')
if api_answer != server_ip:
return api_answer, hostname
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, hostname
return False, hostname
except requests.exceptions.RequestException as error:
return error, hostname
def get_host(action):
"""Get the Host."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
params = {'action': 'get_host', '_username': 'automation', '_secret': mon_token.strip()}
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
request = {
'hostname': hostname
}
else:
hostname = datacenter.upper() + '.' + instance_name
request = {
'hostname': hostname
}
if action == "delete_host":
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror, request, hostname
response_post = resp_post.json()
host_ip = response_post['result']['attributes']['ipaddress']
return host_ip, request, hostname
except requests.exceptions.RequestException as error:
return error, request, hostname
else:
try:
resp_post = requests.post(APIURL, params=params, auth=auth, data={'request': json.dumps(request)})
if json.loads(resp_post.content)['result_code']:
apierror = json.loads(resp_post.content)['result']
return apierror
return False
except requests.exceptions.RequestException as error:
return error
def is_down():
"""Check, if down and downtime comment."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON',
'host_regex': hostname,
'view_name': 'downtimes'
}
apierror = get_host('get_host')
if apierror:
return None, apierror, hostname
try:
resp_get = requests.get(WEBURL, params=params, auth=auth).text
resp_json = json.loads(resp_get)
if len(resp_json) == 1:
host_is_down = False
down_comment = ''
else:
host_is_down = True
down_comment = resp_json[1][resp_json[0].index('downtime_comment')]
return host_is_down, down_comment, hostname
except requests.exceptions.RequestException as error:
return None, error, hostname
def checkmk_web_call(action):
"""Call web page view."""
ldap_secret, mon_token = hook_mon_token()
auth = ('check_mk_user', ldap_secret.strip())
datacenter = get_datacenter()
instance_name = os.environ['GANETI_INSTANCE_NAME'].split('.', 1)[0]
apierror = get_host('get_host')
if apierror:
return apierror
if datacenter in ('dc1', 'dc2', 'dc3'):
tenant = get_tenant(datacenter)
hostname = tenant.upper() + '.' + instance_name
else:
hostname = datacenter.upper() + '.' + instance_name
params = {
'_do_confirm': 'yes',
'_do_actions': 'yes',
'_transid': '-1',
'_username': 'automation',
'_secret': mon_token.strip(),
'output_format': 'JSON'
}
if action == 'stop':
params.update({
'view_name': 'host',
'host': hostname,
'_on_hosts': 'on',
'_downrange__next_year': 'This+year',
'_down_comment': 'down by ganeti shutdown'
})
elif action == 'start':
params.update({
'view_name': 'downtimes',
'host_regex': hostname,
'_remove_downtimes': 'Remove'
})
try:
resp = requests.post(WEBURL, params=params, auth=auth)
return False
except requests.exceptions.RequestException as error:
return error
def gnt_action(action):
logger = logging.getLogger(__name__)
if action == 'instance-add':
apierror, hostname = checkmk_api_call('add_host')
if apierror:
logger.error(apierror, 'Could not add "%s" to check_mk! Please add it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Added "%s" successfully to check_mk. Please activete changes in WATO' % hostname)
elif action == 'instance-remove':
apierror, hostname = checkmk_api_call('delete_host')
if apierror:
logger.error(apierror, 'Could not remove "%s" from check_mk! Please remove it manually!' % hostname)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed "%s" successfully from check_mk. Please activate changes in WATO' % hostname)
elif action == 'instance-start':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
elif host_is_down and down_comment == 'down by ganeti shutdown':
apierror = checkmk_web_call('start')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Removed down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do')
elif action == 'instance-stop':
host_is_down, down_comment, hostname = is_down()
if host_is_down is None:
logger.info(down_comment)
sys.exit(1)
elif host_is_down is False:
apierror = checkmk_web_call('stop')
if apierror:
logger.error(apierror)
os.system('tail -1 /tmp/monitoring_hook.log')
sys.exit(1)
logger.info('Set down time successfully for "%s" in check_mk' % hostname)
else:
logger.info('Nothing to do. "%s" is already down' % hostname)
def main():
logger = logging.getLogger(__name__)
log_file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(process)s - %(message)s')
log_file_handler = logging.FileHandler('/tmp/monitoring_hook.log')
log_file_handler.setFormatter(log_file_format)
log_file_handler.setLevel(logging.DEBUG)
logger.addHandler(log_file_handler)
logger.setLevel(logging.INFO)
"""Add Hook for Ganeti to add new instance to monitoring."""
if socket.getfqdn() == os.environ['GANETI_MASTER']:
action = os.environ['GANETI_HOOKS_PATH']
if os.environ['GANETI_POST_INSTANCE_TAGS']:
if 'monitoring:no' in os.environ['GANETI_POST_INSTANCE_TAGS']:
logger.info('VM will not be added to check_mk')
sys.exit(0)
else:
gnt_action(action)
else:
gnt_action(action)
if __name__ == "__main__":
try:
pid = os.fork()
if pid > 0:
# Exit parent process
sys.exit(0)
except OSError, e:
print('fork failed: %d (%s)' % (e.errno, e.strerror))
sys.exit(1)
main()
After more debugging I found out that script fails only in some datacenters and always succeed in others, it was clear it is a network issue.
The API requests are sent to the WAN IP of the monitoring server, so I just replaced it with the LAN IP in /etc/hosts
until I can find the root cause.
Sorry for the irrelevant post, since the script does what it should do.