@awsekfozc
2016-01-09T01:20:14.000000Z
字数 7544
阅读 2428
OpenTSDB
参考:http://www.ttlsa.com/opentsdb/opentsdb-nagios-monitoring-and-alarming-realization/
在OpenTSDB源码tools目录下有一个Python工具check_tsd。该脚本查询OpenTSDB并返回兼容Nagios的输出OK/WARNING/CRITICAL状态格式。
#!/usr/bin/python## 该脚本根据给定的metric查询TSDB,并与指定的阀值进行比较。# 兼容Nagios的输出格式,因此可以作为nagios的命令。## check_tsd -m mysql.slave.seconds_behind_master -t host=foo -t schema=mydb# -d 600 -a avg -x gt -w 50 -c 100#import httplibimport operatorimport socketimport sysimport timefrom optparse import OptionParserdef main(argv):"""从TSDB提取数据,并做简单的Nagios报警。"""parser = OptionParser(description='Simple TSDB data extractor for Nagios.')parser.add_option('-H', '--host', default='localhost', metavar='HOST',help='Hostname to use to connect to the TSD.')parser.add_option('-p', '--port', type='int', default=80, metavar='PORT',help='Port to connect to the TSD instance on.')parser.add_option('-m', '--metric', metavar='METRIC',help='Metric to query.')parser.add_option('-t', '--tag', action='append', default=[],metavar='TAG', help='Tags to filter the metric on.')parser.add_option('-d', '--duration', type='int', default=600,metavar='SECONDS', help='How far back to look for data.')parser.add_option('-D', '--downsample', default='none', metavar='METHOD',help='Downsample function, e.g. one of avg, min, sum, or max.')parser.add_option('-W', '--downsample-window', type='int', default=60,metavar='SECONDS', help='Window size over which to downsample.')parser.add_option('-a', '--aggregator', default='sum', metavar='METHOD',help='Aggregation method: avg, min, sum (default), max.')parser.add_option('-x', '--method', dest='comparator', default='gt',metavar='METHOD', help='Comparison method: gt, ge, lt, le, eq, ne.')parser.add_option('-r', '--rate', default=False,action='store_true', help='Use rate value as comparison operand.')parser.add_option('-w', '--warning', type='float', metavar='THRESHOLD',help='Threshold for warning. Uses the comparison method.')parser.add_option('-c', '--critical', type='float', metavar='THRESHOLD',help='Threshold for critical. Uses the comparison method.')parser.add_option('-v', '--verbose', default=False,action='store_true', help='Be more verbose.')parser.add_option('-T', '--timeout', type='int', default=10,metavar='SECONDS',help='How long to wait for the response from TSD.')parser.add_option('-E', '--no-result-ok', default=False,action='store_true',help='Return OK when TSD query returns no result.')parser.add_option('-I', '--ignore-recent', default=0, type='int',metavar='SECONDS', help='Ignore data points that are that'' are that recent.')parser.add_option('-S', '--ssl', default=False, action='store_true',help='Make queries to OpenTSDB via SSL (https)')(options, args) = parser.parse_args(args=argv[1:])# 验证参数if options.comparator not in ('gt', 'ge', 'lt', 'le', 'eq', 'ne'):parser.error("Comparator '%s' not valid." % options.comparator)elif options.downsample not in ('none', 'avg', 'min', 'sum', 'max'):parser.error("Downsample '%s' not valid." % options.downsample)elif options.aggregator not in ('avg', 'min', 'sum', 'max'):parser.error("Aggregator '%s' not valid." % options.aggregator)elif not options.metric:parser.error('You must specify a metric (option -m).')elif options.duration <= 0:parser.error('Duration must be strictly positive.')elif options.downsample_window <= 0:parser.error('Downsample window must be strictly positive.')elif options.critical is None and options.warning is None:parser.error('You must specify at least a warning threshold (-w) or a'' critical threshold (-c).')elif options.ignore_recent < 0:parser.error('--ignore-recent must be positive.')if not options.critical:options.critical = options.warningelif not options.warning:options.warning = options.critical# 处理标签tags = ','.join(options.tag)if tags:tags = '{' + tags + '}'# 组装URL并获取if options.downsample == 'none':downsampling = ''else:downsampling = '%ds-%s:' % (options.downsample_window,options.downsample)if options.rate:rate = 'rate:'else:rate = ''url = ('/q?start=%ss-ago&m=%s:%s%s%s%s&ascii&nagios'% (options.duration, options.aggregator, downsampling, rate,options.metric, tags))tsd = '%s:%d' % (options.host, options.port)if options.ssl: # Pick the class to instantiate first.conn = httplib.HTTPSConnectionelse:conn = httplib.HTTPConnectionif sys.version_info[0] * 10 + sys.version_info[1] >= 26: # Python >2.6conn = conn(tsd, timeout=options.timeout)else: # Python 2.5 or less, using the timeout kwarg will make it croak :(conn = conn(tsd)try:conn.connect()except socket.error, e:print "ERROR: couldn't connect to %s: %s" % (tsd, e)return 2if options.verbose:peer = conn.sock.getpeername()print 'Connected to %s:%d' % (peer[0], peer[1])conn.set_debuglevel(1)now = int(time.time())try:conn.request('GET', url)res = conn.getresponse()datapoints = res.read()conn.close()except socket.error, e:print "ERROR: couldn't GET %s from %s: %s" % (url, tsd, e)return 2# URL请求失败时if res.status not in (200, 202):print ('CRITICAL: status = %d when talking to %s:%d'% (res.status, options.host, options.port))if options.verbose:print 'TSD said:'print datapointsreturn 2# URL请求成功时if options.verbose:print datapointsdatapoints = datapoints.splitlines()def no_data_point():"""从TSDB没有获取到任何数据时"""if options.no_result_ok:print 'OK: query did not return any data point (--no-result-ok)'return 0else:print 'CRITICAL: query did not return any data point'return 2if not len(datapoints):return no_data_point()comparator = operator.__dict__[options.comparator]rv = 0 # 该脚本返回值,0-OK,1-WARNING,2-CRITICALbadts = None # 超过阀值的时间戳badval = None # 超过阀值的值npoints = 0 # 查询到多少数据点?nbad = 0 # 有多少数据点超过阀值?for datapoint in datapoints:datapoint = datapoint.split()ts = int(datapoint[1])delta = now - tsif delta > options.duration or delta <= options.ignore_recent:continue # 忽略不在options.duration或options.ignore_recent的数据点npoints += 1val = datapoint[2]if '.' in val:val = float(val)else:val = int(val)bad = False # Is the current value bad?# 比较 warning/critif comparator(val, options.critical):rv = 2bad = Truenbad += 1elif rv < 2 and comparator(val, options.warning):rv = 1bad = Truenbad += 1if (bad and(badval is None # First bad value we find.or comparator(val, badval))): # Worse value.badval = valbadts = tsif options.verbose and len(datapoints) != npoints:print ('ignored %d/%d data points for being more than %ds old'% (len(datapoints) - npoints, len(datapoints), options.duration)) #忽略量/总查询量 数据点大于duration时间if not npoints:return no_data_point()if badts:if options.verbose:print 'worse data point value=%s at ts=%s' % (badval, badts) # 糟糕数据点的值以及该点值的时间戳badts = time.asctime(time.localtime(badts))# 在NRPE里,字符串'|'有特殊含义,但在tag搜索时有用到。对其替换。ttags = tags.replace("|",":")if not rv:print ('OK: %s%s: %d values OK, last=%r'% (options.metric, ttags, npoints, val)) # OK: metric{tags}: 数据点数量 values OK, 最后一次的值else:if rv == 1:level = 'WARNING'threshold = options.warningelif rv == 2:level = 'CRITICAL'threshold = options.criticalprint ('%s: %s%s %s %s: %d/%d bad values (%.1f%%) worst: %r @ %s'% (level, options.metric, ttags, options.comparator, threshold,nbad, npoints, nbad * 100.0 / npoints, badval, badts))return rvif __name__ == '__main__':sys.exit(main(sys.argv))
Options:-h, --help 打印帮助信息-H HOST, --host=HOST 指定TSDB服务器-p PORT, --port=PORT 指定TSDB端口-m METRIC, --metric=METRIC查询的metric-t TAG, --tag=TAG metric过滤的tags-d SECONDS, --duration=SECONDS查询多久的数据,默认600s-D METHOD, --downsample=METHOD缩减像素采样函数有avg, min, sum, or max.-W SECONDS, --downsample-window=SECONDS缩减像素采样窗口大小-a METHOD, --aggregator=METHOD聚合方法: avg, min, sum (default), max.-x METHOD, --method=METHOD比较法: gt, ge, lt, le, eq, ne.-r, --rate 使用速率值作为比较的操作数-w THRESHOLD, --warning=THRESHOLDwarning阀值.-c THRESHOLD, --critical=THRESHOLDcritical阀值.-v, --verbose 显示详细信息-T SECONDS, --timeout=SECONDS等待TSDB响应时间-E, --no-result-ok 当TSD查询没有结果返回时,返回OK状态-I SECONDS, --ignore-recent=SECONDS忽略最近的数据点-S, --ssl 通过SSL查询OpenTSDB (https)
将check_tsd脚本放在Nagios服务器/usr/local/nagios/libexec目录里。
编辑/usr/local/nagios/etc/objects/commands.cnf文件,添加下面的命令:
define command{command_name check_tsdcommand_line $USER1$/check_tsd -H 10.0.101.145 -p 4242 -t host=$HOSTADDRESS$ $ARG1$}###添加监控项define service {use MongoDBhost_name 10.0.0.166service_description query opcounters per secondcheck_command check_tsd!-m rate:mongo.opcounters -t type=query -d 60 -w 300 -c 500}
在此输入正文