因为prometheus支持restful API接口查询数据,所以核心思路是python使用request库,请求prometheus的API接口,得到响应对象后解析处理即可。
一、获取targets数据
1. prometheus查看targets数据
如图所示,现在的prometheus共有三个targets,一个为prometheus,还有两个为Windows
2. 使用apifox模拟请求,获取数据
调用
http://<prometheus.address>/api/v1/targets
并解析
3. python使用request库请求处理数据
废话不多说,直接上代码
class Monitor:
"""
获取prometheus监控数据
"""
def __init__(self):
# prometheus地址
self.usr = PROMETHEUS_URL
# up状态节点列表
self.up_list = []
# down状态节点列表
self.down_list = []
def target(self):
"""
获取监控节点
:return:
"""
url = self.usr + '/api/v1/targets'
response = requests.request('GET', url)
if response.status_code == 200:
targets = response.json()['data']['activeTargets']
for target in targets:
if target['health'] == 'up':
self.up_list.append(target['discoveredLabels']['__address__'])
else:
self.down_list.append(target['discoveredLabels']['__address__'])
return self.up_list
else:
print('Get targets status failed!')
return None
二、获取当前时间指标值
适用于获取服务器的CPU核心数、操作系统内核版本、内存总容量等这些当前状态下的指标值
1. prometheus dashboard查询
以查询Linux系统内核版本为例
如图所示,执行
node_uname_info{job="linux",instance="139.***.***.149:9100"}
查询语句即可。
2. 使用apifox模拟请求,获取数据
调用
http://<prometheus.address>/api/v1/query?query=<expr>
,其中expr为prometheus的查询语句。
3. python使用request库请求处理数据
没啥好说的,直接看代码
import requests
from ops_py.settings import PROMETHEUS_URL
class Monitor:
"""
获取prometheus监控数据
"""
def __init__(self):
# prometheus地址
self.usr = PROMETHEUS_URL
# up节点列表
self.up_list = []
# down节点列表
self.down_list = []
def getQueryValue(self, query):
"""
执行查询语句(查询单个值)
:param query: 查询的语句
:return: 查询到的值
"""
base_url = self.usr + 'api/v1/query?query='
inquire = base_url + query
print(inquire)
response = requests.request('GET', inquire)
if response.status_code == 200:
result = response.json()['data']['result'][0]
# print("原始查询结果:", result)
return result
else:
return None
def get_os_release(self, address):
"""
获取系统内核版本
:param address:
:return:
"""
query = 'node_uname_info{job="linux",instance="' + address + '"}'
result = self.getQueryValue(query)
value = result['metric']['release']
return value
def get_up_time(self, address):
"""
获取系统启动时长
:return:
"""
query = 'sum(time()-node_boot_time_seconds{job="linux",instance="' + address + '"}) by (instance)'
result = self.getQueryValue(query)
value = int(float(result['value'][1]))
time_hour, time_sec = divmod(value, 3600)
time_day, time_hour = divmod(time_hour, 24)
time_mon, time_day = divmod(time_day, 30)
return str(time_mon) + '月 ' + str(time_day) + '天 ' + str(time_hour) + '小时'
def get_cpu_cores(self, address):
"""
获取CPU核心数
:param address:
:return:
"""
query = 'count(node_cpu_seconds_total{job="linux",mode="system",instance="' + address + '"}) by (instance)'
result = self.getQueryValue(query)
value = result['value'][1]
return value
三、获取时间范围内指标值
grafana的折线图数据,都是这种使用场景,例如在指定时间范围内的CPU使用率、内存使用率、系统负载等
1. prometheus dashboard查询
以查询CPU使用率为例
2. 使用apifox模拟请求,获取数据
调用
http://<prometheus.address>/api/v1/query_range?query=<expr>&start=<startstamp>&end=<endstamp>&step=<step>
expr为prometheus的查询语句
startstamp为范围查询开始时间戳
endstamp为范围查询结束时间戳
step为查询时间间隔(单位为秒)
3. python使用request库请求处理数据
import time
import requests
from ops_py.settings import PROMETHEUS_URL
class Monitor:
"""
获取服务器监控数据
"""
def __init__(self):
# prometheus地址
self.usr = PROMETHEUS_URL
# up节点列表
self.up_list = []
# down节点列表
self.down_list = []
def timeQuery(self, start_time, end_time):
"""
范围查询语句构造时间查询格式
:param start_time:
:param end_time:
:return:
"""
start = int(time.mktime(time.strptime(start_time, "%Y-%m-%d %H:%M:%S")))
end = int(time.mktime(time.strptime(end_time, "%Y-%m-%d %H:%M:%S")))
step = int((end - start) / 9)
return '&start=' + str(start) + '&end=' + str(end) + '&step=' + str(step)
def target(self):
"""
获取监控节点
:return:
"""
url = self.usr + '/api/v1/targets'
response = requests.request('GET', url)
if response.status_code == 200:
targets = response.json()['data']['activeTargets']
for target in targets:
if target['health'] == 'up':
self.up_list.append(target['discoveredLabels']['__address__'])
else:
self.down_list.append(target['discoveredLabels']['__address__'])
return self.up_list
else:
print('Get targets status failed!')
return None
def getQueryRange(self, query, time_range):
"""
执行查询语句(查询时间范围)
:param time_range: 查询时间范围
:param query: 查询的语句
:return: 查询到的值
"""
base_url = self.usr + 'api/v1/query_range?query='
inquire = base_url + query + time_range
print(inquire)
response = requests.request('GET', inquire)
if response.status_code == 200:
result = response.json()['data']['result']
# print("原始查询结果:", result)
return result
else:
return None
def get_cpu_use_rate(self, *params):
"""
获取CPU使用率
查单个值——(address)
查所有资源图表值——(start_time,end_time)
:return:
"""
if len(params) == 1:
address = params[0]
query = 'avg(rate(node_cpu_seconds_total{job="linux",instance="'+address+'",mode="user"}[2m])) by (instance) *100'
result = self.getQueryValue(query)
value = round(float(result['value'][1]), 2)
return str(value) + '%'
elif len(params) == 2:
query = 'avg(rate(node_cpu_seconds_total{job="linux",mode="user"}[2m])) by (instance) *100'
time_range = self.timeQuery(params[0], params[1])
result = self.getQueryRange(query, time_range)
return result
else:
print('异常参数')
四、获取告警数据
1. prometheus查询
2. 使用apifox模拟请求,获取数据
调用
http://<prometheus.address>/api/v1/rules?type=alert
并解析
3. python使用request库请求处理数据,获取告警信息
import requests
from ops_py.settings import PROMETHEUS_URL
class Monitor:
"""
获取服务器监控数据
"""
def __init__(self):
# prometheus地址
self.usr = PROMETHEUS_URL
def alert(self):
"""
获取告警列表
:return:
"""
url = self.usr + 'api/v1/rules?type=alert'
response = requests.request('GET', url)
if response.status_code == 200:
rules = response.json()['data']['groups'][0]['rules']
for rule in rules:
if len(rule['alerts']):
for alert in rule['alerts']:
print("告警标题:", alert['labels']['alertname'])
print("告警节点:", alert['labels']['instance'])
print("告警组:", alert['labels']['job'])
print("告警级别:", alert['labels']['severity'])
print("告警详情:", alert['annotations']['description'])
print("触发时间:", alert['activeAt'])
print("触发值:", alert['annotations']['value'])
else:
print('Get targets status failed!')
return None
# 执行结果
告警标题: 内存使用率高
告警节点: 139.***.***.204:9182
告警组: windows
告警级别: warning
告警详情: 告警节点: 139.***.***.204:9182 告警内容:内存使用率超过70%(当前值:80.23841190576253)
触发时间: 2021-04-26T14:40:48.517676466Z
触发值: 80.23841190576253