服务的质量离不开一个监控和反馈体系,如果没有一个监控反馈体系,我们并不能知道线上的服务出现了什么,正在发生什么、可能存在什么样的问题。
这时候如果有一个可视化的表盘,可以让你直观清晰的感知整个系统的运行状况,那么你就能根据这些反馈做出调整,进入一个【行动->反馈->行动..】的过程之中。渐渐的,整个系统的服务质量在这些不断的调整中就大大提高了,极为重要的一点是我们有必要对自己开发的服务质量负责。
搜索接口性能从一开始的混乱无序,到后续的逐渐稳定。
怎么做?
基于Prometheus,我们可以按时间间隔定义采集任务,用拉的方式去采集各个服务应用上的性能指标。基于Grafana我们可以对这些采集到的指标进行可视化展示,查询分析,此外Grafana还支持报警规则,再通过Webhook、我们能把这些报警推送到飞书或者叮叮群里,及时响应异常情况。
指标类型
Prometheus主要支持三种类型一种分别是Counter,Histogram,Gauge,Summary
- Counter主要用来记录一个总数,比如说api请求总次数
-
Gauge主要记录可上可下的值,比如CPU使用率
-
Summary可以用来记录接口的响应时间
-
Histogram和Summary可以做差不多同样的事,就是多了一个bucket的概念,响应时间按范围分段统计,具体差异详见
接口准备
Prometheus主要是通过定时抓取目标{target}/metrics 接口来采集数据的,所以我们需要提供一个接口来暴露这些数据
Nodejs
以express为例,安装prom-clien response-time依赖
require('prom-client').collectDefaultMetrics(); |
记录一些默认的指标比如说当前nodejs进程内存 cpu gc http request等各种系统指标。
var Register = require('prom-client').register; var Counter = require('prom-client').Counter; var Histogram = require('prom-client').Histogram; var Summary = require('prom-client').Summary; var ResponseTime = require('response-time'); // var Logger = require('logger').createLogger(); /** * A Prometheus counter that counts the invocations of the different HTTP verbs * e.g. a GET and a POST call will be counted as 2 different calls */ var numOfRequests = new Counter({ name: 'numOfRequests', help: 'Number of requests made', labelNames: ['method'] }); /** * A Prometheus counter that counts the invocations with different paths * e.g. /foo and /bar will be counted as 2 different paths */ var pathsTaken = new Counter({ name: 'pathsTaken', help: 'Paths taken in the app', labelNames: ['path'] }); /** * A Prometheus summary to record the HTTP method, path, response code and response time */ var responses = new Summary({ name: 'responses', help: 'Response time in millis', labelNames: ['method', 'path', 'status'] }); /** * This funtion will start the collection of metrics and should be called from within in the main js file */ module.exports.startCollection = function () { // Logger.info(Logger.LOG_INFO, 'Starting the collection of metrics, the metrics are available on /metrics'); require('prom-client').collectDefaultMetrics(); }; /** * This function increments the counters that are executed on the request side of an invocation * Currently it increments the counters for numOfPaths and pathsTaken */ module.exports.requestCounters = function (req, res, next) { if (req.path != '/metrics') { numOfRequests.inc({ method: req.method }); pathsTaken.inc({ path: req.path }); } next(); } /** * This function increments the counters that are executed on the response side of an invocation * Currently it updates the responses summary */ module.exports.responseCounters = ResponseTime(function (req, res, time) { if(req.url != '/metrics') { responses.labels(req.method, req.url, res.statusCode).observe(time); } }) /** * In order to have Prometheus get the data from this app a specific URL is registered */ module.exports.injectMetricsRoute = function (App) { App.get('/metrics', (req, res) => { res.set('Content-Type', Register.contentType); res.end(Register.metrics()); }); }; |
记录不同route的请求次数numOfRequests ,和响应时间responses
app.use(Prometheus.requestCounters); app.use(Prometheus.responseCounters); /** * Enable metrics endpoint */ Prometheus.injectMetricsRoute(app); /** * Enable collection of default metrics */ Prometheus.startCollection(); |
中间件加上,注入/metrics接口
PHP
php我们要用到php-apc 或者redis也可以 因为php自己是被web server执行完了就内存释放了,计数统计还得依赖redis或者apc这样的
php7得安装php-apcu还得安装apcu_bc, php7可以用 endclothing/prometheus_client_php, 5.x可以用 jimdo/prometheus_client_php
以laravel为例
我们新加一个中间件Metrics.php
namespace App\Http\Middleware; use Closure; use Illuminate\Support\Facades\Auth; use Illuminate\Http\Request; use Prometheus\CollectorRegistry; class Metrics { public function __construct() { if(function_exists('apcu_add')) { $adapter = new \Prometheus\Storage\APC(); } else { $adapter = new \Prometheus\Storage\InMemory(); } $registry = new CollectorRegistry($adapter); $this->registry = $registry; $this->initRouteMetrics(); } public function initRouteMetrics() { $namespace = "api"; $buckets = [ 100.0, 200.0, 300.0, 600.0, 1000.0, 1500.0, 2500.0, 5000.0, 8000.0, 15000.0, 30000.0, 50000.0, 80000.0 ]; // $buckets = null; $labelNames = $this->getRequestCounterLabelNames(); $name = 'request_duration_milliseconds'; $help = 'duration of http_requests'; $this->requestDurationHistogram = $this->registry->getOrRegisterHistogram( $namespace, $name, $help, $labelNames, $buckets ); } protected function getRequestCounterLabelNames() { return [ 'route', 'method', 'status_code', ]; } /** * Handle an incoming request. * * @param \Illuminate\Http\Request $request * @param \Closure $next * @param string|null $guard * @return mixed */ public function handle(Request $request, Closure $next) { $start = $_SERVER['REQUEST_TIME_FLOAT']; $this->request = $request; /** @var \Illuminate\Http\Response $response */ $response = $next($request); $route = $request->route(); if($route) { $route_name = $route->uri() ?: 'unnamed'; $method = $request->getMethod(); $status = $response->getStatusCode(); $duration = microtime(true) - $start; $duration_milliseconds = $duration * 1000.0; $this->countRequest($route_name, $method, $status, $duration_milliseconds); } return $response; } public function countRequest($route, $method, $statusCode, $duration_milliseconds) { $labelValues = [(string)$route, (string)$method, (string) $statusCode]; $this->requestDurationHistogram->observe($duration_milliseconds, $labelValues); } } |
按route统计api的响应时间
Prometheus数据采集
docker-compose.yml
version: "2.2" services: tapi: logging: driver: "json-file" options: max-size: "50m" image: prom/prometheus restart: always volumes: - /prometheus_data:/prometheus_data:rw - ./prometheus.yml:/etc/prometheus/prometheus.yml - ./targets.json:/etc/prometheus/targets.json command: - '--storage.tsdb.path=/prometheus_data' - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.retention=10d' ports: - "9090:9090" |
挂载一个prometheus_data外部目录来对数据持久化,config设置保留10天的数据
prometheus.yml
global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. scrape_configs: - job_name: "node" file_sd_configs: - refresh_interval: 1m files: - "/etc/prometheus/targets.json" - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] - job_name: web static_configs: - targets: ['192.168.1.22:8096', '192.168.1.89:8096'] |
file_sd会定时动态读取targets.json在不重启的情况下增加新的job
targets.json
[ { "targets": [ "192.168.1.86:8029" ], "labels": { "env": "prod", "job": "api" } }, { "targets": [ "192.168.1.86:8030"], "labels": { "env": "prod", "job": "api" } } ] |
数据指标可视化
启动Grafana
docker run -d --name=grafana -p 3000:3000 grafana/grafana |
配置数据源
Configuration > Add data source 选择Prometheus,URL=http://localhost:9090
配置Dashboard
增加一个特定接口的性能监控
5分钟内请求总时间除以总次数得到api响应时间均值
rate(api_request_duration_milliseconds_sum{route="g/search", method="POST"}[5m]) / rate(api_request_duration_milliseconds_count{route="g/search", method="POST"}[5m]) |
发现慢接口
topk(20, sum by (route) ( rate(api_request_duration_milliseconds_sum{status_code!="500", method!="OPTIONS"}[2m]) / rate(api_request_duration_milliseconds_count{status_code!="500", method!="OPTIONS"}[2m]) ) ) |
sum byroute 的响应时间 取Top20,得到响应时间慢的API
topk(10, sum by (route) (rate(api_request_duration_milliseconds_count{status_code!="500"}[5m]))) |
稍微调整下可以拿到当前时间请求频繁的接口
报警
设置Alerting > Notification channels
由于feishu的webhook传参格式不一样,我们需要格式化一下信息格式,做个转发
所以是 grafana > webhook proxy > feishu
const express = require('express') const bodyParser = require('body-parser') const app = express() const fetch = require('node-fetch'); app.use(bodyParser.json()); app.use(bodyParser.urlencoded({extended: false})); app.post('/webhook/feishu/bot1', async (req, res, next) => { const body = req.body; const matchs = body.evalMatches.length ? JSON.stringify(body.evalMatches, null, 2) : ''; const formatted = { title: body.title, text: `${body.message}\n\nruleName:${body.ruleName}\nstate:${body.state}\n\n${matchs}\n` }; const proxyRes = await fetch('https://open.feishu.cn/open-apis/bot/hook/xxxx', { method: 'post', body: JSON.stringify(formatted), headers: { 'Content-Type': 'application/json' } }); console.log(await proxyRes.json()); res.send('hello'); }); app.listen(3002); |
规则设置
设置一个当搜索接口响应时间高于某个值的报警通知