From c67f80ca47bc1cf7ebf242c15f14e5880366ad3d Mon Sep 17 00:00:00 2001 From: Kai Moritz Date: Tue, 2 May 2023 17:43:51 +0200 Subject: [PATCH] Added Dashboard for kafka-lag-exporter --- Kafka_Lag_Exporter_Dashboard.json | 1038 +++++++++++++++++++++++++++++ prometheus.yml | 12 + 2 files changed, 1050 insertions(+) create mode 100644 Kafka_Lag_Exporter_Dashboard.json diff --git a/Kafka_Lag_Exporter_Dashboard.json b/Kafka_Lag_Exporter_Dashboard.json new file mode 100644 index 0000000..9096da2 --- /dev/null +++ b/Kafka_Lag_Exporter_Dashboard.json @@ -0,0 +1,1038 @@ +{ + "__inputs": [ + { + "name": "DS_CINNAMON_PROMETHEUS", + "label": "Cinnamon Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.3.4" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1556312150750, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 28, + "panels": [], + "repeat": null, + "title": "All Consumer Group Lag", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "Max extrapolated lag in seconds for each consumer group.", + "fill": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 102, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(50, kafka_consumergroup_group_max_lag_seconds{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{group}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Consumer Group Max Lag Seconds", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "estimated lag in seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "Extrapolated lag in seconds for each partition.", + "fill": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 103, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(25, kafka_consumergroup_group_lag_seconds{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{group}},{{topic}},{{partition}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Consumer Group Lag Partition Seconds", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "estimated lag in seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "Max offset lag for each consumer group.", + "fill": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 98, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(50, kafka_consumergroup_group_max_lag{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{group}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Consumer Group Max Lag Offsets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "offsets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "Consumer Group Lag Partition Offsets", + "fill": 1, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 82, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(25, kafka_consumergroup_group_lag{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{group}},{{topic}},{{partition}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Consumer Group Lag Partition Offsets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "offsets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "offsets_2", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 21 + }, + "id": 149, + "panels": [], + "title": "Consumer Group Lag In Time Per Group Over Offset Lag", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "", + "fill": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 22 + }, + "id": 147, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 8, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "consumer_group", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Consumer group lag in offsets", + "color": "#cca300", + "linewidth": 2, + "yaxis": 2 + }, + { + "alias": "Consumer group lag in time", + "color": "rgb(255, 0, 0)", + "linewidth": 2, + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kafka_consumergroup_group_max_lag_seconds{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Consumer group lag in time", + "refId": "A" + }, + { + "expr": "kafka_consumergroup_group_max_lag{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Consumer group lag in offsets", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$consumer_group", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "estimated lag in seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "lag in offsets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 106, + "panels": [], + "title": "Consumer Group Lag in Time Per Group Over Summed Offsets", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "description": "", + "fill": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 31 + }, + "id": 107, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "minSpan": 8, + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "consumer_group", + "repeatDirection": "h", + "seriesOverrides": [ + { + "alias": "Sum of group offsets", + "yaxis": 2 + }, + { + "alias": "Sum of latest offsets", + "yaxis": 2 + }, + { + "alias": "/Consumer group.*/", + "color": "rgb(255, 0, 0)", + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kafka_consumergroup_group_lag_seconds{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"}) by (group)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Consumer group lag in time", + "refId": "A" + }, + { + "expr": "sum(kafka_consumergroup_group_offset{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Sum of group offsets", + "refId": "B" + }, + { + "expr": "sum((kafka_consumergroup_group_offset{namespace=\"$namespace\",cluster_name=\"$cluster_name\",group=~\"$consumer_group\",group!=\"\"} * 0)\n+ on(namespace,cluster_name,topic,partition) group_left() kafka_partition_latest_offset{namespace=\"$namespace\",cluster_name=\"$cluster_name\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Sum of latest offsets", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "$consumer_group", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "estimated lag in seconds", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "offset", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 101, + "panels": [], + "title": "Kafka Lag Exporter JVM Metrics", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 40 + }, + "id": 99, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(jvm_memory_bytes_used{app_kubernetes_io_instance=\"kafka-lag-exporter\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "JVM Memory Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 40 + }, + "id": 95, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(jvm_gc_collection_seconds_sum{app_kubernetes_io_instance=\"kafka-lag-exporter\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "JVM GC Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 40 + }, + "id": 97, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(jvm_gc_collection_seconds_count{app_kubernetes_io_instance=\"kafka-lag-exporter\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "JVM GC Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5m", + "schemaVersion": 16, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "query_result(kafka_consumergroup_group_lag)", + "refresh": 1, + "regex": "/.*namespace=\"([^\"]*).*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "hide": 0, + "includeAll": false, + "label": "Cluster Name", + "multi": false, + "name": "cluster_name", + "options": [], + "query": "query_result(kafka_consumergroup_group_lag{namespace=\"$namespace\"})", + "refresh": 1, + "regex": "/.*cluster_name=\"([^\"]*).*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "${DS_CINNAMON_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Consumer Group", + "multi": true, + "name": "consumer_group", + "options": [], + "query": "query_result(kafka_consumergroup_group_lag{namespace=\"$namespace\",cluster_name=\"$cluster_name\"})", + "refresh": 1, + "regex": "/.*group=\"([^\"]*).*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Kafka Lag Exporter", + "uid": "8LW1Yd8ik", + "version": 14 +} diff --git a/prometheus.yml b/prometheus.yml index 9089bb2..dc9a755 100644 --- a/prometheus.yml +++ b/prometheus.yml @@ -52,6 +52,18 @@ scrape_configs: regex: '([^:]+)(:[0-9]+)?' replacement: '${1}' + - job_name: "kafka-lag-exporter" + static_configs: + - targets: + - "kafka-lag-exporter:8000" + labels: + env: "dev" + relabel_configs: + - source_labels: [__address__] + target_label: hostname + regex: '([^:]+)(:[0-9]+)?' + replacement: '${1}' + - job_name: "burrow-exporter" static_configs: - targets: -- 2.20.1