最近更新时间:2026-06-17 10:34:29
为满足 KCE 集群下 RDMA 网络的运维观测需求,平台提供 rdma-exporter 监控组件。该组件仅在配备 RDMA 网卡的节点部署,专项提供基于KCE集群环境的 RDMA 网络监控能力。rdma-exporter-agent 以 DaemonSet 形式运行,采集节点及 Pod 的 RDMA 网络数据并统一聚合。然后,通过标准化的接口提供给用户查看,并支持与Promethues对接,可视化展示完整 RDMA 网络观测数据,支撑运维观测工作。
注意:
若pod通过macvlan方式使用RDMA网卡,本组件不支持对此类Pod的RDMA网络监控
若pod名称显示为“/",代表是Host网络空间
访问方式:{node_ip}:8102/metrics接口。
指标描述:rdmaexporter_模块名称_具体指标名称{namespace信息,node信息,pod信息} 值。示例:
rdmaexporter_tx_prio5_pause_duration{device="mlx5_5",k8s_namespace="/",k8s_node="10.0.0.206",k8s_pod="/",type="mlx5"}
在托管Prometheus实例的页面中,新建自定义采集配置,参考下图配置,并等待配置完成。
展示模板需通过Import页面导入:
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"target": {
"limit": 100,
"matchAny": false,
"tags": [],
"type": "dashboard"
},
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 19,
"iteration": 1765770409925,
"links": [],
"liveNow": false,
"panels": [
{
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 40,
"title": "HW Counters",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 1
},
"id": 42,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_rx_read_requests{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_read_requests",
"refId": "A"
}
],
"title": "RX Read Requests",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 1
},
"id": 44,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_rx_write_requests{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_write_requests",
"refId": "A"
}
],
"title": "RX Write Requests",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 9
},
"id": 54,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_np_ecn_marked_roce_packets{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "np_ecn_marked_roce_packets",
"refId": "A"
}
],
"title": "NP ECN Marked RoCE Packets",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 9
},
"id": 52,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_np_cnp_sent{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "np_cnp_sent",
"refId": "A"
}
],
"title": "NP CNP Sent",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 17
},
"id": 50,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_rp_cnp_handled{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rp_cnp_handled",
"refId": "A"
}
],
"title": "RP CNP Handled",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 17
},
"id": 48,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_rp_cnp_ignored{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rp_cnp_ignored",
"refId": "A"
}
],
"title": "RP CNP Ignored",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 25
},
"id": 46,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_mlx5_rx_atomic_requests{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_atomic_requests",
"refId": "A"
}
],
"title": "RX Atomic Requests",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 33
},
"id": 30,
"panels": [],
"title": "RDMA Resource Summary",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 34
},
"id": 38,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_qp{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "qp",
"refId": "A"
}
],
"title": "RMDA QPs Count",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 34
},
"id": 36,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_pd{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "pd",
"refId": "A"
}
],
"title": "RDMA PDs Count",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 42
},
"id": 32,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_cq{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "cq",
"refId": "A"
}
],
"title": "RDMA CQs Count",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 42
},
"id": 34,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_ctx{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "ctx",
"refId": "A"
}
],
"title": "RDMA CTXs Count",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 50
},
"id": 2,
"panels": [],
"title": "Prio5 Queue Statistics",
"type": "row"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 51
},
"id": 22,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_tx_prio5_bytes{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "tx_prio5_bytes",
"refId": "A"
}
],
"title": "TX Prio5 Bytes",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 51
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_bytes{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_bytes",
"refId": "A"
}
],
"title": "RX Prio5 Bytes",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 59
},
"id": 24,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_tx_prio5_packets{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "tx_prio5_packets",
"refId": "A"
}
],
"title": "TX Prio5 Packets",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 59
},
"id": 14,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_packets{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_packets",
"refId": "A"
}
],
"title": "RX Prio5 Packets",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 67
},
"id": 26,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_tx_prio5_pause{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "tx_prio5_pause",
"refId": "A"
}
],
"title": "TX Prio5 Pause",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 67
},
"id": 16,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_pause{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_pause",
"refId": "A"
}
],
"title": "RX Prio5 Pause",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 75
},
"id": 18,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_rx_prio5_pause_duration{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "rx_prio5_pause_duration",
"refId": "A"
}
],
"title": "RX Prio5 Pause Duration",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 75
},
"id": 28,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rdmaexporter_tx_prio5_pause_duration{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}",
"interval": "",
"legendFormat": "prio5_pause_duration",
"refId": "A"
}
],
"title": "Prio5 Pause Duration",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 83
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_discards{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_discards",
"refId": "A"
}
],
"title": "RX Prio5 Discards",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 83
},
"id": 20,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_pause_transition{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_pause_transition",
"refId": "A"
}
],
"title": "RX Prio5 Pause Transition",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 91
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_buf_discard{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_buf_discard",
"refId": "A"
}
],
"title": "RX Buf Discard",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 91
},
"id": 12,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_marked{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_marked",
"refId": "A"
}
],
"title": "RX Prio5 Marked",
"type": "timeseries"
},
{
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 99
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "P700343E857AAC980"
},
"exemplar": true,
"expr": "rate(rdmaexporter_rx_prio5_cong_discard{k8s_node=\"$node\", device=\"$device\", k8s_pod=\"$pod\"}[1m])",
"interval": "",
"legendFormat": "rx_prio5_cong_discard",
"refId": "A"
}
],
"title": "RX Prio5 Cong Discard",
"type": "timeseries"
}
],
"schemaVersion": 33,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"current": {
"selected": false,
"text": "10.0.23.2",
"value": "10.0.23.2"
},
"definition": "label_values(rdmaexporter_rx_prio5_discards{}, k8s_node)",
"hide": 0,
"includeAll": false,
"label": "Node",
"multi": false,
"name": "node",
"options": [],
"query": {
"query": "label_values(rdmaexporter_rx_prio5_discards{}, k8s_node)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "/",
"value": "/"
},
"definition": "label_values(rdmaexporter_rx_prio5_discards{k8s_node=\"$node\"}, k8s_pod)",
"hide": 0,
"includeAll": false,
"label": "Pod",
"multi": false,
"name": "pod",
"options": [],
"query": {
"query": "label_values(rdmaexporter_rx_prio5_discards{k8s_node=\"$node\"}, k8s_pod)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
},
{
"current": {
"selected": false,
"text": "mlx5_1",
"value": "mlx5_1"
},
"definition": "label_values(rdmaexporter_rx_prio5_discards{k8s_pod=\"$pod\"}, device)",
"hide": 0,
"includeAll": false,
"label": "Device",
"multi": false,
"name": "device",
"options": [],
"query": {
"query": "label_values(rdmaexporter_rx_prio5_discards{k8s_pod=\"$pod\"}, device)",
"refId": "StandardVariableQuery"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"type": "query"
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "RDMA Exporter",
"uid": "wEDK-QWDz",
"version": 1,
"weekStart": ""
}观测方向 | 指标名称 | 显示名称 | 描述 |
HW Counter | rdmaexporter_mlx5_rx_write_requests | RX Write Requests | 统计接收端处理的Write操作请求数 rdmaexporter_rdma_rx_write_requests{k8s_node="192.168.88.112""} 100 |
rdmaexporter_mlx5_rx_read_requests | RX Read Requests | 统计接收端处理的Read操作请求数 | |
rdmaexporter_mlx5_rx_atomic_requests | RX Atomic Requests | 统计接收端处理的Atomic操作请求数 | |
rdmaexporter_mlx5_rp_cnp_handled | RP CNP Handled | 被对端网卡处理的CNP报文计数 | |
rdmaexporter_mlx5_rp_cnp_ignored | RP CNP Ignored | 对端网卡接收到但是被忽略的CNP报文计数,该计数不应该增加 | |
rdmaexporter_mlx5_np_cnp_sent | NP CNP Sent | 当网卡通过RoCEv2 IP头分析到存在拥塞控制时发送的CNP报文计数 | |
rdmaexporter_mlx5_np_ecn_marked_roce_packets | NP ECN Marked RoCE Packets | 网卡接收到的RoCEv2报文中被标记了ECN bit的报文计数 | |
rdmaexporter_mlx5_out_of_buffer | Out Of Buffer times | 接收端缓冲区不足导致的丢包统计 | |
rdmaexporter_mlx5_out_of_sequence | OutOfOder Seq Packets | 统计收到的乱序报文数 | |
rdmaexporter_mlx5_duplicate_request | Duplicate Packets | 统计收到的重复请求报文数 | |
rdmaexporter_mlx5_rnr_nak_retry_err | RNR NAK Packets | 统计收到的RNR、NAK报文数 | |
rdmaexporter_mlx5_packet_seq_err | NAK Error Packets | 统计收到的NAK Seq错误报文数 | |
rdmaexporter_mlx5_implied_nak_seq_err | Implied NAKs( seq error) | 接收端检测到报文序列号异常,触发隐式NAK重传请求的次数. | |
rdmaexporter_mlx5_local_ack_timeout_err | ACK Timeout(times) | 发送端ACK超时响应次数统计。发送端QP在发送请求包后启动Local Ack Timeout定时器,若超时未收到ACK,则触发改该计数增长 | |
rdmaexporter_mlx5_rx_dct_connect | RX DCT Conn Requests | DCT传输模式下,接收到的连接请求数 | |
rdmaexporter_mlx5_resp_local_length_error | Responder Local Length errors | 接收端在响应阶段检测到本地处理的数据长度与协议预期不符的错误计数 | |
rdmaexporter_mlx5_resp_cqe_error | Responder CQE errors | 接收端在生成CQE时检测到的错误计数 | |
rdmaexporter_mlx5_req_cqe_error | Requester CQE errors | 发送端在生成CQE时检测到的错误计数 | |
rdmaexporter_mlx5_req_remote_invalid_request | Remote Invalid Request errors | 发送端检测到远端无效请求的错误计数 | |
rdmaexporter_mlx5_req_remote_access_errors | Requester Remote Access errors | 发送端检测到远端访问错误的计数 | |
rdmaexporter_mlx5_resp_remote_access_errors | Responder Remote Access errors | 接收端检测到远端访问错误的计数 | |
rdmaexporter_mlx5_resp_cqe_flush_error | Responder CQEs With Flush error | 接收端生成CQE时检测到flushed错误的次数 | |
rdmaexporter_mlx5_req_cqe_flush_error | Requster CQEs With Flush error | 发送端生成CQE时检测到flushed错误的次数 | |
rdmaexporter_mlx5_roce_adp_retrans | Adaptive Retrans times | 动态重传的次数统计 | |
Prio5 Queue Statistics | rdmaexporter_rx_prio5_bytes | RX Prio5 Bytes | RDMA总的入向字节计数 |
rdmaexporter_rx_prio5_packets | RX Prio5 Packates | RDMA总的接收包计数 | |
rdmaexporter_tx_prio5_bytes | TX Prio5 Bytes | RDMA总的出向字节计数 | |
rdmaexporter_tx_prio5_packets | TX Prio5 Packates | RDMA总的发送包计数 | |
rdmaexporter_rx_prio5_pause | RX Prio5 Pause | RDMA接收的pause帧的计数,当pause帧增加,意味着网络存在拥塞,网络对端无法从网卡继续收包 | |
rdmaexporter_tx_prio5_pause | TX Prio5 Pause | RDMA发送的pause帧的计数,当这个计数增加,意味着网卡本身存在拥塞,网卡无法从网络上继续收包 | |
rdmaexporter_rx_prio5_pause_duration | RX Prio5 Pause Duration | RDMA接收到pause帧持续的时间(单位为microSec) | |
rdmaexporter_tx_prio5_pause_duration | TX Prio5 Pause Duration | RDMA发送pause帧持续的时间(单位为microSec) | |
rdmaexporter_rx_prio5_pause_transition | TX Prio5 Pause Duration | RDMA发送pause帧持续的时间(单位为microSec) | |
rdmaexporter_rx_prio5_cong_discard | RX Prio5 Cong Discard | RDMA收包过程中由于host拥塞而导致的丢包计数 | |
rdmaexporter_rx_prio5_discards | RX Prio5 Discard | RDMA接收过程中,由于接收buffer不足导致的丢包计数 | |
rdmaexporter_rx_prio5_buf_discard | RX Prio5 Buf Discard | RDMA收包过程中由于host receive buffer不够而导致的丢包计数 | |
rdmaexporter_rx_prio5_marked | RX Prio5 Marked | RDMA收包过程中被标记ECN的报文计数 | |
RDMA Resource Summary | rdmaexporter_qp | RDMA QPs Count | RDMA创建的QP队列对计数(累计值) |
rdmaexporter_mr | RDMA MRs Count | RDMA创建的内存注册计数(累计值) | |
rdmaexporter_ctx | RDMA CTXs Count | RDMA创建的Context上下文计数(累计值) | |
rdmaexporter_cq | RDMA CQs Count | RDMA创建的完成队列计数(累计值) |
此组件获取节点和容器内的RDMA网卡信息时,会消耗CPU,建议将该组件的cpu limit设置为2。过小的话,调用/metrics接口响应会很慢。
纯净模式