本机curl访问数据

[root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep node_  |head -20
# HELP node_arp_entries ARP entries by device
# TYPE node_arp_entries gauge
node_arp_entries{device="eth0"} 3
# HELP node_boot_time_seconds Node boot time, in unixtime.
# TYPE node_boot_time_seconds gauge
node_boot_time_seconds 1.616987084e+09
# HELP node_context_switches_total Total number of context switches.
# TYPE node_context_switches_total counter
node_context_switches_total 2.105979e+06
# HELP node_cooling_device_cur_state Current throttle state of the cooling device
# TYPE node_cooling_device_cur_state gauge
node_cooling_device_cur_state{name="0",type="Processor"} 0
node_cooling_device_cur_state{name="1",type="Processor"} 0
node_cooling_device_cur_state{name="2",type="Processor"} 0
node_cooling_device_cur_state{name="3",type="Processor"} 0
# HELP node_cooling_device_max_state Maximum throttle state of the cooling device
# TYPE node_cooling_device_max_state gauge
node_cooling_device_max_state{name="0",type="Processor"} 0
node_cooling_device_max_state{name="1",type="Processor"} 0
node_cooling_device_max_state{name="2",type="Processor"} 0
  • node_exporter
  • 查看启动日志

    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:178 msg="Starting node_exporter" version="(version=1.1.2, branch=HEAD, revision=b597c1244d7bef49e6f3359c87a56dd7707f6719)"
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:179 msg="Build context" build_context="(go=go1.15.8, user=root@f07de8ca602a, date=20210305-09:29:10)"
    Mar 29 15:38:51 prome_master_01 node_exporter: level=warn ts=2021-03-29T07:38:51.315Z caller=node_exporter.go:181 msg="Node Exporter is running as root user. This exporter is designed to run as unpriviledged user, root is not required."
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=filesystem_common.go:74 collector=filesystem msg="Parsed flag --collector.filesystem.ignored-mount-points" flag=^/(dev|proc|sys|var/lib/docker/.+)($|/)
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=filesystem_common.go:76 collector=filesystem msg="Parsed flag --collector.filesystem.ignored-fs-types" flag=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:106 msg="Enabled collectors"
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=arp
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=bcache
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=bonding
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=btrfs
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=conntrack
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=cpu
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=cpufreq
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=diskstats
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=edac
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=entropy
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=fibrechannel
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=filefd
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=filesystem
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=hwmon
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=infiniband
    Mar 29 15:38:51 prome_master_01 node_exporter: level=info ts=2021-03-29T07:38:51.316Z caller=node_exporter.go:113 collector=ipvs
    

    默认开启的采集项目介绍

  • 黑名单: 关闭某一项默认开启的采集项
  • --no-collector.<name> flag
    # 未开启前
    [root@prome_master_01 node_exporter]# curl  -s  localhost:9100/metrics |grep node_cpu
    # HELP node_cpu_guest_seconds_total Seconds the CPUs spent in guests (VMs) for each mode.
    # TYPE node_cpu_guest_seconds_total counter
    node_cpu_guest_seconds_total{cpu="0",mode="nice"} 0
    node_cpu_guest_seconds_total{cpu="0",mode="user"} 0
    node_cpu_guest_seconds_total{cpu="1",mode="nice"} 0
    node_cpu_guest_seconds_total{cpu="1",mode="user"} 0
    node_cpu_guest_seconds_total{cpu="2",mode="nice"} 0
    node_cpu_guest_seconds_total{cpu="2",mode="user"} 0
    node_cpu_guest_seconds_total{cpu="3",mode="nice"} 0
    node_cpu_guest_seconds_total{cpu="3",mode="user"} 0
    # HELP node_cpu_seconds_total Seconds the CPUs spent in each mode.
    # TYPE node_cpu_seconds_total counter
    node_cpu_seconds_total{cpu="0",mode="idle"} 17691.27
    node_cpu_seconds_total{cpu="0",mode="iowait"} 8.9
    node_cpu_seconds_total{cpu="0",mode="irq"} 0
    node_cpu_seconds_total{cpu="0",mode="nice"} 0.32
    node_cpu_seconds_total{cpu="0",mode="softirq"} 0.28
    node_cpu_seconds_total{cpu="0",mode="steal"} 2.7
    # 关闭cpu采集
     ./node_exporter --no-collector.cpu
    curl  -s  localhost:9100/metrics |grep node_cpu
    
  • 白名单:关闭默认采集项而只开启某些采集
  •  --collector.disable-defaults --collector.<name> .
    # 只开启mem采集
     ./node_exporter --collector.disable-defaults --collector.meminfo
    # 只开启mem 和cpu 采集
    ./node_exporter --collector.disable-defaults --collector.meminfo --collector.cpu
    

    默认关闭的 关闭原因

  • 太重:High cardinality
  • 太慢:Prolonged runtime that exceeds the Prometheus scrape_interval or scrape_timeout
  • 太多资源开销: Significant resource demands on the host
  • 禁用golang sdk 指标

  • 使用 --web.disable-exporter-metrics
  • promhttp_ 代表访问/metrics 的http情况
  • [root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep promhttp_
    # HELP promhttp_metric_handler_errors_total Total number of internal errors encountered by the promhttp metric handler.
    # TYPE promhttp_metric_handler_errors_total counter
    promhttp_metric_handler_errors_total{cause="encoding"} 0
    promhttp_metric_handler_errors_total{cause="gathering"} 0
    # HELP promhttp_metric_handler_requests_in_flight Current number of scrapes being served.
    # TYPE promhttp_metric_handler_requests_in_flight gauge
    promhttp_metric_handler_requests_in_flight 1
    # HELP promhttp_metric_handler_requests_total Total number of scrapes by HTTP status code.
    # TYPE promhttp_metric_handler_requests_total counter
    promhttp_metric_handler_requests_total{code="200"} 8
    promhttp_metric_handler_requests_total{code="500"} 0
    promhttp_metric_handler_requests_total{code="503"} 0
    
  • go_代表 goruntime 信息等
  • # HELP go_goroutines Number of goroutines that currently exist.
    # TYPE go_goroutines gauge
    go_goroutines 7
    # HELP go_info Information about the Go environment.
    # TYPE go_info gauge
    go_info{version="go1.15.8"} 1
    # HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
    # TYPE go_memstats_alloc_bytes gauge
    go_memstats_alloc_bytes 2.781752e+06
    
  • process_代表 进程信息等
  • # HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
    # TYPE process_cpu_seconds_total counter
    process_cpu_seconds_total 0.54
    # HELP process_max_fds Maximum number of open file descriptors.
    # TYPE process_max_fds gauge
    process_max_fds 1024
    # HELP process_open_fds Number of open file descriptors.
    # TYPE process_open_fds gauge
    process_open_fds 9
    # HELP process_resident_memory_bytes Resident memory size in bytes.
    # TYPE process_resident_memory_bytes gauge
    process_resident_memory_bytes 1.5720448e+07
    

    节点上自打点数据上报

  • --collector.textfile.directory="" 配置本地采集目录
  • 在采集目录里创建.prom文件,格式说明
  • # 创建目录
    mkdir ./text_file_dir
    # 准备 prom文件
    cat <<EOF > ./text_file_dir/test.prom
    # HELP nyy_test_metric just test
    # TYPE nyy_test_metric gauge
    nyy_test_metric{method="post",code="200"} 1027
    # 启动服务
    ./node_exporter --collector.textfile.directory=./text_file_dir
    # curl查看数据
    [root@prome_master_01 tgzs]# curl  -s  localhost:9100/metrics |grep nyy
    # HELP nyy_test_metric just test
    # TYPE nyy_test_metric gauge
    nyy_test_metric{code="200",method="post"} 1027
    

    http传入参数,按采集器过滤指标

  • 原理: 通过http请求参数过滤采集器
  • func (h *handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
        filters := r.URL.Query()["collect[]"]
        level.Debug(h.logger).Log("msg", "collect query:", "filters", filters)
        if len(filters) == 0 {
            // No filters, use the prepared unfiltered handler.
            h.unfilteredHandler.ServeHTTP(w, r)
            return
        // To serve filtered metrics, we create a filtering handler on the fly.
        filteredHandler, err := h.innerHandler(filters...)
        if err != nil {
            level.Warn(h.logger).Log("msg", "Couldn't create filtered metrics handler:", "err", err)
            w.WriteHeader(http.StatusBadRequest)
            w.Write([]byte(fmt.Sprintf("Couldn't create filtered metrics handler: %s", err)))
            return
        filteredHandler.ServeHTTP(w, r)
    
  • http访问
  • # 只看cpu采集器的指标
    http://192.168.0.112:9100/metrics?collect[]=cpu
    # 只看cpu和mem采集器的指标
    http://192.168.0.112:9100/metrics?collect[]=cpu&collect[]=meminfo
    
  • prometheus配置
  •   params:
        collect[]:
          - cpu
          - meminfo
    
  • 和prometheus relabel_config的区别 : 按采集器过滤 VS 按metric_name 或label过滤
  • 导入dashboard商城中的node_exporter模板

  • 地址 grafana.com/grafana/das…
  • node_cpu_seconds_total{mode="user"}
    node_cpu_seconds_total{cpu="0", instance="172.20.70.205:9100", job="prometheus", mode="user"}
    53.43
    node_cpu_seconds_total{cpu="0", instance="172.20.70.215:9100", job="prometheus", mode="user"}
    node_cpu_seconds_total{cpu="1", instance="172.20.70.205:9100", job="prometheus", mode="user"}
    28.96
    node_cpu_seconds_total{cpu="1", instance="172.20.70.215:9100", job="prometheus", mode="user"}
    12.32
    node_cpu_seconds_total{cpu="2", instance="172.20.70.205:9100", job="prometheus", mode="user"}
    31.54
    node_cpu_seconds_total{cpu="2", instance="172.20.70.215:9100", job="prometheus", mode="user"}
    node_cpu_seconds_total{cpu="3", instance="172.20.70.205:9100", job="prometheus", mode="user"}
    53.88
    node_cpu_seconds_total{cpu="3", instance="172.20.70.215:9100", job="prometheus", mode="user"}
    

    prometheus 查询数据及数据概念

    prometheus 基本概念

    sample 数据点

    type sample struct {
        t int64
        v float64
    
  • sample代表一个数据点
  • size:16byte: 包含 1个8byte int64时间戳和1个8byte float64 value
  • Label 标签

    type Label struct {
        Name, Value string
    
  • 一对label 比如 cpu="0" mode: "user"
  • Labels 标签组

    type Labels []Label
    
  • 就是metric 一个指标的所有tag values
  • prometheus四种查询类型

    即时向量 Instant vector : 一组时间序列,每个时间序列包含一个样本,所有样本共享相同的时间戳

    在prometheus页面上就是table查询 ,对应查询接口 /api/v1/query

    范围向量 Range vector : 一组时间序列,每个时间序列包含一个样本,所有样本共享相同的时间戳

    在prometheus页面上就是graph查询 ,对应查询接口 /api/v1/query

    Matrix 矩阵

    type Matrix []Series
    
  • Matrix是series的切片,一般的range_query返回的结果
  • counter 计数器是代表一个累积指标单调递增计数器,其价值只能在重新启动增加或归零。例如,您可以使用计数器来表示已服务请求,已完成任务或错误的数量。
  • http_request_total
    
  • histogram 直方图样本观测(通常之类的东西请求持续时间或响应大小)和计数它们配置的桶中。它还提供所有观察值的总和。
  • # http所有接口 总的95分位值
    # sum/count 可以算平均值
    prometheus_http_request_duration_seconds_sum/ prometheus_http_request_duration_seconds_count
    # histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket[5m])) by (le,handler))
    histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket[1m])) by (le))
    # range_query接口的95分位值
    histogram_quantile(0.95, sum(rate(prometheus_http_request_duration_seconds_bucket{handler="/api/v1/query_range"}[5m])) by (le))
    
  • summary 摘要会采样观察值(通常是请求持续时间和响应大小之类的东西)。尽管它还提供了观测值的总数和所有观测值的总和,但它可以计算滑动时间窗口内的可配置分位数。
  • # gc耗时
    # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
    # TYPE go_gc_duration_seconds summary
    go_gc_duration_seconds{quantile="0"} 0.000135743
    go_gc_duration_seconds{quantile="0.25"} 0.000872805
    go_gc_duration_seconds{quantile="0.5"} 0.000965516
    go_gc_duration_seconds{quantile="0.75"} 0.001055636
    go_gc_duration_seconds{quantile="1"} 0.006464756
    # summary 平均值
    go_gc_duration_seconds_sum /go_gc_duration_seconds_count
    

    范围向量选择器 Range Vector Selectors

  • 范围矢量的工作方式与即时矢量一样,不同之处在于它们从当前即时中选择了一定范围的样本。语法上,将持续时间附加在[]向量选择器末尾的方括号()中,以指定应为每个结果范围向量元素提取多远的时间值。
  • 只能作用在counter
  • ms -毫秒
    m - 分钟
    h - 小时
    d -天-假设一天总是24小时
    w -周-假设一周始终为7天
    y -年-假设一年始终为365天
    

    直接查询报错 node_network_receive_bytes_total{device!="lo"}[1m]

    Error executing query: invalid expression type "range vector" for range query, must be Scalar or instant Vector
    

    需要叠加一个非聚合函数 如 rate irate delta idelta sum 等

  • 计算网卡入流量 rate(node_network_receive_bytes_total{device!="lo"}[1m])
  • 时间范围 ,不能低于采集间隔

    采集30秒 ,查询10秒则无数据

    rate(node_network_receive_bytes_total{device!="lo"}[10s])

    分类:
    前端
    • 991
  •