Масштабируемый мониторинг: Настраиваем VictoriaMetrics в HA-конфигурации с VMAgent и Grafana / Хабр

Когда ваш стек мониторинга перерастает масштаб нескольких серверов, классический Prometheus показывает свои ограничения:

Проблемы с производительностью при миллионах метрик
Вертикальное масштабирование
Сложности с долгосрочным хранением
Ограниченные возможности репликации

Страшно, очень страшно, если бы мы знали, что это

Сегодня настроим высокодоступную и масштабируемую систему на основе VictoriaMetrics - современной time-series базы, совместимой с PromQL.

⠀⠀⠀⠀⠀⠀⠀⠀

Архитектура:

⠀⠀⠀⠀⠀⠀⠀⠀

Что получим:

Горизонтальное масштабирование
Автоматическое переключение при сбоях
Эффективное долгосрочное хранение
Совместимость с существующими Prometheus-конфигами

⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 1: Подготовка инфраструктуры

Разворачиваем на 3-х нодах с label monitoring: true:

# k8s/storage-class.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: fast-ssd
provisioner: kubernetes.io/aws-ebs
parameters:
  type: gp3
  iops: "3000"
  throughput: "125"
allowVolumeExpansion: true

⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 2: Устанавливаем VictoriaMetrics Cluster

# k8s/victoriametrics/vmstorage.yaml
apiVersion: apps/v1/v1
kind: StatefulSet
metadata:
  name: vmstorage
  namespace: monitoring
spec:
  serviceName: vmstorage
  replicas: 3
  selector:
    matchLabels:
      app: vmstorage
  template:
    metadata:
      labels:
        app: vmstorage
    spec:
      containers:
      - name: storage
        image: victoriametrics/vmstorage:v1.93.4-cluster
        args:
        - -retentionPeriod=12
        - -storageDataPath=/storage
        - -envflag.enable=true
        - -envflag.prefix=VM_
        - -loggerFormat=json
        - -clusternativeListenAddr=:8482
        ports:
        - name: clusternative
          containerPort: 8482
        - name: http
          containerPort: 8482
        volumeMounts:
        - name: storage
          mountPath: /storage
        resources:
          requests:
            memory: 4Gi
            cpu: "1"
          limits:
            memory: 8Gi
            cpu: "2"
        livenessProbe:
          httpGet:
            path: /health
            port: http
          initialDelaySeconds: 30
          timeoutSeconds: 10
        readinessProbe:
          httpGet:
            path: /health
            port: http
          initialDelaySeconds: 5
          timeoutSeconds: 5
  volumeClaimTemplates:
  - metadata:
      name: storage
    spec:
      storageClassName: fast-ssd
      accessModes:
      - ReadWriteOnce
      resources:
        requests:
          storage: 500Gi

# k8s/victoriametrics/vmselect.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vmselect
  namespace: monitoring
spec:
  replicas: 2
  selector:
    matchLabels:
      app: vmselect
  template:
    metadata:
      labels:
        app: vmselect
    spec:
      containers:
      - name: select
        image: victoriametrics/vmselect:v1.93.4-cluster
        args:
        - -envflag.enable=true
        - -envflag.prefix=VM_
        - -loggerFormat=json
        - -clusternativeListenAddr=:8481
        - -storageNode=vmstorage-0.vmstorage.monitoring.svc.cluster.local:8482
        - -storageNode=vmstorage-1.vmstorage.monitoring.svc.cluster.local:8482
        - -storageNode=vmstorage-2.vmstorage.monitoring.svc.cluster.local:8482
        ports:
        - name: http
          containerPort: 8481
        resources:
          requests:
            memory: 2Gi
            cpu: "500m"
          limits:
            memory: 4Gi
            cpu: "1"
        livenessProbe:
          httpGet:
            path: /health
            port: http
          initialDelaySeconds: 30
        readinessProbe:
          httpGet:
            path: /health
            port: http
          initialDelaySeconds: 5

⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 3: Настраиваем VMAgent с продвинутым service discovery

# k8s/victoriametrics/vmagent.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vmagent
  namespace: monitoring
spec:
  replicas: 2
  selector:
    matchLabels:
      app: vmagent
  template:
    metadata:
      labels:
        app: vmagent
    spec:
      serviceAccountName: vmagent
      containers:
      - name: agent
        image: victoriametrics/vmagent:v1.93.4
        args:
        - -promscrape.config=/etc/vmagent/scrape.yaml
        - -remoteWrite.url=http://vminsert.monitoring.svc.cluster.local:8480/insert/0/prometheus
        - -remoteWrite.maxBlockSize=4M
        - -remoteWrite.showURL=true
        - -promscrape.suppressScrapeErrors=true
        - -promscrape.cluster.membersCount=2
        - -promscrape.cluster.memberNum=$(POD_INDEX)
        - -promscrape.cluster.replicationFactor=1
        volumeMounts:
        - name: config
          mountPath: /etc/vmagent
        env:
        - name: POD_INDEX
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
              apiVersion: v1
        resources:
          requests:
            memory: 1Gi
            cpu: "500m"
          limits:
            memory: 2Gi
            cpu: "1"
        livenessProbe:
          httpGet:
            path: /health
            port: 8429
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: vmagent-config
  namespace: monitoring
data:
  scrape.yaml: |
    global:
      scrape_interval: 30s
      external_labels:
        cluster: 'production'
        replica: '$(POD_INDEX)'
    
    scrape_configs:
    - job_name: 'kubernetes-nodes'
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - source_labels: [__address__]
        regex: '(.*):10250'
        replacement: '${1}:9100'
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
    
    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod
      metrics_path: /metrics
      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name
    
    # Кастомный job для мониторинга бизнес-метрик
    - job_name: 'business-metrics'
      static_configs:
      - targets: ['business-api:8080']
      metrics_path: /actuator/prometheus
      scrape_interval: 15s

⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 4: Настройка алертинга с VMAler

# k8s/victoriametrics/vmalert.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vmalert
  namespace: monitoring
spec:
  replicas: 2
  selector:
    matchLabels:
      app: vmalert
  template:
    metadata:
      labels:
        app: vmalert
    spec:
      containers:
      - name: alert
        image: victoriametrics/vmalert:v1.93.4
        args:
        - -datasource.url=http://vmselect.monitoring.svc.cluster.local:8481/select/0/prometheus
        - -notifier.url=http://alertmanager.monitoring.svc.cluster.local:9093
        - -remoteWrite.url=http://vminsert.monitoring.svc.cluster.local:8480/insert/0/prometheus
        - -rule=/etc/vmalert/rules/*.yaml
        - -evaluationInterval=30s
        - -external.alert.source=VictoriaMetrics
        volumeMounts:
        - name: rules
          mountPath: /etc/vmalert/rules
        resources:
          requests:
            memory: 1Gi
            cpu: "500m"
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: vmalert-rules
  namespace: monitoring
data:
  infrastructure.yaml: |
    groups:
    - name: infrastructure
      rules:
      - alert: NodeDown
        expr: up{job="kubernetes-nodes"} == 0
        for: 5m
        labels:
          severity: critical
          tier: infrastructure
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "Node {{ $labels.instance }} has been down for more than 5 minutes"
      
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value | humanize }}%"
  
  business.yaml: |
    groups:
    - name: business
      rules:
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
        for: 5m
        labels:
          severity: critical
          tier: application
        annotations:
          summary: "High error rate on {{ $labels.service }}"
          description: "Error rate is {{ $value | humanize }}%"

⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 5: Настройка Grafana для работы с VMSelect

# k8s/grafana/datasources.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-datasources
  namespace: monitoring
data:
  victoriametrics.yaml: |
    apiVersion: 1
    datasources:
    - name: VictoriaMetrics
      type: prometheus
      url: http://vmselect.monitoring.svc.cluster.local:8481/select/0/prometheus
      access: proxy
      isDefault: true
      jsonData:
        timeInterval: 30s
        queryTimeout: 60s
        httpMethod: POST
        manageAlerts: true
        disableMetricsLookup: false

⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀

Шаг 6: Мониторинг самого мониторинга

# k8s/victoriametrics/self-monitoring.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: vm-self-monitoring
  namespace: monitoring
data:
  self-scrape.yaml: |
    - job_name: 'victoriametrics'
      static_configs:
      - targets:
        - 'vmselect:8481'
        - 'vminsert:8480'
        - 'vmstorage-0:8482'
        - 'vmstorage-1:8482'
        - 'vmstorage-2:8482'
        - 'vmagent:8429'
        - 'vmalert:8880'
      metrics_path: /metrics
      scrape_interval: 30s

⠀⠀⠀⠀⠀⠀

Проверка работы:

# Проверяем репликацию
kubectl port-forward svc/vmselect 8481:8481 -n monitoring
curl "http://localhost:8481/select/0/prometheus/api/v1/label/__name__/values"

# Смотрим метрики самого VMAgent
curl "http://localhost:8481/select/0/prometheus/api/v1/query?query=vmagent_remotewrite_blocks_sent_total"

# Проверяем алерты
kubectl port-forward svc/vmalert 8880:8880 -n monitoring
curl http://localhost:8880/api/v1/alerts

⠀⠀⠀⠀⠀⠀⠀⠀

Заключение:

Мы развернули высокодоступную, масштабируемую систему мониторинга, которая:

Обрабатывает миллионы метрик в реальном времени
Автоматически распределяет нагрузку между нодами
Обеспечивает отказоустойчивость на всех уровнях
Поддерживает сложные правила алертинга
Интегрируется с существующей экосистемой Prometheus

Такой стек способен выдержать нагрузку крупного production-окружения и предоставляет надежную основу для observability вашей инфраструктуры.

Есть вопросы по тонкой настройке или конкретным кейсам? Пишите в комментариях :-)