Quick Guide for Datadog Agent on Kubernetes for Metrics

  1. Add the Datadog Helm repository.

    helm repo add datadog https://helm.datadoghq.com
    helm repo update
  2. Configure datadog-values.yaml.

    For pod-level metrics, add Datadog annotations to enable scraping.

  3. Base configuration

    fullnameOverride: "kfuse-agent"
    nameOverride: "kfuse-agent"
    datadog:
      apiKey: " "
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # dd_url: "https://customer.kloudfuse.io/ingester"
      #
    yaml
  4. Enable metrics ingestion.

    fullnameOverride: "kfuse-agent"
    nameOverride: "kfuse-agent"
    datadog:
      logsEnabled: true
      logs:
        enabled: true
        containerCollectAll: true
        containerCollectUsingFiles: true
        autoMultiLineDetection: true
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # dd_url: "https://customer.kloudfuse.io/ingester"
      #
      kubeStateMetricsEnabled: false
      kubeStateMetricsCore:
        enabled: true
        ignoreLegacyKSMCheck: true
      networkMonitoring:
        enabled: false
      orchestratorExplorer:
        enabled: true
      apiKey: "798999171d8a0e80030b20366f2994f4"
      env:
        - name: DD_CHECKS_TAG_CARDINALITY
          value: "orchestrator"
        - name: DD_CONTAINER_INCLUDE
          value: "kube_namespace:.*"
        - name: DD_CONTAINER_INCLUDE_LOGS
          value: "kube_namespace:.*"
        - name: DD_CONTAINER_EXCLUDE_METRICS
          value: "name:gclog.*"
        - name: DD_KUBERNETES_POD_LABELS_AS_TAGS
          value: "{"*":"%%label%%"}"
        - name: DD_KUBERNETES_NODE_LABELS_AS_TAGS
          value: "{"*":"%%label%%"}"
        - name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD
          value: "0.01"
        - name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS"
          value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?'
      apm:
        enabled: true
        portEnabled: true
      dogstatsd:
        port: 8126
        useHostPort: false
      processAgent:
        enabled: true
      prometheusScrape:
        enabled: true
        version: 1
        additionalConfigs:
          - configurations:
            - send_monotonic_counter: false
              send_distribution_counts_as_monotonic: false
              send_distribution_sums_as_monotonic: false
              send_histograms_buckets: true
              max_returned_metrics: 999999
              min_collection_interval: 15
            autodiscovery:
              kubernetes_annotations:
                exclude:
                  app: knight
                  prometheus.io/scrape: "false"
          - configurations:
            - send_monotonic_counter: false
              send_distribution_counts_as_monotonic: false
              send_distribution_sums_as_monotonic: false
              send_histograms_buckets: true
              max_returned_metrics: 999999
              min_collection_interval: 60
            autodiscovery:
              kubernetes_container_names:
                - knight
      checksd:
        kf_openmetrics.py: |-
          from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2
          import os, requests, json, time
    
          col_names = ['metric_name', 'metric_type', 'description']
          METRIC_TYPE_EXPANSIONS = {
              'histogram' :
                  # prom style metric to dd style metric conversions for histogram metrics
                  {
                      'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                      'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                      'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'},
                      'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'},
                      'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'},
                  },
              'summary' :
                  # prom style metric to dd style metric conversions for summary metrics
                  {
                      'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                      'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                      'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'},
                  }
          }
    
          HELP_TEXT = '# HELP'
          TYPE_TEXT = '# TYPE'
          DEFAULT_URL = 'http://%%host%%:%%port%%/metrics'
          class KfOpenMetricsCheck(AgentCheck):
            def __init__(self, name, init_config, instances):
              super(KfOpenMetricsCheck, self).__init__(name, init_config, instances)
    
            def get_default_config(self):
              return {}
    
            def scrape_endpoint(self, scrape_url):
              response = requests.get(scrape_url)
              if response.status_code != 200:
                raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response)
              return response.content.decode('utf-8')
    
            def prepare_mds(self, data):
              mds = {}
              content = data.splitlines()
              def _get_info(line, prefix_text):
                items = line[len(prefix_text):].split()
                return items[0], ' '.join(items[1:])
              def _add_info(metric_name, info, value):
                if metric_name not in mds:
                  mds.update({metric_name:{}})
                mds[metric_name][info] = value
              def _expand_info(metric_name):
                if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS:
                  return
                extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']]
                for ext_name, extension in extensions.items():
                  prefix = ""
                  suffix = ""
                  try:
                    prefix = extension['desc_prefix']
                  except KeyError:
                    pass
                  try:
                    suffix = extension['desc_suffix']
                  except KeyError:
                    pass
                  mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}})
              for l in content:
                if l.startswith(HELP_TEXT):
                  mname, mdesc = _get_info(l, HELP_TEXT)
                  _add_info(mname, 'desc', mdesc)
                if l.startswith(TYPE_TEXT):
                  mname, mtype = _get_info(l, TYPE_TEXT)
                  _add_info(mname, 'type', mtype)
                  _expand_info(mname)
              return mds
    
            def check(self, instance):
              scrape_url = ''
              try:
                scrape_url = instance.get('openmetrics_endpoint')
                if scrape_url != DEFAULT_URL:
                  self.log.info("running kf_openmetrics check for instance {}".format(instance))
                  data = self.scrape_endpoint(scrape_url)
                  mds = self.prepare_mds(data)
                  self.log.debug("prepared mds len:{}".format(len(mds)))
                  self.event(
                    {
                      "timestamp": time.time(),
                      "event_type": "OpenMetricsMetadata",
                      "msg_title" : "OpenMetricsMetadata",
                      "msg_text": json.dumps(mds),
                    }
                  )
                else:
                  self.log.info("skipping kf_openmetrics check for instance {}".format(instance))
              except KeyError as ke:
                self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke))
              except requests.exceptions.InvalidURL as ie:
                self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke))
      confd:
        go_expvar.yaml: |-
          init_config:
          instances:
            - expvar_url: http://localhost:6000/debug/vars  # if you've defined `expvar_port` in datadog.yaml, change the port here to that value
              namespace: datadog.agent
              metrics:
                - path: forwarder/Transactions/RetryQueueSize
                - path: forwarder/Transactions/Success
                  type: rate
                - path: forwarder/Transactions/Errors
                  type: rate
                - path: forwarder/Transactions/ErrorsByType/DNSErrors
                  type: rate
                - path: forwarder/Transactions/ErrorsByType/TLSErrors
                  type: rate
                - path: forwarder/Transactions/ErrorsByType/ConnectionErrors
                  type: rate
                - path: forwarder/Transactions/ErrorsByType/WroteRequestErrors
                  type: rate
                - path: forwarder/Transactions/ErrorsByType/SentRequestErrors
                  type: rate
                - path: forwarder/Transactions/HTTPErrors
                  type: rate
                - path: dogstatsd-udp/Packets
                  type: rate
                - path: dogstatsd-udp/PacketReadingErrors
                  type: rate
                - path: dogstatsd-uds/Packets
                  type: rate
                - path: dogstatsd-uds/PacketReadingErrors
                  type: rate
                - path: dogstatsd-uds/OriginDetectionErrors
                  type: rate
                - path: dogstatsd/ServiceCheckParseErrors
                  type: rate
                - path: dogstatsd/ServiceCheckPackets
                  type: rate
                - path: dogstatsd/EventParseErrors
                  type: rate
                - path: dogstatsd/EventPackets
                  type: rate
                - path: dogstatsd/MetricParseErrors
                  type: rate
                - path: dogstatsd/MetricPackets
                  type: rate
                - path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush
                - path: aggregator/Flush/ServiceCheckFlushTime/LastFlush
                - path: aggregator/Flush/EventFlushTime/LastFlush
                - path: aggregator/Flush/MetricSketchFlushTime/LastFlush
                - path: aggregator/Flush/MainFlushTime/LastFlush
                - path: aggregator/FlushCount/ServiceChecks/LastFlush
                - path: aggregator/FlushCount/Series/LastFlush
                - path: aggregator/FlushCount/Events/LastFlush
                - path: aggregator/FlushCount/Sketches/LastFlush
                - path: aggregator/DogstatsdContexts
                - path: aggregator/SeriesFlushed
                  type: rate
                - path: aggregator/ServiceCheckFlushed
                  type: rate
                - path: aggregator/EventsFlushed
                  type: rate
                - path: aggregator/NumberOfFlush
                  type: rate
                - path: aggregator/DogstatsdMetricSample
                  type: rate
                - path: aggregator/ChecksMetricSample
                  type: rate
                - path: aggregator/ServiceCheck
                  type: rate
                - path: aggregator/Event
                  type: rate
                - path: aggregator/HostnameUpdate
                  type: rate
                - path: scheduler/ChecksEntered
                - path: scheduler/Queues/.*/Size
                  alias: scheduler.queues.size
                - path: scheduler/Queues/.*/Interval
                  alias: scheduler.queues.interval
                - path: scheduler/QueuesCount
                - path: splitter/NotTooBig
                  type: rate
                - path: splitter/TooBig
                  type: rate
                - path: splitter/TotalLoops
                  type: rate
                - path: splitter/PayloadDrops
                  type: rate
                - path: logs-agent/IsRunning
                  type: gauge
                - path: logs-agent/DestinationErrors
                  type: rate
                - path: logs-agent/LogsDecoded
                  type: rate
                - path: logs-agent/LogsProcessed
                  type: rate
                - path: logs-agent/LogsSent
                  type: rate
                - path: logs-agent/BytesSent
                  type: rate
                - path: logs-agent/EncodedBytesSent
                  type: rate
                - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs
                  type: rate
                  alias: batch_strategy.idle_time
                  tags:
                  - sender:0
                - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs
                  type: rate
                  alias: batch_strategy.in_use_time
                  tags:
                  - sender:0
                - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs
                  type: rate
                  alias: batch_strategy.idle_time
                  tags:
                  - sender:1
                - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs
                  type: rate
                  alias: batch_strategy.in_use_time
                  tags:
                  - sender:1
                - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs
                  type: rate
                  alias: batch_strategy.idle_time
                  tags:
                  - sender:2
                - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs
                  type: rate
                  alias: batch_strategy.in_use_time
                  tags:
                  - sender:2
                - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs
                  type: rate
                  alias: batch_strategy.idle_time
                  tags:
                  - sender:3
                - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs
                  type: rate
                  alias: batch_strategy.in_use_time
                  tags:
                  - sender:3
        kf_openmetrics.yaml: |-
            init_config:
            instances:
              - openmetrics_endpoint: http://%%host%%:%%port%%/metrics
                min_collection_interval: 300
    clusterAgent:
      replicaCount: 1
      enabled: true
      datadog_cluster_yaml:
        use_v2_api:
          events: true
          series: true
          service_checks: true
        process_config:
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # process_dd_url: "http://<ingress-ip>/ingester"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # process_dd_url: "https://customer.kloudfuse.io/ingester"
          #
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # events_dd_url: "http://<ingress-ip>/ingester"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # events_dd_url: "https://customer.kloudfuse.io/ingester"
          #
          container_collection:
            enabled: false
        orchestrator_explorer:
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # orchestrator_dd_url: "http://<ingress-ip>"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # orchestrator_dd_url: "https://customer.kloudfuse.io"
          #
          manifest_collection:
            enabled: false
      admissionController:
        enabled: false
    agents:
      image:
        tagSuffix: jmx
      useConfigMap: true
      customAgentConfig:
        skip_ssl_validation: false
        enable_stream_payload_serialization: false
        use_v2_api:
          events: true
          series: true
          service_checks: true
        process_config:
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # process_dd_url: "http://<ingress-ip>/ingester"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # process_dd_url: "https://customer.kloudfuse.io/ingester"
          #
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # events_dd_url: "http://<ingress-ip>/ingester"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # events_dd_url: "https://customer.kloudfuse.io/ingester"
          #
          container_collection:
            enabled: false
        orchestrator_explorer:
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # orchestrator_dd_url: "http://<ingress-ip>"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # orchestrator_dd_url: "https://customer.kloudfuse.io"
          #
          manifest_collection:
            enabled: false
        logs_config:
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80"
          logs_no_ssl: true
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # logs_dd_url: "<ingress-ip>:80"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure
          # to comment out the logs_no_ssl: true default above.
          #
          # logs_dd_url: "customer.kloudfuse.io:443"
          # logs_no_ssl: false
          #
          use_http: true
          auto_multi_line_detection: true
          use_v2_api: false
        apm_config:
          enabled: true
          apm_non_local_traffic: true
          #
          # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
          #
          apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester"
          #
          # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
          #
          # apm_dd_url: "http://<ingress-ip>/ingester"
          #
          # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
          #
          # apm_dd_url: "https://customer.kloudfuse.io/ingester"
          #
        metadata_providers:
          - name: host
            interval: 300
    yaml