Quick Guide for Datadog Agent on Kubernetes for Metrics

Add the Datadog Helm repository

helm repo add datadog https://helm.datadoghq.com
helm repo update

Configure datadog-values.yaml

For pod-level metrics, add Datadog annotations to enable scraping.

Base configuration

fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
  apiKey: " "
  #
  # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
  #
  dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
  #
  # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
  #
  # dd_url: "http://<ingress-ip>/ingester"
  #
  # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at "customer.kloudfuse.io")
  #
  # dd_url: "https://customer.kloudfuse.io/ingester"
  #
yaml

Enable metrics ingestion

fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
  logsEnabled: true
  logs:
    enabled: true
    containerCollectAll: true
    containerCollectUsingFiles: true
    autoMultiLineDetection: true
  #
  # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
  #
  dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
  #
  # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
  #
  # dd_url: "http://<ingress-ip>/ingester"
  #
  # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
  #
  # dd_url: "https://customer.kloudfuse.io/ingester"
  #
  kubeStateMetricsEnabled: false
  kubeStateMetricsCore:
    enabled: true
    ignoreLegacyKSMCheck: true
  networkMonitoring:
    enabled: false
  orchestratorExplorer:
    enabled: true
  apiKey: "798999171d8a0e80030b20366f2994f4"
  env:
    - name: DD_CHECKS_TAG_CARDINALITY
      value: "orchestrator"
    - name: DD_CONTAINER_INCLUDE
      value: "kube_namespace:.*"
    - name: DD_CONTAINER_INCLUDE_LOGS
      value: "kube_namespace:.*"
    - name: DD_CONTAINER_EXCLUDE_METRICS
      value: "name:gclog.*"
    - name: DD_KUBERNETES_POD_LABELS_AS_TAGS
      value: "{"*":"%%label%%"}"
    - name: DD_KUBERNETES_NODE_LABELS_AS_TAGS
      value: "{"*":"%%label%%"}"
    - name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD
      value: "0.01"
    - name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS"
      value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?'
  apm:
    enabled: true
    portEnabled: true
  dogstatsd:
    port: 8126
    useHostPort: false
  processAgent:
    enabled: true
  prometheusScrape:
    enabled: true
    version: 1
    additionalConfigs:
      - configurations:
        - send_monotonic_counter: false
          send_distribution_counts_as_monotonic: false
          send_distribution_sums_as_monotonic: false
          send_histograms_buckets: true
          max_returned_metrics: 999999
          min_collection_interval: 15
        autodiscovery:
          kubernetes_annotations:
            exclude:
              app: knight
              prometheus.io/scrape: "false"
      - configurations:
        - send_monotonic_counter: false
          send_distribution_counts_as_monotonic: false
          send_distribution_sums_as_monotonic: false
          send_histograms_buckets: true
          max_returned_metrics: 999999
          min_collection_interval: 60
        autodiscovery:
          kubernetes_container_names:
            - knight
  checksd:
    kf_openmetrics.py: |-
      from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2
      import os, requests, json, time

      col_names = ['metric_name', 'metric_type', 'description']
      METRIC_TYPE_EXPANSIONS = {
          'histogram' :
              # prom style metric to dd style metric conversions for histogram metrics
              {
                  'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                  'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                  'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'},
                  'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'},
                  'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'},
              },
          'summary' :
              # prom style metric to dd style metric conversions for summary metrics
              {
                  'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                  'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                  'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'},
              }
      }

      HELP_TEXT = '# HELP'
      TYPE_TEXT = '# TYPE'
      DEFAULT_URL = 'http://%%host%%:%%port%%/metrics'
      class KfOpenMetricsCheck(AgentCheck):
        def __init__(self, name, init_config, instances):
          super(KfOpenMetricsCheck, self).__init__(name, init_config, instances)

        def get_default_config(self):
          return {}

        def scrape_endpoint(self, scrape_url):
          response = requests.get(scrape_url)
          if response.status_code != 200:
            raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response)
          return response.content.decode('utf-8')

        def prepare_mds(self, data):
          mds = {}
          content = data.splitlines()
          def _get_info(line, prefix_text):
            items = line[len(prefix_text):].split()
            return items[0], ' '.join(items[1:])
          def _add_info(metric_name, info, value):
            if metric_name not in mds:
              mds.update({metric_name:{}})
            mds[metric_name][info] = value
          def _expand_info(metric_name):
            if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS:
              return
            extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']]
            for ext_name, extension in extensions.items():
              prefix = ""
              suffix = ""
              try:
                prefix = extension['desc_prefix']
              except KeyError:
                pass
              try:
                suffix = extension['desc_suffix']
              except KeyError:
                pass
              mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}})
          for l in content:
            if l.startswith(HELP_TEXT):
              mname, mdesc = _get_info(l, HELP_TEXT)
              _add_info(mname, 'desc', mdesc)
            if l.startswith(TYPE_TEXT):
              mname, mtype = _get_info(l, TYPE_TEXT)
              _add_info(mname, 'type', mtype)
              _expand_info(mname)
          return mds

        def check(self, instance):
          scrape_url = ''
          try:
            scrape_url = instance.get('openmetrics_endpoint')
            if scrape_url != DEFAULT_URL:
              self.log.info("running kf_openmetrics check for instance {}".format(instance))
              data = self.scrape_endpoint(scrape_url)
              mds = self.prepare_mds(data)
              self.log.debug("prepared mds len:{}".format(len(mds)))
              self.event(
                {
                  "timestamp": time.time(),
                  "event_type": "OpenMetricsMetadata",
                  "msg_title" : "OpenMetricsMetadata",
                  "msg_text": json.dumps(mds),
                }
              )
            else:
              self.log.info("skipping kf_openmetrics check for instance {}".format(instance))
          except KeyError as ke:
            self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke))
          except requests.exceptions.InvalidURL as ie:
            self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke))
  confd:
    go_expvar.yaml: |-
      init_config:
      instances:
        - expvar_url: http://localhost:6000/debug/vars  # if you've defined `expvar_port` in datadog.yaml, change the port here to that value
          namespace: datadog.agent
          metrics:
            - path: forwarder/Transactions/RetryQueueSize
            - path: forwarder/Transactions/Success
              type: rate
            - path: forwarder/Transactions/Errors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/DNSErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/TLSErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/ConnectionErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/WroteRequestErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/SentRequestErrors
              type: rate
            - path: forwarder/Transactions/HTTPErrors
              type: rate
            - path: dogstatsd-udp/Packets
              type: rate
            - path: dogstatsd-udp/PacketReadingErrors
              type: rate
            - path: dogstatsd-uds/Packets
              type: rate
            - path: dogstatsd-uds/PacketReadingErrors
              type: rate
            - path: dogstatsd-uds/OriginDetectionErrors
              type: rate
            - path: dogstatsd/ServiceCheckParseErrors
              type: rate
            - path: dogstatsd/ServiceCheckPackets
              type: rate
            - path: dogstatsd/EventParseErrors
              type: rate
            - path: dogstatsd/EventPackets
              type: rate
            - path: dogstatsd/MetricParseErrors
              type: rate
            - path: dogstatsd/MetricPackets
              type: rate
            - path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush
            - path: aggregator/Flush/ServiceCheckFlushTime/LastFlush
            - path: aggregator/Flush/EventFlushTime/LastFlush
            - path: aggregator/Flush/MetricSketchFlushTime/LastFlush
            - path: aggregator/Flush/MainFlushTime/LastFlush
            - path: aggregator/FlushCount/ServiceChecks/LastFlush
            - path: aggregator/FlushCount/Series/LastFlush
            - path: aggregator/FlushCount/Events/LastFlush
            - path: aggregator/FlushCount/Sketches/LastFlush
            - path: aggregator/DogstatsdContexts
            - path: aggregator/SeriesFlushed
              type: rate
            - path: aggregator/ServiceCheckFlushed
              type: rate
            - path: aggregator/EventsFlushed
              type: rate
            - path: aggregator/NumberOfFlush
              type: rate
            - path: aggregator/DogstatsdMetricSample
              type: rate
            - path: aggregator/ChecksMetricSample
              type: rate
            - path: aggregator/ServiceCheck
              type: rate
            - path: aggregator/Event
              type: rate
            - path: aggregator/HostnameUpdate
              type: rate
            - path: scheduler/ChecksEntered
            - path: scheduler/Queues/.*/Size
              alias: scheduler.queues.size
            - path: scheduler/Queues/.*/Interval
              alias: scheduler.queues.interval
            - path: scheduler/QueuesCount
            - path: splitter/NotTooBig
              type: rate
            - path: splitter/TooBig
              type: rate
            - path: splitter/TotalLoops
              type: rate
            - path: splitter/PayloadDrops
              type: rate
            - path: logs-agent/IsRunning
              type: gauge
            - path: logs-agent/DestinationErrors
              type: rate
            - path: logs-agent/LogsDecoded
              type: rate
            - path: logs-agent/LogsProcessed
              type: rate
            - path: logs-agent/LogsSent
              type: rate
            - path: logs-agent/BytesSent
              type: rate
            - path: logs-agent/EncodedBytesSent
              type: rate
            - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:0
            - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:0
            - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:1
            - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:1
            - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:2
            - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:2
            - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:3
            - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:3
    kf_openmetrics.yaml: |-
        init_config:
        instances:
          - openmetrics_endpoint: http://%%host%%:%%port%%/metrics
            min_collection_interval: 300
clusterAgent:
  replicaCount: 1
  enabled: true
  datadog_cluster_yaml:
    use_v2_api:
      events: true
      series: true
      service_checks: true
    process_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # process_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # process_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # events_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # events_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      container_collection:
        enabled: false
    orchestrator_explorer:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # orchestrator_dd_url: "http://<ingress-ip>"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # orchestrator_dd_url: "https://customer.kloudfuse.io"
      #
      manifest_collection:
        enabled: false
  admissionController:
    enabled: false
agents:
  image:
    tagSuffix: jmx
  useConfigMap: true
  customAgentConfig:
    skip_ssl_validation: false
    enable_stream_payload_serialization: false
    use_v2_api:
      events: true
      series: true
      service_checks: true
    process_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # process_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # process_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # events_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # events_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      container_collection:
        enabled: false
    orchestrator_explorer:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # orchestrator_dd_url: "http://<ingress-ip>"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # orchestrator_dd_url: "https://customer.kloudfuse.io"
      #
      manifest_collection:
        enabled: false
    logs_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80"
      logs_no_ssl: true
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # logs_dd_url: "<ingress-ip>:80"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure
      # to comment out the logs_no_ssl: true default above.
      #
      # logs_dd_url: "customer.kloudfuse.io:443"
      # logs_no_ssl: false
      #
      use_http: true
      auto_multi_line_detection: true
      use_v2_api: false
    apm_config:
      enabled: true
      apm_non_local_traffic: true
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # apm_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # apm_dd_url: "https://customer.kloudfuse.io/ingester"
      #
    metadata_providers:
      - name: host
        interval: 300
yaml

Deploy the agent with the configuration file

helm install datadog-agent -f datadog-values.yaml datadog/datadog