Quick Guide for Datadog Agent on Kubernetes for Events

This guide walks you through deploying the Datadog Agent on Kubernetes clusters using Helm. It covers the essential steps to add the Datadog repository, configure the agent, and complete the deployment so that your cluster can send events and metrics to Datadog for monitoring.

Add the Datadog Helm repository

helm repo add datadog https://helm.datadoghq.com
helm repo update

Configure datadog-values.yaml

fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
  logsEnabled: true
  logs:
    enabled: true
    containerCollectAll: true
    containerCollectUsingFiles: true
    autoMultiLineDetection: true
  #
  # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
  #
  dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
  #
  # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
  #
  # dd_url: "http://<ingress-ip>/ingester"
  #
  # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
  #
  # dd_url: "https://customer.kloudfuse.io/ingester"
  #
  kubeStateMetricsEnabled: false
  kubeStateMetricsCore:
    enabled: true
    ignoreLegacyKSMCheck: true
  networkMonitoring:
    enabled: false
  orchestratorExplorer:
    enabled: true
  apiKey: "798999171d8a0e80030b20366f2994f4"
  env:
    - name: DD_CHECKS_TAG_CARDINALITY
      value: "orchestrator"
    - name: DD_CONTAINER_INCLUDE
      value: "kube_namespace:.*"
    - name: DD_CONTAINER_INCLUDE_LOGS
      value: "kube_namespace:.*"
    - name: DD_CONTAINER_EXCLUDE_METRICS
      value: "name:gclog.*"
    - name: DD_KUBERNETES_POD_LABELS_AS_TAGS
      value: "{"*":"%%label%%"}"
    - name: DD_KUBERNETES_NODE_LABELS_AS_TAGS
      value: "{"*":"%%label%%"}"
    - name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD
      value: "0.01"
    - name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS"
      value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?'
  apm:
    enabled: true
    portEnabled: true
  dogstatsd:
    port: 8126
    useHostPort: false
  processAgent:
    enabled: true
  prometheusScrape:
    enabled: true
    version: 1
    additionalConfigs:
      - configurations:
        - send_monotonic_counter: false
          send_distribution_counts_as_monotonic: false
          send_distribution_sums_as_monotonic: false
          send_histograms_buckets: true
          max_returned_metrics: 999999
          min_collection_interval: 15
        autodiscovery:
          kubernetes_annotations:
            exclude:
              app: knight
              prometheus.io/scrape: "false"
      - configurations:
        - send_monotonic_counter: false
          send_distribution_counts_as_monotonic: false
          send_distribution_sums_as_monotonic: false
          send_histograms_buckets: true
          max_returned_metrics: 999999
          min_collection_interval: 60
        autodiscovery:
          kubernetes_container_names:
            - knight
  checksd:
    kf_openmetrics.py: |-
      from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2
      import os, requests, json, time

      col_names = ['metric_name', 'metric_type', 'description']
      METRIC_TYPE_EXPANSIONS = {
          'histogram' :
              # prom style metric to dd style metric conversions for histogram metrics
              {
                  'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                  'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                  'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'},
                  'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'},
                  'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'},
              },
          'summary' :
              # prom style metric to dd style metric conversions for summary metrics
              {
                  'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
                  'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
                  'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'},
              }
      }

      HELP_TEXT = '# HELP'
      TYPE_TEXT = '# TYPE'
      DEFAULT_URL = 'http://%%host%%:%%port%%/metrics'
      class KfOpenMetricsCheck(AgentCheck):
        def __init__(self, name, init_config, instances):
          super(KfOpenMetricsCheck, self).__init__(name, init_config, instances)

        def get_default_config(self):
          return {}

        def scrape_endpoint(self, scrape_url):
          response = requests.get(scrape_url)
          if response.status_code != 200:
            raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response)
          return response.content.decode('utf-8')

        def prepare_mds(self, data):
          mds = {}
          content = data.splitlines()
          def _get_info(line, prefix_text):
            items = line[len(prefix_text):].split()
            return items[0], ' '.join(items[1:])
          def _add_info(metric_name, info, value):
            if metric_name not in mds:
              mds.update({metric_name:{}})
            mds[metric_name][info] = value
          def _expand_info(metric_name):
            if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS:
              return
            extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']]
            for ext_name, extension in extensions.items():
              prefix = ""
              suffix = ""
              try:
                prefix = extension['desc_prefix']
              except KeyError:
                pass
              try:
                suffix = extension['desc_suffix']
              except KeyError:
                pass
              mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}})
          for l in content:
            if l.startswith(HELP_TEXT):
              mname, mdesc = _get_info(l, HELP_TEXT)
              _add_info(mname, 'desc', mdesc)
            if l.startswith(TYPE_TEXT):
              mname, mtype = _get_info(l, TYPE_TEXT)
              _add_info(mname, 'type', mtype)
              _expand_info(mname)
          return mds

        def check(self, instance):
          scrape_url = ''
          try:
            scrape_url = instance.get('openmetrics_endpoint')
            if scrape_url != DEFAULT_URL:
              self.log.info("running kf_openmetrics check for instance {}".format(instance))
              data = self.scrape_endpoint(scrape_url)
              mds = self.prepare_mds(data)
              self.log.debug("prepared mds len:{}".format(len(mds)))
              self.event(
                {
                  "timestamp": time.time(),
                  "event_type": "OpenMetricsMetadata",
                  "msg_title" : "OpenMetricsMetadata",
                  "msg_text": json.dumps(mds),
                }
              )
            else:
              self.log.info("skipping kf_openmetrics check for instance {}".format(instance))
          except KeyError as ke:
            self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke))
          except requests.exceptions.InvalidURL as ie:
            self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke))
  confd:
    go_expvar.yaml: |-
      init_config:
      instances:
        - expvar_url: http://localhost:6000/debug/vars  # if you've defined `expvar_port` in datadog.yaml, change the port here to that value
          namespace: datadog.agent
          metrics:
            - path: forwarder/Transactions/RetryQueueSize
            - path: forwarder/Transactions/Success
              type: rate
            - path: forwarder/Transactions/Errors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/DNSErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/TLSErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/ConnectionErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/WroteRequestErrors
              type: rate
            - path: forwarder/Transactions/ErrorsByType/SentRequestErrors
              type: rate
            - path: forwarder/Transactions/HTTPErrors
              type: rate
            - path: dogstatsd-udp/Packets
              type: rate
            - path: dogstatsd-udp/PacketReadingErrors
              type: rate
            - path: dogstatsd-uds/Packets
              type: rate
            - path: dogstatsd-uds/PacketReadingErrors
              type: rate
            - path: dogstatsd-uds/OriginDetectionErrors
              type: rate
            - path: dogstatsd/ServiceCheckParseErrors
              type: rate
            - path: dogstatsd/ServiceCheckPackets
              type: rate
            - path: dogstatsd/EventParseErrors
              type: rate
            - path: dogstatsd/EventPackets
              type: rate
            - path: dogstatsd/MetricParseErrors
              type: rate
            - path: dogstatsd/MetricPackets
              type: rate
            - path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush
            - path: aggregator/Flush/ServiceCheckFlushTime/LastFlush
            - path: aggregator/Flush/EventFlushTime/LastFlush
            - path: aggregator/Flush/MetricSketchFlushTime/LastFlush
            - path: aggregator/Flush/MainFlushTime/LastFlush
            - path: aggregator/FlushCount/ServiceChecks/LastFlush
            - path: aggregator/FlushCount/Series/LastFlush
            - path: aggregator/FlushCount/Events/LastFlush
            - path: aggregator/FlushCount/Sketches/LastFlush
            - path: aggregator/DogstatsdContexts
            - path: aggregator/SeriesFlushed
              type: rate
            - path: aggregator/ServiceCheckFlushed
              type: rate
            - path: aggregator/EventsFlushed
              type: rate
            - path: aggregator/NumberOfFlush
              type: rate
            - path: aggregator/DogstatsdMetricSample
              type: rate
            - path: aggregator/ChecksMetricSample
              type: rate
            - path: aggregator/ServiceCheck
              type: rate
            - path: aggregator/Event
              type: rate
            - path: aggregator/HostnameUpdate
              type: rate
            - path: scheduler/ChecksEntered
            - path: scheduler/Queues/.*/Size
              alias: scheduler.queues.size
            - path: scheduler/Queues/.*/Interval
              alias: scheduler.queues.interval
            - path: scheduler/QueuesCount
            - path: splitter/NotTooBig
              type: rate
            - path: splitter/TooBig
              type: rate
            - path: splitter/TotalLoops
              type: rate
            - path: splitter/PayloadDrops
              type: rate
            - path: logs-agent/IsRunning
              type: gauge
            - path: logs-agent/DestinationErrors
              type: rate
            - path: logs-agent/LogsDecoded
              type: rate
            - path: logs-agent/LogsProcessed
              type: rate
            - path: logs-agent/LogsSent
              type: rate
            - path: logs-agent/BytesSent
              type: rate
            - path: logs-agent/EncodedBytesSent
              type: rate
            - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:0
            - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:0
            - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:1
            - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:1
            - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:2
            - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:2
            - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs
              type: rate
              alias: batch_strategy.idle_time
              tags:
              - sender:3
            - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs
              type: rate
              alias: batch_strategy.in_use_time
              tags:
              - sender:3
    kf_openmetrics.yaml: |-
        init_config:
        instances:
          - openmetrics_endpoint: http://%%host%%:%%port%%/metrics
            min_collection_interval: 300
clusterAgent:
  replicaCount: 1
  enabled: true
  datadog_cluster_yaml:
    use_v2_api:
      events: true
      series: true
      service_checks: true
    process_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # process_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # process_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # events_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # events_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      container_collection:
        enabled: false
    orchestrator_explorer:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # orchestrator_dd_url: "http://<ingress-ip>"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # orchestrator_dd_url: "https://customer.kloudfuse.io"
      #
      manifest_collection:
        enabled: false
  admissionController:
    enabled: false
agents:
  image:
    tagSuffix: jmx
  useConfigMap: true
  customAgentConfig:
    skip_ssl_validation: false
    enable_stream_payload_serialization: false
    use_v2_api:
      events: true
      series: true
      service_checks: true
    process_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # process_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # process_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # events_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # events_dd_url: "https://customer.kloudfuse.io/ingester"
      #
      container_collection:
        enabled: false
    orchestrator_explorer:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # orchestrator_dd_url: "http://<ingress-ip>"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # orchestrator_dd_url: "https://customer.kloudfuse.io"
      #
      manifest_collection:
        enabled: false
    logs_config:
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80"
      logs_no_ssl: true
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # logs_dd_url: "<ingress-ip>:80"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure
      # to comment out the logs_no_ssl: true default above.
      #
      # logs_dd_url: "customer.kloudfuse.io:443"
      # logs_no_ssl: false
      #
      use_http: true
      auto_multi_line_detection: true
      use_v2_api: false
    apm_config:
      enabled: true
      apm_non_local_traffic: true
      #
      # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
      #
      apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester"
      #
      # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
      #
      # apm_dd_url: "http://<ingress-ip>/ingester"
      #
      # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
      #
      # apm_dd_url: "https://customer.kloudfuse.io/ingester"
      #
    metadata_providers:
      - name: host
        interval: 300
yaml

Deploy the agent with the configuration file

helm install datadog-agent -f datadog-values.yaml datadog/datadog