Quick Guide for Datadog Agent on Kubernetes for Metrics
This guide describes how to deploy the Datadog Agent on Kubernetes clusters to collect metrics from your infrastructure and applications. The Datadog Agent uses Helm for installation and relies on pod annotations to identify and scrape metrics at the pod level. You will configure the agent to ingest metrics and deploy it to your cluster using a configuration file.
Configure datadog-values.yaml
For pod-level metrics, add Datadog annotations to enable scraping.
Base configuration
fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
apiKey: " "
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at "customer.kloudfuse.io")
#
# dd_url: "https://customer.kloudfuse.io/ingester"
#
yaml
Enable metrics ingestion
fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
logsEnabled: true
logs:
enabled: true
containerCollectAll: true
containerCollectUsingFiles: true
autoMultiLineDetection: true
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# dd_url: "https://customer.kloudfuse.io/ingester"
#
kubeStateMetricsEnabled: false
kubeStateMetricsCore:
enabled: true
ignoreLegacyKSMCheck: true
networkMonitoring:
enabled: false
orchestratorExplorer:
enabled: true
apiKey: "798999171d8a0e80030b20366f2994f4"
env:
- name: DD_CHECKS_TAG_CARDINALITY
value: "orchestrator"
- name: DD_CONTAINER_INCLUDE
value: "kube_namespace:.*"
- name: DD_CONTAINER_INCLUDE_LOGS
value: "kube_namespace:.*"
- name: DD_CONTAINER_EXCLUDE_METRICS
value: "name:gclog.*"
- name: DD_KUBERNETES_POD_LABELS_AS_TAGS
value: "{"*":"%%label%%"}"
- name: DD_KUBERNETES_NODE_LABELS_AS_TAGS
value: "{"*":"%%label%%"}"
- name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD
value: "0.01"
- name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS"
value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?'
apm:
enabled: true
portEnabled: true
dogstatsd:
port: 8126
useHostPort: false
processAgent:
enabled: true
prometheusScrape:
enabled: true
version: 1
additionalConfigs:
- configurations:
- send_monotonic_counter: false
send_distribution_counts_as_monotonic: false
send_distribution_sums_as_monotonic: false
send_histograms_buckets: true
max_returned_metrics: 999999
min_collection_interval: 15
autodiscovery:
kubernetes_annotations:
exclude:
app: knight
prometheus.io/scrape: "false"
- configurations:
- send_monotonic_counter: false
send_distribution_counts_as_monotonic: false
send_distribution_sums_as_monotonic: false
send_histograms_buckets: true
max_returned_metrics: 999999
min_collection_interval: 60
autodiscovery:
kubernetes_container_names:
- knight
checksd:
kf_openmetrics.py: |-
from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2
import os, requests, json, time
col_names = ['metric_name', 'metric_type', 'description']
METRIC_TYPE_EXPANSIONS = {
'histogram' :
# prom style metric to dd style metric conversions for histogram metrics
{
'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'},
'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'},
'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'},
},
'summary' :
# prom style metric to dd style metric conversions for summary metrics
{
'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'},
}
}
HELP_TEXT = '# HELP'
TYPE_TEXT = '# TYPE'
DEFAULT_URL = 'http://%%host%%:%%port%%/metrics'
class KfOpenMetricsCheck(AgentCheck):
def __init__(self, name, init_config, instances):
super(KfOpenMetricsCheck, self).__init__(name, init_config, instances)
def get_default_config(self):
return {}
def scrape_endpoint(self, scrape_url):
response = requests.get(scrape_url)
if response.status_code != 200:
raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response)
return response.content.decode('utf-8')
def prepare_mds(self, data):
mds = {}
content = data.splitlines()
def _get_info(line, prefix_text):
items = line[len(prefix_text):].split()
return items[0], ' '.join(items[1:])
def _add_info(metric_name, info, value):
if metric_name not in mds:
mds.update({metric_name:{}})
mds[metric_name][info] = value
def _expand_info(metric_name):
if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS:
return
extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']]
for ext_name, extension in extensions.items():
prefix = ""
suffix = ""
try:
prefix = extension['desc_prefix']
except KeyError:
pass
try:
suffix = extension['desc_suffix']
except KeyError:
pass
mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}})
for l in content:
if l.startswith(HELP_TEXT):
mname, mdesc = _get_info(l, HELP_TEXT)
_add_info(mname, 'desc', mdesc)
if l.startswith(TYPE_TEXT):
mname, mtype = _get_info(l, TYPE_TEXT)
_add_info(mname, 'type', mtype)
_expand_info(mname)
return mds
def check(self, instance):
scrape_url = ''
try:
scrape_url = instance.get('openmetrics_endpoint')
if scrape_url != DEFAULT_URL:
self.log.info("running kf_openmetrics check for instance {}".format(instance))
data = self.scrape_endpoint(scrape_url)
mds = self.prepare_mds(data)
self.log.debug("prepared mds len:{}".format(len(mds)))
self.event(
{
"timestamp": time.time(),
"event_type": "OpenMetricsMetadata",
"msg_title" : "OpenMetricsMetadata",
"msg_text": json.dumps(mds),
}
)
else:
self.log.info("skipping kf_openmetrics check for instance {}".format(instance))
except KeyError as ke:
self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke))
except requests.exceptions.InvalidURL as ie:
self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke))
confd:
go_expvar.yaml: |-
init_config:
instances:
- expvar_url: http://localhost:6000/debug/vars # if you've defined `expvar_port` in datadog.yaml, change the port here to that value
namespace: datadog.agent
metrics:
- path: forwarder/Transactions/RetryQueueSize
- path: forwarder/Transactions/Success
type: rate
- path: forwarder/Transactions/Errors
type: rate
- path: forwarder/Transactions/ErrorsByType/DNSErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/TLSErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/ConnectionErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/WroteRequestErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/SentRequestErrors
type: rate
- path: forwarder/Transactions/HTTPErrors
type: rate
- path: dogstatsd-udp/Packets
type: rate
- path: dogstatsd-udp/PacketReadingErrors
type: rate
- path: dogstatsd-uds/Packets
type: rate
- path: dogstatsd-uds/PacketReadingErrors
type: rate
- path: dogstatsd-uds/OriginDetectionErrors
type: rate
- path: dogstatsd/ServiceCheckParseErrors
type: rate
- path: dogstatsd/ServiceCheckPackets
type: rate
- path: dogstatsd/EventParseErrors
type: rate
- path: dogstatsd/EventPackets
type: rate
- path: dogstatsd/MetricParseErrors
type: rate
- path: dogstatsd/MetricPackets
type: rate
- path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush
- path: aggregator/Flush/ServiceCheckFlushTime/LastFlush
- path: aggregator/Flush/EventFlushTime/LastFlush
- path: aggregator/Flush/MetricSketchFlushTime/LastFlush
- path: aggregator/Flush/MainFlushTime/LastFlush
- path: aggregator/FlushCount/ServiceChecks/LastFlush
- path: aggregator/FlushCount/Series/LastFlush
- path: aggregator/FlushCount/Events/LastFlush
- path: aggregator/FlushCount/Sketches/LastFlush
- path: aggregator/DogstatsdContexts
- path: aggregator/SeriesFlushed
type: rate
- path: aggregator/ServiceCheckFlushed
type: rate
- path: aggregator/EventsFlushed
type: rate
- path: aggregator/NumberOfFlush
type: rate
- path: aggregator/DogstatsdMetricSample
type: rate
- path: aggregator/ChecksMetricSample
type: rate
- path: aggregator/ServiceCheck
type: rate
- path: aggregator/Event
type: rate
- path: aggregator/HostnameUpdate
type: rate
- path: scheduler/ChecksEntered
- path: scheduler/Queues/.*/Size
alias: scheduler.queues.size
- path: scheduler/Queues/.*/Interval
alias: scheduler.queues.interval
- path: scheduler/QueuesCount
- path: splitter/NotTooBig
type: rate
- path: splitter/TooBig
type: rate
- path: splitter/TotalLoops
type: rate
- path: splitter/PayloadDrops
type: rate
- path: logs-agent/IsRunning
type: gauge
- path: logs-agent/DestinationErrors
type: rate
- path: logs-agent/LogsDecoded
type: rate
- path: logs-agent/LogsProcessed
type: rate
- path: logs-agent/LogsSent
type: rate
- path: logs-agent/BytesSent
type: rate
- path: logs-agent/EncodedBytesSent
type: rate
- path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:0
- path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:0
- path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:1
- path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:1
- path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:2
- path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:2
- path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:3
- path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:3
kf_openmetrics.yaml: |-
init_config:
instances:
- openmetrics_endpoint: http://%%host%%:%%port%%/metrics
min_collection_interval: 300
clusterAgent:
replicaCount: 1
enabled: true
datadog_cluster_yaml:
use_v2_api:
events: true
series: true
service_checks: true
process_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# process_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# process_dd_url: "https://customer.kloudfuse.io/ingester"
#
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# events_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# events_dd_url: "https://customer.kloudfuse.io/ingester"
#
container_collection:
enabled: false
orchestrator_explorer:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# orchestrator_dd_url: "http://<ingress-ip>"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# orchestrator_dd_url: "https://customer.kloudfuse.io"
#
manifest_collection:
enabled: false
admissionController:
enabled: false
agents:
image:
tagSuffix: jmx
useConfigMap: true
customAgentConfig:
skip_ssl_validation: false
enable_stream_payload_serialization: false
use_v2_api:
events: true
series: true
service_checks: true
process_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# process_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# process_dd_url: "https://customer.kloudfuse.io/ingester"
#
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# events_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# events_dd_url: "https://customer.kloudfuse.io/ingester"
#
container_collection:
enabled: false
orchestrator_explorer:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# orchestrator_dd_url: "http://<ingress-ip>"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# orchestrator_dd_url: "https://customer.kloudfuse.io"
#
manifest_collection:
enabled: false
logs_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80"
logs_no_ssl: true
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# logs_dd_url: "<ingress-ip>:80"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure
# to comment out the logs_no_ssl: true default above.
#
# logs_dd_url: "customer.kloudfuse.io:443"
# logs_no_ssl: false
#
use_http: true
auto_multi_line_detection: true
use_v2_api: false
apm_config:
enabled: true
apm_non_local_traffic: true
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# apm_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# apm_dd_url: "https://customer.kloudfuse.io/ingester"
#
metadata_providers:
- name: host
interval: 300
yaml