Quick Guide for Datadog Agent on Kubernetes for Metrics
Configure datadog-values.yaml
For pod-level metrics, add Datadog annotations to enable scraping.
Base configuration
fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
apiKey: " "
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at "customer.kloudfuse.io")
#
# dd_url: "https://customer.kloudfuse.io/ingester"
#
yaml
Enable metrics ingestion
fullnameOverride: "kfuse-agent"
nameOverride: "kfuse-agent"
datadog:
logsEnabled: true
logs:
enabled: true
containerCollectAll: true
containerCollectUsingFiles: true
autoMultiLineDetection: true
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# dd_url: "https://customer.kloudfuse.io/ingester"
#
kubeStateMetricsEnabled: false
kubeStateMetricsCore:
enabled: true
ignoreLegacyKSMCheck: true
networkMonitoring:
enabled: false
orchestratorExplorer:
enabled: true
apiKey: "798999171d8a0e80030b20366f2994f4"
env:
- name: DD_CHECKS_TAG_CARDINALITY
value: "orchestrator"
- name: DD_CONTAINER_INCLUDE
value: "kube_namespace:.*"
- name: DD_CONTAINER_INCLUDE_LOGS
value: "kube_namespace:.*"
- name: DD_CONTAINER_EXCLUDE_METRICS
value: "name:gclog.*"
- name: DD_KUBERNETES_POD_LABELS_AS_TAGS
value: "{"*":"%%label%%"}"
- name: DD_KUBERNETES_NODE_LABELS_AS_TAGS
value: "{"*":"%%label%%"}"
- name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD
value: "0.01"
- name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS"
value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?'
apm:
enabled: true
portEnabled: true
dogstatsd:
port: 8126
useHostPort: false
processAgent:
enabled: true
prometheusScrape:
enabled: true
version: 1
additionalConfigs:
- configurations:
- send_monotonic_counter: false
send_distribution_counts_as_monotonic: false
send_distribution_sums_as_monotonic: false
send_histograms_buckets: true
max_returned_metrics: 999999
min_collection_interval: 15
autodiscovery:
kubernetes_annotations:
exclude:
app: knight
prometheus.io/scrape: "false"
- configurations:
- send_monotonic_counter: false
send_distribution_counts_as_monotonic: false
send_distribution_sums_as_monotonic: false
send_histograms_buckets: true
max_returned_metrics: 999999
min_collection_interval: 60
autodiscovery:
kubernetes_container_names:
- knight
checksd:
kf_openmetrics.py: |-
from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2
import os, requests, json, time
col_names = ['metric_name', 'metric_type', 'description']
METRIC_TYPE_EXPANSIONS = {
'histogram' :
# prom style metric to dd style metric conversions for histogram metrics
{
'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'},
'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'},
'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'},
},
'summary' :
# prom style metric to dd style metric conversions for summary metrics
{
'count' : {'type': 'count', 'desc_prefix' : 'Count of'},
'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'},
'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'},
}
}
HELP_TEXT = '# HELP'
TYPE_TEXT = '# TYPE'
DEFAULT_URL = 'http://%%host%%:%%port%%/metrics'
class KfOpenMetricsCheck(AgentCheck):
def __init__(self, name, init_config, instances):
super(KfOpenMetricsCheck, self).__init__(name, init_config, instances)
def get_default_config(self):
return {}
def scrape_endpoint(self, scrape_url):
response = requests.get(scrape_url)
if response.status_code != 200:
raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response)
return response.content.decode('utf-8')
def prepare_mds(self, data):
mds = {}
content = data.splitlines()
def _get_info(line, prefix_text):
items = line[len(prefix_text):].split()
return items[0], ' '.join(items[1:])
def _add_info(metric_name, info, value):
if metric_name not in mds:
mds.update({metric_name:{}})
mds[metric_name][info] = value
def _expand_info(metric_name):
if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS:
return
extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']]
for ext_name, extension in extensions.items():
prefix = ""
suffix = ""
try:
prefix = extension['desc_prefix']
except KeyError:
pass
try:
suffix = extension['desc_suffix']
except KeyError:
pass
mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}})
for l in content:
if l.startswith(HELP_TEXT):
mname, mdesc = _get_info(l, HELP_TEXT)
_add_info(mname, 'desc', mdesc)
if l.startswith(TYPE_TEXT):
mname, mtype = _get_info(l, TYPE_TEXT)
_add_info(mname, 'type', mtype)
_expand_info(mname)
return mds
def check(self, instance):
scrape_url = ''
try:
scrape_url = instance.get('openmetrics_endpoint')
if scrape_url != DEFAULT_URL:
self.log.info("running kf_openmetrics check for instance {}".format(instance))
data = self.scrape_endpoint(scrape_url)
mds = self.prepare_mds(data)
self.log.debug("prepared mds len:{}".format(len(mds)))
self.event(
{
"timestamp": time.time(),
"event_type": "OpenMetricsMetadata",
"msg_title" : "OpenMetricsMetadata",
"msg_text": json.dumps(mds),
}
)
else:
self.log.info("skipping kf_openmetrics check for instance {}".format(instance))
except KeyError as ke:
self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke))
except requests.exceptions.InvalidURL as ie:
self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke))
confd:
go_expvar.yaml: |-
init_config:
instances:
- expvar_url: http://localhost:6000/debug/vars # if you've defined `expvar_port` in datadog.yaml, change the port here to that value
namespace: datadog.agent
metrics:
- path: forwarder/Transactions/RetryQueueSize
- path: forwarder/Transactions/Success
type: rate
- path: forwarder/Transactions/Errors
type: rate
- path: forwarder/Transactions/ErrorsByType/DNSErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/TLSErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/ConnectionErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/WroteRequestErrors
type: rate
- path: forwarder/Transactions/ErrorsByType/SentRequestErrors
type: rate
- path: forwarder/Transactions/HTTPErrors
type: rate
- path: dogstatsd-udp/Packets
type: rate
- path: dogstatsd-udp/PacketReadingErrors
type: rate
- path: dogstatsd-uds/Packets
type: rate
- path: dogstatsd-uds/PacketReadingErrors
type: rate
- path: dogstatsd-uds/OriginDetectionErrors
type: rate
- path: dogstatsd/ServiceCheckParseErrors
type: rate
- path: dogstatsd/ServiceCheckPackets
type: rate
- path: dogstatsd/EventParseErrors
type: rate
- path: dogstatsd/EventPackets
type: rate
- path: dogstatsd/MetricParseErrors
type: rate
- path: dogstatsd/MetricPackets
type: rate
- path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush
- path: aggregator/Flush/ServiceCheckFlushTime/LastFlush
- path: aggregator/Flush/EventFlushTime/LastFlush
- path: aggregator/Flush/MetricSketchFlushTime/LastFlush
- path: aggregator/Flush/MainFlushTime/LastFlush
- path: aggregator/FlushCount/ServiceChecks/LastFlush
- path: aggregator/FlushCount/Series/LastFlush
- path: aggregator/FlushCount/Events/LastFlush
- path: aggregator/FlushCount/Sketches/LastFlush
- path: aggregator/DogstatsdContexts
- path: aggregator/SeriesFlushed
type: rate
- path: aggregator/ServiceCheckFlushed
type: rate
- path: aggregator/EventsFlushed
type: rate
- path: aggregator/NumberOfFlush
type: rate
- path: aggregator/DogstatsdMetricSample
type: rate
- path: aggregator/ChecksMetricSample
type: rate
- path: aggregator/ServiceCheck
type: rate
- path: aggregator/Event
type: rate
- path: aggregator/HostnameUpdate
type: rate
- path: scheduler/ChecksEntered
- path: scheduler/Queues/.*/Size
alias: scheduler.queues.size
- path: scheduler/Queues/.*/Interval
alias: scheduler.queues.interval
- path: scheduler/QueuesCount
- path: splitter/NotTooBig
type: rate
- path: splitter/TooBig
type: rate
- path: splitter/TotalLoops
type: rate
- path: splitter/PayloadDrops
type: rate
- path: logs-agent/IsRunning
type: gauge
- path: logs-agent/DestinationErrors
type: rate
- path: logs-agent/LogsDecoded
type: rate
- path: logs-agent/LogsProcessed
type: rate
- path: logs-agent/LogsSent
type: rate
- path: logs-agent/BytesSent
type: rate
- path: logs-agent/EncodedBytesSent
type: rate
- path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:0
- path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:0
- path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:1
- path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:1
- path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:2
- path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:2
- path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs
type: rate
alias: batch_strategy.idle_time
tags:
- sender:3
- path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs
type: rate
alias: batch_strategy.in_use_time
tags:
- sender:3
kf_openmetrics.yaml: |-
init_config:
instances:
- openmetrics_endpoint: http://%%host%%:%%port%%/metrics
min_collection_interval: 300
clusterAgent:
replicaCount: 1
enabled: true
datadog_cluster_yaml:
use_v2_api:
events: true
series: true
service_checks: true
process_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# process_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# process_dd_url: "https://customer.kloudfuse.io/ingester"
#
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# events_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# events_dd_url: "https://customer.kloudfuse.io/ingester"
#
container_collection:
enabled: false
orchestrator_explorer:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# orchestrator_dd_url: "http://<ingress-ip>"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# orchestrator_dd_url: "https://customer.kloudfuse.io"
#
manifest_collection:
enabled: false
admissionController:
enabled: false
agents:
image:
tagSuffix: jmx
useConfigMap: true
customAgentConfig:
skip_ssl_validation: false
enable_stream_payload_serialization: false
use_v2_api:
events: true
series: true
service_checks: true
process_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# process_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# process_dd_url: "https://customer.kloudfuse.io/ingester"
#
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# events_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# events_dd_url: "https://customer.kloudfuse.io/ingester"
#
container_collection:
enabled: false
orchestrator_explorer:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# orchestrator_dd_url: "http://<ingress-ip>"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# orchestrator_dd_url: "https://customer.kloudfuse.io"
#
manifest_collection:
enabled: false
logs_config:
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80"
logs_no_ssl: true
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# logs_dd_url: "<ingress-ip>:80"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure
# to comment out the logs_no_ssl: true default above.
#
# logs_dd_url: "customer.kloudfuse.io:443"
# logs_no_ssl: false
#
use_http: true
auto_multi_line_detection: true
use_v2_api: false
apm_config:
enabled: true
apm_non_local_traffic: true
#
# Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster
#
apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester"
#
# Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster
#
# apm_dd_url: "http://<ingress-ip>/ingester"
#
# Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”)
#
# apm_dd_url: "https://customer.kloudfuse.io/ingester"
#
metadata_providers:
- name: host
interval: 300
yaml