Quick Guide for Datadog Agent on Kubernetes for Metrics
-
Add the Datadog Helm repository.
helm repo add datadog https://helm.datadoghq.com helm repo update -
Configure
datadog-values.yaml.For pod-level metrics, add Datadog annotations to enable scraping.
-
Base configuration
fullnameOverride: "kfuse-agent" nameOverride: "kfuse-agent" datadog: apiKey: " " # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # dd_url: "https://customer.kloudfuse.io/ingester" #yaml -
Enable metrics ingestion.
fullnameOverride: "kfuse-agent" nameOverride: "kfuse-agent" datadog: logsEnabled: true logs: enabled: true containerCollectAll: true containerCollectUsingFiles: true autoMultiLineDetection: true # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # dd_url: "https://customer.kloudfuse.io/ingester" # kubeStateMetricsEnabled: false kubeStateMetricsCore: enabled: true ignoreLegacyKSMCheck: true networkMonitoring: enabled: false orchestratorExplorer: enabled: true apiKey: "798999171d8a0e80030b20366f2994f4" env: - name: DD_CHECKS_TAG_CARDINALITY value: "orchestrator" - name: DD_CONTAINER_INCLUDE value: "kube_namespace:.*" - name: DD_CONTAINER_INCLUDE_LOGS value: "kube_namespace:.*" - name: DD_CONTAINER_EXCLUDE_METRICS value: "name:gclog.*" - name: DD_KUBERNETES_POD_LABELS_AS_TAGS value: "{"*":"%%label%%"}" - name: DD_KUBERNETES_NODE_LABELS_AS_TAGS value: "{"*":"%%label%%"}" - name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DEFAULT_MATCH_THRESHOLD value: "0.01" - name: "DD_LOGS_CONFIG_AUTO_MULTI_LINE_EXTRA_PATTERNS" value: '^d{4}/d{2}/d{2}sd{2}:d{2}:d{2}(.d+)? ^d{4}-d{2}-d{2}sd{2}:d{2}:d{2}(.d+)? ^d+:d+:d+(.d+)?' apm: enabled: true portEnabled: true dogstatsd: port: 8126 useHostPort: false processAgent: enabled: true prometheusScrape: enabled: true version: 1 additionalConfigs: - configurations: - send_monotonic_counter: false send_distribution_counts_as_monotonic: false send_distribution_sums_as_monotonic: false send_histograms_buckets: true max_returned_metrics: 999999 min_collection_interval: 15 autodiscovery: kubernetes_annotations: exclude: app: knight prometheus.io/scrape: "false" - configurations: - send_monotonic_counter: false send_distribution_counts_as_monotonic: false send_distribution_sums_as_monotonic: false send_histograms_buckets: true max_returned_metrics: 999999 min_collection_interval: 60 autodiscovery: kubernetes_container_names: - knight checksd: kf_openmetrics.py: |- from datadog_checks.base import AgentCheck, OpenMetricsBaseCheckV2 import os, requests, json, time col_names = ['metric_name', 'metric_type', 'description'] METRIC_TYPE_EXPANSIONS = { 'histogram' : # prom style metric to dd style metric conversions for histogram metrics { 'count' : {'type': 'count', 'desc_prefix' : 'Count of'}, 'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'}, 'bucket' : {'type': 'count', 'desc_prefix' : 'Buckets of samples of'}, 'min' : {'type': 'gauge', 'desc_prefix' : 'Minimum value of'}, 'max' : {'type': 'gauge', 'desc_prefix' : 'Maximum value of'}, }, 'summary' : # prom style metric to dd style metric conversions for summary metrics { 'count' : {'type': 'count', 'desc_prefix' : 'Count of'}, 'sum' : {'type': 'count', 'desc_prefix' : 'Sum of'}, 'quantile' : {'type': 'gauge', 'desc_prefix' : 'Various quantile values of'}, } } HELP_TEXT = '# HELP' TYPE_TEXT = '# TYPE' DEFAULT_URL = 'http://%%host%%:%%port%%/metrics' class KfOpenMetricsCheck(AgentCheck): def __init__(self, name, init_config, instances): super(KfOpenMetricsCheck, self).__init__(name, init_config, instances) def get_default_config(self): return {} def scrape_endpoint(self, scrape_url): response = requests.get(scrape_url) if response.status_code != 200: raise Exception('failed to scrape endpoint: %s (error: %s)', scrape_url, response) return response.content.decode('utf-8') def prepare_mds(self, data): mds = {} content = data.splitlines() def _get_info(line, prefix_text): items = line[len(prefix_text):].split() return items[0], ' '.join(items[1:]) def _add_info(metric_name, info, value): if metric_name not in mds: mds.update({metric_name:{}}) mds[metric_name][info] = value def _expand_info(metric_name): if mds[metric_name]['type'] not in METRIC_TYPE_EXPANSIONS: return extensions = METRIC_TYPE_EXPANSIONS[mds[metric_name]['type']] for ext_name, extension in extensions.items(): prefix = "" suffix = "" try: prefix = extension['desc_prefix'] except KeyError: pass try: suffix = extension['desc_suffix'] except KeyError: pass mds.update({'%s_%s'% (mname, ext_name): {'type' : extension['type'], 'desc' : ('%s %s %s' % (prefix, mdesc, suffix)).strip()}}) for l in content: if l.startswith(HELP_TEXT): mname, mdesc = _get_info(l, HELP_TEXT) _add_info(mname, 'desc', mdesc) if l.startswith(TYPE_TEXT): mname, mtype = _get_info(l, TYPE_TEXT) _add_info(mname, 'type', mtype) _expand_info(mname) return mds def check(self, instance): scrape_url = '' try: scrape_url = instance.get('openmetrics_endpoint') if scrape_url != DEFAULT_URL: self.log.info("running kf_openmetrics check for instance {}".format(instance)) data = self.scrape_endpoint(scrape_url) mds = self.prepare_mds(data) self.log.debug("prepared mds len:{}".format(len(mds))) self.event( { "timestamp": time.time(), "event_type": "OpenMetricsMetadata", "msg_title" : "OpenMetricsMetadata", "msg_text": json.dumps(mds), } ) else: self.log.info("skipping kf_openmetrics check for instance {}".format(instance)) except KeyError as ke: self.log.info("failed to scrape url={}, exc={}".format(scrape_url, ke)) except requests.exceptions.InvalidURL as ie: self.log.info("invalid url={}, instance={}, exc={}".format(scrape_url, instance, ke)) confd: go_expvar.yaml: |- init_config: instances: - expvar_url: http://localhost:6000/debug/vars # if you've defined `expvar_port` in datadog.yaml, change the port here to that value namespace: datadog.agent metrics: - path: forwarder/Transactions/RetryQueueSize - path: forwarder/Transactions/Success type: rate - path: forwarder/Transactions/Errors type: rate - path: forwarder/Transactions/ErrorsByType/DNSErrors type: rate - path: forwarder/Transactions/ErrorsByType/TLSErrors type: rate - path: forwarder/Transactions/ErrorsByType/ConnectionErrors type: rate - path: forwarder/Transactions/ErrorsByType/WroteRequestErrors type: rate - path: forwarder/Transactions/ErrorsByType/SentRequestErrors type: rate - path: forwarder/Transactions/HTTPErrors type: rate - path: dogstatsd-udp/Packets type: rate - path: dogstatsd-udp/PacketReadingErrors type: rate - path: dogstatsd-uds/Packets type: rate - path: dogstatsd-uds/PacketReadingErrors type: rate - path: dogstatsd-uds/OriginDetectionErrors type: rate - path: dogstatsd/ServiceCheckParseErrors type: rate - path: dogstatsd/ServiceCheckPackets type: rate - path: dogstatsd/EventParseErrors type: rate - path: dogstatsd/EventPackets type: rate - path: dogstatsd/MetricParseErrors type: rate - path: dogstatsd/MetricPackets type: rate - path: aggregator/Flush/ChecksMetricSampleFlushTime/LastFlush - path: aggregator/Flush/ServiceCheckFlushTime/LastFlush - path: aggregator/Flush/EventFlushTime/LastFlush - path: aggregator/Flush/MetricSketchFlushTime/LastFlush - path: aggregator/Flush/MainFlushTime/LastFlush - path: aggregator/FlushCount/ServiceChecks/LastFlush - path: aggregator/FlushCount/Series/LastFlush - path: aggregator/FlushCount/Events/LastFlush - path: aggregator/FlushCount/Sketches/LastFlush - path: aggregator/DogstatsdContexts - path: aggregator/SeriesFlushed type: rate - path: aggregator/ServiceCheckFlushed type: rate - path: aggregator/EventsFlushed type: rate - path: aggregator/NumberOfFlush type: rate - path: aggregator/DogstatsdMetricSample type: rate - path: aggregator/ChecksMetricSample type: rate - path: aggregator/ServiceCheck type: rate - path: aggregator/Event type: rate - path: aggregator/HostnameUpdate type: rate - path: scheduler/ChecksEntered - path: scheduler/Queues/.*/Size alias: scheduler.queues.size - path: scheduler/Queues/.*/Interval alias: scheduler.queues.interval - path: scheduler/QueuesCount - path: splitter/NotTooBig type: rate - path: splitter/TooBig type: rate - path: splitter/TotalLoops type: rate - path: splitter/PayloadDrops type: rate - path: logs-agent/IsRunning type: gauge - path: logs-agent/DestinationErrors type: rate - path: logs-agent/LogsDecoded type: rate - path: logs-agent/LogsProcessed type: rate - path: logs-agent/LogsSent type: rate - path: logs-agent/BytesSent type: rate - path: logs-agent/EncodedBytesSent type: rate - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/idleMs type: rate alias: batch_strategy.idle_time tags: - sender:0 - path: logs-agent/HttpDestinationStats/logs_0_reliable_0/inUseMs type: rate alias: batch_strategy.in_use_time tags: - sender:0 - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/idleMs type: rate alias: batch_strategy.idle_time tags: - sender:1 - path: logs-agent/HttpDestinationStats/logs_1_reliable_0/inUseMs type: rate alias: batch_strategy.in_use_time tags: - sender:1 - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/idleMs type: rate alias: batch_strategy.idle_time tags: - sender:2 - path: logs-agent/HttpDestinationStats/logs_2_reliable_0/inUseMs type: rate alias: batch_strategy.in_use_time tags: - sender:2 - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/idleMs type: rate alias: batch_strategy.idle_time tags: - sender:3 - path: logs-agent/HttpDestinationStats/logs_3_reliable_0/inUseMs type: rate alias: batch_strategy.in_use_time tags: - sender:3 kf_openmetrics.yaml: |- init_config: instances: - openmetrics_endpoint: http://%%host%%:%%port%%/metrics min_collection_interval: 300 clusterAgent: replicaCount: 1 enabled: true datadog_cluster_yaml: use_v2_api: events: true series: true service_checks: true process_config: # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # process_dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # process_dd_url: "https://customer.kloudfuse.io/ingester" # # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # events_dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # events_dd_url: "https://customer.kloudfuse.io/ingester" # container_collection: enabled: false orchestrator_explorer: # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # orchestrator_dd_url: "http://<ingress-ip>" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # orchestrator_dd_url: "https://customer.kloudfuse.io" # manifest_collection: enabled: false admissionController: enabled: false agents: image: tagSuffix: jmx useConfigMap: true customAgentConfig: skip_ssl_validation: false enable_stream_payload_serialization: false use_v2_api: events: true series: true service_checks: true process_config: # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # process_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # process_dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # process_dd_url: "https://customer.kloudfuse.io/ingester" # # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # events_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # events_dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # events_dd_url: "https://customer.kloudfuse.io/ingester" # container_collection: enabled: false orchestrator_explorer: # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # orchestrator_dd_url: "http://kfuse-ingress-nginx-controller-internal.kfuse" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # orchestrator_dd_url: "http://<ingress-ip>" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # orchestrator_dd_url: "https://customer.kloudfuse.io" # manifest_collection: enabled: false logs_config: # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # logs_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80" logs_no_ssl: true # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # logs_dd_url: "<ingress-ip>:80" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”). Make sure # to comment out the logs_no_ssl: true default above. # # logs_dd_url: "customer.kloudfuse.io:443" # logs_no_ssl: false # use_http: true auto_multi_line_detection: true use_v2_api: false apm_config: enabled: true apm_non_local_traffic: true # # Scenario default: Assumes that Kloudfuse stack & agent both in same VPC and in same K8S cluster # apm_dd_url: "kfuse-ingress-nginx-controller-internal.kfuse:80/ingester" # # Scenario 1: Kloudfuse stack & agent both in same VPC, but in different K8S cluster # # apm_dd_url: "http://<ingress-ip>/ingester" # # Scenario 2: Kloudfuse stack hosted in a different VPC (hosted at “customer.kloudfuse.io”) # # apm_dd_url: "https://customer.kloudfuse.io/ingester" # metadata_providers: - name: host interval: 300yaml