health.conf

The following are the spec and example files for health.conf.

health.conf.spec

#   Version 9.4.0
#
# This file sets the default thresholds for Splunk Enterprise's built
# in Health Report.
#
# Feature stanzas contain indicators, and each indicator has two thresholds:
# * Yellow: Indicates something is wrong and should be investigated.
# * Red: Means that the indicator is effectively not working.
#
# There is a health.conf in the $SPLUNK_HOME/etc/system/default/ directory.
# Never change or copy the configuration files in the default directory.
# The files in the default directory must remain intact and in their original
# location.
#
# To set custom configurations, create a new file with the name health.conf in
# the $SPLUNK_HOME/etc/system/local/ directory. Then add the specific settings
# that you want to customize to the local configuration file.
#
# To learn more about configuration files (including precedence), see the
# documentation located at
# http://docs.splunk.com/Documentation/Splunk/latest/Admin/Aboutconfigurationfiles

[distributed_health_reporter]

disabled = <boolean>
* Whether or not this Splunk platform instance calls connected search peers to
  retrieve health report information.
* A value of 1 disables the distributed health report on this Splunk platform
  instance. When disabled, the instance does not call connected search peers
  to retrieve health report information.
* Default: 0 (enabled)

[health_reporter]

full_health_log_interval = <number>
* The amount of time, in seconds, that elapses between each 'PeriodicHealthReporter=INFO' log entry.
* Default: 30.

suppress_status_update_ms = <number>
* The minimum amount of time, in milliseconds, that must elapse between an
  indicator's health status changes.
* Changes that occur earlier are suppressed.
* Default: 300.

suppress_status_reason_update_s = <number>
* The minimum amount of time, in seconds, that must elapse between a
  change to the reason for the indicator.
* Changes that occur earlier are suppressed.
* Default: 10.

latency_tracker_log_interval = <number>
* The amount of time, in seconds, that elapses between each latency tracker log entry.
* Default: 30.

aggregate_ingestion_latency_health = [0|1]
* A value of 0 disables the aggregation feature for ingestion latency health reporter.
* Default: 1 (enabled).

ingestion_latency_send_interval = <integer>
* The amount of time, in seconds, that splunkd waits before it sends ingestion 
  latency data as part of a heartbeat message.
* splunkd determines the actual interval at which it sends this data by factoring
  the value for 'ingestion_latency_send_interval' with the value for 'heartbeatFrequency' in 
  the [tcpout] stanza of the outputs.conf file. This is because splunkd uses the
  tcpout heartbeat to send ingestion latency data, and that it won't send ingestion latency
  data at a frequency of less than outputs.conf:[tcpout].'heartbeatFrequency' seconds.
  * If you set 'ingestion_latency_send_interval' to a value that is higher than
    'heartbeatFrequency', splunkd sends that data
    only when the number of 'heartbeatFrequency' seconds exceeds the number of
    'ingestion_latency_send_interval' seconds at each  
    'ingestion_latency_send_interval'.
  * For example: if 'ingestion_latency_send_interval' has a value of 75 and
    'heartbeatFrequency' has a value of 60, splunkd sends the data every
    120 seconds, because it takes two periods of 'heartbeatFrequency' 
    seconds before the 'heartbeatFrequency' is greater than the 
    'ingestion_latency_send_interval'.
  * Conversely, if you set 'ingestion_latency_send_interval' to a value that is lower than
    'heartbeatFrequency', splunkd sends that data only when the number of
    'ingestiona_latency_send_interval' seconds has elapsed.
  * If, for example, 'ingestion_latency_send_interval' has a value of 30 and
    'heartbeatFrequency' has a value of 90, splunkd sends the data every
    90 seconds because of the value of 'heartbeatFrequency', even though you set a
    'ingestion_latency_send_interval' of 30.
* Default: 30

ingestion_latency_send_interval_max = <number>
* The maximum amount of time, in seconds, that elapses between ingestion latency sent as part of heart beat message. Should be in range 0-86400
* Default: 86400.

alert.disabled = [0|1]
* A value of 1 disables the alerting feature for health reporter.
* If the value is set to 1, alerting for all features is disabled.
* Default: 0 (enabled)

alert.actions = <string>
* The alert actions that will run when an alert is fired.

alert.min_duration_sec = <integer>
* The minimum amount of time, in seconds, that the health status color must
  persist within threshold_color before triggering an alert.
* Default: 60.

alert.threshold_color = [yellow|red]
* The health status color that will trigger an alert.
* Default: red.

alert.suppress_period = <integer>[m|s|h|d]
* The minimum amount of time, in [minutes|seconds|hours|days], that must
  elapse between each fired alert.
* Alerts that occur earlier will be sent as a batch after this time period
  elapses.
* Default: 10m

[clustering]

health_report_period = <number>
* The amount of time, in seconds, that elapses between each Clustering
  health report run.
* Default: 20.

disabled = <boolean>
* Whether or not the clustering feature health check is disabled.
* A value of 1 disables the clustering feature health check.
* Default: 0 (enabled)

[tree_view:health_subset]

* Defines a tree view for health features.
* Users with 'list_health_subset' capability can view features belonging
  to this tree view.
* Users with 'edit_health_subset' capability can edit thresholds for features
  belonging to this tree view.

[feature:*]

suppress_status_update_ms = <number>
* The minimum amount of time, in milliseconds, that must elapse between an indicator's
  health status changes.
* Changes that occur earlier are suppressed.
* Default: 300.

suppress_status_reason_update_s = <number>
* The minimum amount of time, in seconds, that must elapse between a
  change to the reason for the indicator.
* Changes that occur earlier are suppressed.
* Default: 3.

display_name = <string>
* A human readable name for the feature.

distributed_disabled = <boolean>
* Whether or not the distributed health report (DHR) tree view includes 
  information about this feature.
* A value of "true" means that the DHR does not include this feature in
  its tree view, which means you won't see it when you open the Health Report
  in Splunk Web.
  * This value doesn't apply to the ability of the feature to
    generate alerts, as appropriate.
* A value of "false" means that the DHR includes this feature in
  its tree view.
* Default: 0

snooze_end_time = <number>
* Determines the snooze end time, in seconds since the epoch (Unix time), for this feature.
  Specifying a value for this setting enables a snooze period that suppresses color changes 
  for a feature until the <snooze_end_time>.
* A value of 0 disables snoozing for this feature.
* Default = 0

alert.disabled = <boolean>
* Whether or not alerting is disabled for this feature.
* A value of 1 disables alerting for this feature.
* If alerting is disabled in the [health_reporter] stanza, alerting for this feature is disabled,
  regardless of the value set here.
* Otherwise, if the value is set to 1, alerting for all indicators is disabled.
* Default: 0 (enabled)

alert.min_duration_sec = <integer>
* The minimum amount of time, in seconds, that the health status color must
  persist within threshold_color before triggering an alert.

alert.threshold_color = [yellow|red]
* The health status color to trigger an alert.
* Default: red.

friendly_description = <string>
* A general description to help the user determine what functionality is monitored
  by the heath report indicator.

indicator:<indicator name>:friendly_description = <string>
* A general description of the technical behavior monitored by the indicator. 
  Use common terminology that a user can search on to find documentation, 
  details, or troubleshooting guidance.

indicator:<indicator name>:description = <string>
* Description of this indicator to help users to make basic decisions such as:
  Turning indicators on or off
  Adjusting the threshold of an indicator
  Turning on alerting for an indicator

indicator:<indicator name>:<indicator color> = <number>
* There are various indicator names. See your health.conf for the complete list.
* There are two valid colors: yellow and red.
* These settings should not be adjusted lightly. If the numbers are set too
  high, you might inadvertently mask serious errors that the Health Report is
  trying to bring to your attention.

alert:<indicator name>.disabled = [0|1]
* A value of 1 disables alerting for this indicator.
* Default: 0 (enabled)

alert:<indicator name>.min_duration_sec = <integer>
* The minimum amount of time, in seconds, that the health status color must
  persist within threshold_color before triggering an alert.

alert:<indicator name>.threshold_color = [yellow|red]
* The health status color to trigger an alert.

tree_view:health_subset = [enabled | disabled]
* Indicates that this feature belongs to the 'health_subset' tree view.

[alert_action:*]

disabled = [0|1]
* A value of 1 disables this alert action.
* Default: 0 (enabled)

action.<action parameter> = <string>
* There are various parameters for different alert actions.
* Each value defines one parameter for the alert action.

* NOTE: [feature:master_connectivity], [feature:slave_state]
*       feature:slave_version] stanzas are now DEPRECATED.

health.conf.example

# Version 9.4.0
#
# This file contains an example health.conf. Use this file to configure thresholds
# for Splunk Enterprise's built in Health Report.
#
# To use one or more of these configurations, copy the configuration block
# into health.conf in $SPLUNK_HOME/etc/system/local/. You must restart
# Splunk to enable configurations.

[health_reporter]
# Every 30 seconds a new 'PeriodicHealthReporter=INFO' log entry will be created.
full_health_log_interval = 30
# If an indicator's health status changes before 600 milliseconds elapses,
# the status change is suppressed.
suppress_status_update_ms = 600
# If the reason for the indicator changes before 3 seconds elapses,
# the status change is suppressed.
suppress_status_reason_update_s = 3
# Alerting for all features is enabled.
# You can disable alerting for each feature by setting 'alert.disabled' to 1.
alert.disabled = 0

# If you don't want to send alerts too frequently, you can define a minimum
# time period that must elapse before another alert is fired. Alerts triggered
# during the suppression period are sent after the period expires as a batch.
# The suppress_period value can be in seconds, minutes, hours, and days, and
# uses the format: 60s, 60m, 60h and 60d.
# Default is 10 minutes.
alert.suppress_period = 30m

[alert_action:email]
# Enable email alerts for the health report.
# Before you can send an email alert, you must configure the email notification
# settings on the email settings page.
# In the 'Search and Reporting' app home page, click Settings > Server settings
# > Email settings, and specify values for the settings.
# After you configure email settings, click Settings > Alert actions.
# Make sure that the 'Send email' option is enabled.
disabled = 0

# Define recipients when an email alert is triggered.
# You can define 'to', 'cc', and 'bcc' recipients.
# For multiple recipients in a list, separate email addresses with commas.
# If there is no recipient for a certain recipient type (e.g. bcc), leave the value blank.
action.to = admin_1@testcorp.example, admin_2@testcorp.example
action.cc = admin_3@testcorp.example, admin_4@testcorp.example
action.bcc =

[alert_action:pagerduty]
# Enable Pager Duty alerts for the health report.
# Before you can send an alert to PagerDuty, you must configure some settings
# on both the PagerDuty side and the Splunk Enterprise side.
# In PagerDuty, you must add a service to save your new integration.
# From the Integrations tab of the created service, copy the Integration Key
# string to the 'action.integration_url_override' below.
# On the Splunk side, you must install the PagerDuty Incidents app from
# Splunkbase.
# After you install the app, in Splunk Web, click Settings > Alert actions.
# Make sure that the PagerDuty app is enabled.
disabled = 0
action.integration_url_override = 123456789012345678901234567890ab

[alert_action:mobile]
# Enable Splunk Mobile alerts for the health report.
# You need to configure the 'alert_recipients' under this stanza in order to
# send health report alerts to the Splunk Mobile app on your phone.
#
# Steps to setup the health report mobile alert:
# * Download the Splunk Mobile App on your phone and open the app.
# * Download the Cloud Gateway App from Splunkbase to your splunk instance.
# * In Splunk Web, click Settings > Alert actions and make sure the Cloud
# Gateway App is enabled.
# * In Splunk Web, click Cloud Gateway App > Configure and enable Splunk
# Mobile.
# * In Splunk Web, click Cloud Gateway App > Register and copy the activation
# code displayed in the Splunk Mobile App to register your device(phone).
# * In health.conf configure 'alert_recipients' under the [alert_action:mobile]
# stanza, e.g. action.alert_recipients = admin
#
# Details of how to install and use the Cloud Gateway App please refer to
# https://docs.splunk.com/Documentation/Gateway
disabled = 0
action.alert_recipients = admin

[alert_action:victorops]
# Enable VictorOps alerts for the health report.
# Before you can send an alert to VictorOps, you must configure some settings
# on both the VictorOps side and the Splunk Enterprise side.
# In VictorOps, you must create an API key and can optionally create a routing key.
# On the Splunk side, you must install the VictorOps App from Splunkbase.
# After you install the app, in Splunk Web, click Settings > Alert actions.
# Make sure that the VictorOps app is enabled and the API key is properly configured.
disabled = 0
# alert message type in VictorOps.
# Valid alert message types in VictorOps:
# * CRITICAL - Triggers an incident.
# * WARNING - May trigger an incident, depending on your settings in VictorOps.
# * ACKNOWLEDGEMENT - Acknowledges an incident. This value is unlikely to be useful.
# * INFO - Creates a timeline event, but does not trigger an incident.
# * RECOVERY - Resolves an incident. This value is unlikely to be useful.
action.message_type = CRITICAL
# ID of the incident in VictorOps.
* Optional.
action.entity_id =
# Use this field to choose one of the API keys configured in passwords.conf
# under victorops_app.
# Leave this field empty if you want to use the default API key.
* Optional.
action.record_id =
# Use this field to overwrite the default routing key.
* Optional.
action.routing_key_override =

[clustering]
# Clustering health report will run in every 20 seconds.
health_report_period = 20
# Enable the clustering feature health check.
disabled = 0

[feature:s2s_autolb]
# If more than 20% of forwarding destinations have failed, health status changes to yellow.
indicator:s2s_connections:yellow = 20
# If more than 70% of forwarding destinations have failed, health status changes to red.
indicator:s2s_connections:red = 70
# Alerting for all indicators is disabled.
alert.disabled = 1

[feature:batchreader]
# Enable alerts for feature:batchreader. If there is no 'alert.disabled' value
# specified in a feature stanza, then the alert is enabled for the feature by
# default.
# You can also enable/disable alerts at the indicator level, using the setting:
# 'alert:<indicator name>.disabled'.
alert.disabled = 0

# You can define which color triggers an alert.
# If the value is yellow, both yellow and red trigger an alert.
# If the value is red, only red triggers an alert.
# Default value is red.
# You can also define the threshold_color for each indicator using the setting:
# 'alert:<indicator name>.threshold_color'.
# Indicator level setting overrides the feature level threshold_color setting.
alert.threshold_color = red

# You can define the duration that an unhealthy status persists before the alert fires.
# Default value is 60 seconds.
# You can also define the min_duration_sec for each indicator using the setting:
# 'alert:<indicator name>.min_duration_sec'.
# Indicator level setting overrides feature level min_duration_sec setting.
alert.min_duration_sec = 30

# Suppresses color changes for this feature until March 25, 2021 8:00:00 PM GMT.
snooze_end_time = 1616702400

Related answers from Splunk Community

health.conf

health.conf.spec

[distributed_health_reporter]

[health_reporter]

[clustering]

[tree_view:health_subset]

[feature:*]

[alert_action:*]

health.conf.example

Comments

health.conf

Was this topic useful?