diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-count-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-count-functions.asciidoc deleted file mode 100644 index 54298e80b..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-count-functions.asciidoc +++ /dev/null @@ -1,285 +0,0 @@ -[[ml-count-functions]] -= Count functions - -Count functions detect anomalies when the number of events in a bucket is -anomalous. - -Use `non_zero_count` functions if your data is sparse and you want to ignore -cases where the bucket count is zero. - -Use `distinct_count` functions to determine when the number of distinct values -in one field is unusual, as opposed to the total count. - -Use high-sided functions if you want to monitor unusually high event rates. -Use low-sided functions if you want to look at drops in event rate. - -The {ml-features} include the following count functions: - -* xref:ml-count[`count`, `high_count`, `low_count`] -* xref:ml-nonzero-count[`non_zero_count`, `high_non_zero_count`, `low_non_zero_count`] -* xref:ml-distinct-count[`distinct_count`, `high_distinct_count`, `low_distinct_count`] - -[discrete] -[[ml-count]] -== Count, high_count, low_count - -The `count` function detects anomalies when the number of events in a bucket is -anomalous. - -The `high_count` function detects anomalies when the count of events in a bucket -are unusually high. - -The `low_count` function detects anomalies when the count of events in a bucket -are unusually low. - -These functions support the following properties: - -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing events with the count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example1 -{ - "analysis_config": { - "detectors": [{ - "function" : "count" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -This example is probably the simplest possible analysis. It identifies -time buckets during which the overall count of events is higher or lower than -usual. - -When you use this function in a detector in your {anomaly-job}, it models the -event rate and detects when the event rate is unusual compared to its past -behavior. - -.Example 2: Analyzing errors with the high_count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example2 -{ - "analysis_config": { - "detectors": [{ - "function" : "high_count", - "by_field_name" : "error_code", - "over_field_name": "user" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -If you use this `high_count` function in a detector in your {anomaly-job}, it -models the event rate for each error code. It detects users that generate an -unusually high count of error codes compared to other users. - - -.Example 3: Analyzing status codes with the low_count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example3 -{ - "analysis_config": { - "detectors": [{ - "function" : "low_count", - "by_field_name" : "status_code" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -In this example, the function detects when the count of events for a status code -is lower than usual. - -When you use this function in a detector in your {anomaly-job}, it models the -event rate for each status code and detects when a status code has an unusually -low count compared to its past behavior. - -.Example 4: Analyzing aggregated data with the count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example4 -{ - "analysis_config": { - "summary_count_field_name" : "events_per_min", - "detectors": [{ - "function" : "count" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -If you are analyzing an aggregated `events_per_min` field, do not use a sum -function (for example, `sum(events_per_min)`). Instead, use the count function -and the `summary_count_field_name` property. For more information, see -<>. - -[discrete] -[[ml-nonzero-count]] -== Non_zero_count, high_non_zero_count, low_non_zero_count - -The `non_zero_count` function detects anomalies when the number of events in a -bucket is anomalous, but it ignores cases where the bucket count is zero. Use -this function if you know your data is sparse or has gaps and the gaps are not -important. - -The `high_non_zero_count` function detects anomalies when the number of events -in a bucket is unusually high and it ignores cases where the bucket count is -zero. - -The `low_non_zero_count` function detects anomalies when the number of events in -a bucket is unusually low and it ignores cases where the bucket count is zero. - -These functions support the following properties: - -* `by_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -For example, if you have the following number of events per bucket: - -==== - -1,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,43,31,0,0,0,0,0,0,0,0,0,0,0,0,2,1 - -==== - -The `non_zero_count` function models only the following data: - -==== - -1,22,2,43,31,2,1 - -==== - -.Example 5: Analyzing signatures with the high_non_zero_count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example5 -{ - "analysis_config": { - "detectors": [{ - "function" : "high_non_zero_count", - "by_field_name" : "signaturename" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -If you use this `high_non_zero_count` function in a detector in your -{anomaly-job}, it models the count of events for the `signaturename` field. It -ignores any buckets where the count is zero and detects when a `signaturename` -value has an unusually high count of events compared to its past behavior. - -NOTE: Population analysis (using an `over_field_name` property value) is not -supported for the `non_zero_count`, `high_non_zero_count`, and -`low_non_zero_count` functions. If you want to do population analysis and your -data is sparse, use the `count` functions, which are optimized for that scenario. - - -[discrete] -[[ml-distinct-count]] -== Distinct_count, high_distinct_count, low_distinct_count - -The `distinct_count` function detects anomalies where the number of distinct -values in one field is unusual. - -The `high_distinct_count` function detects unusually high numbers of distinct -values in one field. - -The `low_distinct_count` function detects unusually low numbers of distinct -values in one field. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 6: Analyzing users with the distinct_count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example6 -{ - "analysis_config": { - "detectors": [{ - "function" : "distinct_count", - "field_name" : "user" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -This `distinct_count` function detects when a system has an unusual number -of logged in users. When you use this function in a detector in your -{anomaly-job}, it models the distinct count of users. It also detects when the -distinct number of users is unusual compared to the past. - -.Example 7: Analyzing ports with the high_distinct_count function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example7 -{ - "analysis_config": { - "detectors": [{ - "function" : "high_distinct_count", - "field_name" : "dst_port", - "over_field_name": "src_ip" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -This example detects instances of port scanning. When you use this function in a -detector in your {anomaly-job}, it models the distinct count of ports. It also -detects the `src_ip` values that connect to an unusually high number of -different `dst_ports` values compared to other `src_ip` values. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-functions.asciidoc deleted file mode 100644 index a44e3ceaa..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-functions.asciidoc +++ /dev/null @@ -1,43 +0,0 @@ -[[ml-functions]] -= Function reference - -The {ml-features} include analysis functions that provide a wide variety of -flexible ways to analyze data for anomalies. - -When you create {anomaly-jobs}, you specify one or more detectors, which define -the type of analysis that needs to be done. If you are creating your job by -using {ml} APIs, you specify the functions in detector configuration objects. -If you are creating your job in {kib}, you specify the functions differently -depending on whether you are creating single metric, multi-metric, or advanced -jobs. -//For a demonstration of creating jobs in {kib}, see <>. - -Most functions detect anomalies in both low and high values. In statistical -terminology, they apply a two-sided test. Some functions offer low and high -variations (for example, `count`, `low_count`, and `high_count`). These variations -apply one-sided tests, detecting anomalies only when the values are low or -high, depending one which alternative is used. - -You can specify a `summary_count_field_name` with any function except `metric`. -When you use `summary_count_field_name`, the {ml} features expect the input -data to be pre-aggregated. The value of the `summary_count_field_name` field -must contain the count of raw events that were summarized. In {kib}, use the -**summary_count_field_name** in advanced {anomaly-jobs}. Analyzing aggregated -input data provides a significant boost in performance. For more information, see -<>. - -If your data is sparse, there may be gaps in the data which means you might have -empty buckets. You might want to treat these as anomalies or you might want these -gaps to be ignored. Your decision depends on your use case and what is important -to you. It also depends on which functions you use. The `sum` and `count` -functions are strongly affected by empty buckets. For this reason, there are -`non_null_sum` and `non_zero_count` functions, which are tolerant to sparse data. -These functions effectively ignore empty buckets. - -* <> -* <> -* <> -* <> -* <> -* <> -* <> diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-geo-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-geo-functions.asciidoc deleted file mode 100644 index 6c5f075ab..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-geo-functions.asciidoc +++ /dev/null @@ -1,85 +0,0 @@ -[[ml-geo-functions]] -= Geographic functions - -The geographic functions detect anomalies in the geographic location of the -input data. - -The {ml-features} include the following geographic function: `lat_long`. - -NOTE: You cannot create forecasts for {anomaly-jobs} that contain geographic -functions. You also cannot add rules with conditions to detectors that use -geographic functions. - -[discrete] -[[ml-lat-long]] -== Lat_long - -The `lat_long` function detects anomalies in the geographic location of the -input data. - -This function supports the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing transactions with the lat_long function -[source,console] --------------------------------------------------- -PUT _ml/anomaly_detectors/example1 -{ - "analysis_config": { - "detectors": [{ - "function" : "lat_long", - "field_name" : "transaction_coordinates", - "by_field_name" : "credit_card_number" - }] - }, - "data_description": { - "time_field":"timestamp", - "time_format": "epoch_ms" - } -} --------------------------------------------------- -// TEST[skip:needs-licence] - -If you use this `lat_long` function in a detector in your {anomaly-job}, it -detects anomalies where the geographic location of a credit card transaction is -unusual for a particular customer’s credit card. An anomaly might indicate -fraud. - -A "typical" value indicates a centroid of a cluster of previously observed -locations that is closest to the "actual" location at that time. For example, -there may be one centroid near the person's home that is associated with the -cluster of local grocery stores and restaurants, and another centroid near the -person's work associated with the cluster of lunch and coffee places. - -IMPORTANT: The `field_name` that you supply must be a single string that -contains two comma-separated numbers of the form `latitude,longitude`, a -`geo_point` field, a `geo_shape` field that contains point values, or a -`geo_centroid` aggregation. The `latitude` and `longitude` must be in the range --180 to 180 and represent a point on the surface of the Earth. - -For example, JSON data might contain the following transaction coordinates: - -[source,js] --------------------------------------------------- -{ - "time": 1460464275, - "transaction_coordinates": "40.7,-74.0", - "credit_card_number": "1234123412341234" -} --------------------------------------------------- -// NOTCONSOLE - -In {es}, location data is likely to be stored in `geo_point` fields. For more -information, see {ref}/geo-point.html[`geo_point` data type]. This data type is -supported natively in {ml-features}. Specifically, when pulling data from a -`geo_point` field, a {dfeed} will transform the data into the appropriate -`lat,lon` string format before sending to the {anomaly-job}. - -For more information, see <>. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-info-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-info-functions.asciidoc deleted file mode 100644 index d1cbf39cd..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-info-functions.asciidoc +++ /dev/null @@ -1,90 +0,0 @@ -[[ml-info-functions]] -= Information content functions - -The information content functions detect anomalies in the amount of information -that is contained in strings within a bucket. These functions can be used as -a more sophisticated method to identify incidences of data exfiltration or -C2C activity, when analyzing the size in bytes of the data might not be sufficient. - -The {ml-features} include the following information content functions: - -* `info_content`, `high_info_content`, `low_info_content` - -[discrete] -[[ml-info-content]] -== Info_content, High_info_content, Low_info_content - -The `info_content` function detects anomalies in the amount of information that -is contained in strings in a bucket. - -If you want to monitor for unusually high amounts of information, -use `high_info_content`. -If want to look at drops in information content, use `low_info_content`. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing subdomain strings with the info_content function -[source,js] --------------------------------------------------- -{ - "function" : "info_content", - "field_name" : "subdomain", - "over_field_name" : "highest_registered_domain" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `info_content` function in a detector in your {anomaly-job}, it -models information that is present in the `subdomain` string. It detects -anomalies where the information content is unusual compared to the other -`highest_registered_domain` values. An anomaly could indicate an abuse of the -DNS protocol, such as malicious command and control activity. - -NOTE: In this example, both high and low values are considered anomalous. -In many use cases, the `high_info_content` function is often a more appropriate -choice. - -.Example 2: Analyzing query strings with the high_info_content function -[source,js] --------------------------------------------------- -{ - "function" : "high_info_content", - "field_name" : "query", - "over_field_name" : "src_ip" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `high_info_content` function in a detector in your {anomaly-job}, -it models information content that is held in the DNS query string. It detects -`src_ip` values where the information content is unusually high compared to -other `src_ip` values. This example is similar to the example for the -`info_content` function, but it reports anomalies only where the amount of -information content is higher than expected. - -.Example 3: Analyzing message strings with the low_info_content function -[source,js] --------------------------------------------------- -{ - "function" : "low_info_content", - "field_name" : "message", - "by_field_name" : "logfilename" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `low_info_content` function in a detector in your {anomaly-job}, -it models information content that is present in the message string for each -`logfilename`. It detects anomalies where the information content is low -compared to its past behavior. For example, this function detects unusually low -amounts of information in a collection of rolling log files. Low information -might indicate that a process has entered an infinite loop or that logging -features have been disabled. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-metric-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-metric-functions.asciidoc deleted file mode 100644 index bbd9dfc8f..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-metric-functions.asciidoc +++ /dev/null @@ -1,325 +0,0 @@ -[[ml-metric-functions]] -= Metric functions - -The metric functions include functions such as mean, min and max. These values -are calculated for each bucket. Field values that cannot be converted to -double precision floating point numbers are ignored. - -The {ml-features} include the following metric functions: - -* <> -* <> -* xref:ml-metric-median[`median`, `high_median`, `low_median`] -* xref:ml-metric-mean[`mean`, `high_mean`, `low_mean`] -* <> -* xref:ml-metric-varp[`varp`, `high_varp`, `low_varp`] - -NOTE: You cannot add rules with conditions to detectors that use the `metric` -function. - -[discrete] -[[ml-metric-min]] -== Min - -The `min` function detects anomalies in the arithmetic minimum of a value. -The minimum value is calculated for each bucket. - -High- and low-sided functions are not applicable. - -This function supports the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing minimum transactions with the min function -[source,js] --------------------------------------------------- -{ - "function" : "min", - "field_name" : "amt", - "by_field_name" : "product" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `min` function in a detector in your {anomaly-job}, it detects -where the smallest transaction is lower than previously observed. You can use -this function to detect items for sale at unintentionally low prices due to data -entry mistakes. It models the minimum amount for each product over time. - -[discrete] -[[ml-metric-max]] -== Max - -The `max` function detects anomalies in the arithmetic maximum of a value. -The maximum value is calculated for each bucket. - -High- and low-sided functions are not applicable. - -This function supports the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 2: Analyzing maximum response times with the max function -[source,js] --------------------------------------------------- -{ - "function" : "max", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `max` function in a detector in your {anomaly-job}, it detects -where the longest `responsetime` is longer than previously observed. You can use -this function to detect applications that have `responsetime` values that are -unusually lengthy. It models the maximum `responsetime` for each application -over time and detects when the longest `responsetime` is unusually long compared -to previous applications. - -.Example 3: Two detectors with max and high_mean functions -[source,js] --------------------------------------------------- -{ - "function" : "max", - "field_name" : "responsetime", - "by_field_name" : "application" -}, -{ - "function" : "high_mean", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -The analysis in the previous example can be performed alongside `high_mean` -functions by application. By combining detectors and using the same influencer -this job can detect both unusually long individual response times and average -response times for each bucket. - -[discrete] -[[ml-metric-median]] -== Median, high_median, low_median - -The `median` function detects anomalies in the statistical median of a value. -The median value is calculated for each bucket. - -If you want to monitor unusually high median values, use the `high_median` -function. - -If you are just interested in unusually low median values, use the `low_median` -function. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 4: Analyzing response times with the median function -[source,js] --------------------------------------------------- -{ - "function" : "median", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `median` function in a detector in your {anomaly-job}, it models -the median `responsetime` for each application over time. It detects when the -median `responsetime` is unusual compared to previous `responsetime` values. - -[discrete] -[[ml-metric-mean]] -== Mean, high_mean, low_mean - -The `mean` function detects anomalies in the arithmetic mean of a value. -The mean value is calculated for each bucket. - -If you want to monitor unusually high average values, use the `high_mean` -function. - -If you are just interested in unusually low average values, use the `low_mean` -function. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 5: Analyzing response times with the mean function -[source,js] --------------------------------------------------- -{ - "function" : "mean", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `mean` function in a detector in your {anomaly-job}, it models -the mean `responsetime` for each application over time. It detects when the mean -`responsetime` is unusual compared to previous `responsetime` values. - -.Example 6: Analyzing response times with the high_mean function -[source,js] --------------------------------------------------- -{ - "function" : "high_mean", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `high_mean` function in a detector in your {anomaly-job}, it -models the mean `responsetime` for each application over time. It detects when -the mean `responsetime` is unusually high compared to previous `responsetime` -values. - -.Example 7: Analyzing response times with the low_mean function -[source,js] --------------------------------------------------- -{ - "function" : "low_mean", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `low_mean` function in a detector in your {anomaly-job}, it -models the mean `responsetime` for each application over time. It detects when -the mean `responsetime` is unusually low compared to previous `responsetime` -values. - -[discrete] -[[ml-metric-metric]] -== Metric - -The `metric` function combines `min`, `max`, and `mean` functions. You can use -it as a shorthand for a combined analysis. If you do not specify a function in -a detector, this is the default function. - -High- and low-sided functions are not applicable. You cannot use this function -when a `summary_count_field_name` is specified. - -This function supports the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 8: Analyzing response times with the metric function -[source,js] --------------------------------------------------- -{ - "function" : "metric", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `metric` function in a detector in your {anomaly-job}, it models -the mean, min, and max `responsetime` for each application over time. It detects -when the mean, min, or max `responsetime` is unusual compared to previous -`responsetime` values. - -[discrete] -[[ml-metric-varp]] -== Varp, high_varp, low_varp - -The `varp` function detects anomalies in the variance of a value which is a -measure of the variability and spread in the data. - -If you want to monitor unusually high variance, use the `high_varp` function. - -If you are just interested in unusually low variance, use the `low_varp` function. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 9: Analyzing response times with the varp function -[source,js] --------------------------------------------------- -{ - "function" : "varp", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `varp` function in a detector in your {anomaly-job}, it models -the variance in values of `responsetime` for each application over time. It -detects when the variance in `responsetime` is unusual compared to past -application behavior. - -.Example 10: Analyzing response times with the high_varp function -[source,js] --------------------------------------------------- -{ - "function" : "high_varp", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `high_varp` function in a detector in your {anomaly-job}, it -models the variance in values of `responsetime` for each application over time. -It detects when the variance in `responsetime` is unusual compared to past -application behavior. - -.Example 11: Analyzing response times with the low_varp function -[source,js] --------------------------------------------------- -{ - "function" : "low_varp", - "field_name" : "responsetime", - "by_field_name" : "application" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `low_varp` function in a detector in your {anomaly-job}, it -models the variance in values of `responsetime` for each application over time. -It detects when the variance in `responsetime` is unusual compared to past -application behavior. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-rare-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-rare-functions.asciidoc deleted file mode 100644 index 69378d64f..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-rare-functions.asciidoc +++ /dev/null @@ -1,136 +0,0 @@ -[[ml-rare-functions]] -= Rare functions - -The rare functions detect values that occur rarely in time or rarely for a -population. - -The `rare` analysis detects anomalies according to the number of distinct rare -values. This differs from `freq_rare`, which detects anomalies according to the -number of times (frequency) rare values occur. - -[NOTE] -==== -* The `rare` and `freq_rare` functions should not be used in conjunction with -`exclude_frequent`. -* You cannot create forecasts for {anomaly-jobs} that contain `rare` or -`freq_rare` functions. -* You cannot add rules with conditions to detectors that use `rare` or -`freq_rare` functions. -* Shorter bucket spans (less than 1 hour, for example) are recommended when -looking for rare events. The functions model whether something happens in a -bucket at least once. With longer bucket spans, it is more likely that -entities will be seen in a bucket and therefore they appear less rare. -Picking the ideal bucket span depends on the characteristics of the data -with shorter bucket spans typically being measured in minutes, not hours. -* To model rare data, a learning period of at least 20 buckets is required -for typical data. -==== - -The {ml-features} include the following rare functions: - -* <> -* <> - - -[discrete] -[[ml-rare]] -== Rare - -The `rare` function detects values that occur rarely in time or rarely for a -population. It detects anomalies according to the number of distinct rare values. - -This function supports the following properties: - -* `by_field_name` (required) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing status codes with the rare function -[source,js] --------------------------------------------------- -{ - "function" : "rare", - "by_field_name" : "status" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `rare` function in a detector in your {anomaly-job}, it detects -values that are rare in time. It models status codes that occur over time and -detects when rare status codes occur compared to the past. For example, you can -detect status codes in a web access log that have never (or rarely) occurred -before. - -.Example 2: Analyzing status codes in a population with the rare function -[source,js] --------------------------------------------------- -{ - "function" : "rare", - "by_field_name" : "status", - "over_field_name" : "clientip" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `rare` function in a detector in your {anomaly-job}, it detects -values that are rare in a population. It models status code and client IP -interactions that occur. It defines a rare status code as one that occurs for -few client IP values compared to the population. It detects client IP values -that experience one or more distinct rare status codes compared to the -population. For example in a web access log, a `clientip` that experiences the -highest number of different rare status codes compared to the population is -regarded as highly anomalous. This analysis is based on the number of different -status code values, not the count of occurrences. - -NOTE: To define a status code as rare the {ml-features} look at the number -of distinct status codes that occur, not the number of times the status code -occurs. If a single client IP experiences a single unique status code, this -is rare, even if it occurs for that client IP in every bucket. - -[discrete] -[[ml-freq-rare]] -== Freq_rare - -The `freq_rare` function detects values that occur rarely for a population. -It detects anomalies according to the number of times (frequency) that rare -values occur. - -This function supports the following properties: - -* `by_field_name` (required) -* `over_field_name` (required) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 3: Analyzing URI values in a population with the freq_rare function -[source,js] --------------------------------------------------- -{ - "function" : "freq_rare", - "by_field_name" : "uri", - "over_field_name" : "clientip" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `freq_rare` function in a detector in your {anomaly-job}, it -detects values that are frequently rare in a population. It models URI paths and -client IP interactions that occur. It defines a rare URI path as one that is -visited by few client IP values compared to the population. It detects the -client IP values that experience many interactions with rare URI paths compared -to the population. For example in a web access log, a client IP that visits -one or more rare URI paths many times compared to the population is regarded as -highly anomalous. This analysis is based on the count of interactions with rare -URI paths, not the number of different URI path values. - - -NOTE: Defining a URI path as rare happens the same way as you can see in the -case of the status codes above: the analytics consider the number of distinct -values that occur and not the number of times the URI path occurs. If a single -client IP visits a single unique URI path, this is rare, even if it -occurs for that client IP in every bucket. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-sum-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-sum-functions.asciidoc deleted file mode 100644 index f0b8b8389..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-sum-functions.asciidoc +++ /dev/null @@ -1,113 +0,0 @@ -[[ml-sum-functions]] -= Sum functions - -The sum functions detect anomalies when the sum of a field in a bucket is -anomalous. - -If you want to monitor unusually high totals, use high-sided functions. - -If want to look at drops in totals, use low-sided functions. - -If your data is sparse, use `non_null_sum` functions. Buckets without values are -ignored; buckets with a zero value are analyzed. - -The {ml-features} include the following sum functions: - -* xref:ml-sum[`sum`, `high_sum`, `low_sum`] -* xref:ml-nonnull-sum[`non_null_sum`, `high_non_null_sum`, `low_non_null_sum`] - -[discrete] -[[ml-sum]] -== Sum, high_sum, low_sum - -The `sum` function detects anomalies where the sum of a field in a bucket is -anomalous. - -If you want to monitor unusually high sum values, use the `high_sum` function. - -If you want to monitor unusually low sum values, use the `low_sum` function. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing total expenses with the sum function -[source,js] --------------------------------------------------- -{ - "function" : "sum", - "field_name" : "expenses", - "by_field_name" : "costcenter", - "over_field_name" : "employee" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `sum` function in a detector in your {anomaly-job}, it -models total expenses per employees for each cost center. For each time bucket, -it detects when an employee’s expenses are unusual for a cost center compared -to other employees. - -.Example 2: Analyzing total bytes with the high_sum function -[source,js] --------------------------------------------------- -{ - "function" : "high_sum", - "field_name" : "cs_bytes", - "over_field_name" : "cs_host" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `high_sum` function in a detector in your {anomaly-job}, it -models total `cs_bytes`. It detects `cs_hosts` that transfer unusually high -volumes compared to other `cs_hosts`. This example looks for volumes of data -transferred from a client to a server on the internet that are unusual compared -to other clients. This scenario could be useful to detect data exfiltration or -to find users that are abusing internet privileges. - -[discrete] -[[ml-nonnull-sum]] -== Non_null_sum, high_non_null_sum, low_non_null_sum - -The `non_null_sum` function is useful if your data is sparse. Buckets without -values are ignored and buckets with a zero value are analyzed. - -If you want to monitor unusually high totals, use the `high_non_null_sum` -function. - -If you want to look at drops in totals, use the `low_non_null_sum` function. - -These functions support the following properties: - -* `field_name` (required) -* `by_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -NOTE: Population analysis (that is to say, use of the `over_field_name` property) -is not applicable for this function. - -.Example 3: Analyzing employee approvals with the high_non_null_sum function -[source,js] --------------------------------------------------- -{ - "function" : "high_non_null_sum", - "field_name" : "amount_approved", - "by_field_name" : "employee" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `high_non_null_sum` function in a detector in your {anomaly-job}, -it models the total `amount_approved` for each employee. It ignores any buckets -where the amount is null. It detects employees who approve unusually high -amounts compared to their past behavior. diff --git a/docs/en/stack/ml/anomaly-detection/functions/ml-time-functions.asciidoc b/docs/en/stack/ml/anomaly-detection/functions/ml-time-functions.asciidoc deleted file mode 100644 index 7e2301dca..000000000 --- a/docs/en/stack/ml/anomaly-detection/functions/ml-time-functions.asciidoc +++ /dev/null @@ -1,112 +0,0 @@ -[[ml-time-functions]] -= Time functions - -The time functions detect events that happen at unusual times, either of the day -or of the week. These functions can be used to find unusual patterns of -behavior, typically associated with suspicious user activity. - -The {ml-features} include the following time functions: - -* <> -* <> - - -[NOTE] -==== -* You cannot create forecasts for {anomaly-jobs} that contain time -functions. -* The `time_of_day` function is not aware of the difference between days, for -instance work days and weekends. When modeling different days, use the -`time_of_week` function. In general, the `time_of_week` function is more suited -to modeling the behavior of people rather than machines, as people vary their -behavior according to the day of the week. -* Shorter bucket spans (for example, 10 minutes) are recommended when performing -a `time_of_day` or `time_of_week` analysis. The time of the events being modeled -are not affected by the bucket span, but a shorter bucket span enables quicker -alerting on unusual events. -* Unusual events are flagged based on the previous pattern of the data, not on -what we might think of as unusual based on human experience. So, if events -typically occur between 3 a.m. and 5 a.m., an event occurring at 3 p.m. is -flagged as unusual. -* When Daylight Saving Time starts or stops, regular events can be flagged as -anomalous. This situation occurs because the actual time of the event (as -measured against a UTC baseline) has changed. This situation is treated as a -step change in behavior and the new times will be learned quickly. -==== - -[discrete] -[[ml-time-of-day]] -== Time_of_day - -The `time_of_day` function detects when events occur that are outside normal -usage patterns. For example, it detects unusual activity in the middle of the -night. - -The function expects daily behavior to be similar. If you expect the behavior of -your data to differ on Saturdays compared to Wednesdays, the `time_of_week` -function is more appropriate. - -This function supports the following properties: - -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 1: Analyzing events with the time_of_day function -[source,js] --------------------------------------------------- -{ - "function" : "time_of_day", - "by_field_name" : "process" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `time_of_day` function in a detector in your {anomaly-job}, it -models when events occur throughout a day for each process. It detects when an -event occurs for a process that is at an unusual time in the day compared to -its past behavior. - -[discrete] -[[ml-time-of-week]] -== Time_of_week - -The `time_of_week` function detects when events occur that are outside normal -usage patterns. For example, it detects login events on the weekend. - -IMPORTANT: The `time_of_week` function models time in epoch seconds modulo the - duration of a week in seconds. It means that the `typical` and `actual` values - are seconds after a whole number of weeks since 1/1/1970 in UTC which is a - Thursday. For example, a value of `475` is 475 seconds after midnight on - Thursday in UTC. - -This function supports the following properties: - -* `by_field_name` (optional) -* `over_field_name` (optional) -* `partition_field_name` (optional) - -For more information about those properties, see the -{ref}/ml-put-job.html#ml-put-job-request-body[create {anomaly-jobs} API]. - -.Example 2: Analyzing events with the time_of_week function -[source,js] --------------------------------------------------- -{ - "function" : "time_of_week", - "by_field_name" : "eventcode", - "over_field_name" : "workstation" -} --------------------------------------------------- -// NOTCONSOLE - -If you use this `time_of_week` function in a detector in your {anomaly-job}, it -models when events occur throughout the week for each `eventcode`. It detects -when a workstation event occurs at an unusual time during the week for that -`eventcode` compared to other workstations. It detects events for a -particular workstation that are outside the normal usage pattern. - - diff --git a/docs/en/stack/ml/anomaly-detection/index.asciidoc b/docs/en/stack/ml/anomaly-detection/index.asciidoc index eb1fa9831..1db6e4475 100644 --- a/docs/en/stack/ml/anomaly-detection/index.asciidoc +++ b/docs/en/stack/ml/anomaly-detection/index.asciidoc @@ -58,27 +58,22 @@ include::ml-limitations.asciidoc[leveloffset=+2] include::ml-ad-troubleshooting.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-functions.asciidoc[leveloffset=+2] -include::functions/ml-functions.asciidoc[leveloffset=+1] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-count-functions.asciidoc[] -include::functions/ml-count-functions.asciidoc[leveloffset=+2] +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-functions.asciidoc[leveloffset=+1] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-geo-functions.asciidoc[] -include::functions/ml-geo-functions.asciidoc[leveloffset=+2] +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-count-functions.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-info-functions.asciidoc[] -include::functions/ml-info-functions.asciidoc[leveloffset=+2] +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-geo-functions.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-metric-functions.asciidoc[] -include::functions/ml-metric-functions.asciidoc[leveloffset=+2] +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-info-functions.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-rare-functions.asciidoc[] -include::functions/ml-rare-functions.asciidoc[leveloffset=+2] +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-metric-functions.asciidoc[leveloffset=+2] + +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-rare-functions.asciidoc[leveloffset=+2] + +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-sum-functions.asciidoc[leveloffset=+2] + +include::{es-repo-dir}/ml/anomaly-detection/functions/ml-time-functions.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-sum-functions.asciidoc[] -include::functions/ml-sum-functions.asciidoc[leveloffset=+2] -// include::{es-repo-dir}/ml/anomaly-detection/functions/ml-time-functions.asciidoc[] -include::functions/ml-time-functions.asciidoc[leveloffset=+2] include::ootb-ml-jobs.asciidoc[leveloffset=+1]