diff --git a/modules/n1ql/pages/n1ql-intro/sysinfo.adoc b/modules/n1ql/pages/n1ql-intro/sysinfo.adoc index 55d137636..36ad47e26 100644 --- a/modules/n1ql/pages/n1ql-intro/sysinfo.adoc +++ b/modules/n1ql/pages/n1ql-intro/sysinfo.adoc @@ -57,6 +57,7 @@ xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-active-req[system:active_re xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-prepared[system:prepareds] xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-completed-req[system:completed_requests] xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-history[system:completed_requests_history] +xref:n1ql:n1ql-manage/query-awr.adoc#system-awr[system:awr] a| [%hardbreaks] <> diff --git a/modules/n1ql/pages/n1ql-manage/index.adoc b/modules/n1ql/pages/n1ql-manage/index.adoc index b1cd9b63a..9b1a9d32b 100644 --- a/modules/n1ql/pages/n1ql-manage/index.adoc +++ b/modules/n1ql/pages/n1ql-manage/index.adoc @@ -44,3 +44,9 @@ You can monitor and manage primary and secondary indexes using the Couchbase Web You can configure the Query service using cluster-level query settings, node-level query settings, and request-level query parameters. * xref:n1ql:n1ql-manage/query-settings.adoc[] + +== Automatic Workload Repository (AWR) + +You can capture detailed performance statistics of queries and analyze their performance using AWR. + +* xref:n1ql:n1ql-manage/query-awr.adoc[] \ No newline at end of file diff --git a/modules/n1ql/pages/n1ql-manage/query-awr.adoc b/modules/n1ql/pages/n1ql-manage/query-awr.adoc new file mode 100644 index 000000000..20af014f0 --- /dev/null +++ b/modules/n1ql/pages/n1ql-manage/query-awr.adoc @@ -0,0 +1,477 @@ += Automatic Workload Repository +:page-status: Couchbase Server 8.0 +:description: Monitor and optimize query performance and workload using Automatic Workload Repository (AWR). +:page-toclevels: 2 + +[abstract] +{description} + +== Overview + +Automatic Workload Repository (AWR) is a feature that captures and maintains performance statistics for queries executed on your Couchbase cluster. +It acts as a centralized repository for query performance data, enabling you to monitor and analyze query activity and workload over time. + +By providing a historical view of query behavior, AWR makes it easier to identify trends and performance bottlenecks. +For example, some queries may run efficiently with minimal overhead, while others may consume more resources or take longer to complete. +With AWR, you can understand these differences and optimize your queries accordingly. + +When enabled, AWR automatically gathers detailed metrics from the Query Service for every query that you run on your cluster. +This includes metrics such as execution time, CPU usage, memory consumption, number of executions, and more. +It then aggregates this data into <> and stores them in a <>. + +You can access the collected data by directly querying the <> or by using Couchbase's report generation tool to generate reports and compare query performance across different time periods. +For more information, see <>. + +== Use Cases + +Here are some scenarios where AWR can be effectively utilized to enhance query performance and workload: + +* **Troubleshooting Real-Time Issues**: +You can quickly identify slow running queries or instances of high resource usage. +You can extract the SQL ID of the problematic query from the AWR report and use it to trace the query in xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-completed-req[completed_requests]. + +* **Analyzing Performance**: +When rolling out changes, such as introducing new microservices, AWR lets you compare query performance before and after the update. +This helps you identify affected queries and optimize their performance accordingly. + +* **Analyzing Upgrade Impacts**: +You can assess query performance before and after a cluster upgrade to identify queries impacted by the new version. + +[#workload-repository] +== Workload Repository + +The workload repository is a centralized storage location where all snapshots are collected and maintained. +It is a user-defined location that can be a bucket or a collection, but not a scope. + +Before AWR can start collecting data, you must configure the repository location in the <> catalog. +Until this specified location is available, AWR remains in a quiescent (inactive) state. +Once the location becomes accessible, AWR transitions to an active state and begins collecting data. +If the location becomes unavailable at any point, AWR returns to the quiescent state and resumes activity only when the location is accessible again. + +For more information about setting up the repository location, see <>. + +[#snapshots] +== Snapshots + +AWR stores query performance data in the form of snapshots. +For each unique statement executed within a specified reporting interval, AWR generates a snapshot. +This snapshot contains aggregate metrics for all executions of that statement during the interval. +These metrics include execution time, CPU usage, memory consumption, and other performance indicators. + +Snapshots are stored as individual documents in the workload repository. +Each document is uniquely identified by its document key (ID), that includes the start time of the reporting interval, making it easier to filter and analyze data. + +=== Snapshot Retention Management + +To facilitate long-term analysis and performance comparisons across different periods, AWR retains snapshot documents in the repository. +By default, AWR does not automatically enforce retention policies on these documents. +Instead, you need to configure a Time-To-Live (TTL) or expiration for the AWR location. +The TTL specifies how long the documents remain in that location before the system automatically purges them. + +For example, setting a TTL of 180 days on a given bucket ensures that all snapshot documents older than 180 days within the bucket are automatically deleted. +This mechanism allows you to manage storage usage effectively while retaining relevant history of query performance data. +For more information about configuring the TTL, see xref:server:learn:data/expiration.adoc[Expiration]. + +[#enable-configure-awr] +== Enable and Configure AWR + +AWR is an opt-in feature that you must explicitly enable and configure. +Once enabled, AWR starts collecting data as soon as the repository location is set and is available. + +You can manage these settings through the <> catalog. + +[#system-awr] +=== system:awr + +This catalog determines how AWR functions including where it stores snapshots, how often it collects statistics, and which queries to include in the report. +You can adjust the AWR settings at any time to align with your monitoring needs. + +NOTE: Only admins or users with the `query_manage_system_catalog` role can modify settings in `system:awr`. +For more information, see xref:n1ql:n1ql-intro/sysinfo.adoc#authentication-and-client-privileges[Authentication and Client Privileges]. + +The catalog consists of the following attributes: + +[cols="1a,4a,1a"] +|=== +| Name | Description | Schema + +|**enabled** + +| Indicates whether AWR is enabled or disabled. + +*Default*: `FALSE` + +| Boolean + +| **location** + +| The target keyspace (repository) where the snapshots are stored. +This can only be a path to a bucket or collection; it cannot be a scope. +For more information about the repository, see <>. + +AWR checks the availability of the location only once per interval. + +*Example*: `"bucket1.scope1.collection1"` + +| String + +|**interval** + +|The duration of the reporting interval. +That is, the time between each snapshot collection. +If the interval is set to 10 minutes, AWR captures snapshots every 10 minutes. + +The interval must be at least 1 minute. + +*Default*: `"10m0s"` + +**Example**: `"1m30s"` + +|String(duration) + +|**threshold** + +|The minimum time a statement must take to complete for it be captured and included in the snapshot. + +The threshold must be at least 0 seconds. + +*Default*: `"0s"`, so that by default, all statements are captured by AWR regardless of their execution time. + +**Example**: `"1m30s"` +|String(duration) + +| **num_statements** + +| The maximum number of unique statements for which aggregate data is collected during each interval. + +Once the specified limit is reached during a reporting interval, AWR does not generate snapshots for any additional unique statements within that same interval. + +*Default*: `10000` + +*Max*: `100000` + +| Positive integer + +| **queue_len** + +| Length of the processing queue. +It is recommended not to change this value. + +The default value and maximum allowable value for `queue_len` are internally calculated based on system resources. + +| Positive integer +|=== + +=== Examples + +.Enable AWR and set the repository location +==== +The following query enables AWR and sets the repository location to `default.s1.awr`. +It also sets the reporting interval to 1 minute and threshold to 0 seconds. + +[source,sqlpp] +---- +UPDATE system:awr SET enabled = true, location = "default.s1.awr", interval = "1m", threshold = "0s"; +---- +==== + +.Retrieve current AWR settings +==== +The following query retrieves the current AWR configuration settings. +[source,sqlpp] +---- +SELECT * FROM system:awr; +---- +==== + +[#monitor-awr] +== Monitor AWR + +The current status of AWR is recorded in the `query.log`. +You can view this information in the xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#vitals[system:vitals] output by using the following query: + +[source,sqlpp] +---- +SELECT awr FROM system:vitals; +---- + +[#view-awr-data-and-reports] +== View AWR Data and Reports + +You can access the AWR data by: + +* <> +* <> + +[#report-generation-tool] +=== Report Generation Tool + +You can generate AWR reports using Couchbase's `cbqueryreportgen` tool. +It provides comprehensive and user-friendly reports by executing SQL++ queries against the collected AWR data. + +For optimal query performance with the this tool, it is recommended to create an index on the document key (`META().id`) in your configured AWR location. +If this index is not present, the tool will use sequential scans, which can impact performance. + +For example, if the snapshots are stored in the `default:bucket1.scope1.awr` keyspace, you can create the recommended index as follows: + +[source,sqlpp] +---- +CREATE INDEX idx_awr ON default:bucket1.scope1.awr(META().id); +---- + +This index enables the `cbqueryreportgen` tool to efficiently query and retrieve AWR data for generating reports. +//For more information about the tool and its usage, see cbqueryreportgen. +// TODO: Add link to the CLI Reference section. + +[#query-awr-data-directly] +=== Querying AWR Data Directly + +You can also query AWR data directly from the workload repository using SQL++ queries. +When doing so, it is important to understand the data format, which is optimized to minimize the storage size. + +To query AWR data, access the specific target keyspace where the snapshots are stored. +The document keys (IDs) of the snapshot documents include the timestamp of the reporting interval's start time. +This allows you to filter documents based on time ranges without requiring additional indexes, as sequential scans support range-based key patterns. +However, if needed, you can define and add indexes to further optimize your queries. + +Each document contains the following fields: + +[cols="1a,4a,1a"] +|=== +| Name | Description | Schema + +| **cnt** + +| The number of times the statement was executed. +| Number + +| **from** + +| The start time of the interval, represented as an Epoch timestamp in milliseconds. +| Number + +| **to** + +|The end time of the interval, represented as an Epoch timestamp in milliseconds. +| Number + +| **pln** + +|An array containing the encoded, compressed outlines of the execution plan for both the minimum and maximum execution times of the statement. + +You can use xref:n1ql:n1ql-language-reference/stringfun.adoc#fn-str-uncompress[UNCOMPRESS()] to decompress the execution plan strings, and then pass them to xref:n1ql:n1ql-language-reference/jsonfun.adoc[DECODE_JSON()] for formatting, if needed. + +**Note**: This is just the outline of the plan listing operators and significant objects used. +For full execution details, configure the xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-completed-config[completed_requests] system keyspace to capture the executions of the statement. + +| Array of strings + +| **qc** + +| The query context value. +| String + +| **sqlID** + +| The unique hash identifier of the statement. + +This can be used to aggregate information across different reporting periods for the same statement. +It is also included in the xref:n1ql:n1ql-manage/monitoring-n1ql-query.adoc#sys-completed-req[completed_requests] entries (collected independently of AWR). +| String + +| **sts** + +| An ordered array of 51 entries representing the total, min, and max values of 17 statistics. +That is, each statistic is represented by three consecutive entries in the array: the total value, the minimum value, and the maximum value. +These values have fixed array positions and appear in the sequence specified in the <> array. + +For example, the second statistic in the list is the CPU time. Therefore, + +-- +* `sts[3]` represents the total CPU time. +* `sts[4]` represents minimum CPU time. +* `sts[5]` represents the maximum CPU time. +-- +|<> array + +|**txt** + +| The statement text, possibly in a compressed format. + +Typically, this field is accessed using the xref:n1ql:n1ql-language-reference/stringfun.adoc#fn-str-uncompress[UNCOMPRESS()] function, and the function returns the raw text if it isn't compressed. + +| String + +|**ver** + +| The version of the data record. + +For this release, the value is always 1. + +| Number + +|=== + +[[Stats]] +==== Statistics +[cols="1a,4a,1a"] +|=== +| Name | Description | Schema + +| **total time** + +| The total time taken for the request, that is the time from when the request was received until the results were returned. + +It includes time spent in the queue and is analogous to `elapsedTime` in the xref:n1ql-rest-query:index.adoc#Metrics[Query REST API] response. + +| Number + +| **cpu time** + +| The amount of time the operators in the execution plan spent executing operator code. + +It is analogous to `cpuTime` in the xref:n1ql-rest-query:index.adoc#Metrics[Query Service API] response when xref:n1ql-rest-query:index.adoc#Profile[profiling] is enabled. + +| Number + +| **memory usage (quota)** + + +|The amount of document memory used to execute the request. +A request will return its document memory usage only if `memory-quota` is set for the query, or if both `node-quota` and `node-quota-val-percent` are set. +For more information about these settings, see xref:n1ql:n1ql-manage/query-settings.adoc[]. + +It is analogous to `usedMemory` in the xref:n1ql-rest-query:index.adoc#Metrics[Query Service API] response. + +| Number + + +| **result count** + +| The total number of objects in the results. + +It is analogous to `resultCount` in the xref:n1ql-rest-query:index.adoc#Metrics[Query Service API] response. +| Number + + +| **result size** + +|The total number of bytes in the results. + +It is analogous to `resultSize` in the xref:n1ql-rest-query:index.adoc#Metrics[Query Service API] response. +| Number + + +| **error count** + +| The number of errors that occurred during the request. + +It is analogous to `errorCount` in the xref:n1ql-rest-query:index.adoc#Metrics[Query Service API] response. +| Number + + +| **run time** + +| The total amount of time taken to execute the query. It does not include time spent in the queue. +| Number + + +| **fetch time** + +| The total amount of time spent fetching data from the Data service. + +It includes the time spent executing `Fetch` operator code and waiting for data from the Data service. +| Number + + +| **primary scan time** + +| The total amount of time spent by primary scan operations. + +It includes the time spent executing the `PrimaryScan` operator code and waiting for data from the Index service. +| Number + + +| **sequential scan time** + +| The amount of time spent by sequential scan operations. + +It includes the time spent executing the `PrimaryScan` operator code and waiting for data from the Data service. +| Number + + +| **primary scan count** + +| The total number of index keys returned by primary index scans and processed by the Query engine. +| Number + + +| **sequential scan count** + +| The total number of document keys returned by sequential scans and processed by the Query engine. +| Number + + +| **index scan count** + +| The total number of items returned by index scans and processed by the Query engine. +| Number + + +| **fetch count** + +| The total number of documents fetched from the Data service and processed by the Query engine. +| Number + +| **order count** + +| The number of items that were sorted. +| Number + + +| **primary scan ops** + +| The number of primary scan operators in the execution plan. +| Number + + +| **sequential scan ops** + +| The number of sequential scan operators in the execution plan. +| Number + +|=== + +=== Example +==== +The following example fetches AWR data for a specific SQL ID, including the statement text, max execution plan, number of executions, total time, and max CPU usage. + +.Query +[source,sqlpp] +---- +SELECT + text, + max_plan, + the_count, + avg_total_time, + max_cpu +FROM + default.s1.awr +LET + text = uncompress(txt) +WHERE + sqlID = 'fcff011269f93c3b7903d746c2914dab' +GROUP BY + sqlID, text +LETTING + the_count = SUM(cnt), + max_plan = json_decode(uncompress(MAX(pln[1]))), + avg_total_time = duration_to_str(SUM(sts[0])/SUM(cnt)), + max_cpu = duration_to_str(MAX(sts[5])); +---- + +.Result +[source,json] +---- +[ + { + "text": "select awr from system:vitals;", + "max_plan": { + "#operator": "Sequence", + "~children": [ + { + "#operator": "PrimaryScan", + "index_id": "#primary", + "keyspace": "vitals" + }, + { + "#operator": "Fetch", + "keyspace": "vitals" + }, + { + "#operator": "InitialProject" + }, + { + "#operator": "Stream" + } + ] + }, + "the_count": 2, + "avg_total_time": "38.844257ms", + "max_cpu": "193.409µs" + } +] +---- +==== + +== Limitations + +When working with Couchbase transactions, AWR collects performance statistics for all individual statements, and you may notice that the COMMIT statement often shows the highest elapsed time. +However, from the AWR report alone, you will not be able to get insights into why the COMMIT statement took so long to execute. \ No newline at end of file diff --git a/modules/n1ql/partials/nav.adoc b/modules/n1ql/partials/nav.adoc index adc7e548e..08f347b77 100644 --- a/modules/n1ql/partials/nav.adoc +++ b/modules/n1ql/partials/nav.adoc @@ -39,6 +39,7 @@ *** xref:manage:monitor/monitoring-indexes.adoc[] *** xref:manage:manage-indexes/manage-indexes.adoc[] *** xref:n1ql:n1ql-manage/query-settings.adoc[] + *** xref:n1ql:n1ql-manage/query-awr.adoc[] ** xref:n1ql:n1ql-language-reference/index.adoc[] *** xref:n1ql:n1ql-language-reference/conventions.adoc[] *** xref:n1ql:n1ql-language-reference/reservedwords.adoc[] diff --git a/preview/DOC-12664-add-awr.yml b/preview/DOC-12664-add-awr.yml new file mode 100644 index 000000000..11805b60e --- /dev/null +++ b/preview/DOC-12664-add-awr.yml @@ -0,0 +1,9 @@ +sources: + docs-server: + branches: [release/8.0] + cb-swagger: + branches: [release/8.0] + start_path: docs +override: + startPage: server:introduction:intro.adoc +