diff --git a/docs/ppl-lang/README.md b/docs/ppl-lang/README.md index 9df9f5986..19e1a6ee0 100644 --- a/docs/ppl-lang/README.md +++ b/docs/ppl-lang/README.md @@ -94,7 +94,7 @@ For additional examples see the next [documentation](PPL-Example-Commands.md). - [`IP Address Functions`](functions/ppl-ip.md) - - [`Lambda Functions`](functions/ppl-lambda.md) + - [`Collection Functions`](functions/ppl-collection) --- ### PPL On Spark diff --git a/docs/ppl-lang/functions/ppl-lambda.md b/docs/ppl-lang/functions/ppl-collection.md similarity index 57% rename from docs/ppl-lang/functions/ppl-lambda.md rename to docs/ppl-lang/functions/ppl-collection.md index cdb6f9e8f..b98f5f5ca 100644 --- a/docs/ppl-lang/functions/ppl-lambda.md +++ b/docs/ppl-lang/functions/ppl-collection.md @@ -1,4 +1,56 @@ -## Lambda Functions +## PPL Collection Functions + +### `ARRAY` + +**Description** + +`array(...)` Returns an array with the given elements. + +**Argument type:** +- A \ can be any kind of value such as string, number, or boolean. + +**Return type:** ARRAY + +Example: + + os> source=people | eval `array` = array(1, 2, 0, -1, 1.1, -0.11) + fetched rows / total rows = 1/1 + +------------------------------+ + | array | + +------------------------------+ + | [1.0,2.0,0.0,-1.0,1.1,-0.11] | + +------------------------------+ + os> source=people | eval `array` = array(true, false, true, true) + fetched rows / total rows = 1/1 + +------------------------------+ + | array | + +------------------------------+ + | [true, false, true, true] | + +------------------------------+ + + +### `ARRAY_LENGTH` + +**Description** + +`array_length(array)` Returns the number of elements in the outermost array. + +**Argument type:** ARRAY + +ARRAY or JSON_ARRAY object. + +**Return type:** INTEGER + +Example: + + os> source=people | eval `array` = array_length(array(1,2,3,4)), `empty_array` = array_length(array()) + fetched rows / total rows = 1/1 + +---------+---------------+ + | array | empty_array | + +---------+---------------+ + | 4 | 0 | + +---------+---------------+ + ### `FORALL` @@ -14,7 +66,7 @@ Returns `TRUE` if all elements in the array satisfy the lambda predicate, otherw Example: - os> source=people | eval array = json_array(1, -1, 2), result = forall(array, x -> x > 0) | fields result + os> source=people | eval array = array(1, -1, 2), result = forall(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -22,7 +74,7 @@ Example: | false | +-----------+ - os> source=people | eval array = json_array(1, 3, 2), result = forall(array, x -> x > 0) | fields result + os> source=people | eval array = array(1, 3, 2), result = forall(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -41,7 +93,7 @@ Consider constructing the following array: and perform lambda functions against the nested fields `a` or `b`. See the examples: - os> source=people | eval array = json_array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.a > 0) | fields result + os> source=people | eval array = array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.a > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -49,7 +101,7 @@ and perform lambda functions against the nested fields `a` or `b`. See the examp | false | +-----------+ - os> source=people | eval array = json_array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.b > 0) | fields result + os> source=people | eval array = array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.b > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -71,7 +123,7 @@ Returns `TRUE` if at least one element in the array satisfies the lambda predica Example: - os> source=people | eval array = json_array(1, -1, 2), result = exists(array, x -> x > 0) | fields result + os> source=people | eval array = array(1, -1, 2), result = exists(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -79,7 +131,7 @@ Example: | true | +-----------+ - os> source=people | eval array = json_array(-1, -3, -2), result = exists(array, x -> x > 0) | fields result + os> source=people | eval array = array(-1, -3, -2), result = exists(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -102,7 +154,7 @@ An ARRAY that contains all elements in the input array that satisfy the lambda p Example: - os> source=people | eval array = json_array(1, -1, 2), result = filter(array, x -> x > 0) | fields result + os> source=people | eval array = array(1, -1, 2), result = filter(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -110,7 +162,7 @@ Example: | [1, 2] | +-----------+ - os> source=people | eval array = json_array(-1, -3, -2), result = filter(array, x -> x > 0) | fields result + os> source=people | eval array = array(-1, -3, -2), result = filter(array, x -> x > 0) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -132,7 +184,7 @@ An ARRAY that contains the result of applying the lambda transform function to e Example: - os> source=people | eval array = json_array(1, 2, 3), result = transform(array, x -> x + 1) | fields result + os> source=people | eval array = array(1, 2, 3), result = transform(array, x -> x + 1) | fields result fetched rows / total rows = 1/1 +--------------+ | result | @@ -140,7 +192,7 @@ Example: | [2, 3, 4] | +--------------+ - os> source=people | eval array = json_array(1, 2, 3), result = transform(array, (x, i) -> x + i) | fields result + os> source=people | eval array = array(1, 2, 3), result = transform(array, (x, i) -> x + i) | fields result fetched rows / total rows = 1/1 +--------------+ | result | @@ -162,7 +214,7 @@ The final result of applying the lambda functions to the start value and the inp Example: - os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x) | fields result + os> source=people | eval array = array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -170,7 +222,7 @@ Example: | 6 | +-----------+ - os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 10, (acc, x) -> acc + x) | fields result + os> source=people | eval array = array(1, 2, 3), result = reduce(array, 10, (acc, x) -> acc + x) | fields result fetched rows / total rows = 1/1 +-----------+ | result | @@ -178,7 +230,7 @@ Example: | 16 | +-----------+ - os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | fields result + os> source=people | eval array = array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | fields result fetched rows / total rows = 1/1 +-----------+ | result | diff --git a/docs/ppl-lang/functions/ppl-json.md b/docs/ppl-lang/functions/ppl-json.md index 5b26ee427..2c0c0ca67 100644 --- a/docs/ppl-lang/functions/ppl-json.md +++ b/docs/ppl-lang/functions/ppl-json.md @@ -95,6 +95,11 @@ Example: | {"array":[1.0,2.0,0.0,-1.0,1.1,-0.11]} | +----------------------------------------+ +**Limitation** + +The list of parameters of `json_array` should all be the same type. +`json_array('this', 'is', 1.1, -0.11, true, false)` throws exception. + ### `TO_JSON_STRING` **Description** @@ -149,29 +154,6 @@ Example: +-----------+-----------+-------------+ -### `ARRAY_LENGTH` - -**Description** - -`array_length(jsonArray)` Returns the number of elements in the outermost array. - -**Argument type:** ARRAY - -ARRAY or JSON_ARRAY object. - -**Return type:** INTEGER - -Example: - - os> source=people | eval `json_array` = json_array_length(json_array(1,2,3,4)), `empty_array` = json_array_length(json_array()) - fetched rows / total rows = 1/1 - +--------------+---------------+ - | json_array | empty_array | - +--------------+---------------+ - | 4 | 0 | - +--------------+---------------+ - - ### `JSON_EXTRACT` **Description** @@ -280,3 +262,189 @@ Example: |------------------+---------| | 13 | null | +------------------+---------+ + +### `FORALL` + +**Description** + +`forall(json_array, lambda)` Evaluates whether a lambda predicate holds for all elements in the json_array. + +**Argument type:** ARRAY, LAMBDA + +**Return type:** BOOLEAN + +Returns `TRUE` if all elements in the array satisfy the lambda predicate, otherwise `FALSE`. + +Example: + + os> source=people | eval array = json_array(1, -1, 2), result = forall(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | false | + +-----------+ + + os> source=people | eval array = json_array(1, 3, 2), result = forall(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | true | + +-----------+ + +**Note:** The lambda expression can access the nested fields of the array elements. This applies to all lambda functions introduced in this document. + +Consider constructing the following array: + + array = [ + {"a":1, "b":1}, + {"a":-1, "b":2} + ] + +and perform lambda functions against the nested fields `a` or `b`. See the examples: + + os> source=people | eval array = json_array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.a > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | false | + +-----------+ + + os> source=people | eval array = json_array(json_object("a", 1, "b", 1), json_object("a" , -1, "b", 2)), result = forall(array, x -> x.b > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | true | + +-----------+ + +### `EXISTS` + +**Description** + +`exists(json_array, lambda)` Evaluates whether a lambda predicate holds for one or more elements in the json_array. + +**Argument type:** ARRAY, LAMBDA + +**Return type:** BOOLEAN + +Returns `TRUE` if at least one element in the array satisfies the lambda predicate, otherwise `FALSE`. + +Example: + + os> source=people | eval array = json_array(1, -1, 2), result = exists(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | true | + +-----------+ + + os> source=people | eval array = json_array(-1, -3, -2), result = exists(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | false | + +-----------+ + + +### `FILTER` + +**Description** + +`filter(json_array, lambda)` Filters the input json_array using the given lambda function. + +**Argument type:** ARRAY, LAMBDA + +**Return type:** ARRAY + +An ARRAY that contains all elements in the input json_array that satisfy the lambda predicate. + +Example: + + os> source=people | eval array = json_array(1, -1, 2), result = filter(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | [1, 2] | + +-----------+ + + os> source=people | eval array = json_array(-1, -3, -2), result = filter(array, x -> x > 0) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | [] | + +-----------+ + +### `TRANSFORM` + +**Description** + +`transform(json_array, lambda)` Transform elements in a json_array using the lambda transform function. The second argument implies the index of the element if using binary lambda function. This is similar to a `map` in functional programming. + +**Argument type:** ARRAY, LAMBDA + +**Return type:** ARRAY + +An ARRAY that contains the result of applying the lambda transform function to each element in the input array. + +Example: + + os> source=people | eval array = json_array(1, 2, 3), result = transform(array, x -> x + 1) | fields result + fetched rows / total rows = 1/1 + +--------------+ + | result | + +--------------+ + | [2, 3, 4] | + +--------------+ + + os> source=people | eval array = json_array(1, 2, 3), result = transform(array, (x, i) -> x + i) | fields result + fetched rows / total rows = 1/1 + +--------------+ + | result | + +--------------+ + | [1, 3, 5] | + +--------------+ + +### `REDUCE` + +**Description** + +`reduce(json_array, start, merge_lambda, finish_lambda)` Applies a binary merge lambda function to a start value and all elements in the json_array, and reduces this to a single state. The final state is converted into the final result by applying a finish lambda function. + +**Argument type:** ARRAY, ANY, LAMBDA, LAMBDA + +**Return type:** ANY + +The final result of applying the lambda functions to the start value and the input json_array. + +Example: + + os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | 6 | + +-----------+ + + os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 10, (acc, x) -> acc + x) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | 16 | + +-----------+ + + os> source=people | eval array = json_array(1, 2, 3), result = reduce(array, 0, (acc, x) -> acc + x, acc -> acc * 10) | fields result + fetched rows / total rows = 1/1 + +-----------+ + | result | + +-----------+ + | 60 | + +-----------+ diff --git a/docs/ppl-lang/ppl-dedup-command.md b/docs/ppl-lang/ppl-dedup-command.md index 28fe7f4a4..831c4926f 100644 --- a/docs/ppl-lang/ppl-dedup-command.md +++ b/docs/ppl-lang/ppl-dedup-command.md @@ -1,6 +1,6 @@ -# PPL dedup command +## PPL dedup command -## Table of contents +### Table of contents - [Description](#description) - [Syntax](#syntax) @@ -11,11 +11,11 @@ - [Example 4: Dedup in consecutive document](#example-4-dedup-in-consecutive-document) - [Limitation](#limitation) -## Description +### Description Using `dedup` command to remove identical document defined by field from the search result. -## Syntax +### Syntax ```sql dedup [int] [keepempty=] [consecutive=] diff --git a/docs/ppl-lang/ppl-eval-command.md b/docs/ppl-lang/ppl-eval-command.md index 1908c087c..e98d4d4f2 100644 --- a/docs/ppl-lang/ppl-eval-command.md +++ b/docs/ppl-lang/ppl-eval-command.md @@ -1,10 +1,10 @@ -# PPL `eval` command +## PPL `eval` command -## Description +### Description The ``eval`` command evaluate the expression and append the result to the search result. -## Syntax +### Syntax ```sql eval = ["," = ]... ``` diff --git a/docs/ppl-lang/ppl-fields-command.md b/docs/ppl-lang/ppl-fields-command.md index e37fc644f..4ef041ee2 100644 --- a/docs/ppl-lang/ppl-fields-command.md +++ b/docs/ppl-lang/ppl-fields-command.md @@ -1,12 +1,12 @@ ## PPL `fields` command -**Description** +### Description Using ``field`` command to keep or remove fields from the search result. -**Syntax** +### Syntax -field [+|-] +`field [+|-] ` * index: optional. if the plus (+) is used, only the fields specified in the field list will be keep. if the minus (-) is used, all the fields specified in the field list will be removed. **Default** + * field list: mandatory. comma-delimited keep or remove fields. diff --git a/docs/ppl-lang/ppl-fieldsummary-command.md b/docs/ppl-lang/ppl-fieldsummary-command.md index 468c2046b..2015cf815 100644 --- a/docs/ppl-lang/ppl-fieldsummary-command.md +++ b/docs/ppl-lang/ppl-fieldsummary-command.md @@ -1,11 +1,11 @@ ## PPL `fieldsummary` command -**Description** +### Description Using `fieldsummary` command to : - Calculate basic statistics for each field (count, distinct count, min, max, avg, stddev, mean ) - Determine the data type of each field -**Syntax** +### Syntax `... | fieldsummary (nulls=true/false)` diff --git a/docs/ppl-lang/ppl-grok-command.md b/docs/ppl-lang/ppl-grok-command.md index 06028109b..a9b5645c5 100644 --- a/docs/ppl-lang/ppl-grok-command.md +++ b/docs/ppl-lang/ppl-grok-command.md @@ -1,4 +1,4 @@ -## PPL Correlation Command +## PPL Grok Command ### Description diff --git a/docs/ppl-lang/ppl-join-command.md b/docs/ppl-lang/ppl-join-command.md index b374bce5f..95b375e0a 100644 --- a/docs/ppl-lang/ppl-join-command.md +++ b/docs/ppl-lang/ppl-join-command.md @@ -1,10 +1,115 @@ ## PPL Join Command -## Overview +### Description -[Trace analytics](https://opensearch.org/docs/latest/observability-plugin/trace/ta-dashboards/) considered using SQL/PPL for its queries, but some graphs rely on joining two indices (span index and service map index) together which is not supported by SQL/PPL. Trace analytics was implemented with DSL + javascript, would be good if `join` being added to SQL could support this use case. +`JOIN` command combines two datasets together. The left side could be an index or results from a piped commands, the right side could be either an index or a subquery. -### Schema +### Syntax + +`[joinType] join [leftAlias] [rightAlias] [joinHints] on ` + +**joinType** +- Syntax: `[INNER] | LEFT [OUTER] | RIGHT [OUTER] | FULL [OUTER] | CROSS | [LEFT] SEMI | [LEFT] ANTI` +- Optional +- Description: The type of join to perform. The default is `INNER` if not specified. + +**leftAlias** +- Syntax: `left = ` +- Optional +- Description: The subquery alias to use with the left join side, to avoid ambiguous naming. + +**rightAlias** +- Syntax: `right = ` +- Optional +- Description: The subquery alias to use with the right join side, to avoid ambiguous naming. + +**joinHints** +- Syntax: `[hint.left.key1 = value1 hint.right.key2 = value2]` +- Optional +- Description: Zero or more space-separated join hints in the form of `Key` = `Value`. The key must start with `hint.left.` or `hint.right.` + +**joinCriteria** +- Syntax: `` +- Required +- Description: The syntax starts with `ON`. It could be any comparison expression. Generally, the join criteria looks like `.=.`. For example: `l.id = r.id`. If the join criteria contains multiple conditions, you can specify `AND` and `OR` operator between each comparison expression. For example, `l.id = r.id AND l.email = r.email AND (r.age > 65 OR r.age < 18)`. + +**right-dataset** +- Required +- Description: Right dataset could be either an index or a subquery with/without alias. + +### Example 1: two indices join + +PPL query: + + os> source=customer | join ON c_custkey = o_custkey orders + | fields c_custkey, c_nationkey, c_mktsegment, o_orderkey, o_orderstatus, o_totalprice | head 10 + fetched rows / total rows = 10/10 + +----------+-------------+-------------+------------+---------------+-------------+ + | c_custkey| c_nationkey | c_mktsegment| o_orderkey | o_orderstatus | o_totalprice| + +----------+-------------+-------------+------------+---------------+-------------+ + | 36901 | 13 | AUTOMOBILE | 1 | O | 173665.47 | + | 78002 | 10 | AUTOMOBILE | 2 | O | 46929.18 | + | 123314 | 15 | MACHINERY | 3 | F | 193846.25 | + | 136777 | 10 | HOUSEHOLD | 4 | O | 32151.78 | + | 44485 | 20 | FURNITURE | 5 | F | 144659.2 | + | 55624 | 7 | AUTOMOBILE | 6 | F | 58749.59 | + | 39136 | 5 | FURNITURE | 7 | O | 252004.18 | + | 130057 | 9 | FURNITURE | 32 | O | 208660.75 | + | 66958 | 18 | MACHINERY | 33 | F | 163243.98 | + | 61001 | 3 | FURNITURE | 34 | O | 58949.67 | + +----------+-------------+-------------+------------+---------------+-------------+ + +### Example 2: three indices join + +PPL query: + + os> source=customer | join ON c_custkey = o_custkey orders | join ON c_nationkey = n_nationkey nation + | fields c_custkey, c_mktsegment, o_orderkey, o_orderstatus, o_totalprice, n_name | head 10 + fetched rows / total rows = 10/10 + +----------+-------------+------------+---------------+-------------+--------------+ + | c_custkey| c_mktsegment| o_orderkey | o_orderstatus | o_totalprice| n_name | + +----------+-------------+------------+---------------+-------------+--------------+ + | 36901 | AUTOMOBILE | 1 | O | 173665.47 | JORDAN | + | 78002 | AUTOMOBILE | 2 | O | 46929.18 | IRAN | + | 123314 | MACHINERY | 3 | F | 193846.25 | MOROCCO | + | 136777 | HOUSEHOLD | 4 | O | 32151.78 | IRAN | + | 44485 | FURNITURE | 5 | F | 144659.2 | SAUDI ARABIA | + | 55624 | AUTOMOBILE | 6 | F | 58749.59 | GERMANY | + | 39136 | FURNITURE | 7 | O | 252004.18 | ETHIOPIA | + | 130057 | FURNITURE | 32 | O | 208660.75 | INDONESIA | + | 66958 | MACHINERY | 33 | F | 163243.98 | CHINA | + | 61001 | FURNITURE | 34 | O | 58949.67 | CANADA | + +----------+-------------+------------+---------------+-------------+--------------+ + +### Example 3: join a subquery in right side + +PPL query: + + os>source=supplier| join right = revenue0 ON s_suppkey = supplier_no + [ + source=lineitem | where l_shipdate >= date('1996-01-01') AND l_shipdate < date_add(date('1996-01-01'), interval 3 month) + | eval supplier_no = l_suppkey | stats sum(l_extendedprice * (1 - l_discount)) as total_revenue by supplier_no + ] + | fields s_name, s_phone, total_revenue, supplier_no | head 10 + fetched rows / total rows = 10/10 + +---------------------+----------------+-------------------+-------------+ + | s_name | s_phone | total_revenue | supplier_no | + +---------------------+----------------+-------------------+-------------+ + | Supplier#000007747 | 24-911-546-3505| 636204.0279 | 7747 | + | Supplier#000007748 | 29-535-184-2277| 538311.8099 | 7748 | + | Supplier#000007749 | 18-225-478-7489| 743462.4473000001 | 7749 | + | Supplier#000007750 | 28-680-484-7044| 616828.2220999999 | 7750 | + | Supplier#000007751 | 20-990-606-7343| 1092975.1925 | 7751 | + | Supplier#000007752 | 12-936-258-6650| 1090399.9666 | 7752 | + | Supplier#000007753 | 22-394-329-1153| 777130.7457000001 | 7753 | + | Supplier#000007754 | 26-941-591-5320| 866600.0501 | 7754 | + | Supplier#000007755 | 32-138-467-4225| 702256.7030000001 | 7755 | + | Supplier#000007756 | 29-860-205-8019| 1304979.0511999999| 7756 | + +---------------------+----------------+-------------------+-------------+ + +### Example 4: complex example in OTEL + +**Schema** There will be at least 2 indices, `otel-v1-apm-span-*` (large) and `otel-v1-apm-service-map` (small). @@ -30,154 +135,47 @@ Relevant fields from indices: Full schemas are defined in data-prepper repo: [`otel-v1-apm-span-*`](https://github.com/opensearch-project/data-prepper/blob/04dd7bd18977294800cf4b77d7f01914def75f23/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md), [`otel-v1-apm-service-map`](https://github.com/opensearch-project/data-prepper/blob/4e5f83814c4a0eed2a1ca9bab0693b9e32240c97/docs/schemas/trace-analytics/otel-v1-apm-service-map-index-template.md) -### Requirement - -Support `join` to calculate the following: +**Requirement** For each service, join span index on service map index to calculate metrics under different type of filters. ![image](https://user-images.githubusercontent.com/28062824/194170062-f0dd1d57-c5eb-44db-95e0-6b3b4e52f25a.png) -This sample query calculates latency when filtered by trace group `client_cancel_order` for the `order` service. I only have a subquery example, don't have the join version of the query.. - -```sql -SELECT avg(durationInNanos) -FROM `otel-v1-apm-span-000001` t1 -WHERE t1.serviceName = `order` - AND ((t1.name in - (SELECT target.resource - FROM `otel-v1-apm-service-map` - WHERE serviceName = `order` - AND traceGroupName = `client_cancel_order`) - AND t1.parentSpanId != NULL) - OR (t1.parentSpanId = NULL - AND t1.name = `client_cancel_order`)) - AND t1.traceId in - (SELECT traceId - FROM `otel-v1-apm-span-000001` - WHERE serviceName = `order`) -``` -## Migrate to PPL - -### Syntax of Join Command - -```sql -SEARCH source= -| -| [joinType] JOIN - [leftAlias] - [rightAlias] - [joinHints] - ON joinCriteria - -| -``` -**joinType** -- Syntax: `[INNER] | LEFT [OUTER] | RIGHT [OUTER] | FULL [OUTER] | CROSS | [LEFT] SEMI | [LEFT] ANTI` -- Optional -- Description: The type of join to perform. The default is `INNER` if not specified. +This sample query calculates latency when filtered by trace group `client_cancel_order` for the `order` service. I only have a subquery example, don't have the join version of the query. -**leftAlias** -- Syntax: `left = ` -- Optional -- Description: The subquery alias to use with the left join side, to avoid ambiguous naming. - -**rightAlias** -- Syntax: `right = ` -- Optional -- Description: The subquery alias to use with the right join side, to avoid ambiguous naming. - -**joinHints** -- Syntax: `[hint.left.key1 = value1 hint.right.key2 = value2]` -- Optional -- Description: Zero or more space-separated join hints in the form of `Key` = `Value`. The key must start with `hint.left.` or `hint.right.` - -**joinCriteria** -- Syntax: `` -- Required -- Description: The syntax starts with `ON`. It could be any comparison expression. Generally, the join criteria looks like `.=.`. For example: `l.id = r.id`. If the join criteria contains multiple conditions, you can specify `AND` and `OR` operator between each comparison expression. For example, `l.id = r.id AND l.email = r.email AND (r.age > 65 OR r.age < 18)`. - -**right-table** -- Required -- Description: The index or table name of join right-side. Sub-search is unsupported in join right side for now. - -### Rewriting -```sql -SEARCH source=otel-v1-apm-span-000001 +PPL query: +``` +source=otel-v1-apm-span-000001 | WHERE serviceName = 'order' | JOIN left=t1 right=t2 ON t1.traceId = t2.traceId AND t2.serviceName = 'order' - otel-v1-apm-span-000001 -- self inner join -| EVAL s_name = t1.name -- rename to avoid ambiguous -| EVAL s_parentSpanId = t1.parentSpanId -- RENAME command would be better when it is supported -| EVAL s_durationInNanos = t1.durationInNanos -| FIELDS s_name, s_parentSpanId, s_durationInNanos -- reduce colunms in join + otel-v1-apm-span-000001 // self inner join +| RENAME s_name as t1.name +| RENAME s_parentSpanId as t1.parentSpanId +| RENAME s_durationInNanos as t1.durationInNanos +| FIELDS s_name, s_parentSpanId, s_durationInNanos // reduce colunms in join | LEFT JOIN left=s1 right=t3 ON s_name = t3.target.resource AND t3.serviceName = 'order' AND t3.traceGroupName = 'client_cancel_order' otel-v1-apm-service-map | WHERE (s_parentSpanId IS NOT NULL OR (s_parentSpanId IS NULL AND s_name = 'client_cancel_order')) -| STATS avg(s_durationInNanos) -- no need to add alias if there is no ambiguous -``` - - -### More examples - -Migration from SQL query (TPC-H Q13): -```sql -SELECT c_count, COUNT(*) AS custdist -FROM - ( SELECT c_custkey, COUNT(o_orderkey) c_count - FROM customer LEFT OUTER JOIN orders ON c_custkey = o_custkey - AND o_comment NOT LIKE '%unusual%packages%' - GROUP BY c_custkey - ) AS c_orders -GROUP BY c_count -ORDER BY custdist DESC, c_count DESC; -``` -Rewritten by PPL Join query: -```sql -SEARCH source=customer -| FIELDS c_custkey -| LEFT OUTER JOIN - ON c_custkey = o_custkey AND o_comment NOT LIKE '%unusual%packages%' - orders -| STATS count(o_orderkey) AS c_count BY c_custkey -| STATS count() AS custdist BY c_count -| SORT - custdist, - c_count -``` -_- **Limitation: sub-searches is unsupported in join right side**_ - -If sub-searches is supported, above ppl query could be rewritten as: -```sql -SEARCH source=customer -| FIELDS c_custkey -| LEFT OUTER JOIN - ON c_custkey = o_custkey - [ - SEARCH source=orders - | WHERE o_comment NOT LIKE '%unusual%packages%' - | FIELDS o_orderkey, o_custkey - ] -| STATS count(o_orderkey) AS c_count BY c_custkey -| STATS count() AS custdist BY c_count -| SORT - custdist, - c_count +| STATS avg(s_durationInNanos) ``` ### Comparison with [Correlation](ppl-correlation-command) A primary difference between `correlate` and `join` is that both sides of `correlate` are tables, but both sides of `join` are subqueries. For example: -```sql +``` source = testTable1 - | where country = 'Canada' OR country = 'England' - | eval cname = lower(name) - | fields cname, country, year, month - | inner join left=l, right=r - ON l.cname = r.name AND l.country = r.country AND l.year = 2023 AND r.month = 4 - testTable2s +| where country = 'Canada' OR country = 'England' +| eval cname = lower(name) +| fields cname, country, year, month +| inner join left=l right=r + ON l.cname = r.name AND l.country = r.country AND l.year = 2023 AND r.month = 4 + testTable2s ``` The subquery alias `l` does not represent the `testTable1` table itself. Instead, it represents the subquery: -```sql +``` source = testTable1 | where country = 'Canada' OR country = 'England' | eval cname = lower(name) diff --git a/docs/ppl-lang/ppl-lookup-command.md b/docs/ppl-lang/ppl-lookup-command.md index 1b8350533..6768cdcaf 100644 --- a/docs/ppl-lang/ppl-lookup-command.md +++ b/docs/ppl-lang/ppl-lookup-command.md @@ -1,20 +1,18 @@ ## PPL Lookup Command -## Overview +### Description Lookup command enriches your search data by adding or replacing data from a lookup index (dimension table). You can extend fields of an index with values from a dimension table, append or replace values when lookup condition is matched. As an alternative of [Join command](ppl-join-command), lookup command is more suitable for enriching the source data with a static dataset. -### Syntax of Lookup Command +### Syntax -```sql -SEARCH source= -| -| LOOKUP ( [AS ])... - [(REPLACE | APPEND) ( [AS ])...] -| ``` +LOOKUP ( [AS ])... + [(REPLACE | APPEND) ( [AS ])...] +``` + **lookupIndex** - Required - Description: the name of lookup index (dimension table) @@ -44,26 +42,49 @@ SEARCH source= - Description: If you specify REPLACE, matched values in \ field overwrite the values in result. If you specify APPEND, matched values in \ field only append to the missing values in result. ### Usage -> LOOKUP id AS cid REPLACE mail AS email
-> LOOKUP name REPLACE mail AS email
-> LOOKUP id AS cid, name APPEND address, mail AS email
-> LOOKUP id
- -### Example -```sql -SEARCH source= -| WHERE orderType = 'Cancelled' -| LOOKUP account_list, mkt_id AS mkt_code REPLACE amount, account_name AS name -| STATS count(mkt_code), avg(amount) BY name -``` -```sql -SEARCH source= -| DEDUP market_id -| EVAL category=replace(category, "-", ".") -| EVAL category=ltrim(category, "dvp.") -| LOOKUP bounce_category category AS category APPEND classification -``` -```sql -SEARCH source= -| LOOKUP bounce_category category -``` +- `LOOKUP id AS cid REPLACE mail AS email` +- `LOOKUP name REPLACE mail AS email` +- `LOOKUP id AS cid, name APPEND address, mail AS email` +- `LOOKUP id` + +### Examples 1: replace + +PPL query: + + os>source=people | LOOKUP work_info uid AS id REPLACE department | head 10 + fetched rows / total rows = 10/10 + +------+-----------+-------------+-----------+--------+------------------+ + | id | name | occupation | country | salary | department | + +------+-----------+-------------+-----------+--------+------------------+ + | 1000 | Daniel | Teacher | Canada | 56486 | CUSTOMER_SERVICE | + | 1001 | Joseph | Lawyer | Denmark | 135943 | FINANCE | + | 1002 | David | Artist | Finland | 60391 | DATA | + | 1003 | Charlotte | Lawyer | Denmark | 42173 | LEGAL | + | 1004 | Isabella | Veterinarian| Australia | 117699 | MARKETING | + | 1005 | Lily | Engineer | Italy | 37526 | IT | + | 1006 | Emily | Dentist | Denmark | 125340 | MARKETING | + | 1007 | James | Lawyer | Germany | 56532 | LEGAL | + | 1008 | Lucas | Lawyer | Japan | 87782 | DATA | + | 1009 | Sophia | Architect | Sweden | 37597 | MARKETING | + +------+-----------+-------------+-----------+--------+------------------+ + +### Examples 2: append + +PPL query: + + os>source=people| LOOKUP work_info uid AS ID, name APPEND department | where isnotnull(department) | head 10 + fetched rows / total rows = 10/10 + +------+---------+-------------+-------------+--------+------------+ + | id | name | occupation | country | salary | department | + +------+---------+-------------+-------------+--------+------------+ + | 1018 | Emma | Architect | USA | 72400 | IT | + | 1032 | James | Pilot | Netherlands | 71698 | SALES | + | 1043 | Jane | Nurse | Brazil | 45016 | FINANCE | + | 1046 | Joseph | Pharmacist | Mexico | 109152 | OPERATIONS | + | 1064 | Joseph | Electrician | New Zealand | 50253 | LEGAL | + | 1090 | Matthew | Psychologist| Germany | 73396 | DATA | + | 1103 | Emily | Electrician | Switzerland | 98391 | DATA | + | 1114 | Jake | Nurse | Denmark | 53418 | SALES | + | 1115 | Sofia | Engineer | Mexico | 64829 | OPERATIONS | + | 1122 | Oliver | Scientist | Netherlands | 31146 | DATA | + +------+---------+-------------+-------------+--------+------------+ diff --git a/docs/ppl-lang/ppl-rare-command.md b/docs/ppl-lang/ppl-rare-command.md index e3ad21f4e..8a2ca640f 100644 --- a/docs/ppl-lang/ppl-rare-command.md +++ b/docs/ppl-lang/ppl-rare-command.md @@ -1,11 +1,11 @@ ## PPL rare Command -**Description** +### Description Using ``rare`` command to find the least common tuple of values of all fields in the field list. **Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields. -**Syntax** +### Syntax `rare [N] [by-clause]` `rare_approx [N] [by-clause]` diff --git a/docs/ppl-lang/ppl-subquery-command.md b/docs/ppl-lang/ppl-subquery-command.md index c4a0c337c..b36eb1c80 100644 --- a/docs/ppl-lang/ppl-subquery-command.md +++ b/docs/ppl-lang/ppl-subquery-command.md @@ -1,27 +1,27 @@ -## PPL SubQuery Commands: +## PPL SubQuery Commands -### Syntax -The subquery command should be implemented using a clean, logical syntax that integrates with existing PPL structure. +### Description +The subquery command has 4 types: `InSubquery`, `ExistsSubquery`, `ScalarSubquery` and `RelationSubquery`. +`InSubquery`, `ExistsSubquery` and `ScalarSubquery` are subquery expressions, their common usage is in Where clause(`where `) and Search filter(`search source=* `). -```sql -source=logs | where field in [ subquery source=events | where condition | fields field ] +For example, a subquery expression could be used in boolean expression: ``` - -In this example, the primary search (`source=logs`) is filtered by results from the subquery (`source=events`). - -The subquery command should allow nested queries to be as complex as necessary, supporting multiple levels of nesting. - -Example: - -```sql - source=logs | where id in [ subquery source=users | where user in [ subquery source=actions | where action="login" | fields user] | fields uid ] +| where orders.order_id in [ source=returns | where return_reason="damaged" | field order_id ] ``` +The `orders.order_id in [ source=... ]` is a ``. -For additional info See [Issue](https://github.com/opensearch-project/opensearch-spark/issues/661) - ---- +But `RelationSubquery` is not a subquery expression, it is a subquery plan. +[Recall the join command doc](ppl-join-command.md), the example is a subquery/subsearch **plan**, rather than a **expression**. -### InSubquery usage +### Syntax +- `where [not] in [ source=... | ... | ... ]` (InSubquery) +- `where [not] exists [ source=... | ... | ... ]` (ExistsSubquery) +- `where = [ source=... | ... | ... ]` (ScalarSubquery) +- `source=[ source= ...]` (RelationSubquery) +- `| join ON condition [ source= ]` (RelationSubquery in join right side) + +### Usage +InSubquery: - `source = outer | where a in [ source = inner | fields b ]` - `source = outer | where (a) in [ source = inner | fields b ]` - `source = outer | where (a,b,c) in [ source = inner | fields d,e,f ]` @@ -33,92 +33,9 @@ For additional info See [Issue](https://github.com/opensearch-project/opensearch - `source = outer | where a in [ source = inner1 | where b not in [ source = inner2 | fields c ] | fields b ]` (nested) - `source = table1 | inner join left = l right = r on l.a = r.a AND r.a in [ source = inner | fields d ] | fields l.a, r.a, b, c` (as join filter) -**_SQL Migration examples with IN-Subquery PPL:_** -1. tpch q4 (in-subquery with aggregation) -```sql -select - o_orderpriority, - count(*) as order_count -from - orders -where - o_orderdate >= date '1993-07-01' - and o_orderdate < date '1993-07-01' + interval '3' month - and o_orderkey in ( - select - l_orderkey - from - lineitem - where l_commitdate < l_receiptdate - ) -group by - o_orderpriority -order by - o_orderpriority -``` -Rewritten by PPL InSubquery query: -```sql -source = orders -| where o_orderdate >= "1993-07-01" and o_orderdate < "1993-10-01" and o_orderkey IN - [ source = lineitem - | where l_commitdate < l_receiptdate - | fields l_orderkey - ] -| stats count(1) as order_count by o_orderpriority -| sort o_orderpriority -| fields o_orderpriority, order_count -``` -2.tpch q20 (nested in-subquery) -```sql -select - s_name, - s_address -from - supplier, - nation -where - s_suppkey in ( - select - ps_suppkey - from - partsupp - where - ps_partkey in ( - select - p_partkey - from - part - where - p_name like 'forest%' - ) - ) - and s_nationkey = n_nationkey - and n_name = 'CANADA' -order by - s_name -``` -Rewritten by PPL InSubquery query: -```sql -source = supplier -| where s_suppkey IN [ - source = partsupp - | where ps_partkey IN [ - source = part - | where like(p_name, "forest%") - | fields p_partkey - ] - | fields ps_suppkey - ] -| inner join left=l right=r on s_nationkey = n_nationkey and n_name = 'CANADA' - nation -| sort s_name -``` ---- - -### ExistsSubquery usage - -Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table inner2 +ExistsSubquery: +(Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table inner2) - `source = outer | where exists [ source = inner | where a = c ]` - `source = outer | where not exists [ source = inner | where a = c ]` - `source = outer | where exists [ source = inner | where a = c and b = d ]` @@ -132,48 +49,9 @@ Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table in - `source = outer | where not exists [ source = inner | where c > 10 ]` (uncorrelated exists) - `source = outer | where exists [ source = inner ] | eval l = "nonEmpty" | fields l` (special uncorrelated exists) -**_SQL Migration examples with Exists-Subquery PPL:_** - -tpch q4 (exists subquery with aggregation) -```sql -select - o_orderpriority, - count(*) as order_count -from - orders -where - o_orderdate >= date '1993-07-01' - and o_orderdate < date '1993-07-01' + interval '3' month - and exists ( - select - l_orderkey - from - lineitem - where l_orderkey = o_orderkey - and l_commitdate < l_receiptdate - ) -group by - o_orderpriority -order by - o_orderpriority -``` -Rewritten by PPL ExistsSubquery query: -```sql -source = orders -| where o_orderdate >= "1993-07-01" and o_orderdate < "1993-10-01" - and exists [ - source = lineitem - | where l_orderkey = o_orderkey and l_commitdate < l_receiptdate - ] -| stats count(1) as order_count by o_orderpriority -| sort o_orderpriority -| fields o_orderpriority, order_count -``` ---- - -### ScalarSubquery usage +ScalarSubquery: -Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested +(Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table inner, `e`, `f` are fields of table nested) **Uncorrelated scalar subquery in Select** - `source = outer | eval m = [ source = inner | stats max(c) ] | fields m, a` @@ -203,146 +81,98 @@ Assumptions: `a`, `b` are fields of table outer, `c`, `d` are fields of table in - `source = outer | where a = [ source = inner | stats max(c) | sort c ] OR b = [ source = inner | where c = 1 | stats min(d) | sort d ]` - `source = outer | where a = [ source = inner | where c = [ source = nested | stats max(e) by f | sort f ] | stats max(d) by c | sort c | head 1 ]` -_SQL Migration examples with Scalar-Subquery PPL:_ -Example 1 -```sql -SELECT * -FROM outer -WHERE a = (SELECT max(c) - FROM inner1 - WHERE c = (SELECT max(e) - FROM inner2 - GROUP BY f - ORDER BY f - ) - GROUP BY c - ORDER BY c - LIMIT 1) -``` -Rewritten by PPL ScalarSubquery query: -```sql -source = spark_catalog.default.outer -| where a = [ - source = spark_catalog.default.inner1 - | where c = [ - source = spark_catalog.default.inner2 - | stats max(e) by f - | sort f - ] - | stats max(d) by c - | sort c - | head 1 - ] -``` -Example 2 -```sql -SELECT * FROM outer -WHERE a = (SELECT max(c) - FROM inner - ORDER BY c) -OR b = (SELECT min(d) - FROM inner - WHERE c = 1 - ORDER BY d) -``` -Rewritten by PPL ScalarSubquery query: -```sql -source = spark_catalog.default.outer -| where a = [ - source = spark_catalog.default.inner | stats max(c) | sort c - ] OR b = [ - source = spark_catalog.default.inner | where c = 1 | stats min(d) | sort d - ] -``` ---- - -### (Relation) Subquery -`InSubquery`, `ExistsSubquery` and `ScalarSubquery` are all subquery expressions. But `RelationSubquery` is not a subquery expression, it is a subquery plan which is common used in Join or From clause. - -- `source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side) +RelationSubquery: +- `source = table1 | join left = l right = r on condition [ source = table2 | where d > 10 | head 5 ]` (subquery in join right side) - `source = [ source = table1 | join left = l right = r [ source = table2 | where d > 10 | head 5 ] | stats count(a) by b ] as outer | head 1` -**_SQL Migration examples with Subquery PPL:_** - -tpch q13 -```sql -select - c_count, - count(*) as custdist -from - ( - select - c_custkey, - count(o_orderkey) as c_count - from - customer left outer join orders on - c_custkey = o_custkey - and o_comment not like '%special%requests%' - group by - c_custkey - ) as c_orders -group by - c_count -order by - custdist desc, - c_count desc -``` -Rewritten by PPL (Relation) Subquery: -```sql -SEARCH source = [ - SEARCH source = customer - | LEFT OUTER JOIN left = c right = o ON c_custkey = o_custkey - [ - SEARCH source = orders - | WHERE not like(o_comment, '%special%requests%') - ] - | STATS COUNT(o_orderkey) AS c_count BY c_custkey -] AS c_orders -| STATS COUNT(o_orderkey) AS c_count BY c_custkey -| STATS COUNT(1) AS custdist BY c_count -| SORT - custdist, - c_count -``` ---- +### Examples 1: TPC-H q20 + +PPL query: + + os> source=supplier + | join ON s_nationkey = n_nationkey nation + | where n_name = 'CANADA' + and s_suppkey in [ // InSubquery + source = partsupp + | where ps_partkey in [ // InSubquery + source = part + | where like(p_name, 'forest%') + | fields p_partkey + ] + and ps_availqty > [ // ScalarSubquery + source = lineitem + | where l_partkey = ps_partkey + and l_suppkey = ps_suppkey + and l_shipdate >= date('1994-01-01') + and l_shipdate < date_add(date('1994-01-01'), interval 1 year) + | stats sum(l_quantity) as sum_l_quantity + | eval half_sum_l_quantity = 0.5 * sum_l_quantity // Stats and Eval commands can combine when issues/819 resolved + | fields half_sum_l_quantity + ] + | fields ps_suppkey + ] + | fields s_suppkey, s_name, s_phone, s_acctbal, n_name | head 10 + fetched rows / total rows = 10/10 + +-----------+---------------------+----------------+----------+---------+ + | s_suppkey | s_name | s_phone | s_acctbal| n_name | + +-----------+---------------------+----------------+----------+---------+ + | 8243 | Supplier#000008243 | 13-707-547-1386| 9067.07 | CANADA | + | 736 | Supplier#000000736 | 13-681-806-8650| 5700.83 | CANADA | + | 9032 | Supplier#000009032 | 13-441-662-5539| 3982.32 | CANADA | + | 3201 | Supplier#000003201 | 13-600-413-7165| 3799.41 | CANADA | + | 3849 | Supplier#000003849 | 13-582-965-9117| 52.33 | CANADA | + | 5505 | Supplier#000005505 | 13-531-190-6523| 2023.4 | CANADA | + | 5195 | Supplier#000005195 | 13-622-661-2956| 3717.34 | CANADA | + | 9753 | Supplier#000009753 | 13-724-256-7877| 4406.93 | CANADA | + | 7135 | Supplier#000007135 | 13-367-994-6705| 4950.29 | CANADA | + | 5256 | Supplier#000005256 | 13-180-538-8836| 5624.79 | CANADA | + +-----------+---------------------+----------------+----------+---------+ + + +### Examples 2: TPC-H q22 + +PPL query: + + os> source = [ + source = customer + | where substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + and c_acctbal > [ + source = customer + | where c_acctbal > 0.00 + and substring(c_phone, 1, 2) in ('13', '31', '23', '29', '30', '18', '17') + | stats avg(c_acctbal) + ] + and not exists [ + source = orders + | where o_custkey = c_custkey + ] + | eval cntrycode = substring(c_phone, 1, 2) + | fields cntrycode, c_acctbal + ] as custsale + | stats count() as numcust, sum(c_acctbal) as totacctbal by cntrycode + | sort cntrycode + fetched rows / total rows = 10/10 + +---------+--------------------+------------+ + | numcust | totacctbal | cntrycode | + +---------+--------------------+------------+ + | 888 | 6737713.989999999 | 13 | + | 861 | 6460573.72 | 17 | + | 964 | 7236687.4 | 18 | + | 892 | 6701457.950000001 | 23 | + | 948 | 7158866.630000001 | 29 | + | 909 | 6808436.129999999 | 30 | + | 922 | 6806670.179999999 | 31 | + +---------+--------------------+------------+ ### Additional Context -`InSubquery`, `ExistsSubquery` and `ScalarSubquery` as subquery expressions, their common usage is in `where` clause and `search filter`. - -Where command: -``` -| where | ... -``` -Search filter: -``` -search source=* | ... -``` -A subquery expression could be used in boolean expression, for example - -```sql -| where orders.order_id in [ source=returns | where return_reason="damaged" | field order_id ] -``` - -The `orders.order_id in [ source=... ]` is a ``. - -In general, we name this kind of subquery clause the `InSubquery` expression, it is a ``. - -**Subquery with Different Join Types** +#### RelationSubquery -In issue description is a `ScalarSubquery`: - -```sql -source=employees -| join source=sales on employees.employee_id = sales.employee_id -| where sales.sale_amount > [ source=targets | where target_met="true" | fields target_value ] +RelationSubquery is plan instead of expression, for example ``` - -But `RelationSubquery` is not a subquery expression, it is a subquery plan. -[Recall the join command doc](ppl-join-command.md), the example is a subquery/subsearch **plan**, rather than a **expression**. - -```sql -SEARCH source=customer +source=customer | FIELDS c_custkey -| LEFT OUTER JOIN left = c, right = o ON c.c_custkey = o.o_custkey +| LEFT OUTER JOIN left = c right = o ON c.c_custkey = o.o_custkey [ SEARCH source=orders | WHERE o_comment NOT LIKE '%unusual%packages%' @@ -351,7 +181,7 @@ SEARCH source=customer | STATS ... ``` simply into -```sql +``` SEARCH | LEFT OUTER JOIN ON [ @@ -359,21 +189,14 @@ SEARCH ] | STATS ... ``` -Apply the syntax here and simply into - -```sql -search | left join on [ search ... ] -``` - -The `[ search ...]` is not a `expression`, it's `plan`, similar to the `relation` plan -**Uncorrelated Subquery** +#### Uncorrelated Subquery An uncorrelated subquery is independent of the outer query. It is executed once, and the result is used by the outer query. It's **less common** when using `ExistsSubquery` because `ExistsSubquery` typically checks for the presence of rows that are dependent on the outer query’s row. There is a very special exists subquery which highlight by `(special uncorrelated exists)`: -```sql +``` SELECT 'nonEmpty' FROM outer WHERE EXISTS ( @@ -382,7 +205,7 @@ FROM outer ); ``` Rewritten by PPL ExistsSubquery query: -```sql +``` source = outer | where exists [ source = inner @@ -392,11 +215,11 @@ source = outer ``` This query just print "nonEmpty" if the inner table is not empty. -**Table alias in subquery** +#### Table alias in subquery Table alias is useful in query which contains a subquery, for example -```sql +``` select a, ( select sum(b) from catalog.schema.table1 as t1 diff --git a/docs/ppl-lang/ppl-top-command.md b/docs/ppl-lang/ppl-top-command.md index 93d3a7148..012457fe2 100644 --- a/docs/ppl-lang/ppl-top-command.md +++ b/docs/ppl-lang/ppl-top-command.md @@ -1,6 +1,6 @@ ## PPL top Command -**Description** +### Description Using ``top`` command to find the most common tuple of values of all fields in the field list. diff --git a/docs/ppl-lang/ppl-trendline-command.md b/docs/ppl-lang/ppl-trendline-command.md index b466e2e8f..44b8c999f 100644 --- a/docs/ppl-lang/ppl-trendline-command.md +++ b/docs/ppl-lang/ppl-trendline-command.md @@ -1,6 +1,6 @@ ## PPL trendline Command -**Description** +### Description Using ``trendline`` command to calculate moving averages of fields. ### Syntax - SMA (Simple Moving Average) diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.tokens b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.tokens new file mode 100644 index 000000000..5f976453e --- /dev/null +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.tokens @@ -0,0 +1,798 @@ +SEARCH=1 +DESCRIBE=2 +SHOW=3 +FROM=4 +WHERE=5 +FIELDS=6 +RENAME=7 +STATS=8 +EVENTSTATS=9 +DEDUP=10 +SORT=11 +EVAL=12 +HEAD=13 +TOP_APPROX=14 +TOP=15 +RARE_APPROX=16 +RARE=17 +PARSE=18 +METHOD=19 +REGEX=20 +PUNCT=21 +GROK=22 +PATTERN=23 +PATTERNS=24 +NEW_FIELD=25 +KMEANS=26 +AD=27 +ML=28 +FILLNULL=29 +EXPAND=30 +FLATTEN=31 +TRENDLINE=32 +JOIN=33 +ON=34 +INNER=35 +OUTER=36 +FULL=37 +SEMI=38 +ANTI=39 +CROSS=40 +LEFT_HINT=41 +RIGHT_HINT=42 +CORRELATE=43 +SELF=44 +EXACT=45 +APPROXIMATE=46 +SCOPE=47 +MAPPING=48 +EXPLAIN=49 +FORMATTED=50 +COST=51 +CODEGEN=52 +EXTENDED=53 +SIMPLE=54 +AS=55 +BY=56 +SOURCE=57 +INDEX=58 +D=59 +DESC=60 +DATASOURCES=61 +USING=62 +WITH=63 +AUTO=64 +STR=65 +IP=66 +NUM=67 +FIELDSUMMARY=68 +INCLUDEFIELDS=69 +NULLS=70 +SMA=71 +WMA=72 +KEEPEMPTY=73 +CONSECUTIVE=74 +DEDUP_SPLITVALUES=75 +PARTITIONS=76 +ALLNUM=77 +DELIM=78 +CENTROIDS=79 +ITERATIONS=80 +DISTANCE_TYPE=81 +NUMBER_OF_TREES=82 +SHINGLE_SIZE=83 +SAMPLE_SIZE=84 +OUTPUT_AFTER=85 +TIME_DECAY=86 +ANOMALY_RATE=87 +CATEGORY_FIELD=88 +TIME_FIELD=89 +TIME_ZONE=90 +TRAINING_DATA_SIZE=91 +ANOMALY_SCORE_THRESHOLD=92 +APPEND=93 +CASE=94 +ELSE=95 +IN=96 +EXISTS=97 +NOT=98 +OR=99 +AND=100 +XOR=101 +TRUE=102 +FALSE=103 +REGEXP=104 +CONVERT_TZ=105 +DATETIME=106 +DAY=107 +DAY_HOUR=108 +DAY_MICROSECOND=109 +DAY_MINUTE=110 +DAY_OF_YEAR=111 +DAY_SECOND=112 +HOUR=113 +HOUR_MICROSECOND=114 +HOUR_MINUTE=115 +HOUR_OF_DAY=116 +HOUR_SECOND=117 +INTERVAL=118 +MICROSECOND=119 +MILLISECOND=120 +MINUTE=121 +MINUTE_MICROSECOND=122 +MINUTE_OF_DAY=123 +MINUTE_OF_HOUR=124 +MINUTE_SECOND=125 +MONTH=126 +MONTH_OF_YEAR=127 +QUARTER=128 +SECOND=129 +SECOND_MICROSECOND=130 +SECOND_OF_MINUTE=131 +WEEK=132 +WEEK_OF_YEAR=133 +YEAR=134 +YEAR_MONTH=135 +DATAMODEL=136 +LOOKUP=137 +SAVEDSEARCH=138 +INT=139 +INTEGER=140 +DOUBLE=141 +LONG=142 +FLOAT=143 +STRING=144 +BOOLEAN=145 +PIPE=146 +COMMA=147 +DOT=148 +EQUAL=149 +GREATER=150 +LESS=151 +NOT_GREATER=152 +NOT_LESS=153 +NOT_EQUAL=154 +PLUS=155 +MINUS=156 +STAR=157 +DIVIDE=158 +MODULE=159 +EXCLAMATION_SYMBOL=160 +COLON=161 +LT_PRTHS=162 +RT_PRTHS=163 +LT_SQR_PRTHS=164 +RT_SQR_PRTHS=165 +SINGLE_QUOTE=166 +DOUBLE_QUOTE=167 +BACKTICK=168 +ARROW=169 +BIT_NOT_OP=170 +BIT_AND_OP=171 +BIT_XOR_OP=172 +AVG=173 +COUNT=174 +DISTINCT_COUNT=175 +DISTINCT_COUNT_APPROX=176 +ESTDC=177 +ESTDC_ERROR=178 +MAX=179 +MEAN=180 +MEDIAN=181 +MIN=182 +MODE=183 +RANGE=184 +STDEV=185 +STDEVP=186 +SUM=187 +SUMSQ=188 +VAR_SAMP=189 +VAR_POP=190 +STDDEV_SAMP=191 +STDDEV_POP=192 +PERCENTILE=193 +PERCENTILE_APPROX=194 +TAKE=195 +FIRST=196 +LAST=197 +LIST=198 +VALUES=199 +EARLIEST=200 +EARLIEST_TIME=201 +LATEST=202 +LATEST_TIME=203 +PER_DAY=204 +PER_HOUR=205 +PER_MINUTE=206 +PER_SECOND=207 +RATE=208 +SPARKLINE=209 +C=210 +DC=211 +ABS=212 +CBRT=213 +CEIL=214 +CEILING=215 +CONV=216 +CRC32=217 +E=218 +EXP=219 +FLOOR=220 +LN=221 +LOG=222 +LOG10=223 +LOG2=224 +MOD=225 +PI=226 +POSITION=227 +POW=228 +POWER=229 +RAND=230 +ROUND=231 +SIGN=232 +SIGNUM=233 +SQRT=234 +TRUNCATE=235 +ACOS=236 +ASIN=237 +ATAN=238 +ATAN2=239 +COS=240 +COT=241 +DEGREES=242 +RADIANS=243 +SIN=244 +TAN=245 +MD5=246 +SHA1=247 +SHA2=248 +ADDDATE=249 +ADDTIME=250 +CURDATE=251 +CURRENT_DATE=252 +CURRENT_TIME=253 +CURRENT_TIMESTAMP=254 +CURRENT_TIMEZONE=255 +CURTIME=256 +DATE=257 +DATEDIFF=258 +DATE_ADD=259 +DATE_FORMAT=260 +DATE_SUB=261 +DAYNAME=262 +DAYOFMONTH=263 +DAYOFWEEK=264 +DAYOFYEAR=265 +DAY_OF_MONTH=266 +DAY_OF_WEEK=267 +DURATION=268 +EXTRACT=269 +FROM_DAYS=270 +FROM_UNIXTIME=271 +GET_FORMAT=272 +LAST_DAY=273 +LOCALTIME=274 +LOCALTIMESTAMP=275 +MAKEDATE=276 +MAKE_DATE=277 +MAKETIME=278 +MONTHNAME=279 +NOW=280 +PERIOD_ADD=281 +PERIOD_DIFF=282 +SEC_TO_TIME=283 +STR_TO_DATE=284 +SUBDATE=285 +SUBTIME=286 +SYSDATE=287 +TIME=288 +TIMEDIFF=289 +TIMESTAMP=290 +TIMESTAMPADD=291 +TIMESTAMPDIFF=292 +TIME_FORMAT=293 +TIME_TO_SEC=294 +TO_DAYS=295 +TO_SECONDS=296 +UNIX_TIMESTAMP=297 +UTC_DATE=298 +UTC_TIME=299 +UTC_TIMESTAMP=300 +WEEKDAY=301 +YEARWEEK=302 +SUBSTR=303 +SUBSTRING=304 +LTRIM=305 +RTRIM=306 +TRIM=307 +TO=308 +LOWER=309 +UPPER=310 +CONCAT=311 +CONCAT_WS=312 +LENGTH=313 +STRCMP=314 +RIGHT=315 +LEFT=316 +ASCII=317 +LOCATE=318 +REPLACE=319 +REVERSE=320 +CAST=321 +ISEMPTY=322 +ISBLANK=323 +JSON=324 +JSON_OBJECT=325 +JSON_ARRAY=326 +JSON_ARRAY_LENGTH=327 +TO_JSON_STRING=328 +JSON_EXTRACT=329 +JSON_KEYS=330 +JSON_VALID=331 +ARRAY=332 +ARRAY_LENGTH=333 +FORALL=334 +FILTER=335 +TRANSFORM=336 +REDUCE=337 +LIKE=338 +ISNULL=339 +ISNOTNULL=340 +ISPRESENT=341 +BETWEEN=342 +CIDRMATCH=343 +GEOIP=344 +IFNULL=345 +NULLIF=346 +IF=347 +TYPEOF=348 +COALESCE=349 +MATCH=350 +MATCH_PHRASE=351 +MATCH_PHRASE_PREFIX=352 +MATCH_BOOL_PREFIX=353 +SIMPLE_QUERY_STRING=354 +MULTI_MATCH=355 +QUERY_STRING=356 +ALLOW_LEADING_WILDCARD=357 +ANALYZE_WILDCARD=358 +ANALYZER=359 +AUTO_GENERATE_SYNONYMS_PHRASE_QUERY=360 +BOOST=361 +CUTOFF_FREQUENCY=362 +DEFAULT_FIELD=363 +DEFAULT_OPERATOR=364 +ENABLE_POSITION_INCREMENTS=365 +ESCAPE=366 +FLAGS=367 +FUZZY_MAX_EXPANSIONS=368 +FUZZY_PREFIX_LENGTH=369 +FUZZY_TRANSPOSITIONS=370 +FUZZY_REWRITE=371 +FUZZINESS=372 +LENIENT=373 +LOW_FREQ_OPERATOR=374 +MAX_DETERMINIZED_STATES=375 +MAX_EXPANSIONS=376 +MINIMUM_SHOULD_MATCH=377 +OPERATOR=378 +PHRASE_SLOP=379 +PREFIX_LENGTH=380 +QUOTE_ANALYZER=381 +QUOTE_FIELD_SUFFIX=382 +REWRITE=383 +SLOP=384 +TIE_BREAKER=385 +TYPE=386 +ZERO_TERMS_QUERY=387 +SPAN=388 +MS=389 +S=390 +M=391 +H=392 +W=393 +Q=394 +Y=395 +ID=396 +CLUSTER=397 +INTEGER_LITERAL=398 +DECIMAL_LITERAL=399 +ID_DATE_SUFFIX=400 +DQUOTA_STRING=401 +SQUOTA_STRING=402 +BQUOTA_STRING=403 +LINE_COMMENT=404 +BLOCK_COMMENT=405 +ERROR_RECOGNITION=406 +'SEARCH'=1 +'DESCRIBE'=2 +'SHOW'=3 +'FROM'=4 +'WHERE'=5 +'FIELDS'=6 +'RENAME'=7 +'STATS'=8 +'EVENTSTATS'=9 +'DEDUP'=10 +'SORT'=11 +'EVAL'=12 +'HEAD'=13 +'TOP_APPROX'=14 +'TOP'=15 +'RARE_APPROX'=16 +'RARE'=17 +'PARSE'=18 +'METHOD'=19 +'REGEX'=20 +'PUNCT'=21 +'GROK'=22 +'PATTERN'=23 +'PATTERNS'=24 +'NEW_FIELD'=25 +'KMEANS'=26 +'AD'=27 +'ML'=28 +'FILLNULL'=29 +'EXPAND'=30 +'FLATTEN'=31 +'TRENDLINE'=32 +'JOIN'=33 +'ON'=34 +'INNER'=35 +'OUTER'=36 +'FULL'=37 +'SEMI'=38 +'ANTI'=39 +'CROSS'=40 +'HINT.LEFT'=41 +'HINT.RIGHT'=42 +'CORRELATE'=43 +'SELF'=44 +'EXACT'=45 +'APPROXIMATE'=46 +'SCOPE'=47 +'MAPPING'=48 +'EXPLAIN'=49 +'FORMATTED'=50 +'COST'=51 +'CODEGEN'=52 +'EXTENDED'=53 +'SIMPLE'=54 +'AS'=55 +'BY'=56 +'SOURCE'=57 +'INDEX'=58 +'D'=59 +'DESC'=60 +'DATASOURCES'=61 +'USING'=62 +'WITH'=63 +'AUTO'=64 +'STR'=65 +'IP'=66 +'NUM'=67 +'FIELDSUMMARY'=68 +'INCLUDEFIELDS'=69 +'NULLS'=70 +'SMA'=71 +'WMA'=72 +'KEEPEMPTY'=73 +'CONSECUTIVE'=74 +'DEDUP_SPLITVALUES'=75 +'PARTITIONS'=76 +'ALLNUM'=77 +'DELIM'=78 +'CENTROIDS'=79 +'ITERATIONS'=80 +'DISTANCE_TYPE'=81 +'NUMBER_OF_TREES'=82 +'SHINGLE_SIZE'=83 +'SAMPLE_SIZE'=84 +'OUTPUT_AFTER'=85 +'TIME_DECAY'=86 +'ANOMALY_RATE'=87 +'CATEGORY_FIELD'=88 +'TIME_FIELD'=89 +'TIME_ZONE'=90 +'TRAINING_DATA_SIZE'=91 +'ANOMALY_SCORE_THRESHOLD'=92 +'APPEND'=93 +'CASE'=94 +'ELSE'=95 +'IN'=96 +'EXISTS'=97 +'NOT'=98 +'OR'=99 +'AND'=100 +'XOR'=101 +'TRUE'=102 +'FALSE'=103 +'REGEXP'=104 +'CONVERT_TZ'=105 +'DATETIME'=106 +'DAY'=107 +'DAY_HOUR'=108 +'DAY_MICROSECOND'=109 +'DAY_MINUTE'=110 +'DAY_OF_YEAR'=111 +'DAY_SECOND'=112 +'HOUR'=113 +'HOUR_MICROSECOND'=114 +'HOUR_MINUTE'=115 +'HOUR_OF_DAY'=116 +'HOUR_SECOND'=117 +'INTERVAL'=118 +'MICROSECOND'=119 +'MILLISECOND'=120 +'MINUTE'=121 +'MINUTE_MICROSECOND'=122 +'MINUTE_OF_DAY'=123 +'MINUTE_OF_HOUR'=124 +'MINUTE_SECOND'=125 +'MONTH'=126 +'MONTH_OF_YEAR'=127 +'QUARTER'=128 +'SECOND'=129 +'SECOND_MICROSECOND'=130 +'SECOND_OF_MINUTE'=131 +'WEEK'=132 +'WEEK_OF_YEAR'=133 +'YEAR'=134 +'YEAR_MONTH'=135 +'DATAMODEL'=136 +'LOOKUP'=137 +'SAVEDSEARCH'=138 +'INT'=139 +'INTEGER'=140 +'DOUBLE'=141 +'LONG'=142 +'FLOAT'=143 +'STRING'=144 +'BOOLEAN'=145 +'|'=146 +','=147 +'.'=148 +'='=149 +'>'=150 +'<'=151 +'+'=155 +'-'=156 +'*'=157 +'/'=158 +'%'=159 +'!'=160 +':'=161 +'('=162 +')'=163 +'['=164 +']'=165 +'\''=166 +'"'=167 +'`'=168 +'->'=169 +'~'=170 +'&'=171 +'^'=172 +'AVG'=173 +'COUNT'=174 +'DISTINCT_COUNT'=175 +'DISTINCT_COUNT_APPROX'=176 +'ESTDC'=177 +'ESTDC_ERROR'=178 +'MAX'=179 +'MEAN'=180 +'MEDIAN'=181 +'MIN'=182 +'MODE'=183 +'RANGE'=184 +'STDEV'=185 +'STDEVP'=186 +'SUM'=187 +'SUMSQ'=188 +'VAR_SAMP'=189 +'VAR_POP'=190 +'STDDEV_SAMP'=191 +'STDDEV_POP'=192 +'PERCENTILE'=193 +'PERCENTILE_APPROX'=194 +'TAKE'=195 +'FIRST'=196 +'LAST'=197 +'LIST'=198 +'VALUES'=199 +'EARLIEST'=200 +'EARLIEST_TIME'=201 +'LATEST'=202 +'LATEST_TIME'=203 +'PER_DAY'=204 +'PER_HOUR'=205 +'PER_MINUTE'=206 +'PER_SECOND'=207 +'RATE'=208 +'SPARKLINE'=209 +'C'=210 +'DC'=211 +'ABS'=212 +'CBRT'=213 +'CEIL'=214 +'CEILING'=215 +'CONV'=216 +'CRC32'=217 +'E'=218 +'EXP'=219 +'FLOOR'=220 +'LN'=221 +'LOG'=222 +'LOG10'=223 +'LOG2'=224 +'MOD'=225 +'PI'=226 +'POSITION'=227 +'POW'=228 +'POWER'=229 +'RAND'=230 +'ROUND'=231 +'SIGN'=232 +'SIGNUM'=233 +'SQRT'=234 +'TRUNCATE'=235 +'ACOS'=236 +'ASIN'=237 +'ATAN'=238 +'ATAN2'=239 +'COS'=240 +'COT'=241 +'DEGREES'=242 +'RADIANS'=243 +'SIN'=244 +'TAN'=245 +'MD5'=246 +'SHA1'=247 +'SHA2'=248 +'ADDDATE'=249 +'ADDTIME'=250 +'CURDATE'=251 +'CURRENT_DATE'=252 +'CURRENT_TIME'=253 +'CURRENT_TIMESTAMP'=254 +'CURRENT_TIMEZONE'=255 +'CURTIME'=256 +'DATE'=257 +'DATEDIFF'=258 +'DATE_ADD'=259 +'DATE_FORMAT'=260 +'DATE_SUB'=261 +'DAYNAME'=262 +'DAYOFMONTH'=263 +'DAYOFWEEK'=264 +'DAYOFYEAR'=265 +'DAY_OF_MONTH'=266 +'DAY_OF_WEEK'=267 +'DURATION'=268 +'EXTRACT'=269 +'FROM_DAYS'=270 +'FROM_UNIXTIME'=271 +'GET_FORMAT'=272 +'LAST_DAY'=273 +'LOCALTIME'=274 +'LOCALTIMESTAMP'=275 +'MAKEDATE'=276 +'MAKE_DATE'=277 +'MAKETIME'=278 +'MONTHNAME'=279 +'NOW'=280 +'PERIOD_ADD'=281 +'PERIOD_DIFF'=282 +'SEC_TO_TIME'=283 +'STR_TO_DATE'=284 +'SUBDATE'=285 +'SUBTIME'=286 +'SYSDATE'=287 +'TIME'=288 +'TIMEDIFF'=289 +'TIMESTAMP'=290 +'TIMESTAMPADD'=291 +'TIMESTAMPDIFF'=292 +'TIME_FORMAT'=293 +'TIME_TO_SEC'=294 +'TO_DAYS'=295 +'TO_SECONDS'=296 +'UNIX_TIMESTAMP'=297 +'UTC_DATE'=298 +'UTC_TIME'=299 +'UTC_TIMESTAMP'=300 +'WEEKDAY'=301 +'YEARWEEK'=302 +'SUBSTR'=303 +'SUBSTRING'=304 +'LTRIM'=305 +'RTRIM'=306 +'TRIM'=307 +'TO'=308 +'LOWER'=309 +'UPPER'=310 +'CONCAT'=311 +'CONCAT_WS'=312 +'LENGTH'=313 +'STRCMP'=314 +'RIGHT'=315 +'LEFT'=316 +'ASCII'=317 +'LOCATE'=318 +'REPLACE'=319 +'REVERSE'=320 +'CAST'=321 +'ISEMPTY'=322 +'ISBLANK'=323 +'JSON'=324 +'JSON_OBJECT'=325 +'JSON_ARRAY'=326 +'JSON_ARRAY_LENGTH'=327 +'TO_JSON_STRING'=328 +'JSON_EXTRACT'=329 +'JSON_KEYS'=330 +'JSON_VALID'=331 +'ARRAY'=332 +'ARRAY_LENGTH'=333 +'FORALL'=334 +'FILTER'=335 +'TRANSFORM'=336 +'REDUCE'=337 +'LIKE'=338 +'ISNULL'=339 +'ISNOTNULL'=340 +'ISPRESENT'=341 +'BETWEEN'=342 +'CIDRMATCH'=343 +'GEOIP'=344 +'IFNULL'=345 +'NULLIF'=346 +'IF'=347 +'TYPEOF'=348 +'COALESCE'=349 +'MATCH'=350 +'MATCH_PHRASE'=351 +'MATCH_PHRASE_PREFIX'=352 +'MATCH_BOOL_PREFIX'=353 +'SIMPLE_QUERY_STRING'=354 +'MULTI_MATCH'=355 +'QUERY_STRING'=356 +'ALLOW_LEADING_WILDCARD'=357 +'ANALYZE_WILDCARD'=358 +'ANALYZER'=359 +'AUTO_GENERATE_SYNONYMS_PHRASE_QUERY'=360 +'BOOST'=361 +'CUTOFF_FREQUENCY'=362 +'DEFAULT_FIELD'=363 +'DEFAULT_OPERATOR'=364 +'ENABLE_POSITION_INCREMENTS'=365 +'ESCAPE'=366 +'FLAGS'=367 +'FUZZY_MAX_EXPANSIONS'=368 +'FUZZY_PREFIX_LENGTH'=369 +'FUZZY_TRANSPOSITIONS'=370 +'FUZZY_REWRITE'=371 +'FUZZINESS'=372 +'LENIENT'=373 +'LOW_FREQ_OPERATOR'=374 +'MAX_DETERMINIZED_STATES'=375 +'MAX_EXPANSIONS'=376 +'MINIMUM_SHOULD_MATCH'=377 +'OPERATOR'=378 +'PHRASE_SLOP'=379 +'PREFIX_LENGTH'=380 +'QUOTE_ANALYZER'=381 +'QUOTE_FIELD_SUFFIX'=382 +'REWRITE'=383 +'SLOP'=384 +'TIE_BREAKER'=385 +'TYPE'=386 +'ZERO_TERMS_QUERY'=387 +'SPAN'=388 +'MS'=389 +'S'=390 +'M'=391 +'H'=392 +'W'=393 +'Q'=394 +'Y'=395