diff --git a/docker-compose-local.yaml b/docker-compose-local.yaml
index 6bc337e7..7a0e5807 100644
--- a/docker-compose-local.yaml
+++ b/docker-compose-local.yaml
@@ -72,7 +72,7 @@ services:
- KAFKA_ENABLE_KRAFT=false
coordinator:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: coordinator
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -89,11 +89,12 @@ services:
- environment
broker:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: broker
profiles: ["druid-jupyter", "all-services"]
volumes:
- broker_var:/opt/druid/var
+ - druid_shared:/opt/shared
depends_on:
- zookeeper
- postgres
@@ -106,7 +107,7 @@ services:
- environment
historical:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: historical
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -124,7 +125,7 @@ services:
- environment
middlemanager:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: middlemanager
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -143,7 +144,7 @@ services:
- environment
router:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: router
profiles: ["druid-jupyter", "all-services"]
volumes:
diff --git a/docker-compose.yaml b/docker-compose.yaml
index ea3fccf4..a1a1049e 100644
--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -72,7 +72,7 @@ services:
- KAFKA_ENABLE_KRAFT=false
coordinator:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: coordinator
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -89,11 +89,12 @@ services:
- environment
broker:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: broker
profiles: ["druid-jupyter", "all-services"]
volumes:
- broker_var:/opt/druid/var
+ - druid_shared:/opt/shared
depends_on:
- zookeeper
- postgres
@@ -106,7 +107,7 @@ services:
- environment
historical:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: historical
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -124,7 +125,7 @@ services:
- environment
middlemanager:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: middlemanager
profiles: ["druid-jupyter", "all-services"]
volumes:
@@ -143,7 +144,7 @@ services:
- environment
router:
- image: imply/druid:${DRUID_VERSION:-28.0.1}
+ image: imply/druid:${DRUID_VERSION:-29.0.0}
container_name: router
profiles: ["druid-jupyter", "all-services"]
volumes:
diff --git a/environment b/environment
index dda21228..c98f191b 100644
--- a/environment
+++ b/environment
@@ -55,5 +55,19 @@ druid_indexer_logs_directory=/opt/shared/indexing-logs
druid_processing_numThreads=2
druid_processing_numMergeBuffers=2
+
+# setup up durable Storage for asynchronous query processing
+druid_msq_intermediate_storage_enable=true
+druid_msq_intermediate_storage_type=local
+druid_msq_intermediate_storage_tempDir=/opt/shared/tmp
+druid_msq_intermediate_storage_basePath=/opt/shared/intermediate
+druid_msq_intermediate_storage_cleaner_enabled=true
+
+
+
+
+
+
+
DRUID_LOG4J=
diff --git a/jupyter-img/druidapi/druidapi/sql.py b/jupyter-img/druidapi/druidapi/sql.py
index b688fdbb..823488d8 100644
--- a/jupyter-img/druidapi/druidapi/sql.py
+++ b/jupyter-img/druidapi/druidapi/sql.py
@@ -809,6 +809,19 @@ def rows(self):
page+=1
return self._rows
+ def paged_rows(self, pageNum):
+ import json
+ if self.succeeded:
+ self._rows=[]
+ if pageNum < len(self._pages):
+ results = self._query_client().statement_results(self._id, pageNum)
+ try:
+ self._rows = json.loads(results.text)
+ except Exception as ex:
+ raise ClientError(f"Could not parse JSON from result [{results}]")
+ return self._rows
+
+
def _display(self, display):
return self._druid().display if not display else display
@@ -838,7 +851,7 @@ def __init__(self, druid, rest_client=None):
def rest_client(self):
return self._rest_client
- def _prepare_query(self, request, asynch=False):
+ def _prepare_query(self, request, asynch=False, rowsPerPage= None ):
if not request:
raise ClientError('No query provided.')
# If the request is a dictionary, assume it is already in SqlQuery form.
@@ -854,6 +867,8 @@ def _prepare_query(self, request, asynch=False):
print(request.sql)
if asynch:
request.add_context( 'executionMode', 'ASYNC')
+ if rowsPerPage:
+ request.add_context( 'rowsPerPage', rowsPerPage)
if not query_obj:
query_obj = request.to_request()
return (request, query_obj)
@@ -946,7 +961,7 @@ def task(self, query) -> QueryTaskResult:
r = self.rest_client.post_only_json(REQ_SQL_TASK, query_obj, headers=request.headers)
return QueryTaskResult(request, r)
- def statement(self, query) -> AsynchQueryResult:
+ def statement(self, query, rowsPerPage=None) -> AsynchQueryResult:
'''
Submits an MSQ asynch query. Returns a AsynchQueryResult to track the task.
@@ -955,7 +970,7 @@ def statement(self, query) -> AsynchQueryResult:
query
The query as either a string or a SqlRequest object.
'''
- request, query_obj = self._prepare_query(query, asynch=True)
+ request, query_obj = self._prepare_query(query, asynch=True, rowsPerPage=rowsPerPage)
r = self.rest_client.post_only_json(REQ_SQL_ASYNC, query_obj, headers=request.headers)
return AsynchQueryResult(request, r)
@@ -979,7 +994,7 @@ def statement_results(self, id, page=0 ):
:param page: the page of rows to retrieve
:return: json array with rows for the page of results
'''
- response = self.rest_client.get(REQ_SQL_ASYNC+f'/{id}/results', f'{{"page"={page}}}' )
+ response = self.rest_client.get(REQ_SQL_ASYNC+f'/{id}/results', params={"page":page} )
return response
def run_task(self, query):
@@ -996,16 +1011,15 @@ def run_task(self, query):
raise ClientError(resp.error_message)
resp.wait_until_done()
- def async_sql(self, query):
+ def async_sql(self, query, rowsPerPage=None):
'''
Submits an MSQ asynchronous query request using the sql/statements API Returns a
:param query: The SQL query statement
:return: rows
'''
- resp = self.statement(query)
+ resp = self.statement(query, rowsPerPage=rowsPerPage)
if not resp.ok:
raise ClientError(resp.error_message)
- resp.wait_until_done()
return resp
def _tables_query(self, schema):
diff --git a/notebooks/00-releases/README.md b/notebooks/00-releases/README.md
deleted file mode 100644
index 35c356f4..00000000
--- a/notebooks/00-releases/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Apache Druid releases
-
-This section contains indexes of new and updated notebooks relating to new releases of Apache Druid.
-
-* [Apache Druid 28.0.0](druid-28.md)
-
diff --git a/notebooks/00-releases/druid-28.md b/notebooks/00-releases/druid-28.md
deleted file mode 100644
index c8cbb9d8..00000000
--- a/notebooks/00-releases/druid-28.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Apache Druid 28.0.0
-
-Released in November 2023, Druid 28 includes a number standards compatibiility improvements in the [SQL dialect](https://druid.apache.org/docs/latest/querying/sql), to [Apache Kafka ingestion](https://druid.apache.org/docs/latest/development/extensions-core/kafka-ingestion), and opens the door to using data direct from [Deep Storage](https://druid.apache.org/docs/latest/design/deep-storage) through the MSQ engine.
-
-Check out these notebooks, which were introduced or improved following the Druid 28 release.
-
-* [SQL-compatible NULL](../02-ingestion/09-generating-and-working-with-nulls.ipynb)
-* [Multi-topic Kafka ingestion](../02-ingestion/11-stream-from-multiple-topics.ipynb)
-* [ARRAYS and UNNEST](../02-ingestion/08-table-datatypes-arrays.ipynb)
-* [Query from Deep Storage](../03-query/14-full-timeline-queries.ipynb)
-* [LOOKUP overload](../03-query/06-lookup-tables.ipynb)
-
-These notebooks were added in support of features that are experimental. Remember to drop [into the community](https://druid.apache.org/community) to give your thoughts, findings, and feedback.
-
-* [Window Functions (Experimental)](../03-query/13-query-functions-window.ipynb)
\ No newline at end of file
diff --git a/notebooks/02-ingestion/02-batch-ingestion.ipynb b/notebooks/02-ingestion/02-batch-ingestion.ipynb
index f6e53ce8..26a8776c 100644
--- a/notebooks/02-ingestion/02-batch-ingestion.ipynb
+++ b/notebooks/02-ingestion/02-batch-ingestion.ipynb
@@ -46,7 +46,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 26.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see [Learn Druid Project page](https://github.com/implydata/learn-druid/#readme).\n"
]
@@ -89,7 +89,7 @@
"source": [
"## SQL based ingestion\n",
"\n",
- "Run the following cell to load data from an external file into the \"wikipedia_events\" table."
+ "Run the following cell to load data from an external file into the \"example-wikipedia-batch\" table."
]
},
{
@@ -100,7 +100,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"wikipedia-events-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wikipedia-batch\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -117,7 +117,7 @@
"PARTITIONED BY DAY\n",
"'''\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('wikipedia-events-batch')"
+ "druid.sql.wait_until_ready('example-wikipedia-batch')"
]
},
{
@@ -125,13 +125,13 @@
"id": "1776cd84-9f38-40a3-9e47-5e821d15c901",
"metadata": {},
"source": [
- "`REPLACE` or `INSERT` at the beginning of the statement tells Druid to execute an ingestion task. `INSERT` is used when appending data, `REPLACE` when replacing data. Both methods work when adding data to a new or empty Druid datasource. The `OVERWRITE ALL` clause means that the whole datasource will be replaced with the result of this ingestion. \n",
+ "REPLACE or INSERT at the beginning of the statement tells Druid to execute an ingestion task. INSERT is used when appending data, REPLACE when replacing data. Both methods work when adding data to a new or empty Druid datasource. The OVERWRITE ALL clause means that the whole datasource will be replaced with the result of this ingestion. \n",
"\n",
"```\n",
- "REPLACE INTO \"wikipedia-events-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wikipedia-batch\" OVERWRITE ALL\n",
"```\n",
"\n",
- "The `WITH` clause is used to declare one or more input sources, this could also be placed directly in the `FROM` clause of the final `SELECT`, but this is easier to read:\n",
+ "The WITH clause is used to declare one or more input sources, this could also be placed directly in the FROM clause of the final SELECT, but this is easier to read:\n",
"\n",
"```\n",
"WITH \"ext\" AS \n",
@@ -141,7 +141,7 @@
") EXTEND (...)\n",
"```\n",
"\n",
- "`EXTERN` supports many batch [input sources](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html) and [formats](https://druid.apache.org/docs/latest/ingestion/data-formats.html). In this case the SQL statement uses input source type `http` to access a set or `uris` that each contain a data file in the `json` data format. Note that compressed files are allowed and will automatically be decompressed.\n",
+ "EXTERN supports many batch [input sources](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html) and [formats](https://druid.apache.org/docs/latest/ingestion/data-formats.html). In this case the SQL statement uses input source type \"http\" to access a set or \"uris\" that each contain a data file in the \"json\" data format. Note that compressed files are allowed and will automatically be decompressed.\n",
"```\n",
"FROM TABLE(\n",
" EXTERN(\n",
@@ -149,12 +149,12 @@
" '{\"type\":\"json\"}'\n",
" )\n",
"```\n",
- "The [`EXTEND` clause describes the input schema](https://druid.apache.org/docs/latest/multi-stage-query/reference#extern-function) using SQL data types:\n",
+ "The [EXTEND clause describes the input schema](https://druid.apache.org/docs/latest/multi-stage-query/reference#extern-function) using SQL data types:\n",
"```\n",
"EXTEND (\"isRobot\" VARCHAR, \"channel\" VARCHAR, \"timestamp\" VARCHAR, ...)\n",
"```\n",
"\n",
- "The final SELECT statement defines the transformations and schema of the resulting Druid table. A `__time` column is usually parsed from the source, this expression will be mapped to Druid's primary time partitioning of segments. In this case we specified the `__time` column and ingested the rest of the columns \"AS IS\" using `*`.\n",
+ "The final SELECT statement defines the transformations and schema of the resulting Druid table. A \"__time\" column is usually parsed from the source, this expression will be mapped to Druid's primary time partitioning of segments. In this case we specified the \"__time\" column and ingested the rest of the columns as defined in the EXTEND clause using \"*\".\n",
"\n",
"```\n",
"SELECT\n",
@@ -163,7 +163,7 @@
"FROM \"ext\"\n",
"```\n",
"\n",
- "The final portion of this ingestion is the `PARTITIONED BY DAY` clause which tells Driud to create a separate set of segments for each day. A `PARTITION BY` clause must be included in all `INSERT`/`REPLACE` statements.\n",
+ "The final portion of this ingestion is the PARTITIONED BY DAY clause which tells Driud to create a separate set of segments for each day. A PARTITION BY clause must be included in all INSERT/REPLACE statements.\n",
"\n",
"##### Wait for Segment Availibility:\n",
"The `sql_wait_until_ready` function is used to pause until all the ingested data is available in the Historical cacheing layer before executing any queries."
@@ -187,7 +187,7 @@
"source": [
"sql = \"\"\"\n",
"SELECT channel, count(*) num_events\n",
- "FROM \"wikipedia-events-batch\" \n",
+ "FROM \"example-wikipedia-batch\" \n",
"WHERE __time BETWEEN '2016-06-27' AND '2016-06-28'\n",
"GROUP BY 1 \n",
"ORDER BY 2 DESC \n",
@@ -222,7 +222,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"wikipedia-events-3-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wikipedia-3-batch\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -244,7 +244,7 @@
"PARTITIONED BY DAY\n",
"'''\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('wikipedia-events-3-batch')\n"
+ "druid.sql.wait_until_ready('example-wikipedia-3-batch')\n"
]
},
{
@@ -264,7 +264,7 @@
"source": [
"druid.display.sql(\"\"\"\n",
"SELECT channel, count(*) num_events\n",
- "FROM \"wikipedia-events-3-batch\" \n",
+ "FROM \"example-wikipedia-3-batch\" \n",
"WHERE __time BETWEEN '2016-06-27' AND '2016-06-28'\n",
"GROUP BY 1 \n",
"ORDER BY 2 DESC \n",
@@ -287,7 +287,7 @@
"\n",
"After the initial input stage, the level of parallelism of the job will remain consistent and is controlled by the [context parameter](https://druid.apache.org/docs/latest/multi-stage-query/reference.html#context-parameters) `maxNumTasks`\n",
"\n",
- "If you are running Druid on your laptop, the default configuration only provides 4 worker slots on the Middle Manager, so you can only run with `maxNumTasks=4` resulting in one controller and one worker. If you are using this notebook against a larger Druid cluster, feel free to experiment with higher values. Note that if `maxNumTasks` exceeds the available worker slots, the job will fail with a time out error because it waits for all the worker tasks to be active.\n",
+ "If you are running Druid on your laptop, the default configuration only provides 4 worker slots on the Middle Manager, so you can only run with `maxNumTasks=4` resulting in one controller and one worker. If you are using this notebook against a larger Druid cluster, feel free to experiment with higher values. Note that, if `maxNumTasks` exceeds the available worker slots, the job will fail with a timeout error because it waits for all the worker tasks to be active.\n",
"\n",
"##### rowsPerSegment\n",
"`rowsPerSegment` defaults to 3,000,000. You can adjust it to produce larger or smaller segments. \n",
@@ -303,7 +303,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"wikipedia-events-4-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wikipedia-4-batch\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -329,7 +329,7 @@
"request.add_context( 'maxNumTasks', 4) # can't go any higher in learning environment\n",
"\n",
"druid.display.run_task(request)\n",
- "druid.sql.wait_until_ready('wikipedia-events-4-batch')"
+ "druid.sql.wait_until_ready('example-wikipedia-4-batch')"
]
},
{
@@ -337,7 +337,7 @@
"id": "ee87e942-000a-49d3-b915-4319519c4ed1",
"metadata": {},
"source": [
- "With a `rowsPerSegment` of only 20,000, the same ingestion as before produces more segments. Open the [Druid console in the Data Sources view](http://localhost:8888/unified-console.html#datasources) to see the difference in segments between `wikipedia-events-3-batch` and `wikipedia-events-4-batch`.\n",
+ "With a `rowsPerSegment` of only 20,000, the same ingestion as before produces more segments. Open the [Druid console in the Data Sources view](http://localhost:8888/unified-console.html#datasources) to see the difference in segments between \"example-wikipedia-3-batch\" and \"example-wikipedia-4-batch\".\n",
"\n",
"Note that 20,000 is a very low value used to illustrate setting parameters. Normally this value is in the millions."
]
@@ -349,7 +349,7 @@
"source": [
"## Filter data during ingestion\n",
"\n",
- "In situations where you need data cleansing or your only interested in a subset of the data, the ingestion job can filter the data by simply adding a `WHERE` clause.\n",
+ "In situations where you need data cleansing or your only interested in a subset of the data, the ingestion job can filter the data by simply adding a WHERE clause.\n",
"\n",
"The example excludes all robotic wikipedia updates:"
]
@@ -362,7 +362,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"wikipedia-events-only-human\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wikipedia-only-human\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -385,7 +385,7 @@
"'''\n",
"\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('wikipedia-events-only-human')"
+ "druid.sql.wait_until_ready('example-wikipedia-only-human')"
]
},
{
@@ -397,7 +397,7 @@
"source": [
"druid.display.sql(\"\"\"\n",
"SELECT isRobot, channel, count(*) num_events\n",
- "FROM \"wikipedia-events-only-human\" \n",
+ "FROM \"example-wikipedia-only-human\" \n",
"GROUP BY 1,2 \n",
"ORDER BY 3 DESC \n",
"LIMIT 10\n",
@@ -470,7 +470,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"kttm-transformation-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-kttm-transform-batch\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -505,7 +505,7 @@
"'''\n",
"\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('kttm-transformation-batch')"
+ "druid.sql.wait_until_ready('example-kttm-transform-batch')"
]
},
{
@@ -518,8 +518,8 @@
"# let's see what time of day shows the highest user activity\n",
"druid.display.sql(\"\"\"\n",
"SELECT EXTRACT( HOUR FROM \"__time\") time_hour, city, count(distinct \"session\") session_count\n",
- "FROM \"kttm-transformation-batch\" \n",
- "WHERE \"city\" IS NOT NULL\n",
+ "FROM \"example-kttm-transform-batch\" \n",
+ "WHERE \"city\" IS NOT NULL AND \"city\" <> ''\n",
"GROUP BY 1,2 \n",
"ORDER BY 3 DESC \n",
"LIMIT 10\n",
@@ -546,7 +546,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"kttm-nested-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-kttm-nested-batch\" OVERWRITE ALL\n",
"WITH \"ext\" AS \n",
"(\n",
" SELECT *\n",
@@ -573,7 +573,7 @@
"PARTITIONED BY DAY\n",
"'''\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('kttm-nested-batch')"
+ "druid.sql.wait_until_ready('example-kttm-nested-batch')"
]
},
{
@@ -581,7 +581,7 @@
"id": "c57016cb-ff17-420d-882b-174ec6fd5ab9",
"metadata": {},
"source": [
- "As you can see, ingesting nested columns is straight forward. All you need to do is declare them as `TYPE('COMPLEX')`, include the input field in the main SELECT clause ( `*` = all columns ) and you're done!\n",
+ "As you can see, ingesting nested columns is straight forward. All you need to do is declare them as \"TYPE('COMPLEX')\", include the input field in the main SELECT clause ( * = all columns ) and you're done!\n",
"Take a look at the query example below where we access these nested fields as dimensions we can group by, metrics we can aggregate and filters we can apply:"
]
},
@@ -598,7 +598,7 @@
" SUM( JSON_VALUE(\"event\", '$.layer' RETURNING BIGINT) ) AS \"sum_layers\",\n",
" COUNT( DISTINCT JSON_VALUE(\"geo_ip\", '$.city') ) AS \"unique_cities\"\n",
"\n",
- "FROM \"kttm-nested-batch\"\n",
+ "FROM \"example-kttm-nested-batch\"\n",
"\n",
"WHERE JSON_VALUE(\"geo_ip\", '$.continent') = 'South America'\n",
"\n",
@@ -612,7 +612,7 @@
"id": "a2cb243a-276b-4227-a485-f398ba078f09",
"metadata": {},
"source": [
- "Since nested columns could have different fields from row to row or as their schema changes over time, you can inspect the fields that have been discovered during ingestion using the `JSON_PATHS` function on nested columns:"
+ "Since nested columns could have different fields from row to row or as their schema changes over time, you can inspect the fields that have been discovered during ingestion using the JSON_PATHS function on nested columns:"
]
},
{
@@ -623,14 +623,123 @@
"outputs": [],
"source": [
"druid.display.sql(\"\"\"\n",
- "SELECT 'agent' as nested_column, STRING_AGG( DISTINCT JSON_PATHS(\"agent\"), ', ') paths FROM \"kttm-nested-batch\"\n",
+ "SELECT 'agent' as nested_column, STRING_AGG( DISTINCT JSON_PATHS(\"agent\"), ', ') paths FROM \"example-kttm-nested-batch\"\n",
"UNION ALL\n",
- "SELECT 'event', STRING_AGG( DISTINCT JSON_PATHS(\"event\"), ', ') paths FROM \"kttm-nested-batch\"\n",
+ "SELECT 'event', STRING_AGG( DISTINCT JSON_PATHS(\"event\"), ', ') paths FROM \"example-kttm-nested-batch\"\n",
"UNION ALL\n",
- "SELECT 'geo_ip', STRING_AGG( DISTINCT JSON_PATHS(\"geo_ip\"), ', ') paths FROM \"kttm-nested-batch\"\n",
+ "SELECT 'geo_ip', STRING_AGG( DISTINCT JSON_PATHS(\"geo_ip\"), ', ') paths FROM \"example-kttm-nested-batch\"\n",
"\"\"\")"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "3e330808-696b-4278-be8e-269924cc8089",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## System Fields for Batch Ingestion\n",
+ "\n",
+ "When doing ingestion of multiple files, it is generally helpful to know the specific source of the data. This feature allows you to do just that. It provides system fields that identifty the input source and which can be added to the ingestion job.\n",
+ "\n",
+ "Each Input Source has slightly different input fields. In the example below we use HTTP [checkout in the docs to see the fields that are available](https://druid.apache.org/docs/latest/ingestion/input-sources#http-input-source). \n",
+ "\n",
+ "To enable this functionality, add the new property \"systemFields\" the Input Source field in the EXTERN clause:\n",
+ "```\n",
+ "FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\n",
+ " \"type\":\"http\",\n",
+ " \"systemFields\":[\"__file_uri\",\"__file_path\"], <<<<<< list of system fields to capture\n",
+ " \"uris\":[ 2 -- less rows to process for this example\n",
+ "GROUP BY 1,2,3\n",
+ "PARTITIONED BY ALL\n",
+ "'''\n",
+ "request = druid.sql.sql_request( sql) # init request object\n",
+ "request.add_context( 'maxNumTasks', 4) # can't go any higher in learning environment\n",
+ "\n",
+ "druid.display.run_task(request)\n",
+ "druid.sql.wait_until_ready('example-taxi-trips-rollup')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "271791d0-0a27-47d3-b9ed-6f445c302412",
+ "metadata": {},
+ "source": [
+ "Query the system fields that were ingested to see information about how each file was ingested:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a1488573-b82e-4b04-ae69-91d9c169a86c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"__file_uri\", \"__file_path\", \n",
+ " SUM(\"row_count\") \"total_file_rows\" \n",
+ "FROM \"example-taxi-trips-rollup\"\n",
+ "GROUP BY 1,2\n",
+ "'''\n",
+ "\n",
+ "druid.display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0f06e516-55ea-456f-8665-75884f6d3f27",
+ "metadata": {},
+ "source": [
+ "While the above examples are rather simple, this is a powerful tool to enhance data when the files are organized in folder structures where the path contains infomation about the data. It is common to see this kind of file system organization in cloud storage where that data has already been partitioned by time or other dimensions. Take this list of files as an example:\n",
+ "```\n",
+ "/data/activity_log/customer=501/hour=2024-01-01T10:00:00/datafile1.csv\n",
+ "/data/activity_log/customer=501/hour=2024-01-01T10:00:00/datafile2.csv\n",
+ "/data/activity_log/customer=376/hour=2024-01-01T11:00:00/datafile1.csv\n",
+ "...\n",
+ "```\n",
+ "With this example, the __file_uri or __file_path columns can be parsed at ingestion to create other fields using functions like REGEXP_EXTRACT to extract `customer` and `hour` in this example."
+ ]
+ },
{
"cell_type": "markdown",
"id": "a7a76a4b-d68a-43b3-87eb-239d9d72c042",
@@ -664,7 +773,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"kttm-enhanced-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-kttm-enhanced-batch\" OVERWRITE ALL\n",
"WITH\n",
"kttm_data AS (\n",
" SELECT * \n",
@@ -706,7 +815,7 @@
"PARTITIONED BY DAY\n",
"'''\n",
"druid.display.run_task(sql)\n",
- "druid.sql.wait_until_ready('kttm-enhanced-batch')"
+ "druid.sql.wait_until_ready('example-kttm-enhanced-batch')"
]
},
{
@@ -714,7 +823,7 @@
"id": "f0a6e5b2-f3ce-41a5-85cb-242a15edac7e",
"metadata": {},
"source": [
- "Data for both sources `kttm_data` and `country_lookup` are obtained from external sources:\n",
+ "Data for both sources \"kttm_data\" and \"country_lookup\" are obtained from external sources:\n",
"```\n",
"WITH\n",
"kttm_data AS\n",
@@ -739,7 +848,7 @@
")\n",
"```\n",
"\n",
- "Columns from both tables can be used in the SELECT expressions using the alias `country_lookup` to reference any joined column:\n",
+ "Columns from both tables can be used in the SELECT expressions using the alias \"country_lookup\" to reference any joined column:\n",
"```\n",
" kttm_data.\"country\",\n",
" country_lookup.\"Capital\" AS \"capital\",\n",
@@ -751,7 +860,7 @@
"FROM kttm_data\n",
"LEFT JOIN country_lookup ON country_lookup.Country = kttm_data.country\n",
"```\n",
- "LEFT JOIN insured that all the rows from `kttm_data` source are ingested. An INNER JOIN would exclude rows from `kttm_data` if the value for `kttm_data.country` is not present in `country_lookup.Country`. \n",
+ "LEFT JOIN insured that all the rows from kttm_data source are ingested. An INNER JOIN would exclude rows from \"kttm_data\" if the value for \"kttm_data.country\" is not present in \"country_lookup.Country\". \n",
"Since no context parameters were set, the join is processed as a broadcast join. The first table in the FROM clause is the distributed table and all other joined tables will be shipped to the workers to execute the join.\n",
"\n",
"Take a look at the data:"
@@ -771,7 +880,7 @@
" count( DISTINCT \"ip_address\" ) distinct_users, \n",
" MIN(\"session_length\")/1000 fastest_session_ms,\n",
" MAX(\"session_length\")/1000 slowest_session_ms\n",
- "FROM \"kttm-enhanced-batch\"\n",
+ "FROM \"example-kttm-enhanced-batch\"\n",
"WHERE \"event_type\"='LayerClear'\n",
"GROUP BY 1,2\n",
"ORDER BY 3 DESC\n",
@@ -798,7 +907,7 @@
"}\n",
"```\n",
"\n",
- "Given that this example is meant to run on the local docker compose deployment, two very large tables is not possible, try it out with small sources and just pretend they are big. We'll use the `wikipedia` sample data and join it with `wiki-users-batch` profile data. But first, create the users table because at the time of this writing there wasn't a matching `user` source handy:"
+ "Given that this example is meant to run on the local docker compose deployment, two very large tables is not possible, try it out with small sources and just pretend they are big. We'll use the \"wikipedia\" sample data and join it with \"example-wiki-users-batch\" profile data. But first, create the users table because at the time of this writing there wasn't a matching \"user\" source handy:"
]
},
{
@@ -808,7 +917,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# since we don't have a source for wiki-users-batch let's\n",
+ "# since we don't have a source for example-wiki-users-batch let's\n",
"# create one using in-database transformation that\n",
"# generates user profiles from the wikipedia data by\n",
"# - grouping on \"user\"\n",
@@ -818,7 +927,7 @@
"# - determine the preferred language of the user based on their earliest channel edit\n",
"\n",
"sql = '''\n",
- "REPLACE INTO \"wiki-users-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wiki-users-batch\" OVERWRITE ALL\n",
"SELECT \n",
" \"user\", \n",
" EARLIEST(\n",
@@ -833,7 +942,7 @@
" count(*) \"edits\",\n",
" TIME_SHIFT(MIN(__time), 'P1Y', -1 * MOD(MIN(EXTRACT (MICROSECOND FROM __time)),20) ) AS \"registered_at_ms\",\n",
" EARLIEST(SUBSTRING(\"channel\", 2, 2), 1024) AS \"language\"\n",
- "FROM \"wikipedia-events-batch\"\n",
+ "FROM \"example-wikipedia-batch\"\n",
"GROUP BY 1\n",
"PARTITIONED BY ALL\n",
"'''\n",
@@ -842,7 +951,7 @@
"request.add_context( 'maxNumTasks', 2) # can't go any higher in test env\n",
"\n",
"druid.display.run_task(request)\n",
- "druid.sql.wait_until_ready('wiki-users-batch')"
+ "druid.sql.wait_until_ready('example-wiki-users-batch')"
]
},
{
@@ -861,7 +970,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "REPLACE INTO \"wiki-merge-batch\" OVERWRITE ALL\n",
+ "REPLACE INTO \"example-wiki-merge-batch\" OVERWRITE ALL\n",
"WITH \"wikidata\" AS \n",
"(\n",
" SELECT *\n",
@@ -904,7 +1013,7 @@
" u.\"registered_at_ms\" AS \"user_registration_epoch\",\n",
" u.\"language\" AS \"user_language\"\n",
"FROM \"wikidata\" AS d \n",
- " LEFT JOIN \"wiki-users-batch\" AS u ON u.\"user\"=d.\"user\"\n",
+ " LEFT JOIN \"example-wiki-users-batch\" AS u ON u.\"user\"=d.\"user\"\n",
"PARTITIONED BY DAY\n",
"'''\n",
"request = druid.sql.sql_request( sql) # init request object\n",
@@ -912,7 +1021,7 @@
"request.add_context( 'maxNumTasks', 2) # use 2 tasks to run the ingestion\n",
"\n",
"druid.display.run_task(request)\n",
- "druid.sql.wait_until_ready('wiki-merge-batch')\n"
+ "druid.sql.wait_until_ready('example-wiki-merge-batch')\n"
]
},
{
@@ -934,7 +1043,7 @@
"SELECT \"user_group\",\n",
" count( DISTINCT \"user\") \"distinct_users\",\n",
" sum(\"user_edits\") \"total_activity\"\n",
- "FROM \"wiki-merge-batch\"\n",
+ "FROM \"example-wiki-merge-batch\"\n",
"GROUP BY 1\n",
"ORDER BY 1, 3 DESC\n",
"\"\"\")"
@@ -947,7 +1056,7 @@
"source": [
"## Conclusion\n",
"\n",
- "Druid's [SQL Based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) enables scalable batch ingestion from a large variety of [data sources](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html) and [formats](https://druid.apache.org/docs/latest/ingestion/data-formats.html). The familiarity and expressivity of SQL enables users to quickly transform, filter and generally enhance data directly in the cluster.\n",
+ "Druid's [SQL Based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) enables scalable batch ingestion from a large variety of [data sources](https://druid.apache.org/docs/latest/ingestion/native-batch-input-sources.html) and [formats](https://druid.apache.org/docs/latest/ingestion/data-formats.html). The familiarity and expressiveness of SQL enables users to quickly transform, filter and generally enhance data directly in the cluster.\n",
"\n"
]
},
@@ -968,15 +1077,16 @@
"metadata": {},
"outputs": [],
"source": [
- "druid.datasources.drop('wikipedia-events-batch', True)\n",
- "druid.datasources.drop('wikipedia-events-3-batch', True)\n",
- "druid.datasources.drop('wikipedia-events-4-batch', True)\n",
- "druid.datasources.drop('wikipedia-events-only-human', True)\n",
- "druid.datasources.drop('kttm-transformation-batch', True)\n",
- "druid.datasources.drop('kttm-nested-batch', True)\n",
- "druid.datasources.drop('kttm-enhanced-batch', True)\n",
- "druid.datasources.drop('wiki-users-batch', True)\n",
- "druid.datasources.drop('wiki-merge-batch', True)\n"
+ "druid.datasources.drop('example-wikipedia-batch', True)\n",
+ "druid.datasources.drop('example-wikipedia-3-batch', True)\n",
+ "druid.datasources.drop('example-wikipedia-4-batch', True)\n",
+ "druid.datasources.drop('example-wikipedia-only-human', True)\n",
+ "druid.datasources.drop('example-kttm-transform-batch', True)\n",
+ "druid.datasources.drop('example-kttm-nested-batch', True)\n",
+ "druid.datasources.drop('example-kttm-enhanced-batch', True)\n",
+ "druid.datasources.drop('example-wiki-users-batch', True)\n",
+ "druid.datasources.drop('example-wiki-merge-batch', True)\n",
+ "druid.datasources.drop('example-taxi-trips-rollup', True)\n"
]
}
],
diff --git a/notebooks/02-ingestion/05-working-with-nested-columns.ipynb b/notebooks/02-ingestion/05-working-with-nested-columns.ipynb
index 197c3702..e42c8d42 100644
--- a/notebooks/02-ingestion/05-working-with-nested-columns.ipynb
+++ b/notebooks/02-ingestion/05-working-with-nested-columns.ipynb
@@ -43,7 +43,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 25.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"### Run with Docker\n",
"\n",
@@ -111,7 +111,7 @@
"outputs": [],
"source": [
"sql = '''\n",
- "INSERT INTO \"example-koalas-nesteddata\"\n",
+ "REPLACE INTO \"example-koalas-nesteddata\" OVERWRITE ALL\n",
" WITH \"source\" AS\n",
" (SELECT * FROM TABLE(EXTERN('{\"type\":\"http\",\"uris\":[\"https://static.imply.io/example-data/kttm-nested-v2/kttm-nested-v2-2019-08-25.json.gz\"]}',\n",
" '{\"type\":\"json\"}','[{\"name\":\"timestamp\",\"type\":\"string\"},{\"name\":\"client_ip\",\"type\":\"string\"},\n",
@@ -128,7 +128,7 @@
" PARTITIONED BY DAY\n",
"'''\n",
"\n",
- "sql_client.run_task(sql)\n",
+ "display.run_task(sql)\n",
"sql_client.wait_until_ready(\"example-koalas-nesteddata\")\n",
"display.table(\"example-koalas-nesteddata\")"
]
@@ -192,7 +192,7 @@
" PARTITIONED BY DAY\n",
"'''\n",
"\n",
- "sql_client.run_task(sql)\n",
+ "display.run_task(sql)\n",
"sql_client.wait_until_ready(\"example-koalas-nesteddata-transform\")\n",
"display.table(\"example-koalas-nesteddata-transform\")"
]
@@ -326,6 +326,47 @@
"resp.show()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### Use expression for `path` in JSON_VALUE/JSON_QUERY\n",
+ "\n",
+ "JSON_VALUE and JSON_QUERY functions can use an expression to build the `path` parameter by specifying an expression instead of a literal.\n",
+ "\n",
+ "While this can useful it comes with some cost. Druid can only access the nested indexed columns when you use a literal value for the `path`. When you use an expression Druid accessed the raw JSON to look at the nested fields, this is much slower than using a literal. \n",
+ "\n",
+ "This example uses an expression that depends on the value of the `browser` field in the nested column to select either `country` or `city` values. \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql = '''\n",
+ "SELECT JSON_VALUE(agent, '$.browser') as browser,\n",
+ "JSON_VALUE(geo_ip, \n",
+ " -- expression to select '$.country' as the path for some browsers\n",
+ " -- or '$.city' in all other cases\n",
+ " CASE WHEN \n",
+ " JSON_VALUE(agent, '$.browser') in ('Android browser','Apple Mail', 'Mobile Safari') THEN '$.country' \n",
+ " ELSE \n",
+ " '$.city'\n",
+ " END\n",
+ " ) as city_or_country,\n",
+ " count(*) row_count\n",
+ "FROM \"example-koalas-nesteddata\"\n",
+ "GROUP BY 1,2\n",
+ "ORDER BY 3 DESC\n",
+ "LIMIT 20\n",
+ "'''\n",
+ "resp = sql_client.sql_query(sql)\n",
+ "resp.show()"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -391,13 +432,6 @@
"- [SQL JSON functions](https://druid.apache.org/docs/latest/querying/sql-json-functions.html) for details on all of the functions you used in this tutorial.\n",
"- [SQL-based ingestion](https://druid.apache.org/docs/latest/multi-stage-query/index.html) for information on how to use Druid SQL-based ingestion."
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -416,7 +450,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.11.6"
},
"vscode": {
"interpreter": {
diff --git a/notebooks/02-ingestion/08-table-datatypes-arrays.ipynb b/notebooks/02-ingestion/08-table-datatypes-arrays.ipynb
index 47298194..343c66d3 100644
--- a/notebooks/02-ingestion/08-table-datatypes-arrays.ipynb
+++ b/notebooks/02-ingestion/08-table-datatypes-arrays.ipynb
@@ -35,7 +35,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 28.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"#### Run with Docker\n",
"\n",
@@ -1155,6 +1155,385 @@
"display.sql(sql)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "1f78b53f-b6db-4e53-82da-0b4bf44ad992",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Ingest arrays directly from source data\n",
+ "Arrays of primitive values in the source data can be loaded directly into Druid using SQL based or native ingestion. The elements of the array and whole array values are indexed. Filtering on whole array values will use array valued index and functions like ARRAY_CONTAINS will use the array element valued index.\n",
+ "\n",
+ "The following examples ingest some array data of different primitive types and demonstrate the use of these filtering mechanisms.\n",
+ "The data contains two string arrays `\"pets\"` and `\"breeds\"` and an array of numeric values `\"weights\"`:\n",
+ "```\n",
+ "{\"time\":\"2024-01-10 05:00:00\", \"owner\":\"Alex\", \"pets\":[\"Max\",\"Yoli\"],\"breeds\":[\"boxer\",\"lab\"],\"weights\":[48,65]}\n",
+ "{\"time\":\"2024-01-10 05:30:00\", \"owner\":\"Jill\", \"pets\":[\"Mowgli\",\"Pelusa\"],\"breeds\":[\"boxer\",\"mix\"],\"weights\":[56,27]}\n",
+ "{\"time\":\"2024-01-10 06:00:00\", \"owner\":\"Devraj\", \"pets\":[\"Linda\",\"Frida\"],\"breeds\":[\"beagle\",\"basenji\"],\"weights\":[40,45]}\n",
+ "{\"time\":\"2024-01-10 06:30:00\", \"owner\":\"Kyle\", \"pets\":[\"Kala\",\"Boots\"],\"breeds\":[\"pitbull\",\"siamese\"],\"weights\":[58,10]}\n",
+ "```\n",
+ "\n",
+ "Druid supports ARRAY datatypes in the EXTEND clause of the EXTERN table function defining the schema of the external data:\n",
+ "```\n",
+ " FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\"type\":\"inline\",\"data\":...',\n",
+ " '{\"type\":\"json\"}'\n",
+ " )\n",
+ " ) EXTEND (\"time\" VARCHAR, \"owner\" VARCHAR, \"pets\" VARCHAR ARRAY, \"breeds\" VARCHAR ARRAY, \"weights\" DOUBLE ARRAY)\n",
+ "```\n",
+ "- use type \\ ARRAY to ingest arrays\n",
+ "- in this example \"pets\" and \"breeds\" are loaded as a VARCHAR ARRAY\n",
+ "- and \"weights\" is loaded as a DOUBLE ARRAY \n",
+ "\n",
+ "Run the following cell to ingest the data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e10e2d6a-4e0f-46bf-a917-0398b0409833",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "REPLACE INTO \"example-arrays-in-source\" OVERWRITE ALL\n",
+ "WITH \"ext\" AS (\n",
+ " SELECT *\n",
+ " FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\"type\":\"inline\",\"data\":\"{\\\\\"time\\\\\":\\\\\"2024-01-10 05:00:00\\\\\", \\\\\"owner\\\\\":\\\\\"Alex\\\\\", \\\\\"pets\\\\\":[\\\\\"Max\\\\\",\\\\\"Yoli\\\\\"],\\\\\"breeds\\\\\":[\\\\\"boxer\\\\\",\\\\\"lab\\\\\"],\\\\\"weights\\\\\":[48,65]}\\\\n{\\\\\"time\\\\\":\\\\\"2024-01-10 05:30:00\\\\\", \\\\\"owner\\\\\":\\\\\"Jill\\\\\", \\\\\"pets\\\\\":[\\\\\"Mowgli\\\\\",\\\\\"Pelusa\\\\\"],\\\\\"breeds\\\\\":[\\\\\"boxer\\\\\",\\\\\"mix\\\\\"],\\\\\"weights\\\\\":[56,27]}\\\\n{\\\\\"time\\\\\":\\\\\"2024-01-10 06:00:00\\\\\", \\\\\"owner\\\\\":\\\\\"Devraj\\\\\", \\\\\"pets\\\\\":[\\\\\"Linda\\\\\",\\\\\"Frida\\\\\"],\\\\\"breeds\\\\\":[\\\\\"beagle\\\\\",\\\\\"basenji\\\\\"],\\\\\"weights\\\\\":[40,45]}\\\\n{\\\\\"time\\\\\":\\\\\"2024-01-10 06:30:00\\\\\", \\\\\"owner\\\\\":\\\\\"Kyle\\\\\", \\\\\"pets\\\\\":[\\\\\"Kala\\\\\",\\\\\"Boots\\\\\"],\\\\\"breeds\\\\\":[\\\\\"pitbull\\\\\",\\\\\"siamese\\\\\"],\\\\\"weights\\\\\":[58,10]}\"}',\n",
+ " '{\"type\":\"json\"}'\n",
+ " )\n",
+ " ) EXTEND (\"time\" VARCHAR, \"owner\" VARCHAR, \"pets\" VARCHAR ARRAY, \"breeds\" VARCHAR ARRAY, \"weights\" DOUBLE ARRAY)\n",
+ ")\n",
+ "SELECT\n",
+ " TIME_PARSE(\"time\") as __time,\n",
+ " \"owner\",\n",
+ " \"pets\",\n",
+ " \"breeds\",\n",
+ " ARRAY_TO_MV(\"breeds\") as \"breeds_mv\",\n",
+ " \"weights\"\n",
+ "FROM \"ext\"\n",
+ "PARTITIONED BY ALL\n",
+ "'''\n",
+ "req = sql_client.sql_request(sql)\n",
+ "req.add_context(\"arrayIngestMode\", \"array\")\n",
+ "\n",
+ "display.run_task(req)\n",
+ "sql_client.wait_until_ready('example-arrays-in-source')\n",
+ "display.table('example-arrays-in-source')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ddf76ef-d731-4de9-8a3a-1e9a40ff9fad",
+ "metadata": {},
+ "source": [
+ "Notice that the ingestion is loading the \"breeds\" column as both and array and a multi-value string column. \n",
+ "While this is not usual it will help to understand the difference between the two.\n",
+ "\n",
+ "Take a look at the data:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4a1ea7e-36ef-4a82-82be-ac9b56adfbdc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT * \n",
+ "FROM \"example-arrays-in-source\" x\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ad81872f-5f50-475f-a3b6-dd4a5334d2ea",
+ "metadata": {},
+ "source": [
+ "The result shows \"breeds\" and \"breeds_mv\" that look identical except for the fact that \"breeds_mv\" is sorted.\n",
+ "The query group by behavior on these two columns is quite different.\n",
+ "The SQL example below shows how grouping on an ARRAY column will group on the whole array value:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "12051888-bb4c-4803-a8d1-121798786a7a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"breeds\", count(*) \n",
+ "FROM \"example-arrays-in-source\" x\n",
+ "GROUP BY 1\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "07f01b60-4678-4fe3-a189-afe28b8ffe6e",
+ "metadata": {},
+ "source": [
+ "The multi-value column aggregation behavior is different, it naturally expands/unnests the multiple values resulting in:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e1b8326-0bb7-4d21-93f8-5bde83a4c10d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"breeds_mv\", count(*) \n",
+ "FROM \"example-arrays-in-source\" x\n",
+ "GROUP BY 1\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a3e6cbc1-87eb-46d4-ad49-ab040092b276",
+ "metadata": {},
+ "source": [
+ "The same aggregation behavior as the MVD column can be achieved by explicitly unnesting the `\"breed\"` array column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3c90084-7fc5-46d9-a50b-bbc92b1dd757",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT b.\"breed\", count(*) \n",
+ "FROM \"example-arrays-in-source\" x, UNNEST(\"breeds\") AS b(\"breed\")\n",
+ "GROUP BY 1\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4d888de3-354c-4719-b8c2-87b374b4c5aa",
+ "metadata": {},
+ "source": [
+ "You can filter for a literal array value, a match occurs only if the contents are identical, including order of the values:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cf87b571-3bd2-4cf8-bc87-ad7703d497f7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT * \n",
+ "FROM \"example-arrays-in-source\" x\n",
+ "WHERE \"pets\"=ARRAY['Kala','Boots']\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e7aa25f-1f7d-4ccc-9954-b8986bf1cb08",
+ "metadata": {},
+ "source": [
+ "Use ARRAY_CONTAINS function to find rows with a specific element appearing in the array column. \n",
+ "The following SQL finds customers who have a pet boxer:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b4c70480-4230-429d-bcb0-63e411bfc7dd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT * \n",
+ "FROM \"example-arrays-in-source\" x\n",
+ "WHERE ARRAY_CONTAINS(\"breeds\",'boxer')\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "35046780-4870-4f71-834f-7877b2c2781b",
+ "metadata": {},
+ "source": [
+ "Find customers with pets that weigh more than 50 pounds by UNNESTing the \"weights\" array and comparing the individual items:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b7163382-6b8c-49df-9048-6a473e04c11e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT * \n",
+ "FROM \"example-arrays-in-source\" x, UNNEST(x.\"weights\") AS y(\"weight\")\n",
+ "WHERE y.\"weight\">50\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ea406773-3140-4535-a4da-cd081cb045bb",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Working with nested arrays of objects\n",
+ "Druid can ingest arrays of objects and they can be UNNESTed, filtered and aggregated.\n",
+ "This is very useful when data contains lists of related objects associated to an event. Here's an IOT example that contains an array of metrics issued by the sensors of a device:\n",
+ "\n",
+ "```\n",
+ "{\n",
+ " \"time\":\"2024-01-01 10:00:00\",\n",
+ " \"device\":\"ABF001\",\n",
+ " \"loop\":\"NH3-100-01\",\n",
+ " \"loop-seq\":1,\n",
+ " \"process\":\n",
+ " {\n",
+ " \"name\":\"NH3-100\",\n",
+ " \"session\":\"BATCH-000001\",\n",
+ " \"metrics\":[\n",
+ " {\"name\":\"temperature\",\"value\":30},\n",
+ " {\"name\":\"pressure\",\"value\":56},\n",
+ " {\"name\":\"flow\",\"value\":10}\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "Given that different devices may have different sets of sensors, another example in the same set might look like:\n",
+ "```\n",
+ "{\n",
+ " \"time\":\"2024-01-01 10:00:00\",\n",
+ " \"device\":\"HEAT001\",\n",
+ " \"loop\":\"NH3-100-01\",\n",
+ " \"loop-seq\":2,\n",
+ " \"process\":\n",
+ " {\n",
+ " \"name\":\"NH3-100\",\n",
+ " \"session\":\"BATCH-000001\",\n",
+ " \"metrics\":[\n",
+ " {\"name\":\"temperature\",\"value\":455},\n",
+ " {\"name\":\"pressure\",\"value\":100},\n",
+ " {\"name\":\"fuel-input\", \"value\":10}\n",
+ " ]\n",
+ " }\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "Use the following cell to load these two examples:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ab7528d4-ef32-4631-8293-a7993835b2a0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "REPLACE INTO \"example-array-json-objects\" OVERWRITE ALL\n",
+ "WITH \"ext\" AS (\n",
+ " SELECT *\n",
+ " FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\"type\":\"inline\",\"data\":\"{ \\\\\"time\\\\\":\\\\\"2024-01-01 10:00:00\\\\\", \\\\\"device\\\\\":\\\\\"ABF001\\\\\", \\\\\"loop\\\\\":\\\\\"NH3-100-01\\\\\", \\\\\"loop-seq\\\\\":1,\\\\\"process\\\\\": {\\\\\"name\\\\\":\\\\\"NH3-100\\\\\",\\\\\"session\\\\\":\\\\\"BATCH-000001\\\\\",\\\\\"metrics\\\\\":[ {\\\\\"name\\\\\":\\\\\"temperature\\\\\",\\\\\"value\\\\\":30}, {\\\\\"name\\\\\":\\\\\"pressure\\\\\",\\\\\"value\\\\\":56}, {\\\\\"name\\\\\":\\\\\"flow\\\\\",\\\\\"value\\\\\":10}]}}\\\\n{\\\\\"time\\\\\":\\\\\"2024-01-01 10:00:00\\\\\",\\\\\"device\\\\\":\\\\\"HEAT001\\\\\",\\\\\"loop\\\\\":\\\\\"NH3-100-01\\\\\",\\\\\"loop-seq\\\\\":2,\\\\\"process\\\\\":{ \\\\\"name\\\\\":\\\\\"NH3-100\\\\\", \\\\\"session\\\\\":\\\\\"BATCH-000001\\\\\",\\\\\"metrics\\\\\":[{\\\\\"name\\\\\":\\\\\"temperature\\\\\",\\\\\"value\\\\\":455},{\\\\\"name\\\\\":\\\\\"pressure\\\\\",\\\\\"value\\\\\":100},{\\\\\"name\\\\\":\\\\\"fuel-input\\\\\", \\\\\"value\\\\\":10}]}}\\\\n\"}', '{\"type\":\"json\"}'\n",
+ " )\n",
+ " ) EXTEND (\"time\" VARCHAR, \"device\" VARCHAR, \"loop\" VARCHAR, \"loop-seq\" BIGINT, \"process\" TYPE('COMPLEX'))\n",
+ ")\n",
+ "SELECT\n",
+ " TIME_PARSE(TRIM(\"time\")) AS \"__time\",\n",
+ " \"device\",\n",
+ " \"loop\",\n",
+ " \"loop-seq\",\n",
+ " \"process\"\n",
+ "FROM \"ext\"\n",
+ "PARTITIONED BY DAY\n",
+ "'''\n",
+ "req = sql_client.sql_request(sql)\n",
+ "req.add_context(\"arrayIngestMode\", \"array\")\n",
+ "\n",
+ "display.run_task(req)\n",
+ "sql_client.wait_until_ready('example-array-json-objects')\n",
+ "display.table('example-array-json-objects')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b866323-7538-434e-a890-c99f1ea5bbd9",
+ "metadata": {},
+ "source": [
+ "Use the JSON_QUERY_ARRAY function to access nested arrays of objects:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d2dea605-2cd4-4bb1-a01a-cb5c328188b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"loop\", JSON_QUERY_ARRAY( \"process\", '$.metrics') \"metric_array\" FROM \"example-array-json-objects\"\n",
+ "'''\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2c780164-ea51-4662-bd97-45db00d5d7af",
+ "metadata": {},
+ "source": [
+ "Combine JSON_QUERY_ARRAY with UNNEST to access array elements, and use JSON_VALUE functions to access array object fields to do filtering and aggregation:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d5191c58-d948-4b3e-bd34-53df6f30ffc7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"loop\", \n",
+ " JSON_VALUE( m.\"metric\", '$.name') metric_name, \n",
+ " MIN( JSON_VALUE( m.\"metric\", '$.value' RETURNING DOUBLE) ) min_metric_value,\n",
+ " MAX( JSON_VALUE( m.\"metric\", '$.value' RETURNING DOUBLE) ) max_metric_value,\n",
+ " AVG( JSON_VALUE( m.\"metric\", '$.value' RETURNING DOUBLE) ) avg_metric_value\n",
+ "FROM \"example-array-json-objects\", \n",
+ " UNNEST( JSON_QUERY_ARRAY( \"process\", '$.metrics')) AS m(\"metric\")\n",
+ "WHERE JSON_VALUE( m.\"metric\", '$.name') IN ('temperature', 'pressure')\n",
+ "GROUP BY 1,2\n",
+ "'''\n",
+ "display.sql(sql)"
+ ]
+ },
{
"cell_type": "markdown",
"id": "44738d6d-cec2-40ad-aaba-998c758c63f4",
@@ -1175,7 +1554,9 @@
"druid.datasources.drop(\"example-koalas-arrays-1\")\n",
"druid.datasources.drop(\"example-koalas-arrays-2\")\n",
"druid.datasources.drop(\"example-koalas-arrays-3\")\n",
- "druid.datasources.drop(\"example-koalas-arrays-4\")"
+ "druid.datasources.drop(\"example-koalas-arrays-4\")\n",
+ "druid.datasources.drop(\"example-array-json-objects\")\n",
+ "druid.datasources.drop(\"example-arrays-in-source\")"
]
},
{
@@ -1189,6 +1570,7 @@
"* With the right context parameters, arrays can be constructed from source data and created using aggregators.\n",
"* Scalar array functions allow for items to be found and added.\n",
"* The UNNEST function, together with a JOIN, allows for arrays to be expanded into individual rows.\n",
+ "* JSON_QUERY_ARRAY combined with UNNEST enables the use of arrays of objects that can be expanded into rows and columns.\n",
"\n",
"## Learn more\n",
"\n",
diff --git a/notebooks/02-ingestion/12-spatial-dimensions.ipynb b/notebooks/02-ingestion/12-spatial-dimensions.ipynb
new file mode 100644
index 00000000..5af7c91b
--- /dev/null
+++ b/notebooks/02-ingestion/12-spatial-dimensions.ipynb
@@ -0,0 +1,692 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0cb3b009-ebde-4d56-9d59-a028d66d8309",
+ "metadata": {},
+ "source": [
+ "# Ingestion and query of spatial dimensions\n",
+ "\n",
+ "\n",
+ "This notebook demonstrates the spatial dimension capabilities of Apache Druid.\n",
+ "In this notebook, you perform the following tasks: \n",
+ "- Ingest spatial data\n",
+ "- Query spatial dimensions\n",
+ "- Use spatial filters to efficiently find events within a rectangle, a radius, or a polygon shape \n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbdbf6ad-ca7b-40f5-8ca3-1070f4a3ee42",
+ "metadata": {},
+ "source": [
+ "## Prerequisites\n",
+ "\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
+ "\n",
+ "#### Run with Docker\n",
+ "\n",
+ "Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see the Learn Druid repository [readme](https://github.com/implydata/learn-druid).\n",
+ " \n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5007a243-b81a-4601-8f57-5b14940abbff",
+ "metadata": {},
+ "source": [
+ "## Initialization\n",
+ "\n",
+ "The following cells set up the notebook and learning environment ready for use.\n",
+ "\n",
+ "### Set up and connect to the learning environment\n",
+ "\n",
+ "Run the next cell to set up the Druid Python client's connection to Apache Druid.\n",
+ "\n",
+ "If successful, the Druid version number will be shown in the output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1ec783b-df3f-4168-9be2-cdc6ad3e33c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import druidapi\n",
+ "import os\n",
+ "\n",
+ "if 'DRUID_HOST' not in os.environ.keys():\n",
+ " druid_host=f\"http://localhost:8888\"\n",
+ "else:\n",
+ " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
+ "\n",
+ "print(f\"Opening a connection to {druid_host}.\")\n",
+ "druid = druidapi.jupyter_client(druid_host)\n",
+ "\n",
+ "display_client = druid.display\n",
+ "sql_client = druid.sql\n",
+ "status_client = druid.status\n",
+ "rest_client = druid.rest\n",
+ "\n",
+ "# client for Data Generator API\n",
+ "datagen = druidapi.rest.DruidRestClient(\"http://datagen:9999\")\n",
+ "\n",
+ "status_client.version"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "70aabd26-6615-48ce-8011-adcc83e7d621",
+ "metadata": {},
+ "source": [
+ "### Set up helper functions\n",
+ "\n",
+ "Run the next two cells to set up the following helper functions:\n",
+ "- `wait_for_datagen`: probes the data generator for the status of a job until it completes\n",
+ "- `wait_for_task`: probes indexer task status until it completes or fails"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c45d472d-c414-42c3-bf6f-53a4431f8bdd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def wait_for_datagen( job_name:str):\n",
+ " import time\n",
+ " from IPython.display import clear_output\n",
+ " # wait for the messages to be fully published \n",
+ " done = False\n",
+ " while not done:\n",
+ " result = datagen.get_json(f\"/status/{job_name}\",'')\n",
+ " clear_output(wait=True)\n",
+ " print(json.dumps(result, indent=2))\n",
+ " if result[\"status\"] == 'COMPLETE':\n",
+ " done = True\n",
+ " else:\n",
+ " time.sleep(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3cac21e2-2e38-42b4-8e89-5dbd9fcd1af8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def wait_for_task( task_id):\n",
+ " import time\n",
+ " from IPython.display import clear_output\n",
+ " # wait for the messages to be fully published \n",
+ " done = False\n",
+ " probe_count=0\n",
+ " while not done:\n",
+ " result = rest_client.get_json(f\"/druid/indexer/v1/task/{task_id}/status\",'')\n",
+ " clear_output(wait=True)\n",
+ " print(json.dumps(result, indent=2))\n",
+ " if result[\"status\"][\"status\"] != 'RUNNING':\n",
+ " done = True\n",
+ " else:\n",
+ " probe_count+=1\n",
+ " print(f'Sleeping... probe # {probe_count}')\n",
+ " time.sleep(1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9464c204-bbf1-4a94-ae02-2356b606049f",
+ "metadata": {},
+ "source": [
+ "## Generate sample data\n",
+ "Run the following cell to create 10k rows of user data that include latitude and longitude coordinates for each user."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9f69c6e-ccf2-46de-b7c6-2cfebd590017",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datetime import datetime, timedelta\n",
+ "# simulate clicks for last 2 hours\n",
+ "gen_hours=2\n",
+ "gen_now = datetime.now() - timedelta(hours=gen_hours)\n",
+ "gen_start_time = gen_now.strftime(\"%Y-%m-%d %H:%M:%S\")\n",
+ "\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "datagen_request = {\n",
+ " \"name\": \"users\",\n",
+ " \"target\": { \"type\": \"file\", \"path\":\"users.json\" },\n",
+ " \"config_file\": \"clickstream/users_init.json\", \n",
+ " \"concurrency\":100,\n",
+ " \"total_events\":10000 \n",
+ "}\n",
+ "datagen.post(\"/start\", json.dumps(datagen_request), headers=headers)\n",
+ "wait_for_datagen('users')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1e312f34-313f-4eb4-9d94-98682f09eb05",
+ "metadata": {},
+ "source": [
+ "To ingest the latitude and longitude as a spatially indexed dimension, you must use a native ingestion spec. SQL-based ingestion does not support spatial dimensions.\n",
+ "\n",
+ "The following native `index_parallel` spec includes a property within the \"dimensionSpec\" that describes the spatial index creation:\n",
+ "```\n",
+ " \"dimensionSpec\":\n",
+ " \"dimensions\":[\n",
+ " ...\n",
+ " {\n",
+ " \"name\": \"address_lat\",\n",
+ " \"type\": \"double\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"address_long\",\n",
+ " \"type\": \"double\"\n",
+ " },\n",
+ " ...\n",
+ " ]\n",
+ " \"spatialDimensions\": [\n",
+ " {\n",
+ " \"dimName\": \"address_spatial\",\n",
+ " \"dims\": [\n",
+ " \"address_lat\",\n",
+ " \"address_long\"\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ " ...\n",
+ "```\n",
+ "\n",
+ "The `\"spatialDimensions\"` array contains the list of spatial dimensions. Each spatial dimension has a name `\"dimName\"` and the list of dimensions that are the coordinates for a point in space. The dimensions that form the point come from the dimensions defined in the ingestion spec.\n",
+ "\n",
+ "The ingestion spec is submitted using a REST API call which returns a `task_id`. The `task_id` is then used to monitor the job using a status REST API call inside the `wait_for_task` helper function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f9e6d24-7809-4c3d-bc43-66e9a625abb7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "spatial_index_spec = {\n",
+ " \"type\": \"index_parallel\",\n",
+ " \"spec\": {\n",
+ " \"ioConfig\": {\n",
+ " \"type\": \"index_parallel\",\n",
+ " \"inputSource\": {\n",
+ " \"type\": \"http\",\n",
+ " \"uris\": [\n",
+ " \"http://datagen:9999/file/users.json\"\n",
+ " ]\n",
+ " }, \n",
+ " \"inputFormat\": {\n",
+ " \"type\": \"json\"\n",
+ " },\n",
+ " \"appendToExisting\": False\n",
+ " },\n",
+ " \"dataSchema\": {\n",
+ " \"granularitySpec\": {\n",
+ " \"segmentGranularity\": \"day\",\n",
+ " \"queryGranularity\": \"none\",\n",
+ " \"rollup\": False\n",
+ " },\n",
+ " \"dataSource\": \"example-spatial-index\",\n",
+ " \"timestampSpec\": {\n",
+ " \"column\": \"time\",\n",
+ " \"format\": \"iso\"\n",
+ " },\n",
+ " \"dimensionsSpec\": {\n",
+ " \"dimensions\": [\n",
+ " \"user_id\",\n",
+ " \"first_name\",\n",
+ " \"last_name\",\n",
+ " \"dob\",\n",
+ " {\n",
+ " \"name\": \"address_lat\",\n",
+ " \"type\": \"double\"\n",
+ " },\n",
+ " {\n",
+ " \"name\": \"address_long\",\n",
+ " \"type\": \"double\"\n",
+ " },\n",
+ " \"marital_status\",\n",
+ " {\n",
+ " \"name\": \"income\",\n",
+ " \"type\": \"double\"\n",
+ " },\n",
+ " \"signup_ts\"\n",
+ " ],\n",
+ " \"spatialDimensions\": [\n",
+ " {\n",
+ " \"dimName\": \"address_spatial\",\n",
+ " \"dims\": [\n",
+ " \"address_lat\",\n",
+ " \"address_long\"\n",
+ " ]\n",
+ " }\n",
+ " ]\n",
+ " }\n",
+ " },\n",
+ " \"tuningConfig\": {\n",
+ " \"type\": \"index_parallel\",\n",
+ " \"partitionsSpec\": {\n",
+ " \"type\": \"dynamic\"\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ "}\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "task = rest_client.post(\"/druid/indexer/v1/task\", json.dumps(spatial_index_spec), headers=headers)\n",
+ "task_id = json.loads(task.text)['task']\n",
+ "wait_for_task(task_id)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6bd176f7-c443-41ec-af7a-1e50d69fae01",
+ "metadata": {},
+ "source": [
+ "## Filter spatial data \n",
+ "To query spatial data that uses indexed filtering, you must use the native query API. \n",
+ "You can specify spatial filters as follows:\n",
+ "\n",
+ "```\n",
+ "\"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": ,\n",
+ " \"bound\": \n",
+ "}\n",
+ "```\n",
+ "\n",
+ "The following are the types of bounds for spatial filters:\n",
+ "- rectangular: matches any point that falls within the specified rectangle \n",
+ "- radius: matches any point that falls within the circle defined by center coordinates and a radius \n",
+ "- polygon: matches any point within the area of the polygon as defined by a set of points\n",
+ "\n",
+ "\n",
+ "### Query with spatial data\n",
+ "Use the following TopN query to aggregate minimum and maximum income among the users found in the filtered geographical area and also count them.\n",
+ "The query is the same across all three examples, each example uses a different spatial filter bound type: `\"rectangular\"`,`\"radius\"` or `\"polygon\"`.\n",
+ "\n",
+ "### Filter using a rectangular area\n",
+ "The next cell is an example of a rectangular bound.\n",
+ "The two corner points of the rectangle (`\"minCoords\"`, `\"maxCoords\"`) define the filter. Anything that falls inside the rectangle is included in the results:\n",
+ "```\n",
+ "\"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"rectangular\",\n",
+ " \"minCoords\": [10.0, 20.0],\n",
+ " \"maxCoords\": [30.0, 40.0]\n",
+ " }\n",
+ " }\n",
+ "```\n",
+ "\n",
+ "Run the following cell to test a rectangular filter:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "da9f93f6-f4a5-4673-a9dd-6d305bce2251",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "rectangular_filter_query = {\n",
+ " \"queryType\": \"topN\",\n",
+ " \"dataSource\": {\n",
+ " \"type\": \"table\",\n",
+ " \"name\": \"example-spatial-index\"\n",
+ " },\n",
+ " \"dimension\": {\n",
+ " \"type\": \"default\",\n",
+ " \"dimension\": \"marital_status\",\n",
+ " \"outputName\": \"marital_status\",\n",
+ " \"outputType\": \"STRING\"\n",
+ " },\n",
+ " \"metric\": {\n",
+ " \"type\": \"dimension\",\n",
+ " \"ordering\": {\n",
+ " \"type\": \"lexicographic\"\n",
+ " }\n",
+ " },\n",
+ " \"threshold\": 1001,\n",
+ " \"intervals\": {\n",
+ " \"type\": \"intervals\",\n",
+ " \"intervals\": [\n",
+ " \"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z\"\n",
+ " ]\n",
+ " },\n",
+ " \"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"rectangular\",\n",
+ " \"minCoords\": [10.0, 20.0],\n",
+ " \"maxCoords\": [30.0, 40.0]\n",
+ " }\n",
+ " },\n",
+ " \"granularity\": {\n",
+ " \"type\": \"all\"\n",
+ " },\n",
+ " \"aggregations\": [\n",
+ " {\n",
+ " \"type\": \"doubleMin\",\n",
+ " \"name\": \"min_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"doubleMax\",\n",
+ " \"name\": \"max_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"count\",\n",
+ " \"name\": \"user_count\"\n",
+ " }\n",
+ " ],\n",
+ " \"context\": {\n",
+ " \"sqlOuterLimit\": 1001,\n",
+ " \"useNativeQueryExplain\": False\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "result = rest_client.post(\"/druid/v2\", json.dumps(rectangular_filter_query), headers=headers)\n",
+ "\n",
+ "json.loads(result.text)[0]['result']\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2a4a8bfc-4d99-45a2-8fcc-5b28c68335c6",
+ "metadata": {},
+ "source": [
+ "### Filter using a radius area\n",
+ "The following example shows a radius bound filter.\n",
+ "A `\"radius\"` bound filter is defined by a center point `\"coords\"` and a `\"radius\"` of the circle. Anything within the circle will be included in the result:\n",
+ "```\n",
+ "\"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"radius\",\n",
+ " \"coords\": [10.0, 20.0],\n",
+ " \"radius\": [30.0]\n",
+ " }\n",
+ " }\n",
+ "```\n",
+ "Run the following cell to test a radius filter:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c6323207-d6c7-4ff5-8b2f-24ed89fe2b86",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "radius_filter_query = {\n",
+ " \"queryType\": \"topN\",\n",
+ " \"dataSource\": {\n",
+ " \"type\": \"table\",\n",
+ " \"name\": \"example-spatial-index\"\n",
+ " },\n",
+ " \"dimension\": {\n",
+ " \"type\": \"default\",\n",
+ " \"dimension\": \"marital_status\",\n",
+ " \"outputName\": \"marital_status\",\n",
+ " \"outputType\": \"STRING\"\n",
+ " },\n",
+ " \"metric\": {\n",
+ " \"type\": \"dimension\",\n",
+ " \"ordering\": {\n",
+ " \"type\": \"lexicographic\"\n",
+ " }\n",
+ " },\n",
+ " \"threshold\": 1001,\n",
+ " \"intervals\": {\n",
+ " \"type\": \"intervals\",\n",
+ " \"intervals\": [\n",
+ " \"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z\"\n",
+ " ]\n",
+ " },\n",
+ " \"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"radius\",\n",
+ " \"coords\": [-10.0, 20.0],\n",
+ " \"radius\": 10.0\n",
+ " }\n",
+ " },\n",
+ " \"granularity\": {\n",
+ " \"type\": \"all\"\n",
+ " },\n",
+ " \"aggregations\": [\n",
+ " {\n",
+ " \"type\": \"doubleMin\",\n",
+ " \"name\": \"min_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"doubleMax\",\n",
+ " \"name\": \"max_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"count\",\n",
+ " \"name\": \"user_count\"\n",
+ " }\n",
+ " ],\n",
+ " \"context\": {\n",
+ " \"sqlOuterLimit\": 1001,\n",
+ " \"useNativeQueryExplain\": False\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "result = rest_client.post(\"/druid/v2\", json.dumps(radius_filter_query), headers=headers)\n",
+ "\n",
+ "json.loads(result.text)[0]['result']\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "88ca6f49-1f53-41ce-8ae1-cb89c8f70cf8",
+ "metadata": {},
+ "source": [
+ "### Filter using a polygon area\n",
+ "The following is an example of a polygon bound.\n",
+ "A polygon describes a closed irregular shape where any point that falls within the irregular shape is included in the results. The shape is defined by a list of coordinates in two arrays:\n",
+ "- `\"abscissa\"`: horizontal coordinates\n",
+ "- `\"ordinate\"`: vertical coordinates \n",
+ "\n",
+ "This example uses a rough outline of Africa:\n",
+ "\n",
+ "```\n",
+ "\"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"polygon\",\n",
+ " \"abscissa\": [30.942493, 36.917158, 22.799200, 3.914715, -0.286389, -35.494284, 11.654848, 30.942493 ],\n",
+ " \"ordinate\": [32.434730, 9.758948 ,-16.959803,-8.524121, 7.766978, 19.844799, 51.309644, 32.434730 ]\n",
+ " }\n",
+ " }\n",
+ "```\n",
+ "\n",
+ "Run the following cell to test a polygon filter:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "618bbf5e-0cf3-4ad9-8a04-0452098ab042",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# this polygon filter is roughly the outline of Manhattan\n",
+ "polygon_filter_query = {\n",
+ " \"queryType\": \"topN\",\n",
+ " \"dataSource\": {\n",
+ " \"type\": \"table\",\n",
+ " \"name\": \"example-spatial-index\"\n",
+ " },\n",
+ " \"dimension\": {\n",
+ " \"type\": \"default\",\n",
+ " \"dimension\": \"marital_status\",\n",
+ " \"outputName\": \"marital_status\",\n",
+ " \"outputType\": \"STRING\"\n",
+ " },\n",
+ " \"metric\": {\n",
+ " \"type\": \"dimension\",\n",
+ " \"ordering\": {\n",
+ " \"type\": \"lexicographic\"\n",
+ " }\n",
+ " },\n",
+ " \"threshold\": 1001,\n",
+ " \"intervals\": {\n",
+ " \"type\": \"intervals\",\n",
+ " \"intervals\": [\n",
+ " \"-146136543-09-08T08:23:32.096Z/146140482-04-24T15:36:27.903Z\"\n",
+ " ]\n",
+ " },\n",
+ " \"filter\": {\n",
+ " \"type\": \"spatial\",\n",
+ " \"dimension\": \"address_spatial\",\n",
+ " \"bound\": {\n",
+ " \"type\": \"polygon\",\n",
+ " \"abscissa\": [30.942493, 36.917158, 22.799200, 3.914715, -0.286389, -35.494284, 11.654848, 30.942493 ],\n",
+ " \"ordinate\": [32.434730, 9.758948 ,-16.959803,-8.524121, 7.766978, 19.844799, 51.309644, 32.434730 ]\n",
+ " }\n",
+ " },\n",
+ " \"granularity\": {\n",
+ " \"type\": \"all\"\n",
+ " },\n",
+ " \"aggregations\": [\n",
+ " {\n",
+ " \"type\": \"doubleMin\",\n",
+ " \"name\": \"min_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"doubleMax\",\n",
+ " \"name\": \"max_income\",\n",
+ " \"fieldName\": \"income\"\n",
+ " },\n",
+ " {\n",
+ " \"type\": \"count\",\n",
+ " \"name\": \"user_count\"\n",
+ " }\n",
+ " ],\n",
+ " \"context\": {\n",
+ " \"sqlOuterLimit\": 1001,\n",
+ " \"useNativeQueryExplain\": False\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "headers = {\n",
+ " 'Content-Type': 'application/json'\n",
+ "}\n",
+ "\n",
+ "result = rest_client.post(\"/druid/v2\", json.dumps(polygon_filter_query), headers=headers)\n",
+ "\n",
+ "json.loads(result.text)[0]['result']\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "98846a26-f46d-47ef-ad4c-e09a096367c6",
+ "metadata": {},
+ "source": [
+ "## Cleanup \n",
+ "The following cell removes the example datasource from Druid."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "08dec76f-c5f8-4787-ad20-c25e069b4a83",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Drop datasource: [{druid.datasources.drop('example-spatial-index')}]\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6c96687b-6b35-40ed-baec-ef739bdb80b5",
+ "metadata": {},
+ "source": [
+ "## Learn more\n",
+ "\n",
+ "See [Spatial filters](https://druid.apache.org/docs/latest/querying/geo/) for more information.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "execution": {
+ "allow_errors": true,
+ "timeout": 300
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/03-query/01-groupby.ipynb b/notebooks/03-query/01-groupby.ipynb
index 092f72b0..5de89516 100644
--- a/notebooks/03-query/01-groupby.ipynb
+++ b/notebooks/03-query/01-groupby.ipynb
@@ -89,12 +89,31 @@
"id": "472589e4-1026-4b3b-bb79-eedabb2b44c4",
"metadata": {},
"source": [
+ "\n",
"### Load example data\n",
"\n",
"Once your Druid environment is up and running, ingest the sample data for this tutorial.\n",
"\n",
"Run the following cell to create a table called `example-koalas-groupby`. Notice only columns required for this notebook are ingested from the overall sample dataset.\n",
"\n",
+ "This ingestion is also an example of roll up ingestion. \n",
+ "- It aggregates by 15 minute blocks:\n",
+ " ```\n",
+ " \n",
+ " TIME_FLOOR(TIME_PARSE(\"timestamp\"), 'PT15M') AS \"__time\",\n",
+ "\n",
+ "\n",
+ " ```\n",
+ "- It calculates an aggregate column \"latest_session_length\" which stores the most recent value for the \"session_length\" column within the 15 minute block for each group in the results:\n",
+ " ```\n",
+ "\n",
+ " LATEST_BY(\"session_length\",TIME_PARSE(\"timestamp\")) AS \"latest_session_length\"\n",
+ "\n",
+ " ```\n",
+ "LATEST and EARLIEST aggregations store a data structure in the database that holds the time of the latest or earliest value and the value itself. This allows it to be further aggregated across any combination of dimensions at query time.\n",
+ "\n",
+ "Such aggregate columns created at ingestion time, cannot then be used by EARLIEST_BY, LATEST_BY functions on an alternate time column. The reason is that the column's internal values were built with a different timeline and are therefore not comparable to a different timeline.\n",
+ "\n",
"When completed, you'll see table details."
]
},
@@ -115,15 +134,16 @@
" )\n",
") EXTEND (\"timestamp\" VARCHAR, \"agent_category\" VARCHAR, \"agent_type\" VARCHAR, \"browser\" VARCHAR, \"browser_version\" VARCHAR, \"city\" VARCHAR, \"continent\" VARCHAR, \"country\" VARCHAR, \"version\" VARCHAR, \"event_type\" VARCHAR, \"event_subtype\" VARCHAR, \"loaded_image\" VARCHAR, \"adblock_list\" VARCHAR, \"forwarded_for\" VARCHAR, \"language\" VARCHAR, \"number\" VARCHAR, \"os\" VARCHAR, \"path\" VARCHAR, \"platform\" VARCHAR, \"referrer\" VARCHAR, \"referrer_host\" VARCHAR, \"region\" VARCHAR, \"remote_address\" VARCHAR, \"screen\" VARCHAR, \"session\" VARCHAR, \"session_length\" BIGINT, \"timezone\" VARCHAR, \"timezone_offset\" VARCHAR, \"window\" VARCHAR))\n",
"SELECT\n",
- " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
+ " TIME_FLOOR(TIME_PARSE(\"timestamp\"), 'PT15M') AS \"__time\",\n",
" \"browser\",\n",
" \"city\",\n",
" \"continent\",\n",
" \"country\",\n",
" \"loaded_image\",\n",
" \"os\",\n",
- " \"session_length\"\n",
+ " LATEST_BY(\"session_length\",TIME_PARSE(\"timestamp\")) \"latest_session_length\"\n",
"FROM \"ext\"\n",
+ "GROUP BY 1,2,3,4,5,6,7\n",
"PARTITIONED BY DAY\n",
"'''\n",
"\n",
@@ -187,11 +207,11 @@
"sql='''\n",
"SELECT\n",
" \"loaded_image\",\n",
- " avg(session_length) AS \"timetakenms_average\",\n",
- " max(session_length) AS \"timetakenms_max\",\n",
- " min(session_length) AS \"timetakenms_min\",\n",
- " count(*) AS \"count\",\n",
- " count(DISTINCT \"country\") AS \"countries\"\n",
+ " ROUND(AVG(\"latest_session_length\"),0) AS \"timetakenms_average\",\n",
+ " MAX(\"latest_session_length\") AS \"timetakenms_max\",\n",
+ " MIN(\"latest_session_length\") AS \"timetakenms_min\",\n",
+ " COUNT(*) AS \"count\",\n",
+ " COUNT(DISTINCT \"country\") AS \"countries\"\n",
"FROM \"example-koalas-groupby\"\n",
"WHERE TIME_IN_INTERVAL(\"__time\", '2019-08-25T10:00:00/2019-08-25T18:00:00')\n",
"GROUP BY 1\n",
@@ -279,6 +299,26 @@
"display.sql(sql)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e68a6db9-d83b-4676-911a-b1716059029c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT\n",
+ " \"continent\", \n",
+ " EARLIEST(\"latest_session_length\" ) AS \"earliest_session_length\",\n",
+ " LATEST(\"latest_session_length\" ) AS \"latest_session_length\"\n",
+ "FROM \"example-koalas-groupby\"\n",
+ "WHERE TIME_IN_INTERVAL(\"__time\", '2019-08-25T10:00:00/2019-08-25T18:00:00')\n",
+ "GROUP BY 1\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
{
"cell_type": "markdown",
"id": "2dd3c695-b1a4-4c6c-99f7-9ae1b3c6f574",
@@ -385,7 +425,7 @@
"id": "66b08db9-b63f-45cb-9ac2-18d4cf6eedad",
"metadata": {},
"source": [
- "For data vizualisations where time is on the x axis, the `TIME_FLOOR` function is particularly useful.\n",
+ "For data visualizations where time is on the x axis, the `TIME_FLOOR` function is particularly useful.\n",
"\n",
"Run the next cell, which stores the results of a query in a dataframe and then plots them into a line chart for the period."
]
@@ -953,7 +993,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.11.6"
}
},
"nbformat": 4,
diff --git a/notebooks/03-query/10-functions-ip.ipynb b/notebooks/03-query/10-functions-ip.ipynb
index d045dfe4..58a57807 100644
--- a/notebooks/03-query/10-functions-ip.ipynb
+++ b/notebooks/03-query/10-functions-ip.ipynb
@@ -35,7 +35,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 27.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"#### Run with Docker\n",
"\n",
@@ -132,6 +132,8 @@
"source": [
"## Filtering query results against subnets\n",
"\n",
+ "### IPV4_MATCH\n",
+ "\n",
"The `IPV4_MATCH` function allows for filtering of datasets and of aggregations according to the subnet of an IP address.\n",
"\n",
"Run the following query to see a count of the number of sessions in the data for a specific CIDR."
@@ -147,7 +149,7 @@
"sql='''\n",
"SELECT COUNT(DISTINCT \"session\") AS \"sessions\"\n",
"FROM \"example-koalas-ip\"\n",
- "WHERE IPV4_MATCH(\"forwarded_for\",'68.0.0.0/8')\n",
+ "WHERE IPV4_MATCH(\"forwarded_for\",'68.0.0.0/8') \n",
"AND TIME_IN_INTERVAL(\"__time\",'2019-08-25/PT1H')\n",
"'''\n",
"\n",
@@ -185,6 +187,82 @@
"display.sql(sql)"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "d23b9a99-cb5e-4b38-b4f9-07aec117ba5e",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### IPV6_MATCH \n",
+ "IPV6 utilization is on the rise, Druid 29.0.0 introduces this function that allows you to match IPV6 addresses to a subnet.\n",
+ "\n",
+ "The following example loads a few IPV6 addresses and demonstrates this new matching functionality."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5b393b4d-0784-4250-bdad-09508b13fb5f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "REPLACE INTO \"example-ipv6\" OVERWRITE ALL\n",
+ "WITH \"ext\" AS (\n",
+ " SELECT *\n",
+ " FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\"type\":\"inline\",\"data\":\"{ \\\\\"time\\\\\":\\\\\"2024-01-01T10:03:00\\\\\", \\\\\"ipv6_address\\\\\":\\\\\"3ffe:1900:4545:0003:0200:f8ff:fa10:67af\\\\\"}\\\\n{ \\\\\"time\\\\\":\\\\\"2024-01-01T10:03:01\\\\\", \\\\\"ipv6_address\\\\\":\\\\\"3ae7:ab97:4400:ab07:7f34:6702:6502:0001\\\\\"}\\\\n{ \\\\\"time\\\\\":\\\\\"2024-01-01T10:03:01\\\\\", \\\\\"ipv6_address\\\\\":\\\\\"3ae7:ab97:4400:ab07:7f34:6702:abcd:1234\\\\\"}\\\\n{ \\\\\"time\\\\\":\\\\\"2024-01-01T10:03:00\\\\\", \\\\\"ipv6_address\\\\\":\\\\\"3ffe:1900:4545:0003:0200:f8ff:af77:fe5a\\\\\"}\"}',\n",
+ " '{\"type\":\"json\"}'\n",
+ " )\n",
+ " ) EXTEND (\"time\" VARCHAR, \"ipv6_address\" VARCHAR)\n",
+ ")\n",
+ "SELECT\n",
+ " TIME_PARSE(\"time\") AS \"__time\",\n",
+ " \"ipv6_address\"\n",
+ "FROM \"ext\"\n",
+ "PARTITIONED BY DAY\n",
+ "'''\n",
+ "\n",
+ "display.run_task(sql)\n",
+ "sql_client.wait_until_ready('example-ipv6')\n",
+ "display.table('example-ipv6')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e3248394-98b1-4554-ab05-799e4a16ecc6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filter for subnet 3ffe:1900:4545:0003:0200:f8ff::/64\n",
+ "sql='''\n",
+ "SELECT *\n",
+ "FROM \"example-ipv6\"\n",
+ "WHERE IPV6_MATCH(\"ipv6_address\",'3ffe:1900:4545:0003:0200:f8ff::/64')\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fcb88c1a-cbec-4a8e-875f-d5bd42239560",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# filter for subnet 3ae7:ab97:4400:ab07:7f34:6702::/64\n",
+ "sql='''\n",
+ "SELECT *\n",
+ "FROM \"example-ipv6\"\n",
+ "WHERE IPV6_MATCH(\"ipv6_address\",'3ae7:ab97:4400:ab07:7f34:6702::/64')\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
{
"cell_type": "markdown",
"id": "681d174f-ea3e-4a92-8cb4-ceca45d95b17",
@@ -223,7 +301,7 @@
"id": "41f97b81-ff82-4941-bc15-90e2a1b17c13",
"metadata": {},
"source": [
- "Use these functions, or the native (JSON) equivallents, at ingestion time to enrich data in the table ahead of time.\n",
+ "Use these functions, or the native (JSON) equivalents, at ingestion time to enrich data in the table ahead of time.\n",
"\n",
"An important technique, especially at scale, is to use Apache Datasketches for high-performance, approximate operations on network data.\n",
"\n",
@@ -319,7 +397,8 @@
"outputs": [],
"source": [
"druid.datasources.drop(\"example-koalas-ip\")\n",
- "druid.datasources.drop(\"example-koalas-ip-rollup\")"
+ "druid.datasources.drop(\"example-koalas-ip-rollup\")\n",
+ "druid.datasources.drop(\"example-ipv6\")"
]
},
{
@@ -360,7 +439,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.5"
+ "version": "3.11.6"
}
},
"nbformat": 4,
diff --git a/notebooks/03-query/11-joins.ipynb b/notebooks/03-query/11-joins.ipynb
index 5c7af62a..f5298379 100644
--- a/notebooks/03-query/11-joins.ipynb
+++ b/notebooks/03-query/11-joins.ipynb
@@ -41,7 +41,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 27.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"#### Run with Docker\n",
"\n",
@@ -207,7 +207,7 @@
" \"name\": \"users\",\n",
" \"target\": { \"type\": \"file\", \"path\":\"users.json\" },\n",
" \"config_file\": \"clickstream/users_init.json\", \n",
- " \"concurrency\":4000,\n",
+ " \"concurrency\":100,\n",
" \"total_events\":4000 \n",
"}\n",
"datagen.post(\"/start\", json.dumps(datagen_request), headers=headers)"
@@ -881,6 +881,83 @@
"If the data being joined changes over time and you wish to query with the latest values, consider a lookup. If the data does not change or the change is not desirable, then pre-join the data. The example of `age_group` as the joined property works well in the pre-join strategy because the user's age_group is important at the time they clicked, not when looking at old click rows based on their current age_group."
]
},
+ {
+ "cell_type": "markdown",
+ "id": "92e6116f-3632-4b1f-817f-d656f598758e",
+ "metadata": {},
+ "source": [
+ "\n",
+ "### INNER JOIN with arbitrary conditions\n",
+ "\n",
+ "Before the 29.0.0 release, all join conditions had to be equalities. With the new functionality inequalities can also be used. The query optimization rules continue to apply, so the equality portion of the join will be processed as described above. The inequalities are processed after the join operation is completed.\n",
+ "\n",
+ "In this first example, the query finds activity within a limited the geographical area by filtering for a range of latitudes where the user's is located:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f29569b0-b9a3-4672-866a-af75ae47edc9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql = ''' \n",
+ "SELECT TIME_FLOOR(c.__time, 'P1D') as _date, \n",
+ " round( TIMESTAMPDIFF(YEAR, TIME_PARSE(u.dob), CURRENT_TIMESTAMP),-1) as age_group, \n",
+ " count(distinct c.client_ip) as distinct_ips\n",
+ "FROM clicks c\n",
+ " INNER JOIN users u\n",
+ " ON c.user_id=u.user_id AND u.address_lat > -1 AND u.address_lat < 1\n",
+ "GROUP BY 1, round( TIMESTAMPDIFF(YEAR, TIME_PARSE(u.dob), CURRENT_TIMESTAMP),-1) \n",
+ "ORDER BY 3 DESC\n",
+ "LIMIT 10\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1352b491-1795-44dd-91a3-ff75de61f24f",
+ "metadata": {},
+ "source": [
+ "This second example looks at clicks from the same user during the same hour but for a different country, trying to find users with potentially shared accounts or breached accounts. \n",
+ "This requires that we have a more complex join condition when self joining the clicks data: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d77125a-2b77-4bb9-9ec4-684ce29f8e2e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql = ''' \n",
+ "WITH clicks_by_country AS\n",
+ "( \n",
+ "SELECT TIME_FLOOR(__time, 'PT1H') \"hour\", \"user_id\", \"client_country\", count(*) as \"click_count\"\n",
+ " FROM clicks_enhanced \n",
+ " WHERE __time > CURRENT_TIMESTAMP - INTERVAL '1' DAY \n",
+ " GROUP BY 1,2,3\n",
+ ")\n",
+ "SELECT c1.\"hour\", c1.\"user_id\", \n",
+ " c1.\"client_country\" as \"first_country\", \n",
+ " c2.\"client_country\" as \"second_country\",\n",
+ " c1.\"click_count\" as \"first_country_clicks\",\n",
+ " c2.\"click_count\" as \"second_country_clicks\"\n",
+ "FROM \n",
+ " clicks_by_country c1\n",
+ "INNER JOIN\n",
+ " clicks_by_country c2\n",
+ " ON c1.\"hour\" = c2.\"hour\" AND c1.\"user_id\"=c2.\"user_id\" \n",
+ " AND c1.\"client_country\" <> c2.\"client_country\"\n",
+ " AND c1.\"click_count\" > c2.\"click_count\"\n",
+ "LIMIT 10\n",
+ "'''\n",
+ "\n",
+ "display.sql(sql)\n"
+ ]
+ },
{
"cell_type": "markdown",
"id": "46e2e6f6-86e1-48dd-a125-6c1920173d74",
diff --git a/notebooks/03-query/14-full-timeline-queries.ipynb b/notebooks/03-query/14-sync-async-queries.ipynb
similarity index 92%
rename from notebooks/03-query/14-full-timeline-queries.ipynb
rename to notebooks/03-query/14-sync-async-queries.ipynb
index 3f0cbffb..93289eaa 100644
--- a/notebooks/03-query/14-full-timeline-queries.ipynb
+++ b/notebooks/03-query/14-sync-async-queries.ipynb
@@ -47,7 +47,7 @@
"source": [
"## Prerequisites\n",
"\n",
- "This tutorial works with Druid 28.0.0 or later.\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
"\n",
"#### Run with Docker\n",
"\n",
@@ -376,7 +376,7 @@
"### Nothing new so far\n",
"\n",
"It might take it a minute or two to start making the real-time data available. \n",
- "There are 3 months of data in the table and the next query shows when it has caught up to almost now. Run it a few times until you see the full 90+ days in the data and the most recent event within a minute. Events are being generated irregularly, but there should be at a new one every minute. "
+ "There are 3 months of data in the table and the next query shows when it has caught up to almost now. Run it a few times until you see the full 90+ days in the data and the most recent event within a minute. Events are being generated irregularly, but there should be at least one every minute. "
]
},
{
@@ -555,9 +555,9 @@
" TIMESTAMPDIFF( SECOND, max(__time), CURRENT_TIMESTAMP ) \"recent_event_seconds_ago\" \n",
" FROM \"example-clicks-full-timeline\"\n",
"'''\n",
- "response = sql_client.async_sql(sql)\n",
- "response.wait_until_done()\n",
- "display(response.rows)\n"
+ "result = sql_client.async_sql(sql)\n",
+ "result.wait_until_done() # wait until the query has completed\n",
+ "display(result.rows)\n"
]
},
{
@@ -587,9 +587,9 @@
"'''\n",
"req = sql_client.sql_request(sql)\n",
"req.add_context(\"includeSegmentSource\", \"realtime\")\n",
- "response = sql_client.async_sql(req)\n",
- "response.wait_until_done()\n",
- "display(response.rows)\n"
+ "result = sql_client.async_sql(req)\n",
+ "result.wait_until_done() # wait until the query has completed\n",
+ "display(result.rows)\n"
]
},
{
@@ -600,6 +600,49 @@
"The result from the query covers the full timeline of data available, the ingested data available in Deep Storage plus the latest data being streamed into the table continuously."
]
},
+ {
+ "cell_type": "markdown",
+ "id": "21a32041-e9cb-4b14-a149-b63ccd4c0386",
+ "metadata": {},
+ "source": [
+ "\n",
+ "## Retrieving results page by page \n",
+ "\n",
+ "Use the `rowsPerPage` query context parameter to control the size of the results page.\n",
+ "In the cell below, the page size is set by providing a parameter in the call to the Python API."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37f5a0a6-db48-45c5-9aa6-476ea9ab0d5b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT * \n",
+ " FROM \"example-clicks-full-timeline\"\n",
+ " WHERE __time > CURRENT_TIMESTAMP - INTERVAL '2' HOUR\n",
+ " ORDER BY __time DESC\n",
+ "'''\n",
+ "req = sql_client.sql_request(sql)\n",
+ "req.add_context(\"includeSegmentSource\", \"realtime\")\n",
+ "req.add_context(\"selectDestination\", \"durableStorage\") # needed to enable paging results\n",
+ "\n",
+ "# use rowsPerPage parameter to define page size, defaults to 100000\n",
+ "result = sql_client.async_sql(req, rowsPerPage=10)\n",
+ "\n",
+ "# wait for query to be processed\n",
+ "result.wait_until_done()\n",
+ "\n",
+ "# retrieve results one page at a time\n",
+ "print(\"\\nPAGE #0\\n\")\n",
+ "display(result.paged_rows(pageNum=0))\n",
+ "\n",
+ "print(\"\\nPAGE #1\\n\")\n",
+ "display(result.paged_rows(pageNum=1))"
+ ]
+ },
{
"cell_type": "markdown",
"id": "44738d6d-cec2-40ad-aaba-998c758c63f4",
@@ -636,8 +679,17 @@
"You learned about setting up retention rules for different periods to:\n",
"* cache recent segments in historical tier\n",
"* keep older segments available for async queries from deep storage\n",
- "* use async queries that also retrieve real-time data"
+ "* use async queries that also retrieve real-time data\n",
+ "* how to retrieve results a page at a time"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "18baf600-2255-4232-aff8-e1b1247cbc10",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/notebooks/03-query/15-pivot-unpivot.ipynb b/notebooks/03-query/15-pivot-unpivot.ipynb
new file mode 100644
index 00000000..ac213382
--- /dev/null
+++ b/notebooks/03-query/15-pivot-unpivot.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "0cb3b009-ebde-4d56-9d59-a028d66d8309",
+ "metadata": {},
+ "source": [
+ "# PIVOT and UNPIVOT\n",
+ "\n",
+ "\n",
+ "\n",
+ "This tutorial demonstrates how to work with [PIVOT](https://druid.apache.org/docs/latest/querying/sql#pivot) and [UNPIVOT](https://druid.apache.org/docs/latest/querying/sql#unpivot) SQL operators. \n",
+ "\n",
+ "Note: PIVOT and UNPIVOT are [experimental features](https://druid.apache.org/docs/latest/development/experimental)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bbdbf6ad-ca7b-40f5-8ca3-1070f4a3ee42",
+ "metadata": {},
+ "source": [
+ "## Prerequisites\n",
+ "\n",
+ "This tutorial works with Druid 29.0.0 or later.\n",
+ "\n",
+ "#### Run with Docker\n",
+ "\n",
+ "Launch this tutorial and all prerequisites using the `druid-jupyter` profile of the Docker Compose file for Jupyter-based Druid tutorials. For more information, see the Learn Druid repository [readme](https://github.com/implydata/learn-druid).\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5007a243-b81a-4601-8f57-5b14940abbff",
+ "metadata": {},
+ "source": [
+ "## Initialization\n",
+ "\n",
+ "Run the next cell to set up the Druid Python client's connection to Apache Druid.\n",
+ "\n",
+ "If successful, the Druid version number will be shown in the output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1ec783b-df3f-4168-9be2-cdc6ad3e33c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import druidapi\n",
+ "import os\n",
+ "\n",
+ "if 'DRUID_HOST' not in os.environ.keys():\n",
+ " druid_host=f\"http://localhost:8888\"\n",
+ "else:\n",
+ " druid_host=f\"http://{os.environ['DRUID_HOST']}:8888\"\n",
+ "\n",
+ "print(f\"Opening a connection to {druid_host}.\")\n",
+ "druid = druidapi.jupyter_client(druid_host)\n",
+ "\n",
+ "display_client = druid.display\n",
+ "sql_client = druid.sql\n",
+ "status_client = druid.status\n",
+ "\n",
+ "status_client.version"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "645f0209-436a-47e2-a1f4-be6c43de0eac",
+ "metadata": {},
+ "source": [
+ "## Load example data\n",
+ "\n",
+ "Once your Druid environment is up and running, ingest the sample data for this tutorial.\n",
+ "\n",
+ "Run the following cell to create a table called \"example-wiki-pivot-unpivot\". When completed, you'll see a description of the final table."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "032ee776-1eb9-46dc-989a-933bd7c7d1da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "REPLACE INTO \"example-wiki-pivot-unpivot\" OVERWRITE ALL\n",
+ "WITH \"ext\" AS (\n",
+ " SELECT *\n",
+ " FROM TABLE(\n",
+ " EXTERN(\n",
+ " '{\"type\":\"http\",\"uris\":[\"https://druid.apache.org/data/wikipedia.json.gz\"]}',\n",
+ " '{\"type\":\"json\"}'\n",
+ " )\n",
+ " ) EXTEND (\"isRobot\" VARCHAR, \"channel\" VARCHAR, \"timestamp\" VARCHAR, \"flags\" VARCHAR, \"isUnpatrolled\" VARCHAR, \"page\" VARCHAR, \"diffUrl\" VARCHAR, \"added\" BIGINT, \"comment\" VARCHAR, \"commentLength\" BIGINT, \"isNew\" VARCHAR, \"isMinor\" VARCHAR, \"delta\" BIGINT, \"isAnonymous\" VARCHAR, \"user\" VARCHAR, \"deltaBucket\" BIGINT, \"deleted\" BIGINT, \"namespace\" VARCHAR, \"cityName\" VARCHAR, \"countryName\" VARCHAR, \"regionIsoCode\" VARCHAR, \"metroCode\" BIGINT, \"countryIsoCode\" VARCHAR, \"regionName\" VARCHAR)\n",
+ ")\n",
+ "SELECT\n",
+ " TIME_PARSE(\"timestamp\") AS \"__time\",\n",
+ " \"isRobot\",\n",
+ " \"channel\",\n",
+ " \"added\",\n",
+ " \"user\",\n",
+ " \"deleted\"\n",
+ "FROM \"ext\"\n",
+ "PARTITIONED BY DAY\n",
+ "'''\n",
+ "display_client.run_task(sql)\n",
+ "sql_client.wait_until_ready('example-wiki-pivot-unpivot')\n",
+ "display_client.table('example-wiki-pivot-unpivot')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36ad757a-957e-465a-a284-855c14e3e8fd",
+ "metadata": {},
+ "source": [
+ "## PIVOT\n",
+ "\n",
+ "PIVOT is a SQL operator that reduces rows in a result set by converting rows into columns.\n",
+ "```\n",
+ " FROM \n",
+ " \n",
+ " PIVOT ( \n",
+ " \n",
+ " FOR IN ()\n",
+ " )\n",
+ "```\n",
+ "- \\ - list of aggregate expressions; for example : SUM(added) as added, SUM(deleted) as deleted \n",
+ "- \\ - the column whose values are being turned into new columns\n",
+ "- \\ to use in the form ` as ,...`; for example: 'true' as robot, 'false as human \n",
+ "\n",
+ "The operation produces columns in the form \\__\\ for all the aggregations and values that are specified.\n",
+ "\n",
+ "Try this out. \n",
+ "The next cell runs without PIVOT: "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0b36cb33-1e8a-4284-966c-a3937c8a33a4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ " SELECT\n",
+ " \"channel\", \"isRobot\", \"added\", \"deleted\"\n",
+ " FROM \"example-wiki-pivot-unpivot\" \n",
+ " WHERE TIME_IN_INTERVAL(__time, '2016-06-27/P1D')\n",
+ " LIMIT 10\n",
+ "'''\n",
+ "display_client.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df1ac2ec-93d8-4cb4-bce1-5dabd72c7dbc",
+ "metadata": {},
+ "source": [
+ "The next cell uses PIVOT with the same information as above to demonstrate how PIVOT reorganizes it:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4a63419-a824-405c-90c2-524611c4024b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ " SELECT\n",
+ " \"channel\", \n",
+ " \"robot_added\", \n",
+ " \"human_added\", \n",
+ " \"robot_deleted\", \n",
+ " \"human_deleted\"\n",
+ " FROM \"example-wiki-pivot-unpivot\" \n",
+ " \n",
+ " PIVOT ( SUM(added) as added, \n",
+ " SUM(deleted) as deleted \n",
+ " \n",
+ " FOR \"isRobot\" IN ('true' as robot, 'false' as human)\n",
+ " )\n",
+ " WHERE TIME_IN_INTERVAL(__time, '2016-06-27/P1D')\n",
+ " LIMIT 10\n",
+ "'''\n",
+ "display_client.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0fc3a7ed-255e-49a5-ab0c-964e4fd38885",
+ "metadata": {},
+ "source": [
+ "Notice that the PIVOT operation:\n",
+ "- moved the values of \"isRobot\"='true' into two columns called \"robot_added\" and \"robot_deleted\"\n",
+ "- moved the values of \"isRobot\"='false' into two columns called \"human_added\" and \"human_deleted\"\n",
+ "\n",
+ "Also, notice the presence of NULL values where the pivot column is not the value assigned to the corresponding column name. For example, \"robot_added\" only has values where \"isRobot\" was true."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "841a8fc2-2efa-4327-b35c-e64a8fa380d3",
+ "metadata": {},
+ "source": [
+ "## Robotic updates to wikipedia by channel using normal aggregation\n",
+ "\n",
+ "This follwoing example demonstrates PIVOT with aggregation.\n",
+ "\n",
+ "To illustrate, first determine which channels receive the most robotic changes by running a normal aggregation query:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79bac420-4ee7-4f77-abdf-be651e5752ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT\n",
+ " \"channel\",\n",
+ " \"isRobot\",\n",
+ " SUM(added) total_added,\n",
+ " SUM(deleted) total_deleted\n",
+ "FROM \"example-wiki-pivot-unpivot\" \n",
+ "WHERE TIME_IN_INTERVAL(__time, '2016-06-27/P1D')\n",
+ "GROUP BY 1, 2\n",
+ "'''\n",
+ "display_client.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a8642f39-ad04-45b3-9120-b622a0f7c8a7",
+ "metadata": {},
+ "source": [
+ "## Transform rows with PIVOT \n",
+ "The query above produces a lengthy output, making it hard to discern which channels have the most updates while still seeing the distinction between additions and deletions to the wikipedia pages.\n",
+ "You can use the PIVOT operator to transform rows into columns for distinct values in the column.\n",
+ "PIVOT helps to reorganize the result into less rows and more columns while still keeping all the detailed values.\n",
+ "\n",
+ "The following query uses aggregation on top of the pivoted columns in order to merge results into one row per channel.\n",
+ "\n",
+ "Run the following cell to calculate totals of human and robotic additions and deletions by channel with a ratio of human to total changes. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "65c489a8-a7f2-4cc0-b30c-f30d8ed32be2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"channel\", \n",
+ " SUM(\"robot_added\") AS \"added_robot\", \n",
+ " SUM(\"robot_deleted\") AS \"deleted_robot\", \n",
+ " SUM(\"human_added\") AS \"added_human\",\n",
+ " SUM(\"human_deleted\") AS \"deleted_human\", \n",
+ " \n",
+ " SAFE_DIVIDE( SUM(\"human_deleted\" + \"human_added\") * 1.0 , \n",
+ " SUM( \"robot_deleted\" + \"robot_added\" + \"human_deleted\" + \"human_added\" )\n",
+ " \n",
+ " ) AS \"human_ratio\"\n",
+ " \n",
+ "FROM\n",
+ "(\n",
+ " SELECT\n",
+ " \"channel\", \n",
+ " COALESCE(\"robot_added\",0) AS \"robot_added\", \n",
+ " COALESCE(\"human_added\",0) AS \"human_added\", \n",
+ " COALESCE(\"robot_deleted\",0) AS \"robot_deleted\", \n",
+ " COALESCE(\"human_deleted\",0) AS \"human_deleted\"\n",
+ " FROM \"example-wiki-pivot-unpivot\" \n",
+ " \n",
+ " PIVOT ( SUM(added) as added, \n",
+ " SUM(deleted) as deleted \n",
+ " \n",
+ " FOR \"isRobot\" IN ('true' as robot, 'false' as human)\n",
+ " )\n",
+ " WHERE TIME_IN_INTERVAL(__time, '2016-06-27/P1D')\n",
+ ")x\n",
+ "GROUP BY 1\n",
+ "ORDER BY 6\n",
+ "LIMIT 20\n",
+ "'''\n",
+ "display_client.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d198f50b-f10b-4304-8358-ed6bcc02627a",
+ "metadata": {},
+ "source": [
+ "The result shows 20 channels with the lowest \"human_ratio\", they are the ones that have the highest proportion of robot updates.\n",
+ "\n",
+ "A few cells above we saw that the values from PIVOT can be NULL. The query above calculates a ratio of human to total changes for each channel and the result is sorted on this ratio such that the channels with the highest proportion of robot updates are listed first. To calculate this metric even in the presence of NULLs, the query uses COALESCE on all the pivoted metrics.\n",
+ "\n",
+ "The result is much cleaner than the prior query, and it is easy to determine which channels have the most robotic activity.\n",
+ "\n",
+ "Notice the use of SAFE_DIVIDE in the \"human_ratio\" calculation, which guards for division by zero for cases with no updates. See [SAFE_DIVIDE](https://druid.apache.org/docs/latest/querying/sql-functions#safe_divide) for more information.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f0af833-9508-4f26-a484-2778ca857a2a",
+ "metadata": {},
+ "source": [
+ "## Transform columns with UNPIVOT \n",
+ "UNPIVOT does the opposite of PIVOT. UNPIVOT turns columns into rows by using the names of the columns being removed as values for a new column called the \"names column\".\n",
+ "The values of the removed columns are combined into a single column called the \"values column\".\n",
+ "\n",
+ "Given the prior query, you can investigate the sources of the updates in the most robotically updated pages.\n",
+ "The following cell uses the UNPIVOT operation to do just that.\n",
+ "\n",
+ "Run the following cell to find the most active user as measured by total adding or deleting activity within the channel `'#it.wikipedia'` which has a ratio of ~0.50 (about half robots). The SQL statement uses the following `UNPIVOT` operator:\n",
+ "```\n",
+ " UNPIVOT ( \"changes\" FOR \"action\" IN (\"added\", \"deleted\") )\n",
+ "```\n",
+ "- takes the values of multiple columns: `\"added\",\"deleted\"`\n",
+ "- incorporates them into a single column called the \"values column\": `\"changes\"`\n",
+ "- expands the results in \"names column\" with names of deleted columns: `\"action\"`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f0159df8-6ca0-45af-9c68-6acd92defc4f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sql='''\n",
+ "SELECT \"user\",\n",
+ " \"action\",\n",
+ " SUM(\"changes\") \"total_changes\"\n",
+ "FROM \"example-wiki-pivot-unpivot\"\n",
+ "\n",
+ "UNPIVOT ( \"changes\" FOR \"action\" IN (\"added\", \"deleted\") )\n",
+ "\n",
+ "WHERE TIME_IN_INTERVAL(__time, '2016-06-27/P1D')\n",
+ " AND \"channel\"='#it.wikipedia' \n",
+ " AND \"isRobot\"='true'\n",
+ "GROUP BY 1,2\n",
+ "ORDER BY 3 DESC \n",
+ "LIMIT 10\n",
+ "'''\n",
+ "display_client.sql(sql)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e7f3f5d-bd0c-4957-8d62-94fb01def699",
+ "metadata": {},
+ "source": [
+ "The result provides the list of users that did the most additions or deletions in the channel `'#it.wikipedia'` and identified as `isRobot='true'`. \n",
+ "\n",
+ "It has merged the values of columns \"added\" and \"deleted\" into the column \"changes\" which is SUMed into \"total_changes\".\n",
+ "The names of the original columns \"added\" and \"deleted\" are now values in the \"action\" column, so you can still see the detail.\n",
+ "By using UNPIVOT in this fashion, you can sort on the largest addition or deletion that is the biggest total change by a user and easily find the robot users that are most affecting the channel.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "44738d6d-cec2-40ad-aaba-998c758c63f4",
+ "metadata": {},
+ "source": [
+ "## Clean up\n",
+ "\n",
+ "Run the following cell to remove the table created for this notebook from the database."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8082b545-ba7f-4ede-bb6e-2a6dd62ba0d8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "druid.datasources.drop(\"example-wiki-pivot-unpivot\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "54b8d5fe-ba85-4b5b-9669-0dd47dfbccd1",
+ "metadata": {},
+ "source": [
+ "## Summary\n",
+ "\n",
+ "PIVOT converts row values into columns with aggregate results.\n",
+ "\n",
+ "UNPIVOT converts columns into rows by merging the values from multiple columns into a single column.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "execution": {
+ "allow_errors": true,
+ "timeout": 300
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.6"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/README.md b/notebooks/README.md
index 59acfa30..1fa82fad 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -8,9 +8,7 @@ If you have suggestions or comments on any notebooks, remember to call into the
### Releases
-You can get quick access to new and updated notebooks following different releases of Apache Druid in the `00-releases` folder.
-
-* [Releases index](00-releases)
+You can get quick access to new and updated notebooks following different releases of Apache Druid in the [New Features by Release](#features_by_release) index below.
### Introduction
@@ -40,7 +38,7 @@ Visit the `02-ingestion` folder for notebooks focused on using JSON- and SQL-bas
|[Multi-topic Kafka ingestion](./02-ingestion/11-stream-from-multiple-topics.ipynb)|A walk through of automatic topic detection for streaming ingestion.|`all-services`|
|[SQL-compatible NULL](./02-ingestion/09-generating-and-working-with-nulls.ipynb)|Apache Druid now implements standard SQL NULL handling. This notebook walks through working with NULLs during ingestion and query.|`druid-jupyter`|
|[ARRAYS and UNNEST](./02-ingestion/08-table-datatypes-arrays.ipynb)|Ingesting, creating, and manipulating ARRAYs and the UNNEST operator.|`druid-jupyter`|
-
+|[Ingest and query spatial dimensions](./02-ingestion/12-spatial-dimensions.ipynb)|Ingest spatial dimensions and use rectangular, circular, and polygon filters to query.|`druid-jupyter`|
### Querying data
@@ -63,3 +61,19 @@ For tutorials focused on effective use of all manner of `SELECT` statements in A
|[Using joins effectively in Druid](./03-query/11-joins.ipynb)|A full review of all join strategies available in Druid with examples and performance comparisons.|`druid-jupyter`|
|[Window functions (Experimental)](./03-query/13-query-functions-window.ipynb)|An introduction to Window functions which are a new experimental feature in Druid SQL.|`druid-jupyter`|
|[Query from Deep Storage](./03-query/14-full-timeline-queries.ipynb)|Query from Deep Storage has been enhanced to also view real-time segments making it capable of spanning the whole timeline.|`all-services`|
+|[PIVOT and UNPIVOT functions](./03-query/15-pivot-unpivot.ipynb)|Use PIVOT to convert row values into columns. Use UNPIVOT to convert column values into rows.|`druid-jupyter`|
+
+
+
+### New Features by Release
+
+#### Druid 29.0.0
+* [Ingestion using System Fields](./02-ingestion/02-batch-ingestion.ipynb#system_fields)
+* [IPV6 Support for filtering subnects](./03-query/10-functions-ip.ipynb#ipv6_match)
+* [Control rows per page when retrieving async results](./03-query/14-sync-async-queries.ipynb#async_rows_per_page)
+* [INNER JOIN with inequalities](./03-query/11-joins.ipynb#join_with_inequality)
+* [Expressions for path parameter in JSON functions](./02-ingestion/05-working-with-nested-columns.ipynb#expression_for_path)
+* [PIVOT and UNPIVOT functions](./03-query/15-pivot-unpivot.ipynb)
+* [UNNESTing arrays of objects](./02-ingestion/08-table-datatypes-arrays.ipynb#json_array_of_objects)
+* [Ingest primitive arrays from input source](./02-ingestion/08-table-datatypes-arrays.ipynb#ingest_array)
+* [LATEST/EARLIEST rollup in MSQ](./03-query/01-groupby.ipynb#groupby)
diff --git a/tests/test-notebooks-local-jupyter-build.sh b/tests/test-notebooks-local-jupyter-build.sh
index 0115c113..387c7323 100755
--- a/tests/test-notebooks-local-jupyter-build.sh
+++ b/tests/test-notebooks-local-jupyter-build.sh
@@ -58,7 +58,7 @@ retry 'curl http://localhost:8888/status' 50 2
#echo "Waiting for Data Generator readiness..."
retry 'curl http://localhost:9999/jobs' 50 2
-docker exec -it jupyter pytest --nbmake $TEST_PATH
+docker exec -it jupyter pytest --nbmake $TEST_PATH --nbmake-timeout=1200
# run it a second time to test re-runnability of the notebooks
-docker exec -it jupyter pytest --nbmake $TEST_PATH
+docker exec -it jupyter pytest --nbmake $TEST_PATH --nbmake-timeout=1200
docker compose -f ../docker-compose-local.yaml --profile all-services down -v
diff --git a/tests/test-notebooks.sh b/tests/test-notebooks.sh
index bc0c22ec..8f98151c 100755
--- a/tests/test-notebooks.sh
+++ b/tests/test-notebooks.sh
@@ -58,7 +58,7 @@ retry 'curl http://localhost:8888/status' 50 2
#echo "Waiting for Data Generator readiness..."
retry 'curl http://localhost:9999/jobs' 50 2
-docker exec -it jupyter pytest --nbmake $TEST_PATH
+docker exec -it jupyter pytest --nbmake $TEST_PATH --nbmake-timeout=1200
# run it a second time to test re-runnability of the notebooks
-docker exec -it jupyter pytest --nbmake $TEST_PATH
-docker compose --profile all-services down -v
\ No newline at end of file
+docker exec -it jupyter pytest --nbmake $TEST_PATH --nbmake-timeout=1200
+docker compose --profile all-services down -v