diff --git a/docs/integrations/data-ingestion/aws-glue/index.md b/docs/integrations/data-ingestion/aws-glue/index.md
index f3774440d4d..25da911ef3c 100644
--- a/docs/integrations/data-ingestion/aws-glue/index.md
+++ b/docs/integrations/data-ingestion/aws-glue/index.md
@@ -3,58 +3,125 @@ sidebar_label: 'Amazon Glue'
sidebar_position: 1
slug: /integrations/glue
description: 'Integrate ClickHouse and Amazon Glue'
-keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data']
-title: 'Integrating Amazon Glue with ClickHouse'
+keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data', 'spark']
+title: 'Integrating Amazon Glue with ClickHouse and Spark'
---
+import Image from '@theme/IdealImage';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
+import notebook_connections_config from '@site/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png';
+import dependent_jars_path_option from '@site/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png';
-# Integrating Amazon Glue with ClickHouse
+# Integrating Amazon Glue with ClickHouse and Spark
[Amazon Glue](https://aws.amazon.com/glue/) is a fully managed, serverless data integration service provided by Amazon Web Services (AWS). It simplifies the process of discovering, preparing, and transforming data for analytics, machine learning, and application development.
-Although there is no Glue ClickHouse connector available yet, the official JDBC connector can be leveraged to connect and integrate with ClickHouse:
+## Installation {#installation}
+
+To integrate your Glue code with ClickHouse, you can use our official Spark connector in Glue via one of the following:
+- Installing the ClickHouse Glue connector from the AWS Marketplace (recommended).
+- Manually adding the Spark Connector's jars to your Glue job.
-
+
+
+1. Subscribe to the Connector
+To access the connector in your account, subscribe to the ClickHouse AWS Glue Connector from AWS Marketplace.
+
+2. Grant Required Permissions
+Ensure your Glue job’s IAM role has the necessary permissions, as described in the minimum privileges [guide](https://docs.aws.amazon.com/glue/latest/dg/getting-started-min-privs-job.html#getting-started-min-privs-connectors).
+
+3. Activate the Connector & Create a Connection
+You can activate the connector and create a connection directly by clicking [this link](https://console.aws.amazon.com/gluestudio/home#/connector/add-connection?connectorName="ClickHouse%20AWS%20Glue%20Connector"&connectorType="Spark"&connectorUrl=https://709825985650.dkr.ecr.us-east-1.amazonaws.com/clickhouse/clickhouse-glue:0.1&connectorClassName="com.clickhouse.spark.ClickHouseCatalog"), which opens the Glue connection creation page with key fields pre-filled. Give the connection a name, and press create (no need to provide the ClickHouse connection details at this stage).
+
+4. Use in Glue Job
+In your Glue job, select the `Job details` tab, and expend the `Advanced properties` window. Under the `Connections` section, select the connection you just created. The connector automatically injects the required JARs into the job runtime.
+
+
+
+:::note
+The JARs used in the Glue connector are built for `Spark 3.2`, `Scala 2`, and `Python 3`. Make sure to select these versions when configuring your Glue job.
+:::
+
+
+
+To add the required jars manually, please follow the following:
+1. Upload the following jars to an S3 bucket - `clickhouse-jdbc-0.6.X-all.jar` and `clickhouse-spark-runtime-3.X_2.X-0.8.X.jar`.
+2. Make sure the Glue job has access to this bucket.
+3. Under the `Job details` tab, scroll down and expend the `Advanced properties` drop down, and fill the jars path in `Dependent JARs path`:
+
+
+
+
+
+
+## Examples {#example}
+
+
```java
-import com.amazonaws.services.glue.util.Job
-import com.amazonaws.services.glue.util.GlueArgParser
import com.amazonaws.services.glue.GlueContext
-import org.apache.spark.SparkContext
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import com.clickhouseScala.Native.NativeSparkRead.spark
import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.DataFrame
+
import scala.collection.JavaConverters._
-import com.amazonaws.services.glue.log.GlueLogger
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
-// Initialize Glue job
-object GlueJob {
+object ClickHouseGlueExample {
def main(sysArgs: Array[String]) {
- val sc: SparkContext = new SparkContext()
- val glueContext: GlueContext = new GlueContext(sc)
- val spark: SparkSession = glueContext.getSparkSession
- val logger = new GlueLogger
- import spark.implicits._
- // @params: [JOB_NAME]
val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray)
- Job.init(args("JOB_NAME"), glueContext, args.asJava)
- // JDBC connection details
- val jdbcUrl = "jdbc:ch://{host}:{port}/{schema}"
- val jdbcProperties = new java.util.Properties()
- jdbcProperties.put("user", "default")
- jdbcProperties.put("password", "*******")
- jdbcProperties.put("driver", "com.clickhouse.jdbc.ClickHouseDriver")
-
- // Load the table from ClickHouse
- val df: DataFrame = spark.read.jdbc(jdbcUrl, "my_table", jdbcProperties)
-
- // Show the Spark df, or use it for whatever you like
- df.show()
-
- // Commit the job
+ val sparkSession: SparkSession = SparkSession.builder
+ .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
+ .config("spark.sql.catalog.clickhouse.host", "")
+ .config("spark.sql.catalog.clickhouse.protocol", "https")
+ .config("spark.sql.catalog.clickhouse.http_port", "")
+ .config("spark.sql.catalog.clickhouse.user", "default")
+ .config("spark.sql.catalog.clickhouse.password", "")
+ .config("spark.sql.catalog.clickhouse.database", "default")
+ // for ClickHouse cloud
+ .config("spark.sql.catalog.clickhouse.option.ssl", "true")
+ .config("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE")
+ .getOrCreate
+
+ val glueContext = new GlueContext(sparkSession.sparkContext)
+ Job.init(args("JOB_NAME"), glueContext, args.asJava)
+ import sparkSession.implicits._
+
+ val url = "s3://{path_to_cell_tower_data}/cell_towers.csv.gz"
+
+ val schema = StructType(Seq(
+ StructField("radio", StringType, nullable = false),
+ StructField("mcc", IntegerType, nullable = false),
+ StructField("net", IntegerType, nullable = false),
+ StructField("area", IntegerType, nullable = false),
+ StructField("cell", LongType, nullable = false),
+ StructField("unit", IntegerType, nullable = false),
+ StructField("lon", DoubleType, nullable = false),
+ StructField("lat", DoubleType, nullable = false),
+ StructField("range", IntegerType, nullable = false),
+ StructField("samples", IntegerType, nullable = false),
+ StructField("changeable", IntegerType, nullable = false),
+ StructField("created", TimestampType, nullable = false),
+ StructField("updated", TimestampType, nullable = false),
+ StructField("averageSignal", IntegerType, nullable = false)
+ ))
+
+ val df = sparkSession.read
+ .option("header", "true")
+ .schema(schema)
+ .csv(url)
+
+ // Write to ClickHouse
+ df.writeTo("clickhouse.default.cell_towers").append()
+
+
+ // Read from ClickHouse
+ val dfRead = spark.sql("select * from clickhouse.default.cell_towers")
Job.commit()
}
}
@@ -70,6 +137,8 @@ from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
+from pyspark.sql import Row
+
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
@@ -80,20 +149,29 @@ logger = glueContext.get_logger()
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
-jdbc_url = "jdbc:ch://{host}:{port}/{schema}"
-query = "select * from my_table"
-# For cloud usage, please add ssl options
-df = (spark.read.format("jdbc")
- .option("driver", 'com.clickhouse.jdbc.ClickHouseDriver')
- .option("url", jdbc_url)
- .option("user", 'default')
- .option("password", '*******')
- .option("query", query)
- .load())
-
-logger.info("num of rows:")
-logger.info(str(df.count()))
-logger.info("Data sample:")
+
+spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
+spark.conf.set("spark.sql.catalog.clickhouse.host", "")
+spark.conf.set("spark.sql.catalog.clickhouse.protocol", "https")
+spark.conf.set("spark.sql.catalog.clickhouse.http_port", "")
+spark.conf.set("spark.sql.catalog.clickhouse.user", "default")
+spark.conf.set("spark.sql.catalog.clickhouse.password", "")
+spark.conf.set("spark.sql.catalog.clickhouse.database", "default")
+spark.conf.set("spark.clickhouse.write.format", "json")
+spark.conf.set("spark.clickhouse.read.format", "arrow")
+# for ClickHouse cloud
+spark.conf.set("spark.sql.catalog.clickhouse.option.ssl", "true")
+spark.conf.set("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE")
+
+# Create DataFrame
+data = [Row(id=11, name="John"), Row(id=12, name="Doe")]
+df = spark.createDataFrame(data)
+
+# Write DataFrame to ClickHouse
+df.writeTo("clickhouse.default.example_table").append()
+
+# Read DataFrame from ClickHouse
+df_read = spark.sql("select * from clickhouse.default.example_table")
logger.info(str(df.take(10)))
job.commit()
@@ -102,4 +180,4 @@ job.commit()
-For more details, please visit our [Spark & JDBC documentation](/integrations/apache-spark/spark-jdbc#read-data).
+For more details, please visit our [Spark documentation](/integrations/apache-spark).
diff --git a/docs/integrations/index.mdx b/docs/integrations/index.mdx
index feb668f16ea..299c2b25fcb 100644
--- a/docs/integrations/index.mdx
+++ b/docs/integrations/index.mdx
@@ -205,7 +205,7 @@ We are actively compiling this list of ClickHouse integrations below, so it's no
|Amazon Kinesis| |Data ingestion|Integration with Amazon Kinesis.|[Documentation](/integrations/clickpipes/kinesis/)|
|Amazon MSK| |Data ingestion|Integration with Amazon Managed Streaming for Apache Kafka (MSK).|[Documentation](/integrations/kafka/cloud/amazon-msk/)|
|Amazon S3||Data ingestion|Import from, export to, and transform S3 data in flight with ClickHouse built-in S3 functions.|[Documentation](/integrations/data-ingestion/s3/index.md)|
-|Amazon Glue||Data ingestion|Query ClickHouse over JDBC|[Documentation](/integrations/glue)|
+|Amazon Glue||Data ingestion|Query ClickHouse over Spark using our official Glue connector|[Documentation](/integrations/glue)|
|Apache Spark||Data ingestion|Spark ClickHouse Connector is a high performance connector built on top of Spark DataSource V2.|[GitHub](https://github.com/housepower/spark-clickhouse-connector),
[Documentation](/integrations/data-ingestion/apache-spark/index.md)|
|Azure Event Hubs||Data ingestion|A data streaming platform that supports Apache Kafka's native protocol|[Website](https://azure.microsoft.com/en-gb/products/event-hubs)|
|Azure Synapse||Data ingestion|A cloud-based analytics service for big data and data warehousing.|[Documentation](/integrations/azure-synapse)|
diff --git a/scripts/aspell-ignore/en/aspell-dict.txt b/scripts/aspell-ignore/en/aspell-dict.txt
index f2bd4e0555f..e7a49dec671 100644
--- a/scripts/aspell-ignore/en/aspell-dict.txt
+++ b/scripts/aspell-ignore/en/aspell-dict.txt
@@ -3572,4 +3572,56 @@ zlib
znode
znodes
zookeeperSessionUptime
-zstd
\ No newline at end of file
+zstd
+Okta
+specificities
+reproducibility
+CertManager
+Istio
+LogHouse
+Tailscale
+Thanos
+ReplacingReplicatedMergeTree
+ReplacingSharedMergeTree
+SharedMergeTree
+VersionedCollapsing
+subpath
+AICPA
+restartable
+sumArray
+sumForEach
+argMaxIf
+groupArrayResample
+downsampled
+uniqArrayIf
+minSimpleState
+sumArray
+avgMerge
+avgMergeState
+timeslot
+timeslots
+groupArrayDistinct
+avgMap
+avgState
+avgIf
+quantilesTiming
+quantilesTimingIf
+quantilesTimingArrayIf
+downvotes
+sumSimpleState
+upvotes
+uniqArray
+avgResample
+countResample
+avgMerge
+avgState
+argMinIf
+minSimpleState
+maxSimpleState
+TimescaleDB
+columnstore
+TiDB
+resync
+resynchronization
+Sackmann's
+JARs
\ No newline at end of file
diff --git a/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png b/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png
new file mode 100644
index 00000000000..65935cf70ca
Binary files /dev/null and b/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png differ
diff --git a/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png b/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png
new file mode 100644
index 00000000000..a84717afc9f
Binary files /dev/null and b/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png differ