diff --git a/docs/integrations/data-ingestion/aws-glue/index.md b/docs/integrations/data-ingestion/aws-glue/index.md index f3774440d4d..25da911ef3c 100644 --- a/docs/integrations/data-ingestion/aws-glue/index.md +++ b/docs/integrations/data-ingestion/aws-glue/index.md @@ -3,58 +3,125 @@ sidebar_label: 'Amazon Glue' sidebar_position: 1 slug: /integrations/glue description: 'Integrate ClickHouse and Amazon Glue' -keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data'] -title: 'Integrating Amazon Glue with ClickHouse' +keywords: ['clickhouse', 'amazon', 'aws', 'glue', 'migrating', 'data', 'spark'] +title: 'Integrating Amazon Glue with ClickHouse and Spark' --- +import Image from '@theme/IdealImage'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import notebook_connections_config from '@site/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png'; +import dependent_jars_path_option from '@site/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png'; -# Integrating Amazon Glue with ClickHouse +# Integrating Amazon Glue with ClickHouse and Spark [Amazon Glue](https://aws.amazon.com/glue/) is a fully managed, serverless data integration service provided by Amazon Web Services (AWS). It simplifies the process of discovering, preparing, and transforming data for analytics, machine learning, and application development. -Although there is no Glue ClickHouse connector available yet, the official JDBC connector can be leveraged to connect and integrate with ClickHouse: +## Installation {#installation} + +To integrate your Glue code with ClickHouse, you can use our official Spark connector in Glue via one of the following: +- Installing the ClickHouse Glue connector from the AWS Marketplace (recommended). +- Manually adding the Spark Connector's jars to your Glue job. - + + +1.

Subscribe to the Connector

+To access the connector in your account, subscribe to the ClickHouse AWS Glue Connector from AWS Marketplace. + +2.

Grant Required Permissions

+Ensure your Glue job’s IAM role has the necessary permissions, as described in the minimum privileges [guide](https://docs.aws.amazon.com/glue/latest/dg/getting-started-min-privs-job.html#getting-started-min-privs-connectors). + +3.

Activate the Connector & Create a Connection

+You can activate the connector and create a connection directly by clicking [this link](https://console.aws.amazon.com/gluestudio/home#/connector/add-connection?connectorName="ClickHouse%20AWS%20Glue%20Connector"&connectorType="Spark"&connectorUrl=https://709825985650.dkr.ecr.us-east-1.amazonaws.com/clickhouse/clickhouse-glue:0.1&connectorClassName="com.clickhouse.spark.ClickHouseCatalog"), which opens the Glue connection creation page with key fields pre-filled. Give the connection a name, and press create (no need to provide the ClickHouse connection details at this stage). + +4.

Use in Glue Job

+In your Glue job, select the `Job details` tab, and expend the `Advanced properties` window. Under the `Connections` section, select the connection you just created. The connector automatically injects the required JARs into the job runtime. + +Glue Notebook connections config + +:::note +The JARs used in the Glue connector are built for `Spark 3.2`, `Scala 2`, and `Python 3`. Make sure to select these versions when configuring your Glue job. +::: + +
+ +To add the required jars manually, please follow the following: +1. Upload the following jars to an S3 bucket - `clickhouse-jdbc-0.6.X-all.jar` and `clickhouse-spark-runtime-3.X_2.X-0.8.X.jar`. +2. Make sure the Glue job has access to this bucket. +3. Under the `Job details` tab, scroll down and expend the `Advanced properties` drop down, and fill the jars path in `Dependent JARs path`: + +Glue Notebook JAR path options + + +
+ +## Examples {#example} + + ```java -import com.amazonaws.services.glue.util.Job -import com.amazonaws.services.glue.util.GlueArgParser import com.amazonaws.services.glue.GlueContext -import org.apache.spark.SparkContext +import com.amazonaws.services.glue.util.GlueArgParser +import com.amazonaws.services.glue.util.Job +import com.clickhouseScala.Native.NativeSparkRead.spark import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.DataFrame + import scala.collection.JavaConverters._ -import com.amazonaws.services.glue.log.GlueLogger +import org.apache.spark.sql.types._ +import org.apache.spark.sql.functions._ -// Initialize Glue job -object GlueJob { +object ClickHouseGlueExample { def main(sysArgs: Array[String]) { - val sc: SparkContext = new SparkContext() - val glueContext: GlueContext = new GlueContext(sc) - val spark: SparkSession = glueContext.getSparkSession - val logger = new GlueLogger - import spark.implicits._ - // @params: [JOB_NAME] val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray) - Job.init(args("JOB_NAME"), glueContext, args.asJava) - // JDBC connection details - val jdbcUrl = "jdbc:ch://{host}:{port}/{schema}" - val jdbcProperties = new java.util.Properties() - jdbcProperties.put("user", "default") - jdbcProperties.put("password", "*******") - jdbcProperties.put("driver", "com.clickhouse.jdbc.ClickHouseDriver") - - // Load the table from ClickHouse - val df: DataFrame = spark.read.jdbc(jdbcUrl, "my_table", jdbcProperties) - - // Show the Spark df, or use it for whatever you like - df.show() - - // Commit the job + val sparkSession: SparkSession = SparkSession.builder + .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") + .config("spark.sql.catalog.clickhouse.host", "") + .config("spark.sql.catalog.clickhouse.protocol", "https") + .config("spark.sql.catalog.clickhouse.http_port", "") + .config("spark.sql.catalog.clickhouse.user", "default") + .config("spark.sql.catalog.clickhouse.password", "") + .config("spark.sql.catalog.clickhouse.database", "default") + // for ClickHouse cloud + .config("spark.sql.catalog.clickhouse.option.ssl", "true") + .config("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE") + .getOrCreate + + val glueContext = new GlueContext(sparkSession.sparkContext) + Job.init(args("JOB_NAME"), glueContext, args.asJava) + import sparkSession.implicits._ + + val url = "s3://{path_to_cell_tower_data}/cell_towers.csv.gz" + + val schema = StructType(Seq( + StructField("radio", StringType, nullable = false), + StructField("mcc", IntegerType, nullable = false), + StructField("net", IntegerType, nullable = false), + StructField("area", IntegerType, nullable = false), + StructField("cell", LongType, nullable = false), + StructField("unit", IntegerType, nullable = false), + StructField("lon", DoubleType, nullable = false), + StructField("lat", DoubleType, nullable = false), + StructField("range", IntegerType, nullable = false), + StructField("samples", IntegerType, nullable = false), + StructField("changeable", IntegerType, nullable = false), + StructField("created", TimestampType, nullable = false), + StructField("updated", TimestampType, nullable = false), + StructField("averageSignal", IntegerType, nullable = false) + )) + + val df = sparkSession.read + .option("header", "true") + .schema(schema) + .csv(url) + + // Write to ClickHouse + df.writeTo("clickhouse.default.cell_towers").append() + + + // Read from ClickHouse + val dfRead = spark.sql("select * from clickhouse.default.cell_towers") Job.commit() } } @@ -70,6 +137,8 @@ from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext from awsglue.context import GlueContext from awsglue.job import Job +from pyspark.sql import Row + ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) @@ -80,20 +149,29 @@ logger = glueContext.get_logger() spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) -jdbc_url = "jdbc:ch://{host}:{port}/{schema}" -query = "select * from my_table" -# For cloud usage, please add ssl options -df = (spark.read.format("jdbc") - .option("driver", 'com.clickhouse.jdbc.ClickHouseDriver') - .option("url", jdbc_url) - .option("user", 'default') - .option("password", '*******') - .option("query", query) - .load()) - -logger.info("num of rows:") -logger.info(str(df.count())) -logger.info("Data sample:") + +spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog") +spark.conf.set("spark.sql.catalog.clickhouse.host", "") +spark.conf.set("spark.sql.catalog.clickhouse.protocol", "https") +spark.conf.set("spark.sql.catalog.clickhouse.http_port", "") +spark.conf.set("spark.sql.catalog.clickhouse.user", "default") +spark.conf.set("spark.sql.catalog.clickhouse.password", "") +spark.conf.set("spark.sql.catalog.clickhouse.database", "default") +spark.conf.set("spark.clickhouse.write.format", "json") +spark.conf.set("spark.clickhouse.read.format", "arrow") +# for ClickHouse cloud +spark.conf.set("spark.sql.catalog.clickhouse.option.ssl", "true") +spark.conf.set("spark.sql.catalog.clickhouse.option.ssl_mode", "NONE") + +# Create DataFrame +data = [Row(id=11, name="John"), Row(id=12, name="Doe")] +df = spark.createDataFrame(data) + +# Write DataFrame to ClickHouse +df.writeTo("clickhouse.default.example_table").append() + +# Read DataFrame from ClickHouse +df_read = spark.sql("select * from clickhouse.default.example_table") logger.info(str(df.take(10))) job.commit() @@ -102,4 +180,4 @@ job.commit() -For more details, please visit our [Spark & JDBC documentation](/integrations/apache-spark/spark-jdbc#read-data). +For more details, please visit our [Spark documentation](/integrations/apache-spark). diff --git a/docs/integrations/index.mdx b/docs/integrations/index.mdx index feb668f16ea..299c2b25fcb 100644 --- a/docs/integrations/index.mdx +++ b/docs/integrations/index.mdx @@ -205,7 +205,7 @@ We are actively compiling this list of ClickHouse integrations below, so it's no |Amazon Kinesis| |Data ingestion|Integration with Amazon Kinesis.|[Documentation](/integrations/clickpipes/kinesis/)| |Amazon MSK| |Data ingestion|Integration with Amazon Managed Streaming for Apache Kafka (MSK).|[Documentation](/integrations/kafka/cloud/amazon-msk/)| |Amazon S3||Data ingestion|Import from, export to, and transform S3 data in flight with ClickHouse built-in S3 functions.|[Documentation](/integrations/data-ingestion/s3/index.md)| -|Amazon Glue|Amazon Glue logo|Data ingestion|Query ClickHouse over JDBC|[Documentation](/integrations/glue)| +|Amazon Glue|Amazon Glue logo|Data ingestion|Query ClickHouse over Spark using our official Glue connector|[Documentation](/integrations/glue)| |Apache Spark||Data ingestion|Spark ClickHouse Connector is a high performance connector built on top of Spark DataSource V2.|[GitHub](https://github.com/housepower/spark-clickhouse-connector),
[Documentation](/integrations/data-ingestion/apache-spark/index.md)| |Azure Event Hubs||Data ingestion|A data streaming platform that supports Apache Kafka's native protocol|[Website](https://azure.microsoft.com/en-gb/products/event-hubs)| |Azure Synapse|Azure Synapse logo|Data ingestion|A cloud-based analytics service for big data and data warehousing.|[Documentation](/integrations/azure-synapse)| diff --git a/scripts/aspell-ignore/en/aspell-dict.txt b/scripts/aspell-ignore/en/aspell-dict.txt index f2bd4e0555f..e7a49dec671 100644 --- a/scripts/aspell-ignore/en/aspell-dict.txt +++ b/scripts/aspell-ignore/en/aspell-dict.txt @@ -3572,4 +3572,56 @@ zlib znode znodes zookeeperSessionUptime -zstd \ No newline at end of file +zstd +Okta +specificities +reproducibility +CertManager +Istio +LogHouse +Tailscale +Thanos +ReplacingReplicatedMergeTree +ReplacingSharedMergeTree +SharedMergeTree +VersionedCollapsing +subpath +AICPA +restartable +sumArray +sumForEach +argMaxIf +groupArrayResample +downsampled +uniqArrayIf +minSimpleState +sumArray +avgMerge +avgMergeState +timeslot +timeslots +groupArrayDistinct +avgMap +avgState +avgIf +quantilesTiming +quantilesTimingIf +quantilesTimingArrayIf +downvotes +sumSimpleState +upvotes +uniqArray +avgResample +countResample +avgMerge +avgState +argMinIf +minSimpleState +maxSimpleState +TimescaleDB +columnstore +TiDB +resync +resynchronization +Sackmann's +JARs \ No newline at end of file diff --git a/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png b/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png new file mode 100644 index 00000000000..65935cf70ca Binary files /dev/null and b/static/images/integrations/data-ingestion/aws-glue/dependent_jars_path_option.png differ diff --git a/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png b/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png new file mode 100644 index 00000000000..a84717afc9f Binary files /dev/null and b/static/images/integrations/data-ingestion/aws-glue/notebook-connections-config.png differ