diff --git a/.gitignore b/.gitignore index c6a18e3..3127cb1 100644 --- a/.gitignore +++ b/.gitignore @@ -67,4 +67,4 @@ target/checksum.txt repo .cpcache .lsp -.clj-kondo +.clj-kondo \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 0841990..496d44a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ #Copied from https://github.com/dacort/metabase-athena-driver/blob/d7572cd99551ea998a011f8f00a1e39c1eaa59b8/Dockerfile -ARG METABASE_VERSION=v0.46.6.2 +ARG METABASE_VERSION=v0.50.26 FROM clojure:openjdk-11-tools-deps-slim-buster AS stg_base @@ -36,14 +36,14 @@ WORKDIR /build/metabase # Now build the driver FROM stg_base as stg_build RUN clojure \ - -Sdeps "{:aliases {:sparksql-databricks {:extra-deps {com.metabase/sparksql-databricks {:local/root \"/build/driver\"}}}}}" \ - -X:build:sparksql-databricks \ + -Sdeps "{:aliases {:sparksql-databricks-v2 {:extra-deps {com.metabase/sparksql-databricks {:local/root \"/build/driver\"}}}}}" \ + -X:build:sparksql-databricks-v2 \ build-drivers.build-driver/build-driver! \ - "{:driver :sparksql-databricks, :project-dir \"/build/driver\", :target-dir \"/build/driver/target\"}" + "{:driver :sparksql-databricks-v2, :project-dir \"/build/driver\", :target-dir \"/build/driver/target\"}" # We create an export stage to make it easy to export the driver FROM scratch as stg_export -COPY --from=stg_build /build/driver/target/sparksql-databricks.metabase-driver.jar / +COPY --from=stg_build /build/driver/target/sparksql-databricks-v2.metabase-driver.jar / # Now we can run Metabase with our built driver FROM metabase/metabase:${METABASE_VERSION} AS stg_runner @@ -51,5 +51,7 @@ FROM metabase/metabase:${METABASE_VERSION} AS stg_runner # A metabase user/group is manually added in https://github.com/metabase/metabase/blob/master/bin/docker/run_metabase.sh # Make the UID and GID match COPY --chown=2000:2000 --from=stg_build \ - /build/driver/target/sparksql-databricks.metabase-driver.jar \ + /build/driver/target/sparksql-databricks-v2.metabase-driver.jar \ /plugins/sparksql-databricks.metabase-driver.jar + +RUN wget https://github.com/relferreira/metabase-sparksql-databricks-driver/releases/download/1.6.0/sparksql-databricks.metabase-driver.jar -O /plugins/sparksql-databricks.metabase-driver-old.jar diff --git a/README.md b/README.md index 4412a40..25ab5bf 100644 --- a/README.md +++ b/README.md @@ -1,97 +1,16 @@ # Metabase Driver: Spark Databricks +So the credits are a bit complicated, but originally, this driver was developed by Fernando Goncalves and Rajesh Kumar Ravi. Their original +repository is no longer around. However, github user [relferreira](https://github.com/relferreira), kindly updates it [here](https://github.com/relferreira/metabase-sparksql-databricks-driver/tree/master). However, his solution +does not allow for OAuth Secrets, which was something solved by [shrodingers](https://github.com/shrodingers) at [Brigad](https://github.com/Brigad/metabase-sparksql-databricks-driver). -**Credits**: This repository is only an updated version of the work of Fernando Goncalves and Rajesh Kumar Ravi. +Thus, this work is a combination of two somewhat actively maintained repositories. All that I do is to merge the two solutions and update the driver to work with Metabase 0.50.23. ## Installation To build a dockerized Metabase including the Databricks driver from this repository, simply run: ``` -docker build -t metabase:0.46.6.2-db -f Dockerfile . +docker build -t metabase:0.50.23-databricks -f Dockerfile . ``` The Metabase Databricks driver gets build and included in a final Metabase docker image. - -### To be fixed for >= v0.46: - -To run the tests for this driver, run the following: - -``` -docker build -t metabase/databricks-test --target stg_test . -docker run --rm --name mb-test metabase/databricks-test -``` - -or, if you have Clojure on your local machine, just: - -``` -clojure -X:test -``` - -# Connecting - -## Parameters - -![Connection Parameters](docs/parameters.png) - -- Display Name: a identification name for your database in Metabase -- Host: your Databricks URL (adb-XXXXXXXXX.azuredatabricks.net) -- Port: usually 443 -- Database Name: usually `default` -- Username: usually `token` -- Password: personal access token created in Databrick's dashboard -- Additional JDBC connection string options: - - SQL Warehouse (Endpoint): you can find it at `/sql/warehouses/` at the `Connection details` tab. It should have the following pattern: `;transportMode=http;ssl=1;AuthMech=3;httpPath=/sql/1.0/endpoints/;UID=token;PWD=` - - Cluster Endpoint: you will find it at your cluster's details page. It should have the following pattern: `;transportMode=http;ssl=1;httpPath=sql/protocolv1/o//;AuthMech=3;UID=token;PWD=` - -## Building the driver (the fast way) - -Use the `Dockerfile` on this repo: - -```bash -docker build -t metabase:metabase-head-databricks-1.3.0 . -``` - -And you can deploy to some docker registry of your own and use the image! - -Example of running: - -```bash -docker run -d -p 3000:3000 --name metabase metabase:metabase-head-databricks-1.6.0 -``` - -And access `http://localhost:3000`. - -## Building the driver (advanced way) - -### Prereq: Install Metabase as a local maven dependency, compiled for building drivers - -Clone the [Metabase repo](https://github.com/metabase/metabase) first if you haven't already done so. - -```bash -cd /path/to/metabase/ -./bin/build -``` - -### Build the Spark Databricks driver - -```bash -# (In the sparksql-databricks driver directory) -clojure -X:build :project-dir "\"$(pwd)\"" -``` - -### Copy it to your plugins dir and restart Metabase - -```bash -mkdir -p /path/to/metabase/plugins/ -cp target/sparksql-databricks.metabase-driver.jar /path/to/metabase/plugins/ -jar -jar /path/to/metabase/metabase.jar -``` - -_or:_ - -```bash -mkdir -p /path/to/metabase/plugins -cp target/sparksql-databricks.metabase-driver.jar /path/to/metabase/plugins/ -cd /path/to/metabase_source -lein run -``` diff --git a/docs/parameters.png b/docs/parameters.png index ac36c54..ad2d74c 100644 Binary files a/docs/parameters.png and b/docs/parameters.png differ diff --git a/resources/metabase-plugin.yaml b/resources/metabase-plugin.yaml index 7e62af8..bc2f289 100644 --- a/resources/metabase-plugin.yaml +++ b/resources/metabase-plugin.yaml @@ -1,5 +1,5 @@ info: - name: Metabase Databricks Spark SQL Driver + name: Metabase Databricks Spark SQL Driver (v2) version: 1.0.0-SNAPSHOT description: Allows Metabase to connect to Databricks Spark SQL databases. driver: @@ -7,35 +7,52 @@ driver: lazy-load: true abstract: true parent: sql-jdbc - - name: sparksql-databricks - display-name: Spark SQL (Databricks) + - name: sparksql-databricks-v2 + display-name: Databricks SQL (v2) lazy-load: true parent: hive-like connection-properties: - merge: - - host - - placeholder: ".cloud.databricks.com" - - merge: - - port - - default: 443 + - host + - placeholder: ".cloud.databricks.com" + helper-text: "The hostname of your Databricks account" + - name: app-id + display-name: Databricks client id + placeholder: "9af18267-60e7-4061-b2d5-e2414af88b0b" + required: true + helper-text: "The id of the service principal you generated an Oauth token for (see : https://docs.databricks.com/en/dev-tools/authentication-oauth.html)" + - name: app-secret + display-name: Databricks OAuth secret + placeholder: "doseXXXXXXXXXXXX" + required: true + helper-text: "The secret of the service principal you generated an Oauth token for (see : https://docs.databricks.com/en/dev-tools/authentication-oauth.html)" + - name: http-path + display-name: HTTP Path + placeholder: "/sql/1.0/warehouses/" + helper-text: "The path to the Databricks SQL endpoint (see : https://docs.databricks.com/en/integrations/compute-details.html)" + required: true + - name: catalog + display-name: Catalog + placeholder: "" + required: true - merge: - dbname - - placeholder: default - - merge: - - user - - default: token - - merge: - - password - - placeholder: "" + - required: false + display-name: Schema / Database (Optional) - advanced-options-start - merge: - additional-options - name: jdbc-flags - placeholder: ";transportMode=http;ssl=1;httpPath=;AuthMech=3;UID=token;PWD=" + placeholder: ";transportMode=http;ssl=1;" + - merge: + - additional-options + - name: port + display-name: HTTP Port + placeholder: "443" + default: 443 - default-advanced-options - connection-properties-include-tunnel-config: false init: - step: load-namespace - namespace: metabase.driver.sparksql-databricks + namespace: metabase.driver.sparksql-databricks-v2 - step: register-jdbc-driver class: metabase.driver.FixedSparkDriver diff --git a/scripts/extract_plugin.sh b/scripts/extract_plugin.sh new file mode 100755 index 0000000..036d255 --- /dev/null +++ b/scripts/extract_plugin.sh @@ -0,0 +1,6 @@ +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +docker buildx build --build-arg METABASE_VERSION=v0.49.7 --target stg_export --platform "linux/amd64" -t metabase:databricks-plugin "$SCRIPT_DIR/.." +container_id=$(docker create "metabase:databricks-plugin" /bin/bash) +docker cp "$container_id:/sparksql-databricks-v2.metabase-driver.jar" "$SCRIPT_DIR/../dist/databricks-sql.metabase-driver.jar" +docker rm "$container_id" \ No newline at end of file diff --git a/src/metabase/driver/connection.clj b/src/metabase/driver/connection.clj index 5e94a37..74553f8 100644 --- a/src/metabase/driver/connection.clj +++ b/src/metabase/driver/connection.clj @@ -37,15 +37,16 @@ (defn decorate-and-fix [impl] - (decorator - java.sql.Connection - impl - (getHoldability + (when impl + (decorator + java.sql.Connection + impl + (getHoldability [] ResultSet/CLOSE_CURSORS_AT_COMMIT) - (setReadOnly + (setReadOnly [read-only?] (when (.isClosed this) (throw (SQLException. "Connection is closed"))) (when read-only? - (throw (SQLException. "Enabling read-only mode is not supported")))))) + (throw (SQLException. "Enabling read-only mode is not supported"))))))) diff --git a/src/metabase/driver/hive_like.clj b/src/metabase/driver/hive_like.clj index 3f6b3a7..2ceeda7 100644 --- a/src/metabase/driver/hive_like.clj +++ b/src/metabase/driver/hive_like.clj @@ -3,7 +3,7 @@ [buddy.core.codecs :as codecs] [clojure.string :as str] [honey.sql :as sql] - [java-time :as t] + [java-time.api :as t] [metabase.driver :as driver] [metabase.driver.sql-jdbc.connection :as sql-jdbc.conn] [metabase.driver.sql-jdbc.execute :as sql-jdbc.execute] @@ -75,10 +75,6 @@ #"map" :type/Dictionary #".*" :type/*)) -(defmethod sql.qp/honey-sql-version :hive-like - [_driver] - 2) - (defmethod sql.qp/current-datetime-honeysql-form :hive-like [_] (h2x/with-database-type-info :%now "timestamp")) @@ -96,7 +92,7 @@ (defn- trunc-with-format [format-str expr] (str-to-date format-str (date-format format-str expr))) -(defmethod sql.qp/date [:hive-like :default] [_ _ expr] (h2x/->timestamp expr)) +(defmethod sql.qp/date [:hive-like :default] [_ _ expr] expr) (defmethod sql.qp/date [:hive-like :minute] [_ _ expr] (trunc-with-format "yyyy-MM-dd HH:mm" (h2x/->timestamp expr))) (defmethod sql.qp/date [:hive-like :minute-of-hour] [_ _ expr] [:minute (h2x/->timestamp expr)]) (defmethod sql.qp/date [:hive-like :hour] [_ _ expr] (trunc-with-format "yyyy-MM-dd HH" (h2x/->timestamp expr))) @@ -264,6 +260,9 @@ (sql-jdbc.execute/set-parameter driver ps i (t/local-date-time t (t/local-time 0)))) ;; TIMEZONE FIXME — not sure what timezone the results actually come back as +;; +;; Also, pretty sure Spark SQL doesn't have a TIME type anyway. +;; https://spark.apache.org/docs/latest/sql-ref-datatypes.html (defmethod sql-jdbc.execute/read-column-thunk [:hive-like Types/TIME] [_ ^ResultSet rs _rsmeta ^Integer i] (fn [] @@ -273,11 +272,11 @@ (defmethod sql-jdbc.execute/read-column-thunk [:hive-like Types/DATE] [_ ^ResultSet rs _rsmeta ^Integer i] (fn [] - (when-let [t (.getDate rs i)] - (t/zoned-date-time (t/local-date t) (t/local-time 0) (t/zone-id "UTC"))))) + (when-let [s (.getString rs i)] + (u.date/parse s)))) (defmethod sql-jdbc.execute/read-column-thunk [:hive-like Types/TIMESTAMP] [_ ^ResultSet rs _rsmeta ^Integer i] (fn [] (when-let [t (.getTimestamp rs i)] - (t/zoned-date-time (t/local-date-time t) (t/zone-id "UTC"))))) \ No newline at end of file + (t/zoned-date-time (t/local-date-time t) (t/zone-id "UTC"))))) diff --git a/src/metabase/driver/sparksql_databricks.clj b/src/metabase/driver/sparksql_databricks_v2.clj similarity index 76% rename from src/metabase/driver/sparksql_databricks.clj rename to src/metabase/driver/sparksql_databricks_v2.clj index fbfe928..110a48d 100644 --- a/src/metabase/driver/sparksql_databricks.clj +++ b/src/metabase/driver/sparksql_databricks_v2.clj @@ -1,4 +1,4 @@ -(ns metabase.driver.sparksql-databricks +(ns metabase.driver.sparksql-databricks-v2 (:require [clojure.java.jdbc :as jdbc] [clojure.string :as str] [clojure @@ -20,15 +20,15 @@ [metabase.driver.sql.query-processor :as sql.qp] [metabase.driver.sql.util :as sql.u] [metabase.driver.sql.util.unprepare :as unprepare] - [metabase.mbql.util :as mbql.u] + [metabase.legacy-mbql.util :as mbql.u] + [metabase.lib.metadata :as lib.metadata] [metabase.query-processor.store :as qp.store] [metabase.query-processor.util :as qputil] [metabase.query-processor.util.add-alias-info :as add] - [metabase.driver.sql.query-processor :as sql.qp] [metabase.util.honey-sql-2 :as h2x]) (:import [java.sql Connection ResultSet])) -(driver/register! :sparksql-databricks, :parent :hive-like) +(driver/register! :sparksql-databricks-v2, :parent :hive-like) ;;; ------------------------------------------ Custom HoneySQL Clause Impls ------------------------------------------ @@ -44,7 +44,7 @@ ;; ((get-method sql.qp/->honeysql [:hive-like :field]) driver field))) -(defmethod sql.qp/->honeysql [:sparksql-databricks :field] +(defmethod sql.qp/->honeysql [:sparksql-databricks-v2 :field] [driver [_ _ {::params.substitution/keys [compiling-field-filter?]} :as field-clause]] ;; use [[source-table-alias]] instead of the usual `schema.table` to qualify fields e.g. `t1.field` instead of the ;; normal `schema.table.field` @@ -65,7 +65,7 @@ :else source-table)))] (parent-method driver field-clause))) -(defmethod sql.qp/apply-top-level-clause [:sparksql-databricks :page] +(defmethod sql.qp/apply-top-level-clause [:sparksql-databricks-v2 :page] [_driver _clause honeysql-form {{:keys [items page]} :page}] (let [offset (* (dec page) items)] (if (zero? offset) @@ -78,9 +78,9 @@ (sql.helpers/where [:> :__rownum__ [:inline offset]]) (sql.helpers/limit [:inline items])))))) -(defmethod sql.qp/apply-top-level-clause [:sparksql-databricks :source-table] +(defmethod sql.qp/apply-top-level-clause [:sparksql-databricks-v2 :source-table] [driver _ honeysql-form {source-table-id :source-table}] - (let [{table-name :name, schema :schema} (qp.store/table source-table-id)] + (let [{table-name :name, schema :schema} (lib.metadata/table (qp.store/metadata-provider) source-table-id)] (sql.helpers/from honeysql-form [(sql.qp/->honeysql driver (h2x/identifier :table schema table-name)) [(sql.qp/->honeysql driver (h2x/identifier :table-alias source-table-alias))]]))) @@ -95,18 +95,26 @@ ;; :ssl true} ;; (dissoc opts :host :port :db :jdbc-flags))) -(defn- sparksql-databricks +(defn- sparksql-databricks-v2 "Create a database specification for a Spark SQL database." - [{:keys [host port db jdbc-flags] + [{:keys [host port http-path jdbc-flags app-id app-secret catalog db] :or {host "localhost", port 10000, db "", jdbc-flags ""} :as opts}] (merge {:classname "metabase.driver.FixedSparkDriver" :subprotocol "databricks" - :subname (str "//" host ":" port "/" db jdbc-flags)} - (dissoc opts :host :port :db :jdbc-flags))) - -(defmethod sql-jdbc.conn/connection-details->spec :sparksql-databricks + :subname (str "//" host ":" port jdbc-flags) + :ssl 1 + :httpPath http-path + :ConnSchema db + :ConnCatalog catalog + :AuthMech 11 + :Auth_Flow 1 + :OAuth2ClientId app-id + :OAuth2Secret app-secret} + (dissoc opts :host :port :db :jdbc-flags :http-path :app-id :app-secret :catalog))) + +(defmethod sql-jdbc.conn/connection-details->spec :sparksql-databricks-v2 [_ details] (-> details (update :port (fn [port] @@ -114,8 +122,8 @@ (Integer/parseInt port) port))) (set/rename-keys {:dbname :db}) - (select-keys [:host :port :db :jdbc-flags :dbname]) - sparksql-databricks + (select-keys [:host :port :db :jdbc-flags :dbname :http-path :app-id :app-secret :catalog]) + sparksql-databricks-v2 (sql-jdbc.common/handle-additional-options details))) (defn- dash-to-underscore [s] @@ -123,14 +131,15 @@ (str/replace s #"-" "_"))) ;; workaround for SPARK-9686 Spark Thrift server doesn't return correct JDBC metadata -(defmethod driver/describe-database :sparksql-databricks +(defmethod driver/describe-database :sparksql-databricks-v2 [_ database] {:tables (with-open [conn (jdbc/get-connection (sql-jdbc.conn/db->pooled-connection-spec database))] (set - (for [{:keys [database tablename tab_name], table-namespace :namespace} (jdbc/query {:connection conn} ["show tables"])] - {:name (or tablename tab_name) ; column name differs depending on server (SparkSQL, hive, Impala) + (for [{:keys [database tablename tab_name table_name table_schema], table-namespace :namespace} (jdbc/query {:connection conn} ["select * from information_schema.tables"])] + {:name (or tablename tab_name table_name table_schema) ; column name differs depending on server (SparkSQL, hive, Impala) :schema (or (not-empty database) + (not-empty table_schema) (not-empty table-namespace))})))}) ;; Hive describe table result has commented rows to distinguish partitions @@ -140,7 +149,7 @@ [col_name data_type])) ;; workaround for SPARK-9686 Spark Thrift server doesn't return correct JDBC metadata -(defmethod driver/describe-table :sparksql-databricks +(defmethod driver/describe-table :sparksql-databricks-v2 [driver database {table-name :name, schema :schema}] {:name table-name :schema schema @@ -160,10 +169,10 @@ :database-position idx}))))}) ;; bound variables are not supported in Spark SQL (maybe not Hive either, haven't checked) -(defmethod driver/execute-reducible-query :sparksql-databricks +(defmethod driver/execute-reducible-query :sparksql-databricks-v2 [driver {{sql :query, :keys [params], :as inner-query} :native, :as outer-query} context respond] (let [inner-query (-> (assoc inner-query - :remark (qputil/query->remark :sparksql-databricks outer-query) + :remark (qputil/query->remark :sparksql-databricks-v2 outer-query) :query (if (seq params) (binding [hive-like/*param-splice-style* :paranoid] (unprepare/unprepare driver (cons sql params))) @@ -177,7 +186,7 @@ ;; 2. SparkSQL doesn't support session timezones (at least our driver doesn't support it) ;; 3. SparkSQL doesn't support making connections read-only ;; 4. SparkSQL doesn't support setting the default result set holdability -(defmethod sql-jdbc.execute/do-with-connection-with-options :sparksql-databricks +(defmethod sql-jdbc.execute/do-with-connection-with-options :sparksql-databricks-v2 [driver db-or-id-or-spec options f] (sql-jdbc.execute/do-with-resolved-connection driver @@ -189,7 +198,7 @@ (f conn)))) ;; 1. SparkSQL doesn't support setting holdability type to `CLOSE_CURSORS_AT_COMMIT` -(defmethod sql-jdbc.execute/prepared-statement :sparksql-databricks +(defmethod sql-jdbc.execute/prepared-statement :sparksql-databricks-v2 [driver ^Connection conn ^String sql params] (let [stmt (.prepareStatement conn sql ResultSet/TYPE_FORWARD_ONLY @@ -203,20 +212,26 @@ (throw e))))) ;; the current HiveConnection doesn't support .createStatement -(defmethod sql-jdbc.execute/statement-supported? :sparksql-databricks [_] false) - -(doseq [feature [:basic-aggregations - :binning - :expression-aggregations - :expressions - :native-parameters - :nested-queries - :standard-deviation-aggregations]] - (defmethod driver/supports? [:sparksql-databricks feature] [_ _] true)) +(defmethod sql-jdbc.execute/statement-supported? :sparksql-databricks-v2 [_] false) + +(doseq [[feature supported?] {:basic-aggregations true + :binning true + :expression-aggregations true + :expressions true + :native-parameters true + :nested-queries true + :standard-deviation-aggregations true + :foreign-keys true + :full-join true + :right-join true + :left-join true + :inner-join true + :window-functions/offset true}] + (defmethod driver/database-supports? [:sparksql-databricks-v2 feature] [_driver _feature _db] supported?)) ;; only define an implementation for `:foreign-keys` if none exists already. In test extensions we define an alternate ;; implementation, and we don't want to stomp over that if it was loaded already -(when-not (get (methods driver/supports?) [:sparksql-databricks :foreign-keys]) - (defmethod driver/supports? [:sparksql-databricks :foreign-keys] [_ _] true)) +(when-not (get (methods driver/database-supports?) [:sparksql-databricks-v2 :foreign-keys]) + (defmethod driver/database-supports? [:sparksql-databricks-v2 :foreign-keys] [_driver _feature _db] true)) -(defmethod sql.qp/quote-style :sparksql-databricks [_] :mysql) +(defmethod sql.qp/quote-style :sparksql-databricks-v2 [_] :mysql) diff --git a/test/metabase/driver/connection_test.clj b/test/metabase/driver/connection_test.clj new file mode 100644 index 0000000..997f0c6 --- /dev/null +++ b/test/metabase/driver/connection_test.clj @@ -0,0 +1,11 @@ +(ns metabase.driver.connection-test + (:require [expectations :refer [expect]] + [metabase.driver.connection :as connection])) + +(expect + nil + (connection/decorate-and-fix nil)) + +(expect + some? + (connection/decorate-and-fix {}))