From 840499fbd3c25bf82a6ee0641ef6888079e096da Mon Sep 17 00:00:00 2001 From: Kieran Colford Date: Sun, 4 Feb 2024 11:02:38 -0500 Subject: [PATCH] Add http(s) support to the command line (#8753) * Add http(s) support to the command line * fmt * Add documentation * Add a test * fmt --------- Co-authored-by: Andrew Lamb --- datafusion-cli/Cargo.lock | 20 ++++++++++---------- datafusion-cli/Cargo.toml | 2 +- datafusion-cli/src/exec.rs | 18 ++++++++++++++++++ docs/source/user-guide/cli.md | 21 +++++++++++++++++++++ 4 files changed, 50 insertions(+), 11 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 94c57bd770ec..072898cda46d 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1852,9 +1852,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.59" +version = "0.1.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -2159,9 +2159,9 @@ checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" [[package]] name = "miniz_oxide" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" dependencies = [ "adler", ] @@ -2917,9 +2917,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e9d979b3ce68192e42760c7810125eb6cf2ea10efae545a156063e61f314e2a" +checksum = "0a716eb65e3158e90e17cd93d855216e27bde02745ab842f2cab4a39dba1bacf" [[package]] name = "rustls-webpki" @@ -3371,9 +3371,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.32" +version = "0.3.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe80ced77cbfb4cb91a94bf72b378b4b6791a0d9b7f09d0be747d1bdff4e68bd" +checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749" dependencies = [ "deranged", "num-conv", @@ -3425,9 +3425,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.35.1" +version = "1.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" dependencies = [ "backtrace", "bytes", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 79a1f0162e6a..e40aa6107c7d 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -41,7 +41,7 @@ dirs = "4.0.0" env_logger = "0.9" futures = "0.3" mimalloc = { version = "0.1", default-features = false } -object_store = { version = "0.9.0", features = ["aws", "gcp"] } +object_store = { version = "0.9.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } parquet = { version = "50.0.0", default-features = false } regex = "1.8" diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index a175f99a90d8..5273ea8ee8e4 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -42,6 +42,7 @@ use datafusion::physical_plan::{collect, execute_stream}; use datafusion::prelude::SessionContext; use datafusion::sql::{parser::DFParser, sqlparser::dialect::dialect_from_str}; +use object_store::http::HttpBuilder; use object_store::ObjectStore; use rustyline::error::ReadlineError; use rustyline::Editor; @@ -281,6 +282,11 @@ async fn create_external_table( let builder = get_gcs_object_store_builder(url, cmd)?; Arc::new(builder.build()?) as Arc } + "http" | "https" => Arc::new( + HttpBuilder::new() + .with_url(url.origin().ascii_serialization()) + .build()?, + ) as Arc, _ => { // for other types, try to get from the object_store_registry ctx.runtime_env() @@ -329,12 +335,24 @@ mod tests { return plan_err!("LogicalPlan is not a CreateExternalTable"); } + // Ensure the URL is supported by the object store ctx.runtime_env() .object_store(ListingTableUrl::parse(location)?)?; Ok(()) } + #[tokio::test] + async fn create_object_store_table_http() -> Result<()> { + // Should be OK + let location = "http://example.com/file.parquet"; + let sql = + format!("CREATE EXTERNAL TABLE test STORED AS PARQUET LOCATION '{location}'"); + create_external_table_test(location, &sql).await?; + + Ok(()) + } + #[tokio::test] async fn create_object_store_table_s3() -> Result<()> { let access_key_id = "fake_access_key_id"; diff --git a/docs/source/user-guide/cli.md b/docs/source/user-guide/cli.md index 30ab7d1495a5..4b909bbb1e90 100644 --- a/docs/source/user-guide/cli.md +++ b/docs/source/user-guide/cli.md @@ -260,6 +260,27 @@ STORED AS CSV LOCATION '/path/to/aggregate_test_100.csv'; ``` +## Registering Remote Data Sources + +`datafusion-cli` can read from remote locations using a variety of protocols. +For example to read from a remote parquet file via HTTP(S) you can use the following: + +```sql +CREATE EXTERNAL TABLE hits +STORED AS PARQUET +LOCATION 'https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_1.parquet'; +``` + +```sql +❯ select count(*) from hits; ++----------+ +| COUNT(*) | ++----------+ +| 1000000 | ++----------+ +1 row in set. Query took 0.344 seconds. +``` + ## Registering S3 Data Sources [AWS S3](https://aws.amazon.com/s3/) data sources can be registered by executing a `CREATE EXTERNAL TABLE` SQL statement.