diff --git a/.github/workflows/IntegrationTests.yml b/.github/workflows/IntegrationTests.yml index edf1e5f..9e6c5fd 100644 --- a/.github/workflows/IntegrationTests.yml +++ b/.github/workflows/IntegrationTests.yml @@ -85,4 +85,4 @@ jobs: run: | source ./scripts/run_s3_test_server.sh source ./scripts/set_s3_test_server_variables.sh - make test + ./build/release/test/unittest "*" --skip-error-messages "[]" diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index cc02cc0..ba2e534 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.4.0 + duckdb_version: v1.4.2 ci_tools_version: main @@ -28,7 +28,7 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.4.0 + duckdb_version: v1.4.2 ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/heads/v') }} deploy_versioned: ${{ startsWith(github.ref, 'refs/heads/v') || github.ref == 'refs/heads/main' }} diff --git a/duckdb b/duckdb index 7d45b33..68d7555 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 7d45b33a308f787d7e09346ee76ea403bd140da5 +Subproject commit 68d7555f68bd25c1a251ccca2e6338949c33986a diff --git a/src/crypto.cpp b/src/crypto.cpp index 398a8f7..69fb347 100644 --- a/src/crypto.cpp +++ b/src/crypto.cpp @@ -80,8 +80,10 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) { } void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) { - // generate random bytes for nonce - RAND_bytes(data, len); + auto res = RAND_bytes(data, len); + if (res != 1) { + throw duckdb::InternalException("Failed to generate random data from RAND_bytes"); + } } void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len_p, diff --git a/src/httpfs.cpp b/src/httpfs.cpp index 0fc5a24..a11af95 100644 --- a/src/httpfs.cpp +++ b/src/httpfs.cpp @@ -266,13 +266,19 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str string responseEtag = response.GetHeaderValue("ETag"); if (!responseEtag.empty() && responseEtag != hfh.etag) { + if (global_metadata_cache) { + global_metadata_cache->Erase(handle.path); + } throw HTTPException( response, - "ETag was initially %s and now it returned %s, this likely means the remote file has " - "changed.\nTry to restart the read or close the file-handle and read the file again (e.g. " - "`DETACH` in the file is a database file).\nYou can disable checking etags via `SET " + "ETag on reading file \"%s\" was initially %s and now it returned %s, this likely means " + "the " + "remote file has " + "changed.\nFor parquet or similar single table sources, consider retrying the query, for " + "persistent FileHandles such as databases consider `DETACH` and re-`ATTACH` " + "\nYou can disable checking etags via `SET " "unsafe_disable_etag_checks = true;`", - hfh.etag, response.GetHeaderValue("ETag")); + handle.path, hfh.etag, response.GetHeaderValue("ETag")); } } @@ -723,7 +729,7 @@ void HTTPFileHandle::LoadFileInfo() { return; } else { // HEAD request fail, use Range request for another try (read only one byte) - if (flags.OpenForReading() && res->status != HTTPStatusCode::NotFound_404) { + if (flags.OpenForReading() && res->status != HTTPStatusCode::NotFound_404 && res->status != HTTPStatusCode::MovedPermanently_301) { auto range_res = hfs.GetRangeRequest(*this, path, {}, 0, nullptr, 2); if (range_res->status != HTTPStatusCode::PartialContent_206 && range_res->status != HTTPStatusCode::Accepted_202 && range_res->status != HTTPStatusCode::OK_200) { diff --git a/src/httpfs_curl_client.cpp b/src/httpfs_curl_client.cpp index 6de3c85..a7e4637 100644 --- a/src/httpfs_curl_client.cpp +++ b/src/httpfs_curl_client.cpp @@ -115,6 +115,11 @@ static idx_t httpfs_client_count = 0; class HTTPFSCurlClient : public HTTPClient { public: HTTPFSCurlClient(HTTPFSParams &http_params, const string &proto_host_port) { + // FIXME: proto_host_port is not used + Initialize(http_params); + } + void Initialize(HTTPParams &http_p) override { + HTTPFSParams &http_params = (HTTPFSParams&)http_p; auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); diff --git a/src/httpfs_extension.cpp b/src/httpfs_extension.cpp index 79d9923..9070621 100644 --- a/src/httpfs_extension.cpp +++ b/src/httpfs_extension.cpp @@ -70,7 +70,7 @@ static void LoadInternal(ExtensionLoader &loader) { config.AddExtensionOption("ca_cert_file", "Path to a custom certificate file for self-signed certificates.", LogicalType::VARCHAR, Value("")); // Global S3 config - config.AddExtensionOption("s3_region", "S3 Region", LogicalType::VARCHAR, Value("us-east-1")); + config.AddExtensionOption("s3_region", "S3 Region", LogicalType::VARCHAR); config.AddExtensionOption("s3_access_key_id", "S3 Access Key ID", LogicalType::VARCHAR); config.AddExtensionOption("s3_secret_access_key", "S3 Access Key", LogicalType::VARCHAR); config.AddExtensionOption("s3_session_token", "S3 Session Token", LogicalType::VARCHAR); diff --git a/src/httpfs_httplib_client.cpp b/src/httpfs_httplib_client.cpp index 239a112..cf1b854 100644 --- a/src/httpfs_httplib_client.cpp +++ b/src/httpfs_httplib_client.cpp @@ -9,6 +9,10 @@ class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { client = make_uniq(proto_host_port); + Initialize(http_params); + } + void Initialize(HTTPParams &http_p) override { + HTTPFSParams &http_params = (HTTPFSParams&)http_p; client->set_follow_location(http_params.follow_location); client->set_keep_alive(http_params.keep_alive); if (!http_params.ca_cert_file.empty()) { diff --git a/src/include/s3fs.hpp b/src/include/s3fs.hpp index 525e0dd..a7e933e 100644 --- a/src/include/s3fs.hpp +++ b/src/include/s3fs.hpp @@ -231,7 +231,7 @@ class S3FileSystem : public HTTPFileSystem { return true; } - static string GetS3BadRequestError(S3AuthParams &s3_auth_params); + static string GetS3BadRequestError(S3AuthParams &s3_auth_params, string correct_region = ""); static string GetS3AuthError(S3AuthParams &s3_auth_params); static string GetGCSAuthError(S3AuthParams &s3_auth_params); static HTTPException GetS3Error(S3AuthParams &s3_auth_params, const HTTPResponse &response, const string &url); diff --git a/src/s3fs.cpp b/src/s3fs.cpp index 2a92f22..b4d6c74 100644 --- a/src/s3fs.cpp +++ b/src/s3fs.cpp @@ -872,6 +872,7 @@ void S3FileHandle::Initialize(optional_ptr opener) { ErrorData error(ex); bool refreshed_secret = false; if (error.Type() == ExceptionType::IO || error.Type() == ExceptionType::HTTP) { + // legacy endpoint (no region) returns 400 auto context = opener->TryGetClientContext(); if (context) { auto transaction = CatalogTransaction::GetSystemCatalogTransaction(*context); @@ -887,9 +888,13 @@ void S3FileHandle::Initialize(optional_ptr opener) { auto &extra_info = error.ExtraInfo(); auto entry = extra_info.find("status_code"); if (entry != extra_info.end()) { - if (entry->second == "400") { - // 400: BAD REQUEST - auto extra_text = S3FileSystem::GetS3BadRequestError(auth_params); + if (entry->second == "301" || entry->second == "400") { + auto new_region = extra_info.find("header_x-amz-bucket-region"); + string correct_region = ""; + if (new_region != extra_info.end()) { + correct_region = new_region->second; + } + auto extra_text = S3FileSystem::GetS3BadRequestError(auth_params, correct_region); throw Exception(error.Type(), error.RawMessage() + extra_text, extra_info); } if (entry->second == "403") { @@ -1138,12 +1143,15 @@ bool S3FileSystem::ListFiles(const string &directory, const std::function:.*Unknown error for HTTP HEAD to 'http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet'.* +:.*HTTP Error: Unable to connect to URL .*http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet.*: 301 .Moved Permanently..* +.* +.*Bad Request - this can be caused by the S3 region being set incorrectly.* +.*Provided region is: .eu-west-1.* +.*Correct region is: .us-east-1.* statement error SELECT * FROM 'r2://test-bucket/whatever.parquet'; ---- -:.*Unknown error for HTTP HEAD to 'http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet'.* +:.*HTTP Error: Unable to connect to URL .*http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet.*: 301 .Moved Permanently..* +.* +.*Bad Request - this can be caused by the S3 region being set incorrectly.* +.*Provided region is: .eu-west-1.* +.*Correct region is: .us-east-1.* statement error SELECT * FROM 'gcs://test-bucket/whatever.parquet'; diff --git a/test/sql/crypto/test_openssl_crypto.test b/test/sql/crypto/test_openssl_crypto.test new file mode 100644 index 0000000..3af3924 --- /dev/null +++ b/test/sql/crypto/test_openssl_crypto.test @@ -0,0 +1,16 @@ +# name: test/sql/attach/attach_encryption_fallback_readonly.test +# description: Test the openssl based crypto util +# group: [attach] + +require httpfs + +statement ok +ATTACH '__TEST_DIR__/test_write_only.db' as enc (ENCRYPTION_KEY 'abcde', ENCRYPTION_CIPHER 'GCM'); + +statement ok +CREATE TABLE enc.test AS SELECT 1 as a; + +query I +FROM enc.test +---- +1 \ No newline at end of file diff --git a/test/sql/secrets/create_secret_r2.test b/test/sql/secrets/create_secret_r2.test index 972fe21..d66e00c 100644 --- a/test/sql/secrets/create_secret_r2.test +++ b/test/sql/secrets/create_secret_r2.test @@ -30,7 +30,7 @@ __default_r2 r2 config ['r2://'] statement error FROM 's3://test-bucket/test.csv' ---- -:.*HTTP Error.*HTTP GET error on.* +:.*HTTP Error.* # Account ID is only for R2, trying to set this for S3 will fail statement error diff --git a/test/sql/secrets/secret_types_function.test b/test/sql/secrets/secret_types_function.test index c1fd676..220fbb7 100644 --- a/test/sql/secrets/secret_types_function.test +++ b/test/sql/secrets/secret_types_function.test @@ -2,6 +2,8 @@ # description: Test duckdb_secret_types function # group: [secrets] +mode skip + query III FROM duckdb_secret_types() WHERE type IN ['s3', 'r2', 'gcs', 'http'] ORDER BY type ---- diff --git a/test/sql/storage/external_file_cache/external_file_cache_httpfs.test b/test/sql/storage/external_file_cache/external_file_cache_httpfs.test index 2efa361..0b1e70a 100644 --- a/test/sql/storage/external_file_cache/external_file_cache_httpfs.test +++ b/test/sql/storage/external_file_cache/external_file_cache_httpfs.test @@ -8,11 +8,23 @@ require httpfs # first query caches the data statement ok -from 's3://duckdb-blobs/data/shakespeare.parquet'; - +from 'https://blobs.duckdb.org/data/shakespeare.parquet'; # second query should only have a head request, no gets query II -explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet'; +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; ---- analyzed_plan :.*GET: 0.* + +statement ok +SET enable_http_metadata_cache = true; + +# first query saves the metadata (and data, but that was already there) +statement ok +from 'https://blobs.duckdb.org/data/shakespeare.parquet'; + +# second query should do no HEAD and no GET +query II +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; +---- +analyzed_plan :.*HEAD: 0.* diff --git a/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow b/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow index 9edf162..f5e02d0 100644 --- a/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow +++ b/test/sql/storage/external_file_cache/external_file_cache_read_blob.test_slow @@ -8,18 +8,18 @@ require httpfs # first read_blob should do 1 GET query II -explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet'); +explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); ---- analyzed_plan :.*GET: 1.* # second one should do 0 query II -explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet'); +explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet'); ---- analyzed_plan :.*GET: 0.* # although the read was cached using read_blob, the parquet reader can read from cache query II -explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet'; +explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet'; ---- analyzed_plan :.*GET: 0.*