Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
13d3a62
Clean also global_metadata_cache on discovering a URL is now changed
carlopi Sep 25, 2025
140e19a
Run also slow tests, and avoid skipping HTTP-connected errors
carlopi Sep 25, 2025
b85a1c7
Bump duckdb back to v1.4.0
carlopi Sep 25, 2025
08d99ff
Merge pull request #131 from carlopi/cleanup_global_cache
carlopi Sep 25, 2025
6ed3c2f
Fixup from s3 to https for shakespeare.parquet
carlopi Sep 25, 2025
0b20ee9
test/sql/secrets/create_secret_r2.test: move to generic error failure
carlopi Sep 25, 2025
078ef4b
More shakespeare to blobs
carlopi Sep 25, 2025
301f6a8
Add TODOs
carlopi Sep 25, 2025
3120aaf
Merge pull request #137 from carlopi/duckdb_to_140
carlopi Sep 26, 2025
0518838
Merge pull request #136 from carlopi/avoid_skipping_on_http_errors
carlopi Sep 26, 2025
9a85bb2
apply patches from duckdb v1.4-andium
samansmink Oct 2, 2025
ccbc257
bump submodule
samansmink Oct 3, 2025
8356a90
Merge pull request #144 from samansmink/apply-patches-92f6c8d958394d7…
samansmink Oct 3, 2025
bd7e030
check return value of openssl rand_bytes
samansmink Nov 3, 2025
b80c680
Merge pull request #155 from samansmink/check_random_result
samansmink Nov 4, 2025
4ea1541
Merge pull request #156 from duckdb/main
samansmink Nov 7, 2025
1add891
Fixup check on RAND_bytes
carlopi Nov 10, 2025
041a782
Merge pull request #159 from carlopi/fix_rand_bytes_check
samansmink Nov 10, 2025
b3e1c54
Apply patches around Initialize, bump duckdb
carlopi Nov 12, 2025
19e5adf
Apply disabling_mbedtls_encrypt.patch
carlopi Nov 12, 2025
2a8dea7
Apply disable_failing_tests.patch
carlopi Nov 12, 2025
6b0e375
Bump CI to v1.4.2
carlopi Nov 12, 2025
6c187d8
Merge pull request #164 from carlopi/absorb_patches
carlopi Nov 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/IntegrationTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ jobs:
run: |
source ./scripts/run_s3_test_server.sh
source ./scripts/set_s3_test_server_variables.sh
make test
./build/release/test/unittest "*" --skip-error-messages "[]"
4 changes: 2 additions & 2 deletions .github/workflows/MainDistributionPipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main
with:
extension_name: httpfs
duckdb_version: v1.4.0
duckdb_version: v1.4.2
ci_tools_version: main


Expand All @@ -28,7 +28,7 @@ jobs:
secrets: inherit
with:
extension_name: httpfs
duckdb_version: v1.4.0
duckdb_version: v1.4.2
ci_tools_version: main
deploy_latest: ${{ startsWith(github.ref, 'refs/heads/v') }}
deploy_versioned: ${{ startsWith(github.ref, 'refs/heads/v') || github.ref == 'refs/heads/main' }}
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 940 files
6 changes: 4 additions & 2 deletions src/crypto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,10 @@ const EVP_CIPHER *AESStateSSL::GetCipher(idx_t key_len) {
}

void AESStateSSL::GenerateRandomData(data_ptr_t data, idx_t len) {
// generate random bytes for nonce
RAND_bytes(data, len);
auto res = RAND_bytes(data, len);
if (res != 1) {
throw duckdb::InternalException("Failed to generate random data from RAND_bytes");
}
}

void AESStateSSL::InitializeEncryption(const_data_ptr_t iv, idx_t iv_len, const_data_ptr_t key, idx_t key_len_p,
Expand Down
16 changes: 11 additions & 5 deletions src/httpfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -266,13 +266,19 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
string responseEtag = response.GetHeaderValue("ETag");

if (!responseEtag.empty() && responseEtag != hfh.etag) {
if (global_metadata_cache) {
global_metadata_cache->Erase(handle.path);
}
throw HTTPException(
response,
"ETag was initially %s and now it returned %s, this likely means the remote file has "
"changed.\nTry to restart the read or close the file-handle and read the file again (e.g. "
"`DETACH` in the file is a database file).\nYou can disable checking etags via `SET "
"ETag on reading file \"%s\" was initially %s and now it returned %s, this likely means "
"the "
"remote file has "
"changed.\nFor parquet or similar single table sources, consider retrying the query, for "
"persistent FileHandles such as databases consider `DETACH` and re-`ATTACH` "
"\nYou can disable checking etags via `SET "
"unsafe_disable_etag_checks = true;`",
hfh.etag, response.GetHeaderValue("ETag"));
handle.path, hfh.etag, response.GetHeaderValue("ETag"));
}
}

Expand Down Expand Up @@ -723,7 +729,7 @@ void HTTPFileHandle::LoadFileInfo() {
return;
} else {
// HEAD request fail, use Range request for another try (read only one byte)
if (flags.OpenForReading() && res->status != HTTPStatusCode::NotFound_404) {
if (flags.OpenForReading() && res->status != HTTPStatusCode::NotFound_404 && res->status != HTTPStatusCode::MovedPermanently_301) {
auto range_res = hfs.GetRangeRequest(*this, path, {}, 0, nullptr, 2);
if (range_res->status != HTTPStatusCode::PartialContent_206 &&
range_res->status != HTTPStatusCode::Accepted_202 && range_res->status != HTTPStatusCode::OK_200) {
Expand Down
5 changes: 5 additions & 0 deletions src/httpfs_curl_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ static idx_t httpfs_client_count = 0;
class HTTPFSCurlClient : public HTTPClient {
public:
HTTPFSCurlClient(HTTPFSParams &http_params, const string &proto_host_port) {
// FIXME: proto_host_port is not used
Initialize(http_params);
}
void Initialize(HTTPParams &http_p) override {
HTTPFSParams &http_params = (HTTPFSParams&)http_p;
auto bearer_token = "";
if (!http_params.bearer_token.empty()) {
bearer_token = http_params.bearer_token.c_str();
Expand Down
2 changes: 1 addition & 1 deletion src/httpfs_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ static void LoadInternal(ExtensionLoader &loader) {
config.AddExtensionOption("ca_cert_file", "Path to a custom certificate file for self-signed certificates.",
LogicalType::VARCHAR, Value(""));
// Global S3 config
config.AddExtensionOption("s3_region", "S3 Region", LogicalType::VARCHAR, Value("us-east-1"));
config.AddExtensionOption("s3_region", "S3 Region", LogicalType::VARCHAR);
config.AddExtensionOption("s3_access_key_id", "S3 Access Key ID", LogicalType::VARCHAR);
config.AddExtensionOption("s3_secret_access_key", "S3 Access Key", LogicalType::VARCHAR);
config.AddExtensionOption("s3_session_token", "S3 Session Token", LogicalType::VARCHAR);
Expand Down
4 changes: 4 additions & 0 deletions src/httpfs_httplib_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ class HTTPFSClient : public HTTPClient {
public:
HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) {
client = make_uniq<duckdb_httplib_openssl::Client>(proto_host_port);
Initialize(http_params);
}
void Initialize(HTTPParams &http_p) override {
HTTPFSParams &http_params = (HTTPFSParams&)http_p;
client->set_follow_location(http_params.follow_location);
client->set_keep_alive(http_params.keep_alive);
if (!http_params.ca_cert_file.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/include/s3fs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ class S3FileSystem : public HTTPFileSystem {
return true;
}

static string GetS3BadRequestError(S3AuthParams &s3_auth_params);
static string GetS3BadRequestError(S3AuthParams &s3_auth_params, string correct_region = "");
static string GetS3AuthError(S3AuthParams &s3_auth_params);
static string GetGCSAuthError(S3AuthParams &s3_auth_params);
static HTTPException GetS3Error(S3AuthParams &s3_auth_params, const HTTPResponse &response, const string &url);
Expand Down
18 changes: 13 additions & 5 deletions src/s3fs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ void S3FileHandle::Initialize(optional_ptr<FileOpener> opener) {
ErrorData error(ex);
bool refreshed_secret = false;
if (error.Type() == ExceptionType::IO || error.Type() == ExceptionType::HTTP) {
// legacy endpoint (no region) returns 400
auto context = opener->TryGetClientContext();
if (context) {
auto transaction = CatalogTransaction::GetSystemCatalogTransaction(*context);
Expand All @@ -887,9 +888,13 @@ void S3FileHandle::Initialize(optional_ptr<FileOpener> opener) {
auto &extra_info = error.ExtraInfo();
auto entry = extra_info.find("status_code");
if (entry != extra_info.end()) {
if (entry->second == "400") {
// 400: BAD REQUEST
auto extra_text = S3FileSystem::GetS3BadRequestError(auth_params);
if (entry->second == "301" || entry->second == "400") {
auto new_region = extra_info.find("header_x-amz-bucket-region");
string correct_region = "";
if (new_region != extra_info.end()) {
correct_region = new_region->second;
}
auto extra_text = S3FileSystem::GetS3BadRequestError(auth_params, correct_region);
throw Exception(error.Type(), error.RawMessage() + extra_text, extra_info);
}
if (entry->second == "403") {
Expand Down Expand Up @@ -1138,12 +1143,15 @@ bool S3FileSystem::ListFiles(const string &directory, const std::function<void(c
return true;
}

string S3FileSystem::GetS3BadRequestError(S3AuthParams &s3_auth_params) {
string S3FileSystem::GetS3BadRequestError(S3AuthParams &s3_auth_params, string correct_region) {
string extra_text = "\n\nBad Request - this can be caused by the S3 region being set incorrectly.";
if (s3_auth_params.region.empty()) {
extra_text += "\n* No region is provided.";
} else {
extra_text += "\n* Provided region is \"" + s3_auth_params.region + "\"";
extra_text += "\n* Provided region is: \"" + s3_auth_params.region + "\"";
}
if (!correct_region.empty()) {
extra_text += "\n* Correct region is: \"" + correct_region + "\"";
}
return extra_text;
}
Expand Down
3 changes: 3 additions & 0 deletions test/extension/duckdb_extension_settings.test
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# description: settings for extensions
# group: [extension]

# TODO: move back to duckdb/duckdb
mode skip

require httpfs

statement ok
Expand Down
3 changes: 3 additions & 0 deletions test/sql/copy/csv/parallel/test_parallel_csv.test
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
# description: Test parallel read CSV function on ghub bugs
# group: [parallel]

# TODO: figure out where that bucket went
mode skip

require httpfs

query II
Expand Down
22 changes: 13 additions & 9 deletions test/sql/copy/csv/test_csv_remote.test
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,26 @@ require httpfs
statement ok
PRAGMA enable_verification

# Test load from url with query string
query IIIIIIIIIIII
FROM sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1')
----
, " (empty) \n (empty) 0 0 [{'name': column00, 'type': BIGINT}, {'name': column01, 'type': VARCHAR}, {'name': column02, 'type': BIGINT}, {'name': column03, 'type': BIGINT}, {'name': column04, 'type': BIGINT}, {'name': column05, 'type': BIGINT}, {'name': column06, 'type': BIGINT}, {'name': column07, 'type': VARCHAR}, {'name': column08, 'type': VARCHAR}, {'name': column09, 'type': VARCHAR}, {'name': column10, 'type': VARCHAR}, {'name': column11, 'type': BIGINT}, {'name': column12, 'type': BIGINT}, {'name': column13, 'type': BIGINT}, {'name': column14, 'type': VARCHAR}, {'name': column15, 'type': VARCHAR}, {'name': column16, 'type': VARCHAR}, {'name': column17, 'type': BIGINT}] NULL NULL NULL FROM read_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1', auto_detect=false, delim=',', quote='"', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'VARCHAR', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'BIGINT', 'column06': 'BIGINT', 'column07': 'VARCHAR', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'VARCHAR', 'column11': 'BIGINT', 'column12': 'BIGINT', 'column13': 'BIGINT', 'column14': 'VARCHAR', 'column15': 'VARCHAR', 'column16': 'VARCHAR', 'column17': 'BIGINT'});


# This test abuses the LOCAL_EXTENSION_REPO env to make sure tests are only run when running extension tests
# in duckdb/duckdb. Otherwise you need to pass a data dir when exex

require-env LOCAL_EXTENSION_REPO

# regular csv file
query ITTTIITITTIIII nosort webpagecsv
SELECT * FROM read_csv_auto('duckdb/data/csv/real/web_page.csv') ORDER BY 1;
SELECT * FROM read_csv_auto('data/csv/real/web_page.csv') ORDER BY 1;
----

# file with gzip
query IIIIIIIIIIIIIII nosort lineitemcsv
SELECT * FROM read_csv_auto('duckdb/data/csv/lineitem1k.tbl.gz') ORDER BY ALL;
SELECT * FROM read_csv_auto('data/csv/lineitem1k.tbl.gz') ORDER BY ALL;
----

query ITTTIITITTIIII nosort webpagecsv
Expand All @@ -25,10 +36,3 @@ SELECT * FROM read_csv_auto('https://raw.githubusercontent.com/duckdb/duckdb/mai
query IIIIIIIIIIIIIII nosort lineitemcsv
select * from read_csv_auto('https://raw.githubusercontent.com/duckdb/duckdb/main/data/csv/lineitem1k.tbl.gz') ORDER BY ALL;
----


# Test load from url with query string
query IIIIIIIIIIII
FROM sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1')
----
, " (empty) \n (empty) 0 0 [{'name': column00, 'type': BIGINT}, {'name': column01, 'type': VARCHAR}, {'name': column02, 'type': BIGINT}, {'name': column03, 'type': BIGINT}, {'name': column04, 'type': BIGINT}, {'name': column05, 'type': BIGINT}, {'name': column06, 'type': BIGINT}, {'name': column07, 'type': VARCHAR}, {'name': column08, 'type': VARCHAR}, {'name': column09, 'type': VARCHAR}, {'name': column10, 'type': VARCHAR}, {'name': column11, 'type': BIGINT}, {'name': column12, 'type': BIGINT}, {'name': column13, 'type': BIGINT}, {'name': column14, 'type': VARCHAR}, {'name': column15, 'type': VARCHAR}, {'name': column16, 'type': VARCHAR}, {'name': column17, 'type': BIGINT}] NULL NULL NULL FROM read_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/customer.csv?v=1', auto_detect=false, delim=',', quote='"', escape='', new_line='\n', skip=0, comment='', header=false, columns={'column00': 'BIGINT', 'column01': 'VARCHAR', 'column02': 'BIGINT', 'column03': 'BIGINT', 'column04': 'BIGINT', 'column05': 'BIGINT', 'column06': 'BIGINT', 'column07': 'VARCHAR', 'column08': 'VARCHAR', 'column09': 'VARCHAR', 'column10': 'VARCHAR', 'column11': 'BIGINT', 'column12': 'BIGINT', 'column13': 'BIGINT', 'column14': 'VARCHAR', 'column15': 'VARCHAR', 'column16': 'VARCHAR', 'column17': 'BIGINT'});
10 changes: 0 additions & 10 deletions test/sql/copy/parquet/parquet_encryption_mbedtls_openssl.test
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,4 @@ SELECT * FROM read_parquet('__TEST_DIR__/encrypted${key_len}_openssl.parquet', e
----
42

# write files with default mbedtls
statement ok
COPY (SELECT 42 i) to '__TEST_DIR__/encrypted${key_len}_mbedtls.parquet' (ENCRYPTION_CONFIG {footer_key: 'key${key_len}'}, DEBUG_USE_OPENSSL false)

# read mbedtls encrypted files using OpenSSL
query I
SELECT * FROM read_parquet('__TEST_DIR__/encrypted${key_len}_mbedtls.parquet', encryption_config={footer_key: 'key${key_len}'}, debug_use_openssl=true)
----
42

endloop
12 changes: 10 additions & 2 deletions test/sql/copy/s3/url_encode.test
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,20 @@ set s3_endpoint='';
statement error
SELECT * FROM 's3://test-bucket/whatever.parquet';
----
<REGEX>:.*Unknown error for HTTP HEAD to 'http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet'.*
<REGEX>:.*HTTP Error: Unable to connect to URL .*http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet.*: 301 .Moved Permanently..*
.*
.*Bad Request - this can be caused by the S3 region being set incorrectly.*
.*Provided region is: .eu-west-1.*
.*Correct region is: .us-east-1.*

statement error
SELECT * FROM 'r2://test-bucket/whatever.parquet';
----
<REGEX>:.*Unknown error for HTTP HEAD to 'http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet'.*
<REGEX>:.*HTTP Error: Unable to connect to URL .*http://test-bucket.s3.eu-west-1.amazonaws.com/whatever.parquet.*: 301 .Moved Permanently..*
.*
.*Bad Request - this can be caused by the S3 region being set incorrectly.*
.*Provided region is: .eu-west-1.*
.*Correct region is: .us-east-1.*

statement error
SELECT * FROM 'gcs://test-bucket/whatever.parquet';
Expand Down
16 changes: 16 additions & 0 deletions test/sql/crypto/test_openssl_crypto.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# name: test/sql/attach/attach_encryption_fallback_readonly.test
# description: Test the openssl based crypto util
# group: [attach]

require httpfs

statement ok
ATTACH '__TEST_DIR__/test_write_only.db' as enc (ENCRYPTION_KEY 'abcde', ENCRYPTION_CIPHER 'GCM');

statement ok
CREATE TABLE enc.test AS SELECT 1 as a;

query I
FROM enc.test
----
1
2 changes: 1 addition & 1 deletion test/sql/secrets/create_secret_r2.test
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ __default_r2 r2 config ['r2://']
statement error
FROM 's3://test-bucket/test.csv'
----
<REGEX>:.*HTTP Error.*HTTP GET error on.*
<REGEX>:.*HTTP Error.*

# Account ID is only for R2, trying to set this for S3 will fail
statement error
Expand Down
2 changes: 2 additions & 0 deletions test/sql/secrets/secret_types_function.test
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# description: Test duckdb_secret_types function
# group: [secrets]

mode skip

query III
FROM duckdb_secret_types() WHERE type IN ['s3', 'r2', 'gcs', 'http'] ORDER BY type
----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,23 @@ require httpfs

# first query caches the data
statement ok
from 's3://duckdb-blobs/data/shakespeare.parquet';

from 'https://blobs.duckdb.org/data/shakespeare.parquet';

# second query should only have a head request, no gets
query II
explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet';
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*GET: 0.*

statement ok
SET enable_http_metadata_cache = true;

# first query saves the metadata (and data, but that was already there)
statement ok
from 'https://blobs.duckdb.org/data/shakespeare.parquet';

# second query should do no HEAD and no GET
query II
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*HEAD: 0.*
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,18 @@ require httpfs

# first read_blob should do 1 GET
query II
explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet');
explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet');
----
analyzed_plan <REGEX>:.*GET: 1.*

# second one should do 0
query II
explain analyze from read_blob('s3://duckdb-blobs/data/shakespeare.parquet');
explain analyze from read_blob('https://blobs.duckdb.org/data/shakespeare.parquet');
----
analyzed_plan <REGEX>:.*GET: 0.*

# although the read was cached using read_blob, the parquet reader can read from cache
query II
explain analyze from 's3://duckdb-blobs/data/shakespeare.parquet';
explain analyze from 'https://blobs.duckdb.org/data/shakespeare.parquet';
----
analyzed_plan <REGEX>:.*GET: 0.*
Loading