From 29052b3ec54ad520c8f466674044bbc3141dfc13 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 14 May 2025 14:51:24 +0200 Subject: [PATCH 01/44] include open ssl --- extension/httpfs/crypto.cpp | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 04bd795..72b3835 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -5,7 +5,16 @@ #include #define CPPHTTPLIB_OPENSSL_SUPPORT -#include "httplib.hpp" + +#include +#include +#include +#include +#include + +#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK) +#include +#endif namespace duckdb { From 180bcb69e6eae0c9d5ac8a5e08fa7372de693310 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 15 May 2025 14:29:27 +0200 Subject: [PATCH 02/44] attempt to overwrite get --- extension/httpfs/httpfs_client.cpp | 169 ++++++++++++++++++++++++++--- 1 file changed, 151 insertions(+), 18 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 7a779ef..735e391 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -2,10 +2,103 @@ #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT +#include +#include #include "httplib.hpp" namespace duckdb { +// we statically compile in libcurl, which means the cert file location of the build machine is the +// place curl will look. But not every distro has this file in the same location, so we search a +// number of common locations and use the first one we find. +static std::string certFileLocations[] = { + // Arch, Debian-based, Gentoo + "/etc/ssl/certs/ca-certificates.crt", + // RedHat 7 based + "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", + // Redhat 6 based + "/etc/pki/tls/certs/ca-bundle.crt", + // OpenSUSE + "/etc/ssl/ca-bundle.pem", + // Alpine + "/etc/ssl/cert.pem"}; + +//! Grab the first path that exists, from a list of well-known locations +static std::string SelectCURLCertPath() { + for (std::string &caFile : certFileLocations) { + struct stat buf; + if (stat(caFile.c_str(), &buf) == 0) { + return caFile; + } + } + return std::string(); +} + +static std::string cert_path = SelectCURLCertPath(); + +static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { + ((std::string *)userp)->append((char *)contents, size * nmemb); + return size * nmemb; +} + +class CURLRequestHeaders { +public: + CURLRequestHeaders(const vector &input) { + for (auto &header : input) { + Add(header); + } + } + ~CURLRequestHeaders() { + if (headers) { + curl_slist_free_all(headers); + } + headers = NULL; + } + operator bool() const { + return headers != NULL; + } + +public: + void Add(const string &header) { + headers = curl_slist_append(headers, header.c_str()); + } + +public: + curl_slist *headers = NULL; +}; + + +class CURLHandle { +public: + CURLHandle(const string &token, const string &cert_path) { + curl = curl_easy_init(); + if (!curl) { + throw InternalException("Failed to initialize curl"); + } + if (!token.empty()) { + curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); + } + if (!cert_path.empty()) { + curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } + } + ~CURLHandle() { + curl_easy_cleanup(curl); + } + +public: + operator CURL *() { + return curl; + } + CURLcode Execute() { + return curl_easy_perform(curl); + } + +private: + CURL *curl = NULL; +}; + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -32,30 +125,56 @@ class HTTPFSClient : public HTTPClient { } } state = http_params.state; + auto bearer_token = ""; + if (!http_params.bearer_token.empty()) { + bearer_token = http_params.bearer_token.c_str(); + } + curl = make_uniq(bearer_token, SelectCURLCertPath()); + state = http_params.state; + } + + void SetLogger(HTTPLogger &logger) { + client->set_logger(logger.GetLogger()); } unique_ptr Get(GetRequestInfo &info) override { - if (state) { - state->get_count++; + auto headers = TransformHeadersForCurl(info.headers, info.params); + CURLRequestHeaders curl_headers(headers); + + CURLcode res; + string result; + { + curl_easy_setopt(*curl_handle, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl_handle, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl_handle, CURLOPT_WRITEDATA, &result); + + if (curl_headers) { + curl_easy_setopt(*curl_handle, CURLOPT_HTTPHEADER, curl_headers.headers); + } + res = curl_handle->Execute(); } - auto headers = TransformHeaders(info.headers, info.params); - if (!info.response_handler && !info.content_handler) { - return TransformResult(client->Get(info.path, headers)); - } else { - return TransformResult(client->Get( - info.path.c_str(), headers, - [&](const duckdb_httplib_openssl::Response &response) { - auto http_response = TransformResponse(response); - return info.response_handler(*http_response); - }, - [&](const char *data, size_t data_length) { - if (state) { - state->total_bytes_received += data_length; - } - return info.content_handler(const_data_ptr_cast(data), data_length); - })); + + // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, + // curl_easy_strerror(res)); + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl_handle, CURLINFO_RESPONSE_CODE, response_code); + + // TODO: replace this with better bytes received provided by curl. + if (state) { + state->total_bytes_received += sizeof(result); } + + // get the response code + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = result; + return return_result; } + unique_ptr Put(PutRequestInfo &info) override { if (state) { state->put_count++; @@ -117,6 +236,19 @@ class HTTPFSClient : public HTTPClient { return headers; } + duckdb_httplib_openssl::Headers TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + headers; + for (auto &entry : header_map) { + const std::string new_header = entry.first + "=" + entry.second; + headers.insert(new_header); + } + for (auto &entry : params.extra_headers) { + const std::string new_header = entry.first + "=" + entry.second; + headers.insert(new_header); + } + return headers; + } + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { auto status_code = HTTPUtil::ToStatusCode(response.status); auto result = make_uniq(status_code); @@ -141,6 +273,7 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr client; + unique_ptr curl; optional_ptr state; }; From 303c9a7b7b1cf4b842151ecea300abcea7d905d2 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 15 May 2025 15:50:26 +0200 Subject: [PATCH 03/44] compiles with curl now --- CMakeLists.txt | 10 ++++++++ extension/httpfs/httpfs_client.cpp | 37 +++++++++++++++++++----------- vcpkg.json | 3 ++- 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 92d4547..878f6f6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,8 @@ add_extension_definitions() include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) + + build_static_extension( httpfs extension/httpfs/hffs.cpp @@ -38,7 +40,9 @@ if(MINGW) endif() find_package(OpenSSL REQUIRED) +find_package(CURL REQUIRED) include_directories(${OPENSSL_INCLUDE_DIR}) +include_directories(${CURL_INCLUDE_DIRS}) if(EMSCRIPTEN) else() @@ -46,6 +50,11 @@ else() ${OPENSSL_LIBRARIES}) target_link_libraries(httpfs_extension duckdb_mbedtls ${OPENSSL_LIBRARIES}) + # Link dependencies into extension + target_link_libraries(httpfs_loadable_extension ${CURL_LIBRARIES}) + target_link_libraries(httpfs_extension ${CURL_LIBRARIES}) + + if(MINGW) find_package(ZLIB) target_link_libraries(httpfs_loadable_extension ZLIB::ZLIB -lcrypt32) @@ -53,6 +62,7 @@ else() endif() endif() + install( TARGETS httpfs_extension EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 735e391..9580263 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -2,9 +2,11 @@ #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT + #include -#include +// #include #include "httplib.hpp" +#include "duckdb/common/exception/http_exception.hpp" namespace duckdb { @@ -23,6 +25,7 @@ static std::string certFileLocations[] = { // Alpine "/etc/ssl/cert.pem"}; + //! Grab the first path that exists, from a list of well-known locations static std::string SelectCURLCertPath() { for (std::string &caFile : certFileLocations) { @@ -43,11 +46,12 @@ static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, vo class CURLRequestHeaders { public: - CURLRequestHeaders(const vector &input) { + CURLRequestHeaders(vector &input) { for (auto &header : input) { Add(header); } } + CURLRequestHeaders() {} ~CURLRequestHeaders() { if (headers) { curl_slist_free_all(headers); @@ -144,24 +148,24 @@ class HTTPFSClient : public HTTPClient { CURLcode res; string result; { - curl_easy_setopt(*curl_handle, CURLOPT_URL, info.url.c_str()); - curl_easy_setopt(*curl_handle, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl_handle, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); if (curl_headers) { - curl_easy_setopt(*curl_handle, CURLOPT_HTTPHEADER, curl_headers.headers); + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); } - res = curl_handle->Execute(); + res = curl->Execute(); } // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, // curl_easy_strerror(res)); if (res != CURLcode::CURLE_OK) { string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", url, error)); + throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); } uint16_t response_code = 0; - curl_easy_getinfo(*curl_handle, CURLINFO_RESPONSE_CODE, response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, response_code); // TODO: replace this with better bytes received provided by curl. if (state) { @@ -236,17 +240,22 @@ class HTTPFSClient : public HTTPClient { return headers; } - duckdb_httplib_openssl::Headers TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - headers; + CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + "=" + entry.second; - headers.insert(new_header); + headers.push_back(new_header); } for (auto &entry : params.extra_headers) { const std::string new_header = entry.first + "=" + entry.second; - headers.insert(new_header); + headers.push_back(new_header); } - return headers; + CURLRequestHeaders curl_headers; + for (auto &header : headers) { + curl_headers.Add(header); + } + return curl_headers; + // return CURLRequestHeaders(headers); } unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { diff --git a/vcpkg.json b/vcpkg.json index 3ed9a36..809e67b 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,5 +1,6 @@ { "dependencies": [ - "openssl" + "openssl", + "curl" ] } From b8272b19ce7ecc044d6af1df0641b5eb8d008dae Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 12:58:43 +0200 Subject: [PATCH 04/44] put head delete post get replaced --- extension/httpfs/httpfs_client.cpp | 413 ++++++++++++++++----- extension/httpfs/include/httpfs_client.hpp | 51 +++ test/sql/copy/csv/test_csv_httpfs.test | 5 + 3 files changed, 383 insertions(+), 86 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 9580263..6e6ca23 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -40,68 +40,67 @@ static std::string SelectCURLCertPath() { static std::string cert_path = SelectCURLCertPath(); static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { - ((std::string *)userp)->append((char *)contents, size * nmemb); - return size * nmemb; + size_t totalSize = size * nmemb; + std::string* str = static_cast(userp); + str->append(static_cast(contents), totalSize); + return totalSize; } -class CURLRequestHeaders { -public: - CURLRequestHeaders(vector &input) { - for (auto &header : input) { - Add(header); - } - } - CURLRequestHeaders() {} - ~CURLRequestHeaders() { - if (headers) { - curl_slist_free_all(headers); +static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t totalSize = size * nmemb; + std::string header(static_cast(contents), totalSize); + HeaderCollector* header_collection = static_cast(userp); + + // Trim trailing \r\n + if (!header.empty() && header.back() == '\n') { + header.pop_back(); + if (!header.empty() && header.back() == '\r') { + header.pop_back(); } - headers = NULL; - } - operator bool() const { - return headers != NULL; } -public: - void Add(const string &header) { - headers = curl_slist_append(headers, header.c_str()); + // If header starts with HTTP/... curl has followed a redirect and we have a new Header, + // so we clear all of the current header_collection + if (header.rfind("HTTP/", 0) == 0) { + header_collection->header_collection.push_back(HTTPHeaders()); } -public: - curl_slist *headers = NULL; -}; + size_t colonPos = header.find(':'); - -class CURLHandle { -public: - CURLHandle(const string &token, const string &cert_path) { - curl = curl_easy_init(); - if (!curl) { - throw InternalException("Failed to initialize curl"); - } - if (!token.empty()) { - curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); - curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); - } - if (!cert_path.empty()) { - curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + if (colonPos != std::string::npos) { + // Split the string into two parts + std::string part1 = header.substr(0, colonPos); + std::string part2 = header.substr(colonPos + 1); + if (part2.at(0) == ' ') { + part2.erase(0, 1); } + + header_collection->header_collection.back().Insert(part1, part2); } - ~CURLHandle() { - curl_easy_cleanup(curl); - } + // TODO: some headers may not follow standard response header formats. + // what to do in this case? Invalid does not mean we should abort. -public: - operator CURL *() { - return curl; + return totalSize; +} + + CURLHandle::CURLHandle(const string &token, const string &cert_path) { + curl = curl_easy_init(); + if (!curl) { + throw InternalException("Failed to initialize curl"); } - CURLcode Execute() { - return curl_easy_perform(curl); + if (!token.empty()) { + curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); } + if (!cert_path.empty()) { + curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } +} + +CURLHandle::~CURLHandle() { + curl_easy_cleanup(curl); +} -private: - CURL *curl = NULL; -}; class HTTPFSClient : public HTTPClient { public: @@ -129,12 +128,14 @@ class HTTPFSClient : public HTTPClient { } } state = http_params.state; + + // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } curl = make_uniq(bearer_token, SelectCURLCertPath()); - state = http_params.state; + } void SetLogger(HTTPLogger &logger) { @@ -142,15 +143,46 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Get(GetRequestInfo &info) override { - auto headers = TransformHeadersForCurl(info.headers, info.params); - CURLRequestHeaders curl_headers(headers); + if (state) { + state->get_count++; + } + + // auto headers = TransformHeaders(info.headers, info.params); + // if (!info.response_handler && !info.content_handler) { + // return TransformResult(client->Get(info.path, headers)); + // } else { + // return TransformResult(client->Get( + // info.path.c_str(), headers, + // [&](const duckdb_httplib_openssl::Response &response) { + // auto http_response = TransformResponse(response); + // return info.response_handler(*http_response); + // }, + // [&](const char *data, size_t data_length) { + // if (state) { + // state->total_bytes_received += data_length; + // } + // return info.content_handler(const_data_ptr_cast(data), data_length); + // })); + // } + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); CURLcode res; - string result; + std::string result; + HeaderCollector response_header_collection; { + // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again + curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); + + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + // write response headers (different header collection for each redirect) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); if (curl_headers) { curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); @@ -165,17 +197,28 @@ class HTTPFSClient : public HTTPClient { throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); } uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // TODO: replace this with better bytes received provided by curl. + idx_t bytes_received = 0; + if (response_header_collection.header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(response_header_collection.header_collection.back().GetHeaderValue("content-length")); + D_ASSERT(bytes_received == result.size()); + } else { + bytes_received = result.size(); + } if (state) { - state->total_bytes_received += sizeof(result); + state->total_bytes_received += bytes_received; } - // get the response code + const char* data = result.c_str(); + info.content_handler(const_data_ptr_cast(data), bytes_received); + + // Construct HTTPResponse auto status_code = HTTPStatusCode(response_code); auto return_result = make_uniq(status_code); return_result->body = result; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; return return_result; } @@ -184,25 +227,170 @@ class HTTPFSClient : public HTTPClient { state->put_count++; state->total_bytes_sent += info.buffer_in_len; } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, - info.content_type)); + // auto headers = TransformHeaders(info.headers, info.params); + // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, + // info.content_type)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + // Optionally add headers directly + curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + + // Perform PUT + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); + + // Include PUT body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Follow redirects if needed + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Capture response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Capture response headers + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Apply headers + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute the request + res = curl->Execute(); + } + + // Check response + if (res != CURLcode::CURLE_OK) { + std::string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl PUT Request to '%s' failed with error: '%s'", info.url, error)); + } + + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = ""; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } unique_ptr Head(HeadRequestInfo &info) override { - if (state) { - state->head_count++; - } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Head(info.path, headers)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); + + // Perform HEAD request instead of GET + curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // set write function to collect body — no body expected, so safe to omit + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute HEAD request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->body = ""; + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } unique_ptr Delete(DeleteRequestInfo &info) override { - if (state) { - state->delete_count++; + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + // TODO: some delete requests require a BODY + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + + // Set DELETE request method + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "DELETE"); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Set write function to collect response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute DELETE request + res = curl->Execute(); } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Delete(info.path, headers)); + + // Handle result + if (res != CURLcode::CURLE_OK) { + std::string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl DELETE Request to '%s' failed with error: '%s'", info.url, error)); + } + + // Get HTTP response status code + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; + } unique_ptr Post(PostRequestInfo &info) override { @@ -210,22 +398,75 @@ class HTTPFSClient : public HTTPClient { state->post_count++; state->total_bytes_sent += info.buffer_in_len; } - // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - duckdb_httplib_openssl::Request req; - req.method = "POST"; - req.path = info.path; - req.headers = TransformHeaders(info.headers, info.params); - req.headers.emplace("Content-Type", "application/octet-stream"); - req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - uint64_t /*total_length*/) { - if (state) { - state->total_bytes_received += data_length; + + // // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib + // duckdb_httplib_openssl::Request req; + // req.method = "POST"; + // req.path = info.path; + // req.headers = TransformHeaders(info.headers, info.params); + // req.headers.emplace("Content-Type", "application/octet-stream"); + // req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, + // uint64_t /*total_length*/) { + // if (state) { + // state->total_bytes_received += data_length; + // } + // info.buffer_out += string(data, data_length); + // return true; + // }; + // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); + // return TransformResult(client->send(req)); + + auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + curl_headers.Add("Content-Type: application/octet-stream"); + + CURLcode res; + std::string result; + HeaderCollector response_header_collection; + + { + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_POST, 1L); + + // Set POST body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Follow redirects + // TODO: follow redirects for POST? + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Set write function to collect response body + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); } - info.buffer_out += string(data, data_length); - return true; - }; - req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - return TransformResult(client->send(req)); + + // Execute POST request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl POST Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + + info.buffer_out = result; + // Construct HTTPResponse + auto status_code = HTTPStatusCode(response_code); + auto return_result = make_uniq(status_code); + return_result->headers = response_header_collection.header_collection.back(); + return_result->url = info.url; + return return_result; } private: @@ -243,11 +484,11 @@ class HTTPFSClient : public HTTPClient { CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { std::vector headers; for (auto &entry : header_map) { - const std::string new_header = entry.first + "=" + entry.second; + const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } for (auto &entry : params.extra_headers) { - const std::string new_header = entry.first + "=" + entry.second; + const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } CURLRequestHeaders curl_headers; @@ -255,7 +496,6 @@ class HTTPFSClient : public HTTPClient { curl_headers.Add(header); } return curl_headers; - // return CURLRequestHeaders(headers); } unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { @@ -283,6 +523,7 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr client; unique_ptr curl; + CURLRequestHeaders request_headers; optional_ptr state; }; diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 1d7620c..11a48a4 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -1,6 +1,7 @@ #pragma once #include "duckdb/common/http_util.hpp" +#include namespace duckdb { class HTTPLogger; @@ -36,4 +37,54 @@ class HTTPFSUtil : public HTTPUtil { string GetName() const override; }; +class CURLHandle { +public: + CURLHandle(const string &token, const string &cert_path); + ~CURLHandle(); + +public: + operator CURL *() { + return curl; + } + CURLcode Execute() { + return curl_easy_perform(curl); + } + +private: + CURL *curl = NULL; +}; + +class CURLRequestHeaders { +public: + CURLRequestHeaders(vector &input) { + for (auto &header : input) { + Add(header); + } + } + CURLRequestHeaders() {} + + ~CURLRequestHeaders() { + if (headers) { + curl_slist_free_all(headers); + } + headers = NULL; + } + operator bool() const { + return headers != NULL; + } + +public: + void Add(const string &header) { + headers = curl_slist_append(headers, header.c_str()); + } + +public: + curl_slist *headers = NULL; +}; + +struct HeaderCollector { + std::vector header_collection; +}; + + } // namespace duckdb diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 869a0b6..cbeed96 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -4,9 +4,14 @@ require httpfs +require parquet + statement ok PRAGMA enable_verification +statement ok +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet'; + #FIXME this test fails: file is nonexistent mode skip From e6c27809be6bc258266e8231f2cb887706b43c14 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:10:16 +0200 Subject: [PATCH 05/44] bump duckdb --- duckdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/duckdb b/duckdb index 44d0856..4e5ff4e 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit 44d0856021c69743661adaea0006725fdbf7db29 +Subproject commit 4e5ff4e5e8508791f2fedbd3c9d19e9e5a5e4d93 From 2afdc880ca24020869aa77ce858be465a85f81e4 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:10:25 +0200 Subject: [PATCH 06/44] remove extension stuff --- extension/httpfs/httpfs_client.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 6e6ca23..0aa0057 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -138,9 +138,6 @@ class HTTPFSClient : public HTTPClient { } - void SetLogger(HTTPLogger &logger) { - client->set_logger(logger.GetLogger()); - } unique_ptr Get(GetRequestInfo &info) override { if (state) { From be81a80c54f32a8bfaf5e5397625091f782f8987 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:18:23 +0200 Subject: [PATCH 07/44] install curl during build --- .github/workflows/MinioTests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index dca4c52..a88c84f 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -33,7 +33,10 @@ jobs: - name: Install Ninja shell: bash - run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build + run: | + sudo apt-get update -y -qq + sudo apt-get install -y -qq software-properties-common + sudo apt-get install -y -qq libcurl4-gnutls-dev libcurl4-openssl-dev ninja-build - name: Setup Ccache uses: hendrikmuhs/ccache-action@main From fdac2778b9dc93df3fb2704c885f5f844275d6ae Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:21:34 +0200 Subject: [PATCH 08/44] run on ubuntu-latest --- .github/workflows/MinioTests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index a88c84f..4d721ba 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -10,7 +10,7 @@ defaults: jobs: minio-tests: name: Minio Tests - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest env: S3_TEST_SERVER_AVAILABLE: 1 AWS_DEFAULT_REGION: eu-west-1 From ef581dee06e1edf821fe32fc548cc4084388839c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:26:29 +0200 Subject: [PATCH 09/44] init vcpackage toolchain path --- .github/workflows/MinioTests.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index 4d721ba..b6d9dc0 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -19,6 +19,7 @@ jobs: DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 DUCKDB_S3_USE_SSL: false GEN: ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux steps: @@ -31,12 +32,9 @@ jobs: with: python-version: '3.10' - - name: Install Ninja + - name: Install libraries shell: bash - run: | - sudo apt-get update -y -qq - sudo apt-get install -y -qq software-properties-common - sudo apt-get install -y -qq libcurl4-gnutls-dev libcurl4-openssl-dev ninja-build + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build - name: Setup Ccache uses: hendrikmuhs/ccache-action@main @@ -49,9 +47,11 @@ jobs: with: vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 - - name: Build - shell: bash - run: make + - name: Build extension + env: + GEN: ninja + run: | + make release - name: Start S3/HTTP test server shell: bash From e7bc7e0790f1253e1c79817c845edabd3353ca90 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 13:53:23 +0200 Subject: [PATCH 10/44] actually add params to end of url for get requests --- extension/httpfs/httpfs_client.cpp | 39 +++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 0aa0057..0b28b91 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -162,7 +162,12 @@ class HTTPFSClient : public HTTPClient { // })); // } - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -173,7 +178,7 @@ class HTTPFSClient : public HTTPClient { // follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); @@ -228,7 +233,7 @@ class HTTPFSClient : public HTTPClient { // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, // info.content_type)); - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); // Optionally add headers directly curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); @@ -286,7 +291,7 @@ class HTTPFSClient : public HTTPClient { unique_ptr Head(HeadRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); CURLcode res; std::string result; @@ -338,7 +343,7 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Delete(DeleteRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); CURLcode res; std::string result; @@ -413,7 +418,7 @@ class HTTPFSClient : public HTTPClient { // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); // return TransformResult(client->send(req)); - auto curl_headers = TransformHeadersForCurl(info.headers, info.params); + auto curl_headers = TransformHeadersForCurl(info.headers); curl_headers.Add("Content-Type: application/octet-stream"); CURLcode res; @@ -478,16 +483,12 @@ class HTTPFSClient : public HTTPClient { return headers; } - CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + ": " + entry.second; headers.push_back(new_header); } - for (auto &entry : params.extra_headers) { - const std::string new_header = entry.first + ": " + entry.second; - headers.push_back(new_header); - } CURLRequestHeaders curl_headers; for (auto &header : headers) { curl_headers.Add(header); @@ -495,6 +496,22 @@ class HTTPFSClient : public HTTPClient { return curl_headers; } + string TransformParamsCurl(const HTTPParams ¶ms) { + string result = ""; + unordered_map escaped_params; + bool first_param = true; + for (auto &entry : params.extra_headers) { + const string key = entry.first; + const string value = curl_easy_escape(*curl, entry.second.c_str(), 0); + if (!first_param) { + result += "&"; + } + result += key + "=" + value; + first_param = false; + } + return result; + } + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { auto status_code = HTTPUtil::ToStatusCode(response.status); auto result = make_uniq(status_code); From b74abc2cd030ee77c60e97d3dc1ce597bead152c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 14:58:23 +0200 Subject: [PATCH 11/44] get the response error message as well from the http response code header --- extension/httpfs/httpfs_client.cpp | 212 +++++++++++++--------------- test/sql/secret/secret_refresh.test | 4 +- 2 files changed, 99 insertions(+), 117 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 0b28b91..769d877 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -63,6 +63,7 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v // so we clear all of the current header_collection if (header.rfind("HTTP/", 0) == 0) { header_collection->header_collection.push_back(HTTPHeaders()); + header_collection->header_collection.back().Insert("__RESPONSE_STATUS__", header); } size_t colonPos = header.find(':'); @@ -135,7 +136,6 @@ class HTTPFSClient : public HTTPClient { bearer_token = http_params.bearer_token.c_str(); } curl = make_uniq(bearer_token, SelectCURLCertPath()); - } @@ -144,24 +144,6 @@ class HTTPFSClient : public HTTPClient { state->get_count++; } - // auto headers = TransformHeaders(info.headers, info.params); - // if (!info.response_handler && !info.content_handler) { - // return TransformResult(client->Get(info.path, headers)); - // } else { - // return TransformResult(client->Get( - // info.path.c_str(), headers, - // [&](const duckdb_httplib_openssl::Response &response) { - // auto http_response = TransformResponse(response); - // return info.response_handler(*http_response); - // }, - // [&](const char *data, size_t data_length) { - // if (state) { - // state->total_bytes_received += data_length; - // } - // return info.content_handler(const_data_ptr_cast(data), data_length); - // })); - // } - auto curl_headers = TransformHeadersForCurl(info.headers); auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -214,14 +196,7 @@ class HTTPFSClient : public HTTPClient { const char* data = result.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = result; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Put(PutRequestInfo &info) override { @@ -229,13 +204,16 @@ class HTTPFSClient : public HTTPClient { state->put_count++; state->total_bytes_sent += info.buffer_in_len; } - // auto headers = TransformHeaders(info.headers, info.params); - // return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, - // info.content_type)); auto curl_headers = TransformHeadersForCurl(info.headers); - // Optionally add headers directly - curl_headers.headers = curl_slist_append(curl_headers.headers, "Content-Type: application/json"); + // Add content type header from info + curl_headers.Add("Content-Type: " + info.content_type); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -280,70 +258,78 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + return TransformResponseCurl(response_code, response_header_collection, result, res, url); + // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = ""; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + // auto status_code = HTTPStatusCode(response_code); + // auto return_result = make_uniq(status_code); + // return_result->body = ""; + // return_result->headers = response_header_collection.header_collection.back(); + // return_result->url = info.url; + // return return_result; } unique_ptr Head(HeadRequestInfo &info) override { - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersForCurl(info.headers); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } - CURLcode res; - std::string result; - HeaderCollector response_header_collection; + CURLcode res; + std::string result; + HeaderCollector response_header_collection; - { - // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); - // Perform HEAD request instead of GET - curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); - curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - - // Follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // set write function to collect body — no body expected, so safe to omit - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); - - // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } - - // Execute HEAD request - res = curl->Execute(); - } - - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); - } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->body = ""; - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + // Perform HEAD request instead of GET + curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // set write function to collect body — no body expected, so safe to omit + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + + // Collect response headers (multiple header blocks for redirects) + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + + // Add headers if any + if (curl_headers) { + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); + } + + // Execute HEAD request + res = curl->Execute(); + } + + // Handle result + if (res != CURLcode::CURLE_OK) { + string error = curl_easy_strerror(res); + throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); + } + uint16_t response_code = 0; + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Delete(DeleteRequestInfo &info) override { auto curl_headers = TransformHeadersForCurl(info.headers); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -386,13 +372,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; - + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } unique_ptr Post(PostRequestInfo &info) override { @@ -401,25 +381,15 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - // // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - // duckdb_httplib_openssl::Request req; - // req.method = "POST"; - // req.path = info.path; - // req.headers = TransformHeaders(info.headers, info.params); - // req.headers.emplace("Content-Type", "application/octet-stream"); - // req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - // uint64_t /*total_length*/) { - // if (state) { - // state->total_bytes_received += data_length; - // } - // info.buffer_out += string(data, data_length); - // return true; - // }; - // req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - // return TransformResult(client->send(req)); - auto curl_headers = TransformHeadersForCurl(info.headers); - curl_headers.Add("Content-Type: application/octet-stream"); + const string content_type = "Content-Type: application/octet-stream"; + curl_headers.Add(content_type.c_str()); + // transform parameters + auto url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + url += "?" + curl_params; + } CURLcode res; std::string result; @@ -434,7 +404,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); // Follow redirects - // TODO: follow redirects for POST? + // TODO: should we follow redirects for POST? curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); // Set write function to collect response body @@ -461,14 +431,9 @@ class HTTPFSClient : public HTTPClient { } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - info.buffer_out = result; // Construct HTTPResponse - auto status_code = HTTPStatusCode(response_code); - auto return_result = make_uniq(status_code); - return_result->headers = response_header_collection.header_collection.back(); - return_result->url = info.url; - return return_result; + return TransformResponseCurl(response_code, response_header_collection, result, res, url); } private: @@ -523,6 +488,23 @@ class HTTPFSClient : public HTTPClient { return result; } + unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { + auto status_code = HTTPStatusCode(response_code); + auto response = make_uniq(status_code); + if (response_code >= 400) { + if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + } else { + response->request_error = curl_easy_strerror(res); + } + return response; + } + response->body = body; + response->url = url; + response->headers = header_collection.header_collection.back(); + return response; + } + unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { if (res.error() == duckdb_httplib_openssl::Error::Success) { auto &response = res.value(); diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 85c8738..35e2735 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -81,7 +81,7 @@ CREATE SECRET s1 ( statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +IO Error: HTTP/1.1 403 Forbidden query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -121,7 +121,7 @@ set s3_access_key_id='bogus' statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +IO Error: HTTP/1.1 403 Forbidden # -> log empty query II From 88f96fc9571e3a51b057e62c3d1b2a2793367345 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 15:18:08 +0200 Subject: [PATCH 12/44] remove all running httplib.hpp code --- extension/httpfs/httpfs_client.cpp | 85 +++++++----------------------- 1 file changed, 20 insertions(+), 65 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 769d877..8ea99c2 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -4,8 +4,7 @@ #define CPPHTTPLIB_OPENSSL_SUPPORT #include -// #include -#include "httplib.hpp" +#include #include "duckdb/common/exception/http_exception.hpp" namespace duckdb { @@ -106,30 +105,6 @@ CURLHandle::~CURLHandle() { class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { - client = make_uniq(proto_host_port); - client->set_follow_location(true); - client->set_keep_alive(http_params.keep_alive); - if (!http_params.ca_cert_file.empty()) { - client->set_ca_cert_path(http_params.ca_cert_file.c_str()); - } - client->enable_server_certificate_verification(http_params.enable_server_cert_verification); - client->set_write_timeout(http_params.timeout, http_params.timeout_usec); - client->set_read_timeout(http_params.timeout, http_params.timeout_usec); - client->set_connection_timeout(http_params.timeout, http_params.timeout_usec); - client->set_decompress(false); - if (!http_params.bearer_token.empty()) { - client->set_bearer_token_auth(http_params.bearer_token.c_str()); - } - - if (!http_params.http_proxy.empty()) { - client->set_proxy(http_params.http_proxy, http_params.http_proxy_port); - - if (!http_params.http_proxy_username.empty()) { - client->set_proxy_basic_auth(http_params.http_proxy_username, http_params.http_proxy_password); - } - } - state = http_params.state; - // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { @@ -437,16 +412,6 @@ class HTTPFSClient : public HTTPClient { } private: - duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - duckdb_httplib_openssl::Headers headers; - for (auto &entry : header_map) { - headers.insert(entry); - } - for (auto &entry : params.extra_headers) { - headers.insert(entry); - } - return headers; - } CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { std::vector headers; @@ -477,17 +442,6 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { - auto status_code = HTTPUtil::ToStatusCode(response.status); - auto result = make_uniq(status_code); - result->body = response.body; - result->reason = response.reason; - for (auto &entry : response.headers) { - result->headers.Insert(entry.first, entry.second); - } - return result; - } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); @@ -505,19 +459,7 @@ class HTTPFSClient : public HTTPClient { return response; } - unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { - if (res.error() == duckdb_httplib_openssl::Error::Success) { - auto &response = res.value(); - return TransformResponse(response); - } else { - auto result = make_uniq(HTTPStatusCode::INVALID); - result->request_error = to_string(res.error()); - return result; - } - } - private: - unique_ptr client; unique_ptr curl; CURLRequestHeaders request_headers; optional_ptr state; @@ -529,14 +471,27 @@ unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, con } unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { - duckdb_httplib_openssl::Params query_params; - duckdb_httplib_openssl::detail::parse_query_text(text, query_params); + unordered_map params; + + auto pos = text.find('?'); + if (pos == std::string::npos) return params; + + std::string query = text.substr(pos + 1); + std::stringstream ss(query); + std::string item; - unordered_map result; - for (auto &entry : query_params) { - result.emplace(std::move(entry.first), std::move(entry.second)); + while (std::getline(ss, item, '&')) { + auto eq_pos = item.find('='); + if (eq_pos != std::string::npos) { + std::string key = item.substr(0, eq_pos); + std::string value = StringUtil::URLDecode(item.substr(eq_pos + 1)); + params[key] = value; + } else { + params[item] = ""; // key with no value + } } - return result; + + return params; } string HTTPFSUtil::GetName() const { From 04cb2a8fe5fda42627c4fff7b3f1547a82109c64 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 15:49:21 +0200 Subject: [PATCH 13/44] add back in state initialization --- extension/httpfs/httpfs_client.cpp | 16 ++++++++-------- test/sql/metadata_stats.test | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 test/sql/metadata_stats.test diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8ea99c2..8126dcf 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -110,6 +110,7 @@ class HTTPFSClient : public HTTPClient { if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } + state = http_params.state; curl = make_uniq(bearer_token, SelectCURLCertPath()); } @@ -234,17 +235,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); return TransformResponseCurl(response_code, response_header_collection, result, res, url); - - // Construct HTTPResponse - // auto status_code = HTTPStatusCode(response_code); - // auto return_result = make_uniq(status_code); - // return_result->body = ""; - // return_result->headers = response_header_collection.header_collection.back(); - // return_result->url = info.url; - // return return_result; } unique_ptr Head(HeadRequestInfo &info) override { + if (state) { + state->head_count++; + } auto curl_headers = TransformHeadersForCurl(info.headers); // transform parameters @@ -298,6 +294,10 @@ class HTTPFSClient : public HTTPClient { } unique_ptr Delete(DeleteRequestInfo &info) override { + if (state) { + state->delete_count++; + } + auto curl_headers = TransformHeadersForCurl(info.headers); // transform parameters auto url = info.url; diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test new file mode 100644 index 0000000..ba5ddfd --- /dev/null +++ b/test/sql/metadata_stats.test @@ -0,0 +1,17 @@ +# name: test/sql/metadata_stats.test +# description: Test getting metadata stats +# group: [] + +require parquet + +require httpfs + +statement ok +SET force_download=false; + +mode output_result + +query II +explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') +---- +analyzed_plan :.*GET: 2.* From 02e8ac1a14ecf16837fdb8801986db0d86d719f5 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 16:02:00 +0200 Subject: [PATCH 14/44] remove output result --- test/sql/metadata_stats.test | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test index ba5ddfd..f15955b 100644 --- a/test/sql/metadata_stats.test +++ b/test/sql/metadata_stats.test @@ -9,8 +9,6 @@ require httpfs statement ok SET force_download=false; -mode output_result - query II explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') ---- From c92998cdd92425be70d79230d1c7911c07679f99 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:35:46 +0200 Subject: [PATCH 15/44] add json read test --- test/sql/metadata_stats.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test index f15955b..4bc6c07 100644 --- a/test/sql/metadata_stats.test +++ b/test/sql/metadata_stats.test @@ -6,6 +6,12 @@ require parquet require httpfs +require json + +# Test Force download with server that doesn't want to give us the head +statement ok +FROM read_json('https://api.spring.io/projects/spring-boot/generations') + statement ok SET force_download=false; From fa93e103159255f12ed8f4ff95ec2b753d6d751b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:36:21 +0200 Subject: [PATCH 16/44] add core extensions var --- .github/workflows/MinioTests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index b6d9dc0..6f431e1 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -18,6 +18,7 @@ jobs: AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 DUCKDB_S3_USE_SSL: false + CORE_EXTENSIONS: 'parquet;json' GEN: ninja VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux From 499de9054e8cacad7e11c4596613462f510b93bf Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:36:51 +0200 Subject: [PATCH 17/44] always add headers, and set many more curl options on initialize --- extension/httpfs/httpfs_client.cpp | 107 ++++++++++++++--------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8126dcf..b970e40 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -105,22 +105,56 @@ CURLHandle::~CURLHandle() { class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { - // initializing curl auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); } state = http_params.state; curl = make_uniq(bearer_token, SelectCURLCertPath()); - } + // set curl options + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Curl re-uses connections by default + if (!http_params.keep_alive) { + curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 1L); + } + + // client->enable_server_certificate_verification(http_params.enable_server_cert_verification); + if (http_params.enable_server_cert_verification) { + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYPEER, 1L); // Verify the cert + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 2L); // Verify that the cert matches the hostname + } + + // TODO: no global write timeout option, but you could put customize a timeout in the write functions + // or handle use CURLOPT_XFERINFOFUNCTION (progress callback) with CURLOPT_TIMEOUT_MS + // we could also set CURLOPT_LOW_SPEED_LIMIT and timeout if the speed is too low for + // too long. + + // set read timeout + curl_easy_setopt(*curl, CURLOPT_TIMEOUT, http_params.timeout); + // set connection timeout + curl_easy_setopt(*curl, CURLOPT_CONNECTTIMEOUT, http_params.timeout); + // accept content as-is (i.e no decompressing) + curl_easy_setopt(*curl, CURLOPT_ACCEPT_ENCODING, "identity"); + + if (!http_params.http_proxy.empty()) { + curl_easy_setopt(*curl, CURLOPT_PROXY, StringUtil::Format("%s:%s", http_params.http_proxy, http_params.http_proxy_port).c_str()); + + if (!http_params.http_proxy_username.empty()) { + curl_easy_setopt(*curl, CURLOPT_PROXYUSERNAME, http_params.http_proxy_username.c_str()); + curl_easy_setopt(*curl, CURLOPT_PROXYPASSWORD, http_params.http_proxy_password.c_str()); + } + } + } unique_ptr Get(GetRequestInfo &info) override { if (state) { state->get_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); auto url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); @@ -134,8 +168,7 @@ class HTTPFSClient : public HTTPClient { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - // follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); @@ -144,18 +177,10 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } - // DUCKDB_LOG_DEBUG(context, "iceberg.Catalog.Curl.HTTPRequest", "GET %s (curl code '%s')", url, - // curl_easy_strerror(res)); - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl GET Request to '%s' failed with error: '%s'", info.url, error)); - } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -181,7 +206,7 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters @@ -217,20 +242,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Apply headers - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute the request res = curl->Execute(); } - // Check response - if (res != CURLcode::CURLE_OK) { - std::string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl PUT Request to '%s' failed with error: '%s'", info.url, error)); - } - uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -242,7 +259,7 @@ class HTTPFSClient : public HTTPClient { state->head_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -257,7 +274,6 @@ class HTTPFSClient : public HTTPClient { { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); - // curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -275,19 +291,13 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute HEAD request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl HEAD Request to '%s' failed with error: '%s'", info.url, error)); - } + uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); return TransformResponseCurl(response_code, response_header_collection, result, res, url); @@ -298,7 +308,7 @@ class HTTPFSClient : public HTTPClient { state->delete_count++; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters auto url = info.url; if (!info.params.extra_headers.empty()) { @@ -310,7 +320,6 @@ class HTTPFSClient : public HTTPClient { std::string result; HeaderCollector response_header_collection; - // TODO: some delete requests require a BODY { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -330,20 +339,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute DELETE request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - std::string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl DELETE Request to '%s' failed with error: '%s'", info.url, error)); - } - // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); @@ -356,7 +357,7 @@ class HTTPFSClient : public HTTPClient { state->total_bytes_sent += info.buffer_in_len; } - auto curl_headers = TransformHeadersForCurl(info.headers); + auto curl_headers = TransformHeadersCurl(info.headers); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters @@ -391,19 +392,12 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); // Add headers if any - if (curl_headers) { - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers.headers); - } + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); // Execute POST request res = curl->Execute(); } - // Handle result - if (res != CURLcode::CURLE_OK) { - string error = curl_easy_strerror(res); - throw HTTPException(StringUtil::Format("Curl POST Request to '%s' failed with error: '%s'", info.url, error)); - } uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); info.buffer_out = result; @@ -413,7 +407,7 @@ class HTTPFSClient : public HTTPClient { private: - CURLRequestHeaders TransformHeadersForCurl(const HTTPHeaders &header_map) { + CURLRequestHeaders TransformHeadersCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { const std::string new_header = entry.first + ": " + entry.second; @@ -445,7 +439,7 @@ class HTTPFSClient : public HTTPClient { unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (response_code >= 400) { + if (res != CURLcode::CURLE_OK) { if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { @@ -461,7 +455,6 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr curl; - CURLRequestHeaders request_headers; optional_ptr state; }; From 9d8f71e241b60f4bf320781132be07b01da3379a Mon Sep 17 00:00:00 2001 From: Tmonster Date: Fri, 16 May 2025 17:37:24 +0200 Subject: [PATCH 18/44] update extension ci tools --- extension-ci-tools | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension-ci-tools b/extension-ci-tools index cca140d..3695927 160000 --- a/extension-ci-tools +++ b/extension-ci-tools @@ -1 +1 @@ -Subproject commit cca140d4cc47f3f3e40f29b49c305bd92845771f +Subproject commit 36959273c1f64ebfe01e644b779ca77fd52aaf0f From e0cce43741acfa0ff3d06ece6c9b416d2526b129 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 19 May 2025 10:09:56 +0200 Subject: [PATCH 19/44] fix test add comments --- test/sql/secret/secret_refresh.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 35e2735..696279e 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -78,10 +78,11 @@ CREATE SECRET s1 ( REFRESH 1 ) +# TODO: add FORBIDDEN back in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -IO Error: HTTP/1.1 403 Forbidden +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -118,10 +119,11 @@ statement ok set s3_access_key_id='bogus' # Without secret this query will fail, but since there are no suitable secrets, no refresh attempt will be made +# TODO: add FORBIDDEN in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -IO Error: HTTP/1.1 403 Forbidden +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () # -> log empty query II From 9b9058789220c458cfca5115bc2e4edd40076dcc Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 10:39:39 +0200 Subject: [PATCH 20/44] declare header object on heap not on stack --- extension/httpfs/httpfs_client.cpp | 57 ++++++++++++++++++------------ 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index b970e40..d090309 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -110,6 +110,8 @@ class HTTPFSClient : public HTTPClient { bearer_token = http_params.bearer_token.c_str(); } state = http_params.state; + // init curl globally, + // curl_global_init(CURL_GLOBAL_DEFAULT); curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -149,6 +151,11 @@ class HTTPFSClient : public HTTPClient { } } + ~HTTPFSClient() { + // init curl globally, + // curl_global_cleanup(); + } + unique_ptr Get(GetRequestInfo &info) override { if (state) { state->get_count++; @@ -163,7 +170,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); @@ -175,7 +182,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); @@ -185,8 +192,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); idx_t bytes_received = 0; - if (response_header_collection.header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(response_header_collection.header_collection.back().GetHeaderValue("content-length")); + if (response_header_collection && response_header_collection->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == result.size()); } else { bytes_received = result.size(); @@ -197,7 +204,7 @@ class HTTPFSClient : public HTTPClient { const char* data = result.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Put(PutRequestInfo &info) override { @@ -218,7 +225,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -239,7 +246,7 @@ class HTTPFSClient : public HTTPClient { // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -251,7 +258,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Head(HeadRequestInfo &info) override { @@ -269,7 +276,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // Set URL @@ -288,7 +295,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -300,7 +307,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -318,7 +325,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { // Set URL @@ -336,7 +343,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -348,7 +355,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } unique_ptr Post(PostRequestInfo &info) override { @@ -369,7 +376,7 @@ class HTTPFSClient : public HTTPClient { CURLcode res; std::string result; - HeaderCollector response_header_collection; + auto response_header_collection = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); @@ -389,7 +396,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &response_header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -402,7 +409,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); info.buffer_out = result; // Construct HTTPResponse - return TransformResponseCurl(response_code, response_header_collection, result, res, url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); } private: @@ -436,20 +443,24 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector &header_collection, string &body, CURLcode res, string &url) { + unique_ptr TransformResponseCurl(uint16_t response_code, unique_ptr header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (res != CURLcode::CURLE_OK) { - if (header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (header_collection && res != CURLcode::CURLE_OK) { + if (!header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } return response; } response->body = body; - response->url = url; - response->headers = header_collection.header_collection.back(); + response->url= url; + if (header_collection && !header_collection->header_collection.empty()) { + for (auto &header : header_collection->header_collection.back()) { + response->headers.Insert(header.first, header.second); + } + } return response; } From 5b3d9884a756ca2727179a32f5fd161794638bd2 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:24:41 +0200 Subject: [PATCH 21/44] also uniq string --- extension/httpfs/httpfs_client.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index d090309..804af0b 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -268,10 +268,10 @@ class HTTPFSClient : public HTTPClient { auto curl_headers = TransformHeadersCurl(info.headers); // transform parameters - auto url = info.url; + auto url = make_uniq(info.url); if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + *url += "?" + curl_params; } CURLcode res; @@ -280,7 +280,7 @@ class HTTPFSClient : public HTTPClient { { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, url->c_str()); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -303,11 +303,14 @@ class HTTPFSClient : public HTTPClient { // Execute HEAD request res = curl->Execute(); } - + Printer::Print("executed HEad"); + Printer::Print("url is " + *url); + Printer::Print("body is " + result); uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + Printer::Print("start transforming"); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, *url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -454,7 +457,9 @@ class HTTPFSClient : public HTTPClient { } return response; } + Printer::Print("assigning body"); response->body = body; + Printer::Print("assigning url"); response->url= url; if (header_collection && !header_collection->header_collection.empty()) { for (auto &header : header_collection->header_collection.back()) { From ccbf00a26aeae737fde04a09525c7e12fc7f4183 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:34:11 +0200 Subject: [PATCH 22/44] some more debugging statements --- extension/httpfs/httpfs_client.cpp | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 804af0b..d2ff8c4 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -102,6 +102,12 @@ CURLHandle::~CURLHandle() { } +struct RequestInfo { + string url; + string body; +}; + + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -267,20 +273,20 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); + request_info->url = info.url; // transform parameters - auto url = make_uniq(info.url); if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - *url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, url->c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Perform HEAD request instead of GET curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); @@ -291,7 +297,7 @@ class HTTPFSClient : public HTTPClient { // set write function to collect body — no body expected, so safe to omit curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); @@ -304,13 +310,13 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } Printer::Print("executed HEad"); - Printer::Print("url is " + *url); - Printer::Print("body is " + result); + Printer::Print("url is " + request_info->url); + Printer::Print("body is " + request_info->body); uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); Printer::Print("start transforming"); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, *url); + return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, request_info->body, res, request_info->url); } unique_ptr Delete(DeleteRequestInfo &info) override { From de6d6065bccaddb086d5cd9a09d67edc258e4da8 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 11:44:17 +0200 Subject: [PATCH 23/44] move a lot of the request response info to the heap so it does not get overwritten on the stack --- extension/httpfs/httpfs_client.cpp | 80 +++++++++++++++--------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index d2ff8c4..eb8a363 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -105,6 +105,7 @@ CURLHandle::~CURLHandle() { struct RequestInfo { string url; string body; + HeaderCollector header_collection; }; @@ -168,27 +169,27 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto url = info.url; + auto request_info = make_uniq(); + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - curl_easy_setopt(*curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); @@ -202,15 +203,15 @@ class HTTPFSClient : public HTTPClient { bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == result.size()); } else { - bytes_received = result.size(); + bytes_received = request_info->body.size(); } if (state) { state->total_bytes_received += bytes_received; } - const char* data = result.c_str(); + const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Put(PutRequestInfo &info) override { @@ -220,21 +221,21 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Perform PUT curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); @@ -248,11 +249,11 @@ class HTTPFSClient : public HTTPClient { // Capture response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -264,7 +265,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Head(HeadRequestInfo &info) override { @@ -282,7 +283,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); { // Set URL @@ -301,7 +301,7 @@ class HTTPFSClient : public HTTPClient { // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -316,7 +316,7 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); Printer::Print("start transforming"); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, request_info->body, res, request_info->url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -325,20 +325,20 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { // Set URL - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // Set DELETE request method curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "DELETE"); @@ -348,11 +348,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -364,7 +364,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } unique_ptr Post(PostRequestInfo &info) override { @@ -374,21 +374,21 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); + auto request_info = make_uniq(); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters - auto url = info.url; + request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); - url += "?" + curl_params; + request_info->url += "?" + curl_params; } CURLcode res; - std::string result; auto response_header_collection = make_uniq(); { - curl_easy_setopt(*curl, CURLOPT_URL, info.url.c_str()); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); // Set POST body @@ -401,11 +401,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &result); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, response_header_collection.get()); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -416,9 +416,9 @@ class HTTPFSClient : public HTTPClient { uint16_t response_code = 0; curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - info.buffer_out = result; + info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(response_code, response_header_collection ? std::move(response_header_collection) : nullptr, result, res, url); + return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); } private: @@ -452,12 +452,12 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, unique_ptr header_collection, string &body, CURLcode res, string &url) { + unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector header_collection, string &body, CURLcode res, string &url) { auto status_code = HTTPStatusCode(response_code); auto response = make_uniq(status_code); - if (header_collection && res != CURLcode::CURLE_OK) { - if (!header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (res != CURLcode::CURLE_OK) { + if (!header_collection.header_collection.empty() && header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } @@ -467,8 +467,8 @@ class HTTPFSClient : public HTTPClient { response->body = body; Printer::Print("assigning url"); response->url= url; - if (header_collection && !header_collection->header_collection.empty()) { - for (auto &header : header_collection->header_collection.back()) { + if (!header_collection.header_collection.empty()) { + for (auto &header : header_collection.header_collection.back()) { response->headers.Insert(header.first, header.second); } } From 0c167fd27393a6fe788f703aa8888bcb38a83827 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 13:53:29 +0200 Subject: [PATCH 24/44] this should work on the build now --- extension/httpfs/httpfs_client.cpp | 95 +++++++++++++----------------- 1 file changed, 40 insertions(+), 55 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index eb8a363..8a93bd3 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -103,9 +103,9 @@ CURLHandle::~CURLHandle() { struct RequestInfo { - string url; - string body; - HeaderCollector header_collection; + string url = ""; + string body = ""; + uint16_t response_code = 0; }; @@ -176,8 +176,8 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } + auto header_collector = make_uniq(); CURLcode res; - auto response_header_collection = make_uniq(); { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); @@ -186,22 +186,21 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); // write response data curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // write response headers (different header collection for each redirect) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); idx_t bytes_received = 0; - if (response_header_collection && response_header_collection->header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(response_header_collection->header_collection.back().GetHeaderValue("content-length")); - D_ASSERT(bytes_received == result.size()); + if (header_collector && header_collector->header_collection.empty() && header_collector->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(header_collector->header_collection.back().GetHeaderValue("content-length")); + D_ASSERT(bytes_received == request_info->body.size()); } else { bytes_received = request_info->body.size(); } @@ -211,7 +210,7 @@ class HTTPFSClient : public HTTPClient { const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Put(PutRequestInfo &info) override { @@ -231,9 +230,8 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } + auto header_collector = make_uniq(); CURLcode res; - auto response_header_collection = make_uniq(); - { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -249,11 +247,11 @@ class HTTPFSClient : public HTTPClient { // Capture response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Capture response headers curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -262,10 +260,9 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Head(HeadRequestInfo &info) override { @@ -283,7 +280,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - + auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -297,11 +294,11 @@ class HTTPFSClient : public HTTPClient { // set write function to collect body — no body expected, so safe to omit curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -309,14 +306,9 @@ class HTTPFSClient : public HTTPClient { // Execute HEAD request res = curl->Execute(); } - Printer::Print("executed HEad"); - Printer::Print("url is " + request_info->url); - Printer::Print("body is " + request_info->body); - - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - Printer::Print("start transforming"); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -334,8 +326,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); - + auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -348,11 +339,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -362,9 +353,8 @@ class HTTPFSClient : public HTTPClient { } // Get HTTP response status code - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } unique_ptr Post(PostRequestInfo &info) override { @@ -385,8 +375,7 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto response_header_collection = make_uniq(); - + auto header_collector = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); @@ -401,11 +390,11 @@ class HTTPFSClient : public HTTPClient { // Set write function to collect response body curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, request_info->body.c_str()); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); // Collect response headers (multiple header blocks for redirects) curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -414,15 +403,13 @@ class HTTPFSClient : public HTTPClient { res = curl->Execute(); } - uint16_t response_code = 0; - curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &response_code); + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(response_code, request_info->header_collection, request_info->body, res, request_info->url); + return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); } private: - CURLRequestHeaders TransformHeadersCurl(const HTTPHeaders &header_map) { std::vector headers; for (auto &entry : header_map) { @@ -452,23 +439,21 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(uint16_t response_code, HeaderCollector header_collection, string &body, CURLcode res, string &url) { - auto status_code = HTTPStatusCode(response_code); + unique_ptr TransformResponseCurl(unique_ptr request_info, unique_ptr header_collection, CURLcode res) { + auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { - if (!header_collection.header_collection.empty() && header_collection.header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error =header_collection.header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } return response; } - Printer::Print("assigning body"); - response->body = body; - Printer::Print("assigning url"); - response->url= url; - if (!header_collection.header_collection.empty()) { - for (auto &header : header_collection.header_collection.back()) { + response->body = request_info->body; + response->url= request_info->url; + if (header_collection && !header_collection->header_collection.empty()) { + for (auto &header : header_collection->header_collection.back()) { response->headers.Insert(header.first, header.second); } } From 5d8d79fe1cad70530961300ee23508251d272737 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 14:34:24 +0200 Subject: [PATCH 25/44] more clean up --- .github/workflows/MinioTests.yml | 10 +- CMakeLists.txt | 2 - extension/httpfs/httpfs_client.cpp | 5 +- test/sql/copy/csv/test_csv_httpfs.test | 207 ++++++++++++++----------- 4 files changed, 123 insertions(+), 101 deletions(-) diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index 6f431e1..467b5b3 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -33,7 +33,7 @@ jobs: with: python-version: '3.10' - - name: Install libraries + - name: Install Ninja shell: bash run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build @@ -48,11 +48,9 @@ jobs: with: vcpkgGitCommitId: 5e5d0e1cd7785623065e77eff011afdeec1a3574 - - name: Build extension - env: - GEN: ninja - run: | - make release + - name: Build + shell: bash + run: make - name: Start S3/HTTP test server shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index 878f6f6..865b901 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,8 +9,6 @@ add_extension_definitions() include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) - - build_static_extension( httpfs extension/httpfs/hffs.cpp diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 8a93bd3..252ad2b 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -118,7 +118,7 @@ class HTTPFSClient : public HTTPClient { } state = http_params.state; // init curl globally, - // curl_global_init(CURL_GLOBAL_DEFAULT); + curl_global_init(CURL_GLOBAL_DEFAULT); curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -159,8 +159,7 @@ class HTTPFSClient : public HTTPClient { } ~HTTPFSClient() { - // init curl globally, - // curl_global_cleanup(); + curl_global_cleanup(); } unique_ptr Get(GetRequestInfo &info) override { diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index cbeed96..e1de167 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -9,99 +9,35 @@ require parquet statement ok PRAGMA enable_verification -statement ok -select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet'; - - -#FIXME this test fails: file is nonexistent -mode skip - -query IIIIII rowsort -SELECT * from read_csv_auto('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c'); +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- -2020 Allemagne Germany 26.1 53196.069 200601.2 -2020 Autriche Austria 18.0 4723.5 26215.8 -2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 -2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 -2020 Chypre Cyprus 0.0 0.0 1627.6 -2020 Croatie Croatia 16.3 1094.8 6726.3 -2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 -2020 Espagne Spain 17.4 14211.7 81512.9 -2020 Estonie Estonia 8.5 241.1 2827.3 -2020 Finlande Finland 2.8000000000000003 692.3 24674.4 -2020 France France 20.3 28278.9 139375.8 -2020 Grèce Greece 5.800000000000001 896.5 15401.9 -2020 Hongrie Hungary 30.5 5486.7 17872.4 -2020 Irlande Ireland 17.4 1968.477 11296.601 -2020 Italie Italy 29.2 33042.585 113119.475 -2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 -2020 Lituanie Lithuania 10.7 584.104 5457.728 -2020 Luxembourg Luxembourg 16.5 623.165 3786.785 -2020 Malte Malta 0.0 0.0 547.5 -2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 -2020 Pologne Poland 13.5 9323.205 69135.018 -2020 Portugal Portugal 11.1 1814.878 16354.725 -2020 Roumanie Romania 23.7 5626.161 23712.653 -2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 -2020 République tchèque Czech Republic 21.4 5187.282 24263.896 -2020 Slovaquie Slovakia 25.0 2564.876 10248.401 -2020 Slovénie Slovenia 12.1 590.243 4861.315 -2020 Suède Sweden 1.5 475.195 31311.413 -2020 UE 28 Europe 28 22.5 238152.4 1056907.5 -2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 -2021 Autriche Austria 18.720006775926056 4645.795 24817.272 -2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 -2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 -2021 Chypre Cyprus 0.0 0.0 1528.558 -2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 -2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 -2021 Espagne Spain 19.10173955663722 13815.0 72323.256 -2021 Estonie Estonia 8.988278645659518 245.094 2726.818 -2021 Finlande Finland 2.9937725178230212 694.288 23191.074 -2021 France France 20.649030024470434 26465.646 128168.955 -2021 Grèce Greece 7.580480506088059 1097.87 14482.855 -2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 -2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 -2021 Italie Italy 30.86368769746751 31807.236 103057.147 -2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 -2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 -2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 -2021 Malte Malta 0.0 0.0 499.875 -2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 -2021 Pologne Poland 13.146720200313602 9235.656 70250.647 -2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 -2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 -2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 -2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 -2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 -2021 Suède Sweden 1.497679952802663 471.085 31454.317 -2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +query IIIIIIIIIIIIIIIIII +select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 +1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 +2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 +3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 +4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 +5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 +6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 +7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 +8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 +9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 +10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 #Add test for 5924 query IIIIII @@ -358,3 +294,94 @@ select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); 249 North America United States of America Members' Exchange NULL 2020-09-24 250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 251 Asia China Beijing Stock Exchange NULL 2021-12-27 + + +#FIXME this test fails: file is nonexistent +mode skip + +query IIIIII rowsort +SELECT * from read_csv_auto('https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'); +---- +2020 Allemagne Germany 26.1 53196.069 200601.2 +2020 Autriche Austria 18.0 4723.5 26215.8 +2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 +2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 +2020 Chypre Cyprus 0.0 0.0 1627.6 +2020 Croatie Croatia 16.3 1094.8 6726.3 +2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 +2020 Espagne Spain 17.4 14211.7 81512.9 +2020 Estonie Estonia 8.5 241.1 2827.3 +2020 Finlande Finland 2.8000000000000003 692.3 24674.4 +2020 France France 20.3 28278.9 139375.8 +2020 Grèce Greece 5.800000000000001 896.5 15401.9 +2020 Hongrie Hungary 30.5 5486.7 17872.4 +2020 Irlande Ireland 17.4 1968.477 11296.601 +2020 Italie Italy 29.2 33042.585 113119.475 +2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 +2020 Lituanie Lithuania 10.7 584.104 5457.728 +2020 Luxembourg Luxembourg 16.5 623.165 3786.785 +2020 Malte Malta 0.0 0.0 547.5 +2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 +2020 Pologne Poland 13.5 9323.205 69135.018 +2020 Portugal Portugal 11.1 1814.878 16354.725 +2020 Roumanie Romania 23.7 5626.161 23712.653 +2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 +2020 République tchèque Czech Republic 21.4 5187.282 24263.896 +2020 Slovaquie Slovakia 25.0 2564.876 10248.401 +2020 Slovénie Slovenia 12.1 590.243 4861.315 +2020 Suède Sweden 1.5 475.195 31311.413 +2020 UE 28 Europe 28 22.5 238152.4 1056907.5 +2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 +2021 Autriche Austria 18.720006775926056 4645.795 24817.272 +2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 +2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 +2021 Chypre Cyprus 0.0 0.0 1528.558 +2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 +2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 +2021 Espagne Spain 19.10173955663722 13815.0 72323.256 +2021 Estonie Estonia 8.988278645659518 245.094 2726.818 +2021 Finlande Finland 2.9937725178230212 694.288 23191.074 +2021 France France 20.649030024470434 26465.646 128168.955 +2021 Grèce Greece 7.580480506088059 1097.87 14482.855 +2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 +2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 +2021 Italie Italy 30.86368769746751 31807.236 103057.147 +2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 +2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 +2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 +2021 Malte Malta 0.0 0.0 499.875 +2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 +2021 Pologne Poland 13.146720200313602 9235.656 70250.647 +2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 +2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 +2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 +2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 +2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 +2021 Suède Sweden 1.497679952802663 471.085 31454.317 +2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 \ No newline at end of file From 0c420152cdf1eb202ad1ab58c2f2cb7a024ab878 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 15:21:54 +0200 Subject: [PATCH 26/44] call curl_global_init, unsure when to call cleanup --- extension/httpfs/httpfs_client.cpp | 40 +++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index e2e3762..13ebd6d 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -94,6 +94,8 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } if (!cert_path.empty()) { curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } else { + throw InternalException("Could not set certificate authority"); } } @@ -109,6 +111,8 @@ struct RequestInfo { }; +static idx_t httpfs_client_count = 0; + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { @@ -118,8 +122,9 @@ class HTTPFSClient : public HTTPClient { } state = http_params.state; - // init curl globally, - curl_global_init(CURL_GLOBAL_DEFAULT); + // call curl_global_init if not already done by another HTTPFS Client + InitCurlGlobal(); + curl = make_uniq(bearer_token, SelectCURLCertPath()); // set curl options @@ -160,7 +165,7 @@ class HTTPFSClient : public HTTPClient { } ~HTTPFSClient() { - curl_global_cleanup(); + DestroyCurlGlobal(); } unique_ptr Get(GetRequestInfo &info) override { @@ -289,6 +294,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); + // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); @@ -443,6 +450,7 @@ class HTTPFSClient : public HTTPClient { auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { + // TODO: request error can come from HTTPS Status code toString() value. if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { @@ -463,6 +471,32 @@ class HTTPFSClient : public HTTPClient { private: unique_ptr curl; optional_ptr state; + + static std::mutex &GetRefLock() { + static std::mutex mtx; + return mtx; + } + + static void InitCurlGlobal() { + GetRefLock(); + if (httpfs_client_count == 0) { + curl_global_init(CURL_GLOBAL_DEFAULT); + } + ++httpfs_client_count; + } + + static void DestroyCurlGlobal() { + // TODO: when to call curl_global_cleanup() + // calling it on client destruction causes SSL errors when verification is on (due to many requests). + // GetRefLock(); + // if (httpfs_client_count == 0) { + // throw InternalException("Destroying Httpfs client that did not initialize CURL"); + // } + // --httpfs_client_count; + // if (httpfs_client_count == 0) { + // curl_global_cleanup(); + // } + } }; unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { From bd44ff2a11246de2e7788c05659b909584dd81c5 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 20 May 2025 15:25:10 +0200 Subject: [PATCH 27/44] remove verbose --- extension/httpfs/httpfs_client.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 13ebd6d..2d93c4d 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -294,8 +294,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - curl_easy_setopt(*curl, CURLOPT_VERBOSE, 1L); - // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); From 3da7bfff534272dd4cb00d44722ab3615331fb7b Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:11:40 +0200 Subject: [PATCH 28/44] reuse many components --- extension/httpfs/httpfs_client.cpp | 112 +++++++++-------------------- 1 file changed, 35 insertions(+), 77 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 2d93c4d..1c4695e 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -59,7 +59,7 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } // If header starts with HTTP/... curl has followed a redirect and we have a new Header, - // so we clear all of the current header_collection + // so we push back a new header_collection and store headers from the redirect there. if (header.rfind("HTTP/", 0) == 0) { header_collection->header_collection.push_back(HTTPHeaders()); header_collection->header_collection.back().Insert("__RESPONSE_STATUS__", header); @@ -108,6 +108,7 @@ struct RequestInfo { string url = ""; string body = ""; uint16_t response_code = 0; + std::vector header_collection; }; @@ -126,6 +127,7 @@ class HTTPFSClient : public HTTPClient { InitCurlGlobal(); curl = make_uniq(bearer_token, SelectCURLCertPath()); + request_info = make_uniq(); // set curl options // follow redirects @@ -153,6 +155,15 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_CONNECTTIMEOUT, http_params.timeout); // accept content as-is (i.e no decompressing) curl_easy_setopt(*curl, CURLOPT_ACCEPT_ENCODING, "identity"); + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // define the header callback + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + // define the write data callback (for get requests) + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); if (!http_params.http_proxy.empty()) { curl_easy_setopt(*curl, CURLOPT_PROXY, StringUtil::Format("%s:%s", http_params.http_proxy, http_params.http_proxy_port).c_str()); @@ -174,28 +185,17 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); request_info->url = info.url; if (!info.params.extra_headers.empty()) { auto curl_params = TransformParamsCurl(info.params); request_info->url += "?" + curl_params; } - auto header_collector = make_uniq(); CURLcode res; { // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); - - curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); - // write response data - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - // write response headers (different header collection for each redirect) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); res = curl->Execute(); } @@ -203,8 +203,8 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); idx_t bytes_received = 0; - if (header_collector && header_collector->header_collection.empty() && header_collector->header_collection.back().HasHeader("content-length")) { - bytes_received = std::stoi(header_collector->header_collection.back().GetHeaderValue("content-length")); + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(request_info->header_collection.back().GetHeaderValue("content-length")); D_ASSERT(bytes_received == request_info->body.size()); } else { bytes_received = request_info->body.size(); @@ -215,7 +215,7 @@ class HTTPFSClient : public HTTPClient { const char* data = request_info->body.c_str(); info.content_handler(const_data_ptr_cast(data), bytes_received); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Put(PutRequestInfo &info) override { @@ -225,7 +225,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); // Add content type header from info curl_headers.Add("Content-Type: " + info.content_type); // transform parameters @@ -235,39 +234,24 @@ class HTTPFSClient : public HTTPClient { request_info->url += "?" + curl_params; } - auto header_collector = make_uniq(); CURLcode res; { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); - // Perform PUT curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); - // Include PUT body curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); - // Follow redirects if needed - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // Capture response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Capture response headers - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Apply headers curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); - // Execute the request res = curl->Execute(); } curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Head(HeadRequestInfo &info) override { @@ -276,7 +260,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); request_info->url = info.url; // transform parameters if (!info.params.extra_headers.empty()) { @@ -285,7 +268,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -294,17 +276,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); - // Follow redirects - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // set write function to collect body — no body expected, so safe to omit - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -313,7 +284,7 @@ class HTTPFSClient : public HTTPClient { } curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl(res); } unique_ptr Delete(DeleteRequestInfo &info) override { @@ -322,7 +293,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); // transform parameters request_info->url = info.url; if (!info.params.extra_headers.empty()) { @@ -331,7 +301,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { // Set URL curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); @@ -342,14 +311,6 @@ class HTTPFSClient : public HTTPClient { // Follow redirects curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - // Set write function to collect response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -359,7 +320,7 @@ class HTTPFSClient : public HTTPClient { // Get HTTP response status code curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl( res); } unique_ptr Post(PostRequestInfo &info) override { @@ -369,7 +330,6 @@ class HTTPFSClient : public HTTPClient { } auto curl_headers = TransformHeadersCurl(info.headers); - auto request_info = make_uniq(); const string content_type = "Content-Type: application/octet-stream"; curl_headers.Add(content_type.c_str()); // transform parameters @@ -380,7 +340,6 @@ class HTTPFSClient : public HTTPClient { } CURLcode res; - auto header_collector = make_uniq(); { curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); curl_easy_setopt(*curl, CURLOPT_POST, 1L); @@ -389,18 +348,6 @@ class HTTPFSClient : public HTTPClient { curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); - // Follow redirects - // TODO: should we follow redirects for POST? - curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); - - // Set write function to collect response body - curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); - curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); - - // Collect response headers (multiple header blocks for redirects) - curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); - curl_easy_setopt(*curl, CURLOPT_HEADERDATA, header_collector.get()); - // Add headers if any curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); @@ -411,7 +358,7 @@ class HTTPFSClient : public HTTPClient { curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); info.buffer_out = request_info->body; // Construct HTTPResponse - return TransformResponseCurl(std::move(request_info), std::move(header_collector), res); + return TransformResponseCurl( res); } private: @@ -444,13 +391,22 @@ class HTTPFSClient : public HTTPClient { return result; } - unique_ptr TransformResponseCurl(unique_ptr request_info, unique_ptr header_collection, CURLcode res) { + void ResetRequestInfo() { + // clear headers after transform + request_info->header_collection.clear(); + // reset request info. + request_info->body = ""; + request_info->url = ""; + request_info->response_code = 0; + } + + unique_ptr TransformResponseCurl(CURLcode res) { auto status_code = HTTPStatusCode(request_info->response_code); auto response = make_uniq(status_code); if (res != CURLcode::CURLE_OK) { // TODO: request error can come from HTTPS Status code toString() value. - if (header_collection && !header_collection->header_collection.empty() && header_collection->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { - response->request_error = header_collection->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error = request_info->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); } else { response->request_error = curl_easy_strerror(res); } @@ -458,17 +414,19 @@ class HTTPFSClient : public HTTPClient { } response->body = request_info->body; response->url= request_info->url; - if (header_collection && !header_collection->header_collection.empty()) { - for (auto &header : header_collection->header_collection.back()) { + if (!request_info->header_collection.empty()) { + for (auto &header : request_info->header_collection.back()) { response->headers.Insert(header.first, header.second); } } + ResetRequestInfo(); return response; } private: unique_ptr curl; optional_ptr state; + unique_ptr request_info; static std::mutex &GetRefLock() { static std::mutex mtx; From 198288e62c8b7643ac012c6a2518760ddc6d0c23 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:27:56 +0200 Subject: [PATCH 29/44] add test to check response headers --- test/sql/copy/csv/test_csv_httpfs.test | 5 ++++ test/sql/test_headers_parsed.test | 38 ++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 test/sql/test_headers_parsed.test diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index e1de167..1641a0e 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -9,6 +9,9 @@ require parquet statement ok PRAGMA enable_verification +statement ok +pragma enable_logging('HTTP'); + query II select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- @@ -25,6 +28,8 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest +pragma enable_logging('HTTP'); + query IIIIIIIIIIIIIIIIII select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test new file mode 100644 index 0000000..3d331ed --- /dev/null +++ b/test/sql/test_headers_parsed.test @@ -0,0 +1,38 @@ +# name: test/sql/copy/csv/test_headers_parsed.test +# description: This test triggers the http prefetch mechanism. +# group: [csv] + +require httpfs + +require parquet + +statement ok +pragma enable_logging('HTTP'); + +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; +---- +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest + +query II +select response.status from duckdb_logs_parsed('HTTP') order by all; +---- +OK_200 +PartialContent_206 + +query II +select response.headers['__RESPONSE_STATUS__'] from duckdb_logs_parsed('HTTP') order by all; +---- +HTTP/2 200 +HTTP/2 206 \ No newline at end of file From 0f9f74928bcdef6fd02820bd5c30218169604eb3 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:33:21 +0200 Subject: [PATCH 30/44] remove internal exception if certificate authority cannot be set --- extension/httpfs/httpfs_client.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 1c4695e..9931b87 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -94,8 +94,6 @@ static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, v } if (!cert_path.empty()) { curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); - } else { - throw InternalException("Could not set certificate authority"); } } From 73103b1a5bfb64ed18457b973fd1b6f06738c25c Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:37:16 +0200 Subject: [PATCH 31/44] fix test --- test/sql/test_headers_parsed.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 3d331ed..1044d45 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -25,7 +25,7 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest -query II +query I select response.status from duckdb_logs_parsed('HTTP') order by all; ---- OK_200 From afb0c2aec107174546789b508de82a45f441295f Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:53:31 +0200 Subject: [PATCH 32/44] actually fix test --- test/sql/test_headers_parsed.test | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 1044d45..44a5121 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -31,8 +31,16 @@ select response.status from duckdb_logs_parsed('HTTP') order by all; OK_200 PartialContent_206 -query II -select response.headers['__RESPONSE_STATUS__'] from duckdb_logs_parsed('HTTP') order by all; + +# response status is either +# HTTP/2 200 +# HTTP/2 206 +# OR +# HTTP/1.1 200 OK +# HTTP/1.1 206 Partial Content +# depending on OS and CA (I think) +query I +select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; ---- -HTTP/2 200 -HTTP/2 206 \ No newline at end of file +true +true \ No newline at end of file From 4b32d40ed26e95d72d5f1e90439963dab5f6c505 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Wed, 21 May 2025 10:57:48 +0200 Subject: [PATCH 33/44] remove rogue pragma --- test/sql/copy/csv/test_csv_httpfs.test | 2 -- 1 file changed, 2 deletions(-) diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 1641a0e..d62683e 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -28,8 +28,6 @@ select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_ 11 production designer 12 guest -pragma enable_logging('HTTP'); - query IIIIIIIIIIIIIIIIII select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- From 07a7afa8c44067dea6ba7d719fa8b0a34aa2304e Mon Sep 17 00:00:00 2001 From: Tmonster Date: Mon, 26 May 2025 10:41:57 +0200 Subject: [PATCH 34/44] re-run CI for after release From 3a93b424dc53a69ec2e2e55665a195cd020e73d1 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Tue, 17 Jun 2025 11:10:59 +0200 Subject: [PATCH 35/44] update main distribution pipeline --- .github/workflows/MainDistributionPipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index ec31856..d44e529 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main duckdb-stable-deploy: @@ -27,6 +27,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} From 3251e7c5c90b4fbbe16a371e5679706e3f8beac9 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 2 Jul 2025 14:25:43 +0200 Subject: [PATCH 36/44] Override http_util only if not already named 'WasmHTTPUtils' --- extension/httpfs/httpfs_extension.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index c9bc985..aab275b 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -61,7 +61,12 @@ static void LoadInternal(DatabaseInstance &instance) { // HuggingFace options config.AddExtensionOption("hf_max_per_page", "Debug option to limit number of items returned in list requests", LogicalType::UBIGINT, Value::UBIGINT(0)); - config.http_util = make_shared_ptr(); + + if (config.http_util && config.http_util->GetName() == "WasmHTTPUtils") { + // Already handled, do not override + } else { + config.http_util = make_shared_ptr(); + } auto provider = make_uniq(config); provider->SetAll(); From 454849554de81dcf6ef8cda780205cb0d504d4da Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 2 Jul 2025 15:44:20 +0200 Subject: [PATCH 37/44] Extract some code to hash_functions.cpp/hpp --- CMakeLists.txt | 2 ++ extension/httpfs/crypto.cpp | 23 +---------------- extension/httpfs/hash_functions.cpp | 28 +++++++++++++++++++++ extension/httpfs/include/hash_functions.hpp | 18 +++++++++++++ 4 files changed, 49 insertions(+), 22 deletions(-) create mode 100644 extension/httpfs/hash_functions.cpp create mode 100644 extension/httpfs/include/hash_functions.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 92d4547..40b0d3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ build_static_extension( extension/httpfs/httpfs_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp + extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp extension/httpfs/httpfs_extension.cpp) @@ -30,6 +31,7 @@ build_loadable_extension( extension/httpfs/httpfs_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp + extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp extension/httpfs/httpfs_extension.cpp) diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 04bd795..3a89ca5 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -1,4 +1,5 @@ #include "crypto.hpp" +#include "hash_functions.hpp" #include "mbedtls_wrapper.hpp" #include #include "duckdb/common/common.hpp" @@ -9,28 +10,6 @@ namespace duckdb { -void sha256(const char *in, size_t in_len, hash_bytes &out) { - duckdb_mbedtls::MbedTlsWrapper::ComputeSha256Hash(in, in_len, (char *)out); -} - -void hmac256(const std::string &message, const char *secret, size_t secret_len, hash_bytes &out) { - duckdb_mbedtls::MbedTlsWrapper::Hmac256(secret, secret_len, message.data(), message.size(), (char *)out); -} - -void hmac256(std::string message, hash_bytes secret, hash_bytes &out) { - hmac256(message, (char *)secret, sizeof(hash_bytes), out); -} - -void hex256(hash_bytes &in, hash_str &out) { - const char *hex = "0123456789abcdef"; - unsigned char *pin = in; - unsigned char *pout = out; - for (; pin < in + sizeof(in); pout += 2, pin++) { - pout[0] = hex[(*pin >> 4) & 0xF]; - pout[1] = hex[*pin & 0xF]; - } -} - AESStateSSL::AESStateSSL(const std::string *key) : context(EVP_CIPHER_CTX_new()) { if (!(context)) { throw InternalException("AES GCM failed with initializing context"); diff --git a/extension/httpfs/hash_functions.cpp b/extension/httpfs/hash_functions.cpp new file mode 100644 index 0000000..1e6bb8f --- /dev/null +++ b/extension/httpfs/hash_functions.cpp @@ -0,0 +1,28 @@ +#include "mbedtls_wrapper.hpp" +#include "hash_functions.hpp" + +namespace duckdb { + +void sha256(const char *in, size_t in_len, hash_bytes &out) { + duckdb_mbedtls::MbedTlsWrapper::ComputeSha256Hash(in, in_len, (char *)out); +} + +void hmac256(const std::string &message, const char *secret, size_t secret_len, hash_bytes &out) { + duckdb_mbedtls::MbedTlsWrapper::Hmac256(secret, secret_len, message.data(), message.size(), (char *)out); +} + +void hmac256(std::string message, hash_bytes secret, hash_bytes &out) { + hmac256(message, (char *)secret, sizeof(hash_bytes), out); +} + +void hex256(hash_bytes &in, hash_str &out) { + const char *hex = "0123456789abcdef"; + unsigned char *pin = in; + unsigned char *pout = out; + for (; pin < in + sizeof(in); pout += 2, pin++) { + pout[0] = hex[(*pin >> 4) & 0xF]; + pout[1] = hex[*pin & 0xF]; + } +} + +} // namespace duckdb diff --git a/extension/httpfs/include/hash_functions.hpp b/extension/httpfs/include/hash_functions.hpp new file mode 100644 index 0000000..bfefe79 --- /dev/null +++ b/extension/httpfs/include/hash_functions.hpp @@ -0,0 +1,18 @@ +#pragma once + +#include "duckdb/common/helper.hpp" + +namespace duckdb { + +typedef unsigned char hash_bytes[32]; +typedef unsigned char hash_str[64]; + +void sha256(const char *in, size_t in_len, hash_bytes &out); + +void hmac256(const std::string &message, const char *secret, size_t secret_len, hash_bytes &out); + +void hmac256(std::string message, hash_bytes secret, hash_bytes &out); + +void hex256(hash_bytes &in, hash_str &out); + +} // namespace duckdb From ae7230f3f087a8783b978e2894712c00a13cfeac Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 2 Jul 2025 15:50:41 +0200 Subject: [PATCH 38/44] Feature select: no crytpo.cpp and no override of encryption_utils in Wasm --- CMakeLists.txt | 11 +++++++++-- extension/httpfs/httpfs_extension.cpp | 4 ++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 40b0d3d..5c68ea0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,11 @@ add_extension_definitions() include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) +if (NOT EMSCRIPTEN) + set(EXTRA_SOURCES "extension/httpfs/crypto.cpp") + add_definitions(-DOVERRIDE_ENCRYPTION_UTILS=1) +endif() + build_static_extension( httpfs extension/httpfs/hffs.cpp @@ -19,7 +24,8 @@ build_static_extension( extension/httpfs/crypto.cpp extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp - extension/httpfs/httpfs_extension.cpp) + extension/httpfs/httpfs_extension.cpp + ${EXTRA_SOURCES}) set(PARAMETERS "-warnings") build_loadable_extension( @@ -33,7 +39,8 @@ build_loadable_extension( extension/httpfs/crypto.cpp extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp - extension/httpfs/httpfs_extension.cpp) + extension/httpfs/httpfs_extension.cpp + ${EXTRA_SOURCES}) if(MINGW) set(OPENSSL_USE_STATIC_LIBS TRUE) diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index aab275b..945cdd4 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -6,7 +6,9 @@ #include "duckdb.hpp" #include "s3fs.hpp" #include "hffs.hpp" +#ifdef OVERRIDE_ENCRYPTION_UTILS #include "crypto.hpp" +#endif // OVERRIDE_ENCRYPTION_UTILS namespace duckdb { @@ -74,8 +76,10 @@ static void LoadInternal(DatabaseInstance &instance) { CreateS3SecretFunctions::Register(instance); CreateBearerTokenFunctions::Register(instance); +#ifdef OVERRIDE_ENCRYPTION_UTILS // set pointer to OpenSSL encryption state config.encryption_util = make_shared_ptr(); +#endif // OVERRIDE_ENCRYPTION_UTILS } void HttpfsExtension::Load(DuckDB &db) { LoadInternal(*db.instance); From bb583db996a23a903baea1d8dd67bcf0ffb2424b Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 18 Jun 2025 14:41:08 +0200 Subject: [PATCH 39/44] Bundle-in mbedtls --- extension_config.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/extension_config.cmake b/extension_config.cmake index 5881043..2664bc2 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -15,4 +15,5 @@ duckdb_extension_load(httpfs SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} INCLUDE_DIR ${CMAKE_CURRENT_LIST_DIR}/extension/httpfs/include ${LOAD_HTTPFS_TESTS} + LINKED_LIBS "../../third_party/mbedtls/libduckdb_mbedtls.a" ) From 9bf98eff513037410650c7677a2707565e1de2d4 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 2 Jul 2025 15:55:39 +0200 Subject: [PATCH 40/44] Re-enable wasm compilation --- .github/workflows/MainDistributionPipeline.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index b06d213..496d87c 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -19,7 +19,6 @@ jobs: extension_name: httpfs duckdb_version: v1.3-ossivalis ci_tools_version: main - exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads' duckdb-stable-deploy: @@ -32,4 +31,3 @@ jobs: duckdb_version: v1.3-ossivalis ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} - exclude_archs: 'wasm_mvp;wasm_eh;wasm_threads' From 104bec6ca79cb02719940c43c7d27bce2c7d0b29 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 09:14:34 +0200 Subject: [PATCH 41/44] Skip conditionally also httpfs_client.cpp --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c68ea0..6791c0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ include_directories(extension/httpfs/include ${DUCKDB_MODULE_BASE_DIR}/third_party/httplib) if (NOT EMSCRIPTEN) - set(EXTRA_SOURCES "extension/httpfs/crypto.cpp") + set(EXTRA_SOURCES extension/httpfs/crypto.cpp extension/httpfs/httpfs_client.cpp) add_definitions(-DOVERRIDE_ENCRYPTION_UTILS=1) endif() @@ -19,13 +19,12 @@ build_static_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp extension/httpfs/httpfs_extension.cpp - ${EXTRA_SOURCES}) + ${EXTRA_SOURCES} ) set(PARAMETERS "-warnings") build_loadable_extension( @@ -34,13 +33,12 @@ build_loadable_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/hash_functions.cpp extension/httpfs/create_secret_functions.cpp extension/httpfs/httpfs_extension.cpp - ${EXTRA_SOURCES}) + ${EXTRA_SOURCES} ) if(MINGW) set(OPENSSL_USE_STATIC_LIBS TRUE) From a2cc41149e4e164f853de0ba52c3ff6fad3e11cb Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 11:16:40 +0200 Subject: [PATCH 42/44] Separate minimal HTTPFSUtil implementations into httpfs_client and httpfs_client_wasm --- CMakeLists.txt | 2 ++ extension/httpfs/httpfs.cpp | 5 +++++ extension/httpfs/httpfs_client.cpp | 4 ---- extension/httpfs/httpfs_client_wasm.cpp | 16 ++++++++++++++++ 4 files changed, 23 insertions(+), 4 deletions(-) create mode 100644 extension/httpfs/httpfs_client_wasm.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6791c0a..024cb72 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ include_directories(extension/httpfs/include if (NOT EMSCRIPTEN) set(EXTRA_SOURCES extension/httpfs/crypto.cpp extension/httpfs/httpfs_client.cpp) add_definitions(-DOVERRIDE_ENCRYPTION_UTILS=1) +else() + set(EXTRA_SOURCES extension/httpfs/httpfs_client_wasm.cpp) endif() build_static_extension( diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 56c8b8e..20f85a6 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -727,4 +727,9 @@ void HTTPFileHandle::StoreClient(unique_ptr client) { HTTPFileHandle::~HTTPFileHandle() { DUCKDB_LOG_FILE_SYSTEM_CLOSE((*this)); }; + +string HTTPFSUtil::GetName() const { + return "HTTPFS"; +} + } // namespace duckdb diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 84eb457..3bf5a64 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -160,8 +160,4 @@ unordered_map HTTPFSUtil::ParseGetParameters(const string &text) return result; } -string HTTPFSUtil::GetName() const { - return "HTTPFS"; -} - } // namespace duckdb diff --git a/extension/httpfs/httpfs_client_wasm.cpp b/extension/httpfs/httpfs_client_wasm.cpp new file mode 100644 index 0000000..aaa22bb --- /dev/null +++ b/extension/httpfs/httpfs_client_wasm.cpp @@ -0,0 +1,16 @@ +#include "httpfs_client.hpp" +#include "http_state.hpp" + +namespace duckdb { + +unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { + throw InternalException("HTTPFSUtil::InitializeClient is not expected to be called"); +} + +unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { + unordered_map result; + //TODO: HTTPFSUtil::ParseGetParameters is currently not implemented + return result; +} + +} // namespace duckdb From 95567d3235cba0ae3279c9e02063dbfa65711129 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 11:54:20 +0200 Subject: [PATCH 43/44] Have curl and httplib implementation convive side-by-side --- CMakeLists.txt | 6 +- ...tpfs_client.cpp => httpfs_curl_client.cpp} | 16 +- extension/httpfs/httpfs_extension.cpp | 16 ++ extension/httpfs/httpfs_httplib_client.cpp | 167 ++++++++++++++++++ extension/httpfs/include/httpfs_client.hpp | 9 + 5 files changed, 204 insertions(+), 10 deletions(-) rename extension/httpfs/{httpfs_client.cpp => httpfs_curl_client.cpp} (96%) create mode 100644 extension/httpfs/httpfs_httplib_client.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 865b901..3aa0549 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,8 @@ build_static_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp + extension/httpfs/httpfs_httplib_client.cpp + extension/httpfs/httpfs_curl_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/create_secret_functions.cpp @@ -27,7 +28,8 @@ build_loadable_extension( extension/httpfs/hffs.cpp extension/httpfs/s3fs.cpp extension/httpfs/httpfs.cpp - extension/httpfs/httpfs_client.cpp + extension/httpfs/httpfs_httplib_client.cpp + extension/httpfs/httpfs_curl_client.cpp extension/httpfs/http_state.cpp extension/httpfs/crypto.cpp extension/httpfs/create_secret_functions.cpp diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_curl_client.cpp similarity index 96% rename from extension/httpfs/httpfs_client.cpp rename to extension/httpfs/httpfs_curl_client.cpp index 9931b87..d648269 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_curl_client.cpp @@ -112,9 +112,9 @@ struct RequestInfo { static idx_t httpfs_client_count = 0; -class HTTPFSClient : public HTTPClient { +class HTTPFSCurlClient : public HTTPClient { public: - HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { + HTTPFSCurlClient(HTTPFSParams &http_params, const string &proto_host_port) { auto bearer_token = ""; if (!http_params.bearer_token.empty()) { bearer_token = http_params.bearer_token.c_str(); @@ -173,7 +173,7 @@ class HTTPFSClient : public HTTPClient { } } - ~HTTPFSClient() { + ~HTTPFSCurlClient() { DestroyCurlGlobal(); } @@ -453,12 +453,12 @@ class HTTPFSClient : public HTTPClient { } }; -unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { - auto client = make_uniq(http_params.Cast(), proto_host_port); +unique_ptr HTTPFSCurlUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { + auto client = make_uniq(http_params.Cast(), proto_host_port); return std::move(client); } -unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { +unordered_map HTTPFSCurlUtil::ParseGetParameters(const string &text) { unordered_map params; auto pos = text.find('?'); @@ -482,8 +482,8 @@ unordered_map HTTPFSUtil::ParseGetParameters(const string &text) return params; } -string HTTPFSUtil::GetName() const { - return "HTTPFS"; +string HTTPFSCurlUtil::GetName() const { + return "HTTPFS-Curl"; } } // namespace duckdb diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index c9bc985..c8b9485 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -2,6 +2,7 @@ #include "httpfs_extension.hpp" +#include "httpfs_client.hpp" #include "create_secret_functions.hpp" #include "duckdb.hpp" #include "s3fs.hpp" @@ -61,6 +62,21 @@ static void LoadInternal(DatabaseInstance &instance) { // HuggingFace options config.AddExtensionOption("hf_max_per_page", "Debug option to limit number of items returned in list requests", LogicalType::UBIGINT, Value::UBIGINT(0)); + + auto callback_httpfs_client_implementation = [](ClientContext& context, SetScope scope, Value& parameter) { + auto &config = DBConfig::GetConfig(context); + string value = StringValue::Get(parameter); + if (value == "curl" && (!config.http_util || config.http_util->GetName() != "HTTPFSUtil-Curl")) { + config.http_util = make_shared_ptr(); + } + if ((value == "httplib" || value == "default" )&& (!config.http_util || config.http_util->GetName() != "HTTPFSUtil")) { + config.http_util = make_shared_ptr(); + } + }; + config.AddExtensionOption("httpfs_client_implementation", "Select which is the HTTPUtil implementation to be used", + LogicalType::VARCHAR, "default", callback_httpfs_client_implementation); + + config.http_util = make_shared_ptr(); auto provider = make_uniq(config); diff --git a/extension/httpfs/httpfs_httplib_client.cpp b/extension/httpfs/httpfs_httplib_client.cpp new file mode 100644 index 0000000..84eb457 --- /dev/null +++ b/extension/httpfs/httpfs_httplib_client.cpp @@ -0,0 +1,167 @@ +#include "httpfs_client.hpp" +#include "http_state.hpp" + +#define CPPHTTPLIB_OPENSSL_SUPPORT +#include "httplib.hpp" + +namespace duckdb { + +class HTTPFSClient : public HTTPClient { +public: + HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { + client = make_uniq(proto_host_port); + client->set_follow_location(http_params.follow_location); + client->set_keep_alive(http_params.keep_alive); + if (!http_params.ca_cert_file.empty()) { + client->set_ca_cert_path(http_params.ca_cert_file.c_str()); + } + client->enable_server_certificate_verification(http_params.enable_server_cert_verification); + client->set_write_timeout(http_params.timeout, http_params.timeout_usec); + client->set_read_timeout(http_params.timeout, http_params.timeout_usec); + client->set_connection_timeout(http_params.timeout, http_params.timeout_usec); + client->set_decompress(false); + if (!http_params.bearer_token.empty()) { + client->set_bearer_token_auth(http_params.bearer_token.c_str()); + } + + if (!http_params.http_proxy.empty()) { + client->set_proxy(http_params.http_proxy, http_params.http_proxy_port); + + if (!http_params.http_proxy_username.empty()) { + client->set_proxy_basic_auth(http_params.http_proxy_username, http_params.http_proxy_password); + } + } + state = http_params.state; + } + + unique_ptr Get(GetRequestInfo &info) override { + if (state) { + state->get_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + if (!info.response_handler && !info.content_handler) { + return TransformResult(client->Get(info.path, headers)); + } else { + return TransformResult(client->Get( + info.path.c_str(), headers, + [&](const duckdb_httplib_openssl::Response &response) { + auto http_response = TransformResponse(response); + return info.response_handler(*http_response); + }, + [&](const char *data, size_t data_length) { + if (state) { + state->total_bytes_received += data_length; + } + return info.content_handler(const_data_ptr_cast(data), data_length); + })); + } + } + unique_ptr Put(PutRequestInfo &info) override { + if (state) { + state->put_count++; + state->total_bytes_sent += info.buffer_in_len; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, + info.content_type)); + } + + unique_ptr Head(HeadRequestInfo &info) override { + if (state) { + state->head_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Head(info.path, headers)); + } + + unique_ptr Delete(DeleteRequestInfo &info) override { + if (state) { + state->delete_count++; + } + auto headers = TransformHeaders(info.headers, info.params); + return TransformResult(client->Delete(info.path, headers)); + } + + unique_ptr Post(PostRequestInfo &info) override { + if (state) { + state->post_count++; + state->total_bytes_sent += info.buffer_in_len; + } + // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib + duckdb_httplib_openssl::Request req; + req.method = "POST"; + req.path = info.path; + req.headers = TransformHeaders(info.headers, info.params); + req.headers.emplace("Content-Type", "application/octet-stream"); + req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, + uint64_t /*total_length*/) { + if (state) { + state->total_bytes_received += data_length; + } + info.buffer_out += string(data, data_length); + return true; + }; + req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); + return TransformResult(client->send(req)); + } + +private: + duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { + duckdb_httplib_openssl::Headers headers; + for (auto &entry : header_map) { + headers.insert(entry); + } + for (auto &entry : params.extra_headers) { + headers.insert(entry); + } + return headers; + } + + unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { + auto status_code = HTTPUtil::ToStatusCode(response.status); + auto result = make_uniq(status_code); + result->body = response.body; + result->reason = response.reason; + for (auto &entry : response.headers) { + result->headers.Insert(entry.first, entry.second); + } + return result; + } + + unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { + if (res.error() == duckdb_httplib_openssl::Error::Success) { + auto &response = res.value(); + return TransformResponse(response); + } else { + auto result = make_uniq(HTTPStatusCode::INVALID); + result->request_error = to_string(res.error()); + return result; + } + } + +private: + unique_ptr client; + optional_ptr state; +}; + +unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { + auto client = make_uniq(http_params.Cast(), proto_host_port); + return std::move(client); +} + +unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { + duckdb_httplib_openssl::Params query_params; + duckdb_httplib_openssl::detail::parse_query_text(text, query_params); + + unordered_map result; + for (auto &entry : query_params) { + result.emplace(std::move(entry.first), std::move(entry.second)); + } + return result; +} + +string HTTPFSUtil::GetName() const { + return "HTTPFS"; +} + +} // namespace duckdb diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 11a48a4..d540ce8 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -37,6 +37,15 @@ class HTTPFSUtil : public HTTPUtil { string GetName() const override; }; +class HTTPFSCurlUtil : public HTTPFSUtil { +public: + unique_ptr InitializeClient(HTTPParams &http_params, const string &proto_host_port) override; + + static unordered_map ParseGetParameters(const string &text); + + string GetName() const override; +}; + class CURLHandle { public: CURLHandle(const string &token, const string &cert_path); From 401ff6be061b855af1bc707cd3269c0134610cc5 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 3 Jul 2025 12:21:34 +0200 Subject: [PATCH 44/44] Adapt test --- test/sql/test_headers_parsed.test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test index 44a5121..317ec82 100644 --- a/test/sql/test_headers_parsed.test +++ b/test/sql/test_headers_parsed.test @@ -6,6 +6,9 @@ require httpfs require parquet +statement ok +SET httpfs_client_implementation='curl'; + statement ok pragma enable_logging('HTTP'); @@ -31,7 +34,6 @@ select response.status from duckdb_logs_parsed('HTTP') order by all; OK_200 PartialContent_206 - # response status is either # HTTP/2 200 # HTTP/2 206 @@ -43,4 +45,4 @@ query I select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; ---- true -true \ No newline at end of file +true