diff --git a/.github/workflows/MainDistributionPipeline.yml b/.github/workflows/MainDistributionPipeline.yml index ec31856..d44e529 100644 --- a/.github/workflows/MainDistributionPipeline.yml +++ b/.github/workflows/MainDistributionPipeline.yml @@ -17,7 +17,7 @@ jobs: uses: duckdb/extension-ci-tools/.github/workflows/_extension_distribution.yml@main with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main duckdb-stable-deploy: @@ -27,6 +27,6 @@ jobs: secrets: inherit with: extension_name: httpfs - duckdb_version: v1.3.0 + duckdb_version: v1.3.1 ci_tools_version: main deploy_latest: ${{ startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main' }} diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index dca4c52..467b5b3 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -10,7 +10,7 @@ defaults: jobs: minio-tests: name: Minio Tests - runs-on: ubuntu-24.04 + runs-on: ubuntu-latest env: S3_TEST_SERVER_AVAILABLE: 1 AWS_DEFAULT_REGION: eu-west-1 @@ -18,7 +18,9 @@ jobs: AWS_SECRET_ACCESS_KEY: minio_duckdb_user_password DUCKDB_S3_ENDPOINT: duckdb-minio.com:9000 DUCKDB_S3_USE_SSL: false + CORE_EXTENSIONS: 'parquet;json' GEN: ninja + VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake VCPKG_TARGET_TRIPLET: x64-linux steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index 92d4547..865b901 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,9 @@ if(MINGW) endif() find_package(OpenSSL REQUIRED) +find_package(CURL REQUIRED) include_directories(${OPENSSL_INCLUDE_DIR}) +include_directories(${CURL_INCLUDE_DIRS}) if(EMSCRIPTEN) else() @@ -46,6 +48,11 @@ else() ${OPENSSL_LIBRARIES}) target_link_libraries(httpfs_extension duckdb_mbedtls ${OPENSSL_LIBRARIES}) + # Link dependencies into extension + target_link_libraries(httpfs_loadable_extension ${CURL_LIBRARIES}) + target_link_libraries(httpfs_extension ${CURL_LIBRARIES}) + + if(MINGW) find_package(ZLIB) target_link_libraries(httpfs_loadable_extension ZLIB::ZLIB -lcrypt32) @@ -53,6 +60,7 @@ else() endif() endif() + install( TARGETS httpfs_extension EXPORT "${DUCKDB_EXPORT_SET}" diff --git a/extension/httpfs/crypto.cpp b/extension/httpfs/crypto.cpp index 04bd795..72b3835 100644 --- a/extension/httpfs/crypto.cpp +++ b/extension/httpfs/crypto.cpp @@ -5,7 +5,16 @@ #include #define CPPHTTPLIB_OPENSSL_SUPPORT -#include "httplib.hpp" + +#include +#include +#include +#include +#include + +#if defined(_WIN32) && defined(OPENSSL_USE_APPLINK) +#include +#endif namespace duckdb { diff --git a/extension/httpfs/httpfs_client.cpp b/extension/httpfs/httpfs_client.cpp index 84eb457..9931b87 100644 --- a/extension/httpfs/httpfs_client.cpp +++ b/extension/httpfs/httpfs_client.cpp @@ -2,84 +2,323 @@ #include "http_state.hpp" #define CPPHTTPLIB_OPENSSL_SUPPORT -#include "httplib.hpp" + +#include +#include +#include "duckdb/common/exception/http_exception.hpp" namespace duckdb { +// we statically compile in libcurl, which means the cert file location of the build machine is the +// place curl will look. But not every distro has this file in the same location, so we search a +// number of common locations and use the first one we find. +static std::string certFileLocations[] = { + // Arch, Debian-based, Gentoo + "/etc/ssl/certs/ca-certificates.crt", + // RedHat 7 based + "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", + // Redhat 6 based + "/etc/pki/tls/certs/ca-bundle.crt", + // OpenSUSE + "/etc/ssl/ca-bundle.pem", + // Alpine + "/etc/ssl/cert.pem"}; + + +//! Grab the first path that exists, from a list of well-known locations +static std::string SelectCURLCertPath() { + for (std::string &caFile : certFileLocations) { + struct stat buf; + if (stat(caFile.c_str(), &buf) == 0) { + return caFile; + } + } + return std::string(); +} + +static std::string cert_path = SelectCURLCertPath(); + +static size_t RequestWriteCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t totalSize = size * nmemb; + std::string* str = static_cast(userp); + str->append(static_cast(contents), totalSize); + return totalSize; +} + +static size_t RequestHeaderCallback(void *contents, size_t size, size_t nmemb, void *userp) { + size_t totalSize = size * nmemb; + std::string header(static_cast(contents), totalSize); + HeaderCollector* header_collection = static_cast(userp); + + // Trim trailing \r\n + if (!header.empty() && header.back() == '\n') { + header.pop_back(); + if (!header.empty() && header.back() == '\r') { + header.pop_back(); + } + } + + // If header starts with HTTP/... curl has followed a redirect and we have a new Header, + // so we push back a new header_collection and store headers from the redirect there. + if (header.rfind("HTTP/", 0) == 0) { + header_collection->header_collection.push_back(HTTPHeaders()); + header_collection->header_collection.back().Insert("__RESPONSE_STATUS__", header); + } + + size_t colonPos = header.find(':'); + + if (colonPos != std::string::npos) { + // Split the string into two parts + std::string part1 = header.substr(0, colonPos); + std::string part2 = header.substr(colonPos + 1); + if (part2.at(0) == ' ') { + part2.erase(0, 1); + } + + header_collection->header_collection.back().Insert(part1, part2); + } + // TODO: some headers may not follow standard response header formats. + // what to do in this case? Invalid does not mean we should abort. + + return totalSize; +} + + CURLHandle::CURLHandle(const string &token, const string &cert_path) { + curl = curl_easy_init(); + if (!curl) { + throw InternalException("Failed to initialize curl"); + } + if (!token.empty()) { + curl_easy_setopt(curl, CURLOPT_XOAUTH2_BEARER, token.c_str()); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BEARER); + } + if (!cert_path.empty()) { + curl_easy_setopt(curl, CURLOPT_CAINFO, cert_path.c_str()); + } +} + +CURLHandle::~CURLHandle() { + curl_easy_cleanup(curl); +} + + +struct RequestInfo { + string url = ""; + string body = ""; + uint16_t response_code = 0; + std::vector header_collection; +}; + + +static idx_t httpfs_client_count = 0; + class HTTPFSClient : public HTTPClient { public: HTTPFSClient(HTTPFSParams &http_params, const string &proto_host_port) { - client = make_uniq(proto_host_port); - client->set_follow_location(http_params.follow_location); - client->set_keep_alive(http_params.keep_alive); - if (!http_params.ca_cert_file.empty()) { - client->set_ca_cert_path(http_params.ca_cert_file.c_str()); - } - client->enable_server_certificate_verification(http_params.enable_server_cert_verification); - client->set_write_timeout(http_params.timeout, http_params.timeout_usec); - client->set_read_timeout(http_params.timeout, http_params.timeout_usec); - client->set_connection_timeout(http_params.timeout, http_params.timeout_usec); - client->set_decompress(false); + auto bearer_token = ""; if (!http_params.bearer_token.empty()) { - client->set_bearer_token_auth(http_params.bearer_token.c_str()); + bearer_token = http_params.bearer_token.c_str(); + } + state = http_params.state; + + // call curl_global_init if not already done by another HTTPFS Client + InitCurlGlobal(); + + curl = make_uniq(bearer_token, SelectCURLCertPath()); + request_info = make_uniq(); + + // set curl options + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Curl re-uses connections by default + if (!http_params.keep_alive) { + curl_easy_setopt(*curl, CURLOPT_FORBID_REUSE, 1L); + } + + // client->enable_server_certificate_verification(http_params.enable_server_cert_verification); + if (http_params.enable_server_cert_verification) { + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYPEER, 1L); // Verify the cert + curl_easy_setopt(*curl, CURLOPT_SSL_VERIFYHOST, 2L); // Verify that the cert matches the hostname } + // TODO: no global write timeout option, but you could put customize a timeout in the write functions + // or handle use CURLOPT_XFERINFOFUNCTION (progress callback) with CURLOPT_TIMEOUT_MS + // we could also set CURLOPT_LOW_SPEED_LIMIT and timeout if the speed is too low for + // too long. + + // set read timeout + curl_easy_setopt(*curl, CURLOPT_TIMEOUT, http_params.timeout); + // set connection timeout + curl_easy_setopt(*curl, CURLOPT_CONNECTTIMEOUT, http_params.timeout); + // accept content as-is (i.e no decompressing) + curl_easy_setopt(*curl, CURLOPT_ACCEPT_ENCODING, "identity"); + // follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // define the header callback + curl_easy_setopt(*curl, CURLOPT_HEADERFUNCTION, RequestHeaderCallback); + curl_easy_setopt(*curl, CURLOPT_HEADERDATA, &request_info->header_collection); + // define the write data callback (for get requests) + curl_easy_setopt(*curl, CURLOPT_WRITEFUNCTION, RequestWriteCallback); + curl_easy_setopt(*curl, CURLOPT_WRITEDATA, &request_info->body); + if (!http_params.http_proxy.empty()) { - client->set_proxy(http_params.http_proxy, http_params.http_proxy_port); + curl_easy_setopt(*curl, CURLOPT_PROXY, StringUtil::Format("%s:%s", http_params.http_proxy, http_params.http_proxy_port).c_str()); if (!http_params.http_proxy_username.empty()) { - client->set_proxy_basic_auth(http_params.http_proxy_username, http_params.http_proxy_password); + curl_easy_setopt(*curl, CURLOPT_PROXYUSERNAME, http_params.http_proxy_username.c_str()); + curl_easy_setopt(*curl, CURLOPT_PROXYPASSWORD, http_params.http_proxy_password.c_str()); } } - state = http_params.state; + } + + ~HTTPFSClient() { + DestroyCurlGlobal(); } unique_ptr Get(GetRequestInfo &info) override { if (state) { state->get_count++; } - auto headers = TransformHeaders(info.headers, info.params); - if (!info.response_handler && !info.content_handler) { - return TransformResult(client->Get(info.path, headers)); + + auto curl_headers = TransformHeadersCurl(info.headers); + request_info->url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + request_info->url += "?" + curl_params; + } + + CURLcode res; + { + // If the same handle served a HEAD request, we must set NOBODY back to 0L to request content again + curl_easy_setopt(*curl, CURLOPT_NOBODY, 0L); + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); + res = curl->Execute(); + } + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + + idx_t bytes_received = 0; + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("content-length")) { + bytes_received = std::stoi(request_info->header_collection.back().GetHeaderValue("content-length")); + D_ASSERT(bytes_received == request_info->body.size()); } else { - return TransformResult(client->Get( - info.path.c_str(), headers, - [&](const duckdb_httplib_openssl::Response &response) { - auto http_response = TransformResponse(response); - return info.response_handler(*http_response); - }, - [&](const char *data, size_t data_length) { - if (state) { - state->total_bytes_received += data_length; - } - return info.content_handler(const_data_ptr_cast(data), data_length); - })); + bytes_received = request_info->body.size(); + } + if (state) { + state->total_bytes_received += bytes_received; } + + const char* data = request_info->body.c_str(); + info.content_handler(const_data_ptr_cast(data), bytes_received); + return TransformResponseCurl(res); } + unique_ptr Put(PutRequestInfo &info) override { if (state) { state->put_count++; state->total_bytes_sent += info.buffer_in_len; } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Put(info.path, headers, const_char_ptr_cast(info.buffer_in), info.buffer_in_len, - info.content_type)); + + auto curl_headers = TransformHeadersCurl(info.headers); + // Add content type header from info + curl_headers.Add("Content-Type: " + info.content_type); + // transform parameters + request_info->url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + request_info->url += "?" + curl_params; + } + + CURLcode res; + { + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); + // Perform PUT + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "PUT"); + // Include PUT body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Apply headers + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); + + res = curl->Execute(); + } + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + + return TransformResponseCurl(res); } unique_ptr Head(HeadRequestInfo &info) override { if (state) { state->head_count++; } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Head(info.path, headers)); + + auto curl_headers = TransformHeadersCurl(info.headers); + request_info->url = info.url; + // transform parameters + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + request_info->url += "?" + curl_params; + } + + CURLcode res; + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); + + // Perform HEAD request instead of GET + curl_easy_setopt(*curl, CURLOPT_NOBODY, 1L); + curl_easy_setopt(*curl, CURLOPT_HTTPGET, 0L); + + // Add headers if any + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); + + // Execute HEAD request + res = curl->Execute(); + } + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl(res); } unique_ptr Delete(DeleteRequestInfo &info) override { if (state) { state->delete_count++; } - auto headers = TransformHeaders(info.headers, info.params); - return TransformResult(client->Delete(info.path, headers)); + + auto curl_headers = TransformHeadersCurl(info.headers); + // transform parameters + request_info->url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + request_info->url += "?" + curl_params; + } + + CURLcode res; + { + // Set URL + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); + + // Set DELETE request method + curl_easy_setopt(*curl, CURLOPT_CUSTOMREQUEST, "DELETE"); + + // Follow redirects + curl_easy_setopt(*curl, CURLOPT_FOLLOWLOCATION, 1L); + + // Add headers if any + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); + + // Execute DELETE request + res = curl->Execute(); + } + + // Get HTTP response status code + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + return TransformResponseCurl( res); } unique_ptr Post(PostRequestInfo &info) override { @@ -87,61 +326,131 @@ class HTTPFSClient : public HTTPClient { state->post_count++; state->total_bytes_sent += info.buffer_in_len; } - // We use a custom Request method here, because there is no Post call with a contentreceiver in httplib - duckdb_httplib_openssl::Request req; - req.method = "POST"; - req.path = info.path; - req.headers = TransformHeaders(info.headers, info.params); - req.headers.emplace("Content-Type", "application/octet-stream"); - req.content_receiver = [&](const char *data, size_t data_length, uint64_t /*offset*/, - uint64_t /*total_length*/) { - if (state) { - state->total_bytes_received += data_length; - } - info.buffer_out += string(data, data_length); - return true; - }; - req.body.assign(const_char_ptr_cast(info.buffer_in), info.buffer_in_len); - return TransformResult(client->send(req)); + + auto curl_headers = TransformHeadersCurl(info.headers); + const string content_type = "Content-Type: application/octet-stream"; + curl_headers.Add(content_type.c_str()); + // transform parameters + request_info->url = info.url; + if (!info.params.extra_headers.empty()) { + auto curl_params = TransformParamsCurl(info.params); + request_info->url += "?" + curl_params; + } + + CURLcode res; + { + curl_easy_setopt(*curl, CURLOPT_URL, request_info->url.c_str()); + curl_easy_setopt(*curl, CURLOPT_POST, 1L); + + // Set POST body + curl_easy_setopt(*curl, CURLOPT_POSTFIELDS, const_char_ptr_cast(info.buffer_in)); + curl_easy_setopt(*curl, CURLOPT_POSTFIELDSIZE, info.buffer_in_len); + + // Add headers if any + curl_easy_setopt(*curl, CURLOPT_HTTPHEADER, curl_headers ? curl_headers.headers : nullptr); + + // Execute POST request + res = curl->Execute(); + } + + curl_easy_getinfo(*curl, CURLINFO_RESPONSE_CODE, &request_info->response_code); + info.buffer_out = request_info->body; + // Construct HTTPResponse + return TransformResponseCurl( res); } private: - duckdb_httplib_openssl::Headers TransformHeaders(const HTTPHeaders &header_map, const HTTPParams ¶ms) { - duckdb_httplib_openssl::Headers headers; + CURLRequestHeaders TransformHeadersCurl(const HTTPHeaders &header_map) { + std::vector headers; for (auto &entry : header_map) { - headers.insert(entry); + const std::string new_header = entry.first + ": " + entry.second; + headers.push_back(new_header); } - for (auto &entry : params.extra_headers) { - headers.insert(entry); + CURLRequestHeaders curl_headers; + for (auto &header : headers) { + curl_headers.Add(header); } - return headers; + return curl_headers; } - unique_ptr TransformResponse(const duckdb_httplib_openssl::Response &response) { - auto status_code = HTTPUtil::ToStatusCode(response.status); - auto result = make_uniq(status_code); - result->body = response.body; - result->reason = response.reason; - for (auto &entry : response.headers) { - result->headers.Insert(entry.first, entry.second); + string TransformParamsCurl(const HTTPParams ¶ms) { + string result = ""; + unordered_map escaped_params; + bool first_param = true; + for (auto &entry : params.extra_headers) { + const string key = entry.first; + const string value = curl_easy_escape(*curl, entry.second.c_str(), 0); + if (!first_param) { + result += "&"; + } + result += key + "=" + value; + first_param = false; } return result; } - unique_ptr TransformResult(duckdb_httplib_openssl::Result &&res) { - if (res.error() == duckdb_httplib_openssl::Error::Success) { - auto &response = res.value(); - return TransformResponse(response); - } else { - auto result = make_uniq(HTTPStatusCode::INVALID); - result->request_error = to_string(res.error()); - return result; + void ResetRequestInfo() { + // clear headers after transform + request_info->header_collection.clear(); + // reset request info. + request_info->body = ""; + request_info->url = ""; + request_info->response_code = 0; + } + + unique_ptr TransformResponseCurl(CURLcode res) { + auto status_code = HTTPStatusCode(request_info->response_code); + auto response = make_uniq(status_code); + if (res != CURLcode::CURLE_OK) { + // TODO: request error can come from HTTPS Status code toString() value. + if (!request_info->header_collection.empty() && request_info->header_collection.back().HasHeader("__RESPONSE_STATUS__")) { + response->request_error = request_info->header_collection.back().GetHeaderValue("__RESPONSE_STATUS__"); + } else { + response->request_error = curl_easy_strerror(res); + } + return response; } + response->body = request_info->body; + response->url= request_info->url; + if (!request_info->header_collection.empty()) { + for (auto &header : request_info->header_collection.back()) { + response->headers.Insert(header.first, header.second); + } + } + ResetRequestInfo(); + return response; } private: - unique_ptr client; + unique_ptr curl; optional_ptr state; + unique_ptr request_info; + + static std::mutex &GetRefLock() { + static std::mutex mtx; + return mtx; + } + + static void InitCurlGlobal() { + GetRefLock(); + if (httpfs_client_count == 0) { + curl_global_init(CURL_GLOBAL_DEFAULT); + } + ++httpfs_client_count; + } + + static void DestroyCurlGlobal() { + // TODO: when to call curl_global_cleanup() + // calling it on client destruction causes SSL errors when verification is on (due to many requests). + // GetRefLock(); + // if (httpfs_client_count == 0) { + // throw InternalException("Destroying Httpfs client that did not initialize CURL"); + // } + // --httpfs_client_count; + // if (httpfs_client_count == 0) { + // curl_global_cleanup(); + // } + } }; unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, const string &proto_host_port) { @@ -150,14 +459,27 @@ unique_ptr HTTPFSUtil::InitializeClient(HTTPParams &http_params, con } unordered_map HTTPFSUtil::ParseGetParameters(const string &text) { - duckdb_httplib_openssl::Params query_params; - duckdb_httplib_openssl::detail::parse_query_text(text, query_params); + unordered_map params; + + auto pos = text.find('?'); + if (pos == std::string::npos) return params; + + std::string query = text.substr(pos + 1); + std::stringstream ss(query); + std::string item; - unordered_map result; - for (auto &entry : query_params) { - result.emplace(std::move(entry.first), std::move(entry.second)); + while (std::getline(ss, item, '&')) { + auto eq_pos = item.find('='); + if (eq_pos != std::string::npos) { + std::string key = item.substr(0, eq_pos); + std::string value = StringUtil::URLDecode(item.substr(eq_pos + 1)); + params[key] = value; + } else { + params[item] = ""; // key with no value + } } - return result; + + return params; } string HTTPFSUtil::GetName() const { diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 1d7620c..11a48a4 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -1,6 +1,7 @@ #pragma once #include "duckdb/common/http_util.hpp" +#include namespace duckdb { class HTTPLogger; @@ -36,4 +37,54 @@ class HTTPFSUtil : public HTTPUtil { string GetName() const override; }; +class CURLHandle { +public: + CURLHandle(const string &token, const string &cert_path); + ~CURLHandle(); + +public: + operator CURL *() { + return curl; + } + CURLcode Execute() { + return curl_easy_perform(curl); + } + +private: + CURL *curl = NULL; +}; + +class CURLRequestHeaders { +public: + CURLRequestHeaders(vector &input) { + for (auto &header : input) { + Add(header); + } + } + CURLRequestHeaders() {} + + ~CURLRequestHeaders() { + if (headers) { + curl_slist_free_all(headers); + } + headers = NULL; + } + operator bool() const { + return headers != NULL; + } + +public: + void Add(const string &header) { + headers = curl_slist_append(headers, header.c_str()); + } + +public: + curl_slist *headers = NULL; +}; + +struct HeaderCollector { + std::vector header_collection; +}; + + } // namespace duckdb diff --git a/test/sql/copy/csv/test_csv_httpfs.test b/test/sql/copy/csv/test_csv_httpfs.test index 869a0b6..d62683e 100644 --- a/test/sql/copy/csv/test_csv_httpfs.test +++ b/test/sql/copy/csv/test_csv_httpfs.test @@ -4,99 +4,43 @@ require httpfs +require parquet + statement ok PRAGMA enable_verification +statement ok +pragma enable_logging('HTTP'); -#FIXME this test fails: file is nonexistent -mode skip - -query IIIIII rowsort -SELECT * from read_csv_auto('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c'); ----- -2020 Allemagne Germany 26.1 53196.069 200601.2 -2020 Autriche Austria 18.0 4723.5 26215.8 -2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 -2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 -2020 Chypre Cyprus 0.0 0.0 1627.6 -2020 Croatie Croatia 16.3 1094.8 6726.3 -2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 -2020 Espagne Spain 17.4 14211.7 81512.9 -2020 Estonie Estonia 8.5 241.1 2827.3 -2020 Finlande Finland 2.8000000000000003 692.3 24674.4 -2020 France France 20.3 28278.9 139375.8 -2020 Grèce Greece 5.800000000000001 896.5 15401.9 -2020 Hongrie Hungary 30.5 5486.7 17872.4 -2020 Irlande Ireland 17.4 1968.477 11296.601 -2020 Italie Italy 29.2 33042.585 113119.475 -2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 -2020 Lituanie Lithuania 10.7 584.104 5457.728 -2020 Luxembourg Luxembourg 16.5 623.165 3786.785 -2020 Malte Malta 0.0 0.0 547.5 -2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 -2020 Pologne Poland 13.5 9323.205 69135.018 -2020 Portugal Portugal 11.1 1814.878 16354.725 -2020 Roumanie Romania 23.7 5626.161 23712.653 -2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 -2020 République tchèque Czech Republic 21.4 5187.282 24263.896 -2020 Slovaquie Slovakia 25.0 2564.876 10248.401 -2020 Slovénie Slovenia 12.1 590.243 4861.315 -2020 Suède Sweden 1.5 475.195 31311.413 -2020 UE 28 Europe 28 22.5 238152.4 1056907.5 -2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 -2021 Autriche Austria 18.720006775926056 4645.795 24817.272 -2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 -2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 -2021 Chypre Cyprus 0.0 0.0 1528.558 -2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 -2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 -2021 Espagne Spain 19.10173955663722 13815.0 72323.256 -2021 Estonie Estonia 8.988278645659518 245.094 2726.818 -2021 Finlande Finland 2.9937725178230212 694.288 23191.074 -2021 France France 20.649030024470434 26465.646 128168.955 -2021 Grèce Greece 7.580480506088059 1097.87 14482.855 -2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 -2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 -2021 Italie Italy 30.86368769746751 31807.236 103057.147 -2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 -2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 -2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 -2021 Malte Malta 0.0 0.0 499.875 -2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 -2021 Pologne Poland 13.146720200313602 9235.656 70250.647 -2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 -2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 -2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 -2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 -2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 -2021 Suède Sweden 1.497679952802663 471.085 31454.317 -2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -query IIIIII rowsort res -SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); - - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') ----- -1265 - -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; ---- -1265 +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest -# Give it a try to a request that returns length 0 -query I -SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +query IIIIIIIIIIIIIIIIII +select * from 'https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'; ---- -1265 +1 AAAAAAAABAAAAAAA 980124 7135 32946 2452238 2452208 Mr. Javier Lewis Y 9 12 1936 CHILE NULL Javier.Lewis@VFAxlnZEvOx.org 2452508 +2 AAAAAAAACAAAAAAA 819667 1461 31655 2452318 2452288 Dr. Amy Moses Y 9 4 1966 TOGO NULL Amy.Moses@Ovk9KjHH.com 2452318 +3 AAAAAAAADAAAAAAA 1473522 6247 48572 2449130 2449100 Miss Latisha Hamilton Y 18 9 1979 NIUE NULL Latisha.Hamilton@V.com 2452313 +4 AAAAAAAAEAAAAAAA 1703214 3986 39558 2450030 2450000 Dr. Michael White Y 7 6 1983 MEXICO NULL Michael.White@i.org 2452361 +5 AAAAAAAAFAAAAAAA 953372 4470 36368 2449438 2449408 Sir Robert Moran N 8 5 1956 FIJI NULL Robert.Moran@Hh.edu 2452469 +6 AAAAAAAAGAAAAAAA 213219 6374 27082 2451883 2451853 Ms. Brunilda Sharp Y 4 12 1925 SURINAME NULL Brunilda.Sharp@T3pylZEUQjm.org 2452430 +7 AAAAAAAAHAAAAAAA 68377 3219 44814 2451438 2451408 Ms. Fonda Wiles N 24 4 1985 GAMBIA NULL Fonda.Wiles@S9KnyEtz9hv.org 2452360 +8 AAAAAAAAIAAAAAAA 1215897 2471 16598 2449406 2449376 Sir Ollie Shipman N 26 12 1938 KOREA, REPUBLIC OF NULL Ollie.Shipman@be.org 2452334 +9 AAAAAAAAJAAAAAAA 1168667 1404 49388 2452275 2452245 Sir Karl Gilbert N 26 10 1966 MONTSERRAT NULL Karl.Gilbert@Crg5KyP2IxX9C4d6.edu 2452454 +10 AAAAAAAAKAAAAAAA 1207553 5143 19580 2451353 2451323 Ms. Albert Brunson N 15 10 1973 JORDAN NULL Albert.Brunson@62.com 2452641 #Add test for 5924 query IIIIII @@ -353,3 +297,94 @@ select * from read_csv_auto('https://csvbase.com/meripaterson/stock-exchanges'); 249 North America United States of America Members' Exchange NULL 2020-09-24 250 Africa Zimbabwe Victoria Falls Stock Exchange NULL 2020-11-01 251 Asia China Beijing Stock Exchange NULL 2021-12-27 + + +#FIXME this test fails: file is nonexistent +mode skip + +query IIIIII rowsort +SELECT * from read_csv_auto('https://github.com/duckdb/duckdb/raw/9cf66f950dde0173e1a863a7659b3ecf11bf3978/data/csv/customer.csv'); +---- +2020 Allemagne Germany 26.1 53196.069 200601.2 +2020 Autriche Austria 18.0 4723.5 26215.8 +2020 Belgique Belgium 28.999999999999996 9436.1 32553.0 +2020 Bulgarie Bulgaria 11.600000000000001 1124.1 9698.7 +2020 Chypre Cyprus 0.0 0.0 1627.6 +2020 Croatie Croatia 16.3 1094.8 6726.3 +2020 Danemark Denmark 11.600000000000001 1579.0 13601.4 +2020 Espagne Spain 17.4 14211.7 81512.9 +2020 Estonie Estonia 8.5 241.1 2827.3 +2020 Finlande Finland 2.8000000000000003 692.3 24674.4 +2020 France France 20.3 28278.9 139375.8 +2020 Grèce Greece 5.800000000000001 896.5 15401.9 +2020 Hongrie Hungary 30.5 5486.7 17872.4 +2020 Irlande Ireland 17.4 1968.477 11296.601 +2020 Italie Italy 29.2 33042.585 113119.475 +2020 Lettonie Latvia 8.200000000000001 323.605 3926.131 +2020 Lituanie Lithuania 10.7 584.104 5457.728 +2020 Luxembourg Luxembourg 16.5 623.165 3786.785 +2020 Malte Malta 0.0 0.0 547.5 +2020 Pays-Bas Netherlands 37.1 16588.314 44682.656 +2020 Pologne Poland 13.5 9323.205 69135.018 +2020 Portugal Portugal 11.1 1814.878 16354.725 +2020 Roumanie Romania 23.7 5626.161 23712.653 +2020 Royaume-Uni United Kingdom 32.4 39311.416 121414.483 +2020 République tchèque Czech Republic 21.4 5187.282 24263.896 +2020 Slovaquie Slovakia 25.0 2564.876 10248.401 +2020 Slovénie Slovenia 12.1 590.243 4861.315 +2020 Suède Sweden 1.5 475.195 31311.413 +2020 UE 28 Europe 28 22.5 238152.4 1056907.5 +2021 Allemagne Germany 26.760345686044435 51812.567 193616.957 +2021 Autriche Austria 18.720006775926056 4645.795 24817.272 +2021 Belgique Belgium 29.279402721103864 9088.083 31039.168 +2021 Bulgarie Bulgaria 12.368015142641884 1176.537 9512.739 +2021 Chypre Cyprus 0.0 0.0 1528.558 +2021 Croatie Croatia 17.10389029082304 1100.12 6431.987 +2021 Danemark Denmark 11.485631727184947 1508.152 13130.771 +2021 Espagne Spain 19.10173955663722 13815.0 72323.256 +2021 Estonie Estonia 8.988278645659518 245.094 2726.818 +2021 Finlande Finland 2.9937725178230212 694.288 23191.074 +2021 France France 20.649030024470434 26465.646 128168.955 +2021 Grèce Greece 7.580480506088059 1097.87 14482.855 +2021 Hongrie Hungary 32.344729318831554 5693.164 17601.52 +2021 Irlande Ireland 18.020604987495144 1953.468 10840.191 +2021 Italie Italy 30.86368769746751 31807.236 103057.147 +2021 Lettonie Latvia 8.502139837843602 322.927 3798.185 +2021 Lituanie Lithuania 11.029023816606903 582.797 5284.212 +2021 Luxembourg Luxembourg 17.282784281000467 564.365 3265.475 +2021 Malte Malta 0.0 0.0 499.875 +2021 Pays-Bas Netherlands 37.61392206122467 15896.316 42261.788 +2021 Pologne Poland 13.146720200313602 9235.656 70250.647 +2021 Portugal Portugal 11.437926753365227 1740.3 15215.17 +2021 Roumanie Romania 24.909638477223016 5846.885 23472.38 +2021 République tchèque Czech Republic 21.716683280446812 5158.445 23753.374 +2021 Slovaquie Slovakia 25.253930010417324 2427.134 9610.916 +2021 Slovénie Slovenia 13.141683407321874 582.024 4428.839 +2021 Suède Sweden 1.497679952802663 471.085 31454.317 +2021 UE 27 UE 27 21.894190365821018 193930.95399999994 885764.4460000001 + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +query IIIIII rowsort res +SELECT * from read_csv('https://www.data.gouv.fr/fr/datasets/r/6d186965-f41b-41f3-9b23-88241cc6890c',DELIM=';',Columns={'annee_de_reference':'VARCHAR','pays':'VARCHAR','label_en':'VARCHAR','part_du_gaz_naturel_dans_la_consommation_finale_d_energie0':'VARCHAR','consommation_finale_de_gaz_naturel_mtep':'VARCHAR','consommation_finale_d_energie_totale_mtep':'VARCHAR'}); + + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 + +# Give it a try to a request that returns length 0 +query I +SELECT count(*) from read_csv_auto('https://query1.finance.yahoo.com/v7/finance/download/^GSPC?period1=1512086400&period2=1670630400&interval=1d&events=history') +---- +1265 \ No newline at end of file diff --git a/test/sql/metadata_stats.test b/test/sql/metadata_stats.test new file mode 100644 index 0000000..4bc6c07 --- /dev/null +++ b/test/sql/metadata_stats.test @@ -0,0 +1,21 @@ +# name: test/sql/metadata_stats.test +# description: Test getting metadata stats +# group: [] + +require parquet + +require httpfs + +require json + +# Test Force download with server that doesn't want to give us the head +statement ok +FROM read_json('https://api.spring.io/projects/spring-boot/generations') + +statement ok +SET force_download=false; + +query II +explain analyze SELECT id, first_name, last_name, email FROM PARQUET_SCAN('https://raw.githubusercontent.com/duckdb/duckdb/main/data/parquet-testing/userdata1.parquet') +---- +analyzed_plan :.*GET: 2.* diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 85c8738..696279e 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -78,10 +78,11 @@ CREATE SECRET s1 ( REFRESH 1 ) +# TODO: add FORBIDDEN back in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () query I SELECT message[0:46] FROM duckdb_logs WHERE message like '%Successfully refreshed secret%' @@ -118,10 +119,11 @@ statement ok set s3_access_key_id='bogus' # Without secret this query will fail, but since there are no suitable secrets, no refresh attempt will be made +# TODO: add FORBIDDEN in once enum util for http status codes is merged into httpfs statement error FROM "s3://test-bucket/test-file.parquet" ---- -HTTP 403 +HTTP Error: Unable to connect to URL "s3://test-bucket/test-file.parquet": 403 () # -> log empty query II diff --git a/test/sql/test_headers_parsed.test b/test/sql/test_headers_parsed.test new file mode 100644 index 0000000..44a5121 --- /dev/null +++ b/test/sql/test_headers_parsed.test @@ -0,0 +1,46 @@ +# name: test/sql/copy/csv/test_headers_parsed.test +# description: This test triggers the http prefetch mechanism. +# group: [csv] + +require httpfs + +require parquet + +statement ok +pragma enable_logging('HTTP'); + +query II +select * from 'https://github.com/duckdb/duckdb-data/releases/download/v1.0/job_role_type.parquet' order by all; +---- +1 actor +2 actress +3 producer +4 writer +5 cinematographer +6 composer +7 costume designer +8 director +9 editor +10 miscellaneous crew +11 production designer +12 guest + +query I +select response.status from duckdb_logs_parsed('HTTP') order by all; +---- +OK_200 +PartialContent_206 + + +# response status is either +# HTTP/2 200 +# HTTP/2 206 +# OR +# HTTP/1.1 200 OK +# HTTP/1.1 206 Partial Content +# depending on OS and CA (I think) +query I +select response.headers['__RESPONSE_STATUS__'] LIKE 'HTTP%20%' from duckdb_logs_parsed('HTTP') order by all; +---- +true +true \ No newline at end of file diff --git a/vcpkg.json b/vcpkg.json index 3ed9a36..809e67b 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -1,5 +1,6 @@ { "dependencies": [ - "openssl" + "openssl", + "curl" ] }