diff --git a/.github/workflows/MinioTests.yml b/.github/workflows/MinioTests.yml index dca4c52..ad07635 100644 --- a/.github/workflows/MinioTests.yml +++ b/.github/workflows/MinioTests.yml @@ -20,6 +20,8 @@ jobs: DUCKDB_S3_USE_SSL: false GEN: ninja VCPKG_TARGET_TRIPLET: x64-linux + PYTHON_HTTP_SERVER_URL: http://localhost:8008 + PYTHON_HTTP_SERVER_DIR: /tmp/python_test_server steps: - uses: actions/checkout@v4 @@ -48,7 +50,9 @@ jobs: - name: Build shell: bash - run: make + run: | + echo -e "\nduckdb_extension_load(tpch)\n" >> extension_config.cmake + make - name: Start S3/HTTP test server shell: bash @@ -60,6 +64,13 @@ jobs: source ./scripts/run_s3_test_server.sh sleep 30 + - name: Run & Populate test server + shell: bash + run: | + mkdir -p $PYTHON_HTTP_SERVER_DIR + cd $PYTHON_HTTP_SERVER_DIR + python3 -m http.server 8008 & + - name: Test shell: bash run: | diff --git a/extension/httpfs/httpfs.cpp b/extension/httpfs/httpfs.cpp index 34bcc5d..359fd32 100644 --- a/extension/httpfs/httpfs.cpp +++ b/extension/httpfs/httpfs.cpp @@ -47,6 +47,7 @@ unique_ptr HTTPFSUtil::InitializeParameters(optional_ptr // Setting lookups FileOpener::TryGetCurrentSetting(opener, "http_timeout", result->timeout, info); FileOpener::TryGetCurrentSetting(opener, "force_download", result->force_download, info); + FileOpener::TryGetCurrentSetting(opener, "auto_fallback_to_full_download", result->auto_fallback_to_full_download, info); FileOpener::TryGetCurrentSetting(opener, "http_retries", result->retries, info); FileOpener::TryGetCurrentSetting(opener, "http_retry_wait_ms", result->retry_wait_ms, info); FileOpener::TryGetCurrentSetting(opener, "http_retry_backoff", result->retry_backoff, info); @@ -233,8 +234,7 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str if (response.HasHeader("Content-Length")) { auto content_length = stoll(response.GetHeaderValue("Content-Length")); if ((idx_t)content_length != buffer_out_len) { - throw HTTPException("HTTP GET error: Content-Length from server mismatches requested " - "range, server may not support range requests."); + RangeRequestNotSupportedException::Throw(); } } } @@ -257,6 +257,8 @@ unique_ptr HTTPFileSystem::GetRangeRequest(FileHandle &handle, str return true; }); + get_request.try_request = true; + auto response = http_util.Request(get_request, http_client); hfh.StoreClient(std::move(http_client)); @@ -349,9 +351,36 @@ unique_ptr HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file return std::move(handle); } -// Buffered read from http file. -// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set -void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { +bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len) { + auto res = GetRangeRequest(handle, url, header_map, file_offset, buffer_out, buffer_out_len); + + if (res) { + // Request succeeded TODO: fix upstream that 206 is not considered success + if (res->Success() || res->status == HTTPStatusCode::PartialContent_206 || res->status == HTTPStatusCode::Accepted_202) { + return true; + } + + // Request failed and we have a request error + if (res->HasRequestError()) { + ErrorData error (res->GetRequestError()); + + // Special case: we can do a retry with a full file download + if (error.Type() == RangeRequestNotSupportedException::TYPE && error.RawMessage() == RangeRequestNotSupportedException::MESSAGE) { + auto &hfh = handle.Cast(); + if (hfh.http_params.auto_fallback_to_full_download) { + return false; + } + + error.Throw(); + } + } + throw HTTPException(*res, "Request returned HTTP %d for HTTP %s to '%s'", + static_cast(res->status), EnumUtil::ToString(RequestType::GET_REQUEST), res->url); + } + throw IOException("Unknown error for HTTP %s to '%s'", EnumUtil::ToString(RequestType::GET_REQUEST), url); +} + +bool HTTPFileSystem::ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { auto &hfh = handle.Cast(); D_ASSERT(hfh.http_params.state); @@ -362,7 +391,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id memcpy(buffer, hfh.cached_file_handle->GetData() + location, nr_bytes); DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); hfh.file_offset = location + nr_bytes; - return; + return true; } idx_t to_read = nr_bytes; @@ -371,7 +400,9 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id // Don't buffer when DirectIO is set or when we are doing parallel reads bool skip_buffer = hfh.flags.DirectIO() || hfh.flags.RequireParallelAccess(); if (skip_buffer && to_read > 0) { - GetRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read); + if (!TryRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read)) { + return false; + } DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); // Update handle status within critical section for parallel access. if (hfh.flags.RequireParallelAccess()) { @@ -379,13 +410,13 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id hfh.buffer_available = 0; hfh.buffer_idx = 0; hfh.file_offset = location + nr_bytes; - return; + return true; } hfh.buffer_available = 0; hfh.buffer_idx = 0; hfh.file_offset = location + nr_bytes; - return; + return true; } if (location >= hfh.buffer_start && location < hfh.buffer_end) { @@ -417,13 +448,17 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id // Bypass buffer if we read more than buffer size if (to_read > new_buffer_available) { - GetRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read); + if (!TryRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read)) { + return false; + } hfh.buffer_available = 0; hfh.buffer_idx = 0; start_offset += to_read; break; } else { - GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available); + if (!TryRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available)) { + return false; + } hfh.buffer_available = new_buffer_available; hfh.buffer_idx = 0; hfh.buffer_start = start_offset; @@ -433,6 +468,32 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id } hfh.file_offset = location + nr_bytes; DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location); + return true; +} + +// Buffered read from http file. +// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set +void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) { + auto success = ReadInternal(handle, buffer, nr_bytes, location); + if (success) { + return; + } + + // ReadInternal returned false. This means the regular path of querying the file with range requests failed. We will + // attempt to download the full file and retry. + + if (handle.logger) { + DUCKDB_LOG_WARN(handle.logger, "Falling back to full file download for file '%s': the server does not support HTTP range requests. Performance and memory usage are potentially degraded.", handle.path); + } + + auto &hfh = handle.Cast(); + + bool should_write_cache = false; + hfh.FullDownload(*this, should_write_cache); + + if (!ReadInternal(handle, buffer, nr_bytes, location)) { + throw HTTPException("Failed to read from HTTP file after automatically retrying a full file download."); + } } int64_t HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes) { @@ -644,6 +705,19 @@ void HTTPFileHandle::LoadFileInfo() { initialized = true; } + +void HTTPFileHandle::TryAddLogger(FileOpener &opener) { + auto context = opener.TryGetClientContext(); + if (context) { + logger = context->logger; + return; + } + auto database = opener.TryGetDatabase(); + if (database) { + logger = database->GetLogManager().GlobalLoggerReference(); + } +} + void HTTPFileHandle::Initialize(optional_ptr opener) { auto &hfs = file_system.Cast(); http_params.state = HTTPState::TryGetState(opener); diff --git a/extension/httpfs/httpfs_extension.cpp b/extension/httpfs/httpfs_extension.cpp index c9bc985..1871126 100644 --- a/extension/httpfs/httpfs_extension.cpp +++ b/extension/httpfs/httpfs_extension.cpp @@ -26,6 +26,7 @@ static void LoadInternal(DatabaseInstance &instance) { config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3)); config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100)); config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false)); + config.AddExtensionOption("auto_fallback_to_full_download", "Allows automatically falling back to full file downloads when possible.", LogicalType::BOOLEAN, Value(true)); // Reduces the number of requests made while waiting, for example retry_wait_ms of 50 and backoff factor of 2 will // result in wait times of 0 50 100 200 400...etc. config.AddExtensionOption("http_retry_backoff", "Backoff factor for exponentially increasing retry wait time", diff --git a/extension/httpfs/include/httpfs.hpp b/extension/httpfs/include/httpfs.hpp index a6b8570..f654642 100644 --- a/extension/httpfs/include/httpfs.hpp +++ b/extension/httpfs/include/httpfs.hpp @@ -14,6 +14,19 @@ namespace duckdb { +class RangeRequestNotSupportedException { +public: + // Call static Throw instead: if thrown as exception DuckDB can't catch it. + explicit RangeRequestNotSupportedException() = delete; + + static constexpr ExceptionType TYPE = ExceptionType::HTTP; + static constexpr const char *MESSAGE = "Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`"; + + static void Throw() { + throw HTTPException(MESSAGE); + } +}; + class HTTPClientCache { public: //! Get a client from the client cache @@ -49,6 +62,8 @@ class HTTPFileHandle : public FileHandle { time_t last_modified; string etag; bool initialized = false; + + bool auto_fallback_to_full_file_download = true; // When using full file download, the full file will be written to a cached file handle unique_ptr cached_file_handle; @@ -84,7 +99,10 @@ class HTTPFileHandle : public FileHandle { //! Perform a HEAD request to get the file info (if not yet loaded) void LoadFileInfo(); -private: + //! TODO: make base function virtual? + void TryAddLogger(FileOpener &opener); + +public: //! Fully downloads a file void FullDownload(HTTPFileSystem &hfs, bool &should_write_cache); }; @@ -153,6 +171,8 @@ class HTTPFileSystem : public FileSystem { } virtual HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url); + bool TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len); + bool ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location); protected: virtual duckdb::unique_ptr CreateHandle(const OpenFileInfo &file, FileOpenFlags flags, diff --git a/extension/httpfs/include/httpfs_client.hpp b/extension/httpfs/include/httpfs_client.hpp index 1d7620c..0b3f5ce 100644 --- a/extension/httpfs/include/httpfs_client.hpp +++ b/extension/httpfs/include/httpfs_client.hpp @@ -15,8 +15,10 @@ struct HTTPFSParams : public HTTPParams { static constexpr bool DEFAULT_ENABLE_SERVER_CERT_VERIFICATION = false; static constexpr uint64_t DEFAULT_HF_MAX_PER_PAGE = 0; static constexpr bool DEFAULT_FORCE_DOWNLOAD = false; + static constexpr bool AUTO_FALLBACK_TO_FULL_DOWNLOAD = true; bool force_download = DEFAULT_FORCE_DOWNLOAD; + bool auto_fallback_to_full_download = AUTO_FALLBACK_TO_FULL_DOWNLOAD; bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION; idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE; string ca_cert_file; diff --git a/test/sql/full_file_download_fallback.test b/test/sql/full_file_download_fallback.test new file mode 100644 index 0000000..456e6bc --- /dev/null +++ b/test/sql/full_file_download_fallback.test @@ -0,0 +1,46 @@ +# name: test/sql/full_file_download_fallback.test +# group: [full_file_download] + +require parquet + +require httpfs + +require tpch + +require-env PYTHON_HTTP_SERVER_URL + +require-env PYTHON_HTTP_SERVER_DIR + +statement ok +pragma enable_logging; + +statement ok +call dbgen(sf=1); + +statement ok +copy lineitem to '${PYTHON_HTTP_SERVER_DIR}/lineitem.csv' + +statement ok +drop table lineitem; + +statement ok +CREATE view lineitem AS FROM '${PYTHON_HTTP_SERVER_URL}/lineitem.csv'; + +query I +pragma tpch(6); +---- +123141078.22829981 + +query I +select count(*) from duckdb_logs where log_level='WARN' and message like '%Falling back to full%' +---- +2 + +statement ok +set auto_fallback_to_full_download=false + +statement error +pragma tpch(6); +---- +HTTP Error: Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true` + diff --git a/test/sql/secret/secret_aws.test b/test/sql/secret/secret_aws.test index 529c4bc..c2e3245 100644 --- a/test/sql/secret/secret_aws.test +++ b/test/sql/secret/secret_aws.test @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT require-env DUCKDB_S3_USE_SSL +set ignore_error_messages + require httpfs require parquet diff --git a/test/sql/secret/secret_refresh.test b/test/sql/secret/secret_refresh.test index 85c8738..ea8cd64 100644 --- a/test/sql/secret/secret_refresh.test +++ b/test/sql/secret/secret_refresh.test @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT require-env DUCKDB_S3_USE_SSL +set ignore_error_messages + require httpfs require parquet diff --git a/test/sql/secret/secret_refresh_attach.test b/test/sql/secret/secret_refresh_attach.test index c20881d..ffc2ccd 100644 --- a/test/sql/secret/secret_refresh_attach.test +++ b/test/sql/secret/secret_refresh_attach.test @@ -16,6 +16,8 @@ require-env DUCKDB_S3_USE_SSL require-env S3_ATTACH_DB +set ignore_error_messages + require httpfs require parquet