Skip to content

Automatic full file download fallback #92

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion .github/workflows/MinioTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ jobs:
DUCKDB_S3_USE_SSL: false
GEN: ninja
VCPKG_TARGET_TRIPLET: x64-linux
PYTHON_HTTP_SERVER_URL: http://localhost:8008
PYTHON_HTTP_SERVER_DIR: /tmp/python_test_server

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -48,7 +50,9 @@ jobs:

- name: Build
shell: bash
run: make
run: |
echo -e "\nduckdb_extension_load(tpch)\n" >> extension_config.cmake
make

- name: Start S3/HTTP test server
shell: bash
Expand All @@ -60,6 +64,13 @@ jobs:
source ./scripts/run_s3_test_server.sh
sleep 30

- name: Run & Populate test server
shell: bash
run: |
mkdir -p $PYTHON_HTTP_SERVER_DIR
cd $PYTHON_HTTP_SERVER_DIR
python3 -m http.server 8008 &

- name: Test
shell: bash
run: |
Expand Down
96 changes: 85 additions & 11 deletions extension/httpfs/httpfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ unique_ptr<HTTPParams> HTTPFSUtil::InitializeParameters(optional_ptr<FileOpener>
// Setting lookups
FileOpener::TryGetCurrentSetting(opener, "http_timeout", result->timeout, info);
FileOpener::TryGetCurrentSetting(opener, "force_download", result->force_download, info);
FileOpener::TryGetCurrentSetting(opener, "auto_fallback_to_full_download", result->auto_fallback_to_full_download, info);
FileOpener::TryGetCurrentSetting(opener, "http_retries", result->retries, info);
FileOpener::TryGetCurrentSetting(opener, "http_retry_wait_ms", result->retry_wait_ms, info);
FileOpener::TryGetCurrentSetting(opener, "http_retry_backoff", result->retry_backoff, info);
Expand Down Expand Up @@ -233,8 +234,7 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
if (response.HasHeader("Content-Length")) {
auto content_length = stoll(response.GetHeaderValue("Content-Length"));
if ((idx_t)content_length != buffer_out_len) {
throw HTTPException("HTTP GET error: Content-Length from server mismatches requested "
"range, server may not support range requests.");
RangeRequestNotSupportedException::Throw();
}
}
}
Expand All @@ -257,6 +257,8 @@ unique_ptr<HTTPResponse> HTTPFileSystem::GetRangeRequest(FileHandle &handle, str
return true;
});

get_request.try_request = true;

auto response = http_util.Request(get_request, http_client);

hfh.StoreClient(std::move(http_client));
Expand Down Expand Up @@ -349,9 +351,36 @@ unique_ptr<FileHandle> HTTPFileSystem::OpenFileExtended(const OpenFileInfo &file
return std::move(handle);
}

// Buffered read from http file.
// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set
void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
bool HTTPFileSystem::TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len) {
auto res = GetRangeRequest(handle, url, header_map, file_offset, buffer_out, buffer_out_len);

if (res) {
// Request succeeded TODO: fix upstream that 206 is not considered success
if (res->Success() || res->status == HTTPStatusCode::PartialContent_206 || res->status == HTTPStatusCode::Accepted_202) {
return true;
}

// Request failed and we have a request error
if (res->HasRequestError()) {
ErrorData error (res->GetRequestError());

// Special case: we can do a retry with a full file download
if (error.Type() == RangeRequestNotSupportedException::TYPE && error.RawMessage() == RangeRequestNotSupportedException::MESSAGE) {
auto &hfh = handle.Cast<HTTPFileHandle>();
if (hfh.http_params.auto_fallback_to_full_download) {
return false;
}

error.Throw();
}
}
throw HTTPException(*res, "Request returned HTTP %d for HTTP %s to '%s'",
static_cast<int>(res->status), EnumUtil::ToString(RequestType::GET_REQUEST), res->url);
}
throw IOException("Unknown error for HTTP %s to '%s'", EnumUtil::ToString(RequestType::GET_REQUEST), url);
}

bool HTTPFileSystem::ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
auto &hfh = handle.Cast<HTTPFileHandle>();

D_ASSERT(hfh.http_params.state);
Expand All @@ -362,7 +391,7 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
memcpy(buffer, hfh.cached_file_handle->GetData() + location, nr_bytes);
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
hfh.file_offset = location + nr_bytes;
return;
return true;
}

idx_t to_read = nr_bytes;
Expand All @@ -371,21 +400,23 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
// Don't buffer when DirectIO is set or when we are doing parallel reads
bool skip_buffer = hfh.flags.DirectIO() || hfh.flags.RequireParallelAccess();
if (skip_buffer && to_read > 0) {
GetRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read);
if (!TryRangeRequest(hfh, hfh.path, {}, location, (char *)buffer, to_read)) {
return false;
}
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
// Update handle status within critical section for parallel access.
if (hfh.flags.RequireParallelAccess()) {
std::lock_guard<std::mutex> lck(hfh.mu);
hfh.buffer_available = 0;
hfh.buffer_idx = 0;
hfh.file_offset = location + nr_bytes;
return;
return true;
}

hfh.buffer_available = 0;
hfh.buffer_idx = 0;
hfh.file_offset = location + nr_bytes;
return;
return true;
}

if (location >= hfh.buffer_start && location < hfh.buffer_end) {
Expand Down Expand Up @@ -417,13 +448,17 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id

// Bypass buffer if we read more than buffer size
if (to_read > new_buffer_available) {
GetRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read);
if (!TryRangeRequest(hfh, hfh.path, {}, location + buffer_offset, (char *)buffer + buffer_offset, to_read)) {
return false;
}
hfh.buffer_available = 0;
hfh.buffer_idx = 0;
start_offset += to_read;
break;
} else {
GetRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available);
if (!TryRangeRequest(hfh, hfh.path, {}, start_offset, (char *)hfh.read_buffer.get(), new_buffer_available)) {
return false;
}
hfh.buffer_available = new_buffer_available;
hfh.buffer_idx = 0;
hfh.buffer_start = start_offset;
Expand All @@ -433,6 +468,32 @@ void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, id
}
hfh.file_offset = location + nr_bytes;
DUCKDB_LOG_FILE_SYSTEM_READ(handle, nr_bytes, location);
return true;
}

// Buffered read from http file.
// Note that buffering is disabled when FileFlags::FILE_FLAGS_DIRECT_IO is set
void HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location) {
auto success = ReadInternal(handle, buffer, nr_bytes, location);
if (success) {
return;
}

// ReadInternal returned false. This means the regular path of querying the file with range requests failed. We will
// attempt to download the full file and retry.

if (handle.logger) {
DUCKDB_LOG_WARN(handle.logger, "Falling back to full file download for file '%s': the server does not support HTTP range requests. Performance and memory usage are potentially degraded.", handle.path);
}

auto &hfh = handle.Cast<HTTPFileHandle>();

bool should_write_cache = false;
hfh.FullDownload(*this, should_write_cache);

if (!ReadInternal(handle, buffer, nr_bytes, location)) {
throw HTTPException("Failed to read from HTTP file after automatically retrying a full file download.");
}
}

int64_t HTTPFileSystem::Read(FileHandle &handle, void *buffer, int64_t nr_bytes) {
Expand Down Expand Up @@ -644,6 +705,19 @@ void HTTPFileHandle::LoadFileInfo() {
initialized = true;
}


void HTTPFileHandle::TryAddLogger(FileOpener &opener) {
auto context = opener.TryGetClientContext();
if (context) {
logger = context->logger;
return;
}
auto database = opener.TryGetDatabase();
if (database) {
logger = database->GetLogManager().GlobalLoggerReference();
}
}

void HTTPFileHandle::Initialize(optional_ptr<FileOpener> opener) {
auto &hfs = file_system.Cast<HTTPFileSystem>();
http_params.state = HTTPState::TryGetState(opener);
Expand Down
1 change: 1 addition & 0 deletions extension/httpfs/httpfs_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static void LoadInternal(DatabaseInstance &instance) {
config.AddExtensionOption("http_retries", "HTTP retries on I/O error", LogicalType::UBIGINT, Value(3));
config.AddExtensionOption("http_retry_wait_ms", "Time between retries", LogicalType::UBIGINT, Value(100));
config.AddExtensionOption("force_download", "Forces upfront download of file", LogicalType::BOOLEAN, Value(false));
config.AddExtensionOption("auto_fallback_to_full_download", "Allows automatically falling back to full file downloads when possible.", LogicalType::BOOLEAN, Value(true));
// Reduces the number of requests made while waiting, for example retry_wait_ms of 50 and backoff factor of 2 will
// result in wait times of 0 50 100 200 400...etc.
config.AddExtensionOption("http_retry_backoff", "Backoff factor for exponentially increasing retry wait time",
Expand Down
22 changes: 21 additions & 1 deletion extension/httpfs/include/httpfs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,19 @@

namespace duckdb {

class RangeRequestNotSupportedException {
public:
// Call static Throw instead: if thrown as exception DuckDB can't catch it.
explicit RangeRequestNotSupportedException() = delete;

static constexpr ExceptionType TYPE = ExceptionType::HTTP;
static constexpr const char *MESSAGE = "Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`";

static void Throw() {
throw HTTPException(MESSAGE);
}
};

class HTTPClientCache {
public:
//! Get a client from the client cache
Expand Down Expand Up @@ -49,6 +62,8 @@ class HTTPFileHandle : public FileHandle {
time_t last_modified;
string etag;
bool initialized = false;

bool auto_fallback_to_full_file_download = true;

// When using full file download, the full file will be written to a cached file handle
unique_ptr<CachedFileHandle> cached_file_handle;
Expand Down Expand Up @@ -84,7 +99,10 @@ class HTTPFileHandle : public FileHandle {
//! Perform a HEAD request to get the file info (if not yet loaded)
void LoadFileInfo();

private:
//! TODO: make base function virtual?
void TryAddLogger(FileOpener &opener);

public:
//! Fully downloads a file
void FullDownload(HTTPFileSystem &hfs, bool &should_write_cache);
};
Expand Down Expand Up @@ -153,6 +171,8 @@ class HTTPFileSystem : public FileSystem {
}

virtual HTTPException GetHTTPError(FileHandle &, const HTTPResponse &response, const string &url);
bool TryRangeRequest(FileHandle &handle, string url, HTTPHeaders header_map, idx_t file_offset, char *buffer_out, idx_t buffer_out_len);
bool ReadInternal(FileHandle &handle, void *buffer, int64_t nr_bytes, idx_t location);

protected:
virtual duckdb::unique_ptr<HTTPFileHandle> CreateHandle(const OpenFileInfo &file, FileOpenFlags flags,
Expand Down
2 changes: 2 additions & 0 deletions extension/httpfs/include/httpfs_client.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ struct HTTPFSParams : public HTTPParams {
static constexpr bool DEFAULT_ENABLE_SERVER_CERT_VERIFICATION = false;
static constexpr uint64_t DEFAULT_HF_MAX_PER_PAGE = 0;
static constexpr bool DEFAULT_FORCE_DOWNLOAD = false;
static constexpr bool AUTO_FALLBACK_TO_FULL_DOWNLOAD = true;

bool force_download = DEFAULT_FORCE_DOWNLOAD;
bool auto_fallback_to_full_download = AUTO_FALLBACK_TO_FULL_DOWNLOAD;
bool enable_server_cert_verification = DEFAULT_ENABLE_SERVER_CERT_VERIFICATION;
idx_t hf_max_per_page = DEFAULT_HF_MAX_PER_PAGE;
string ca_cert_file;
Expand Down
46 changes: 46 additions & 0 deletions test/sql/full_file_download_fallback.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# name: test/sql/full_file_download_fallback.test
# group: [full_file_download]

require parquet

require httpfs

require tpch

require-env PYTHON_HTTP_SERVER_URL

require-env PYTHON_HTTP_SERVER_DIR

statement ok
pragma enable_logging;

statement ok
call dbgen(sf=1);

statement ok
copy lineitem to '${PYTHON_HTTP_SERVER_DIR}/lineitem.csv'

statement ok
drop table lineitem;

statement ok
CREATE view lineitem AS FROM '${PYTHON_HTTP_SERVER_URL}/lineitem.csv';

query I
pragma tpch(6);
----
123141078.22829981

query I
select count(*) from duckdb_logs where log_level='WARN' and message like '%Falling back to full%'
----
2

statement ok
set auto_fallback_to_full_download=false

statement error
pragma tpch(6);
----
HTTP Error: Content-Length from server mismatches requested range, server may not support range requests. You can try to resolve this by enabling `SET force_download=true`

2 changes: 2 additions & 0 deletions test/sql/secret/secret_aws.test
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT

require-env DUCKDB_S3_USE_SSL

set ignore_error_messages

require httpfs

require parquet
Expand Down
2 changes: 2 additions & 0 deletions test/sql/secret/secret_refresh.test
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ require-env DUCKDB_S3_ENDPOINT

require-env DUCKDB_S3_USE_SSL

set ignore_error_messages

require httpfs

require parquet
Expand Down
2 changes: 2 additions & 0 deletions test/sql/secret/secret_refresh_attach.test
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ require-env DUCKDB_S3_USE_SSL

require-env S3_ATTACH_DB

set ignore_error_messages

require httpfs

require parquet
Expand Down
Loading