Skip to content

Commit cbd1844

Browse files
authored
feat: support for gcs storage (#520)
* chore: include opendal/services-gcs * feat: basic gcs scaffolding * feat: populate config parse with basic details * feat: include docker-compose integration tests * feat: add extra iceberg properties * feat: add tests for gcs read/write These are currently conditional tests with a todo comment using the test_with proc macro. More work needs to be done on investigating/potentially expanding OpenDAL to allow unauthenticated requests to fake-gcs-server. At the moment this always ends up reaching the final VM metadata check. * chore: minor cleanup for compose todo * fix: do not introduce new properties * feat: infer bucket from path * chore: add user-project const * feat: add allow_anonymous for test * chore: remove test-with dep * feat: update with allow_anonymous functionality This requires the opendal allow_anonymous funcitonality with the GCS service to work. * ci: use cargo sort * chore: undo storage-gcs default feature * feat: include disable_ params for GCS_NO_AUTH * ci: use storage-all for async-std tests * revert: use opendal from workspace Now that v0.49 has been released, this work does not need to pin to a particular version!
1 parent 257cdbd commit cbd1844

File tree

7 files changed

+250
-3
lines changed

7 files changed

+250
-3
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ jobs:
113113
run: cargo test --no-fail-fast --all-targets --all-features --workspace
114114

115115
- name: Async-std Test
116-
run: cargo test --no-fail-fast --all-targets --no-default-features --features "async-std" --features "storage-fs" --workspace
116+
run: cargo test --no-fail-fast --all-targets --no-default-features --features "async-std" --features "storage-all" --workspace
117117

118118
- name: Doc Test
119119
run: cargo test --no-fail-fast --doc --all-features --workspace

crates/iceberg/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,11 +30,12 @@ keywords = ["iceberg"]
3030

3131
[features]
3232
default = ["storage-memory", "storage-fs", "storage-s3", "tokio"]
33-
storage-all = ["storage-memory", "storage-fs", "storage-s3"]
33+
storage-all = ["storage-memory", "storage-fs", "storage-s3", "storage-gcs"]
3434

3535
storage-memory = ["opendal/services-memory"]
3636
storage-fs = ["opendal/services-fs"]
3737
storage-s3 = ["opendal/services-s3"]
38+
storage-gcs = ["opendal/services-gcs"]
3839

3940
async-std = ["dep:async-std"]
4041
tokio = ["dep:tokio"]

crates/iceberg/src/io/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,7 @@ pub use storage_s3::*;
8282
mod storage_fs;
8383
#[cfg(feature = "storage-fs")]
8484
use storage_fs::*;
85+
#[cfg(feature = "storage-gcs")]
86+
mod storage_gcs;
87+
#[cfg(feature = "storage-gcs")]
88+
pub use storage_gcs::*;

crates/iceberg/src/io/storage.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
use std::sync::Arc;
1919

20+
#[cfg(feature = "storage-gcs")]
21+
use opendal::services::GcsConfig;
2022
#[cfg(feature = "storage-s3")]
2123
use opendal::services::S3Config;
2224
use opendal::{Operator, Scheme};
@@ -38,6 +40,8 @@ pub(crate) enum Storage {
3840
scheme_str: String,
3941
config: Arc<S3Config>,
4042
},
43+
#[cfg(feature = "storage-gcs")]
44+
Gcs { config: Arc<GcsConfig> },
4145
}
4246

4347
impl Storage {
@@ -56,6 +60,10 @@ impl Storage {
5660
scheme_str,
5761
config: super::s3_config_parse(props)?.into(),
5862
}),
63+
#[cfg(feature = "storage-gcs")]
64+
Scheme::Gcs => Ok(Self::Gcs {
65+
config: super::gcs_config_parse(props)?.into(),
66+
}),
5967
_ => Err(Error::new(
6068
ErrorKind::FeatureUnsupported,
6169
format!("Constructing file io from scheme: {scheme} not supported now",),
@@ -117,7 +125,24 @@ impl Storage {
117125
))
118126
}
119127
}
120-
#[cfg(all(not(feature = "storage-s3"), not(feature = "storage-fs")))]
128+
#[cfg(feature = "storage-gcs")]
129+
Storage::Gcs { config } => {
130+
let operator = super::gcs_config_build(config, path)?;
131+
let prefix = format!("gs://{}/", operator.info().name());
132+
if path.starts_with(&prefix) {
133+
Ok((operator, &path[prefix.len()..]))
134+
} else {
135+
Err(Error::new(
136+
ErrorKind::DataInvalid,
137+
format!("Invalid gcs url: {}, should start with {}", path, prefix),
138+
))
139+
}
140+
}
141+
#[cfg(all(
142+
not(feature = "storage-s3"),
143+
not(feature = "storage-fs"),
144+
not(feature = "storage-gcs")
145+
))]
121146
_ => Err(Error::new(
122147
ErrorKind::FeatureUnsupported,
123148
"No storage service has been enabled",
@@ -131,6 +156,7 @@ impl Storage {
131156
"memory" => Ok(Scheme::Memory),
132157
"file" | "" => Ok(Scheme::Fs),
133158
"s3" | "s3a" => Ok(Scheme::S3),
159+
"gs" => Ok(Scheme::Gcs),
134160
s => Ok(s.parse::<Scheme>()?),
135161
}
136162
}
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
//! Google Cloud Storage properties
18+
19+
use std::collections::HashMap;
20+
21+
use opendal::services::GcsConfig;
22+
use opendal::Operator;
23+
use url::Url;
24+
25+
use crate::{Error, ErrorKind, Result};
26+
27+
// Reference: https://github.com/apache/iceberg/blob/main/gcp/src/main/java/org/apache/iceberg/gcp/GCPProperties.java
28+
29+
/// Google Cloud Project ID
30+
pub const GCS_PROJECT_ID: &str = "gcs.project-id";
31+
/// Google Cloud Storage endpoint
32+
pub const GCS_SERVICE_PATH: &str = "gcs.service.path";
33+
/// Google Cloud user project
34+
pub const GCS_USER_PROJECT: &str = "gcs.user-project";
35+
/// Allow unauthenticated requests
36+
pub const GCS_NO_AUTH: &str = "gcs.no-auth";
37+
38+
/// Parse iceberg properties to [`GcsConfig`].
39+
pub(crate) fn gcs_config_parse(mut m: HashMap<String, String>) -> Result<GcsConfig> {
40+
let mut cfg = GcsConfig::default();
41+
42+
if let Some(endpoint) = m.remove(GCS_SERVICE_PATH) {
43+
cfg.endpoint = Some(endpoint);
44+
}
45+
46+
if m.remove(GCS_NO_AUTH).is_some() {
47+
cfg.allow_anonymous = true;
48+
cfg.disable_vm_metadata = true;
49+
cfg.disable_config_load = true;
50+
}
51+
52+
Ok(cfg)
53+
}
54+
55+
/// Build a new OpenDAL [`Operator`] based on a provided [`GcsConfig`].
56+
pub(crate) fn gcs_config_build(cfg: &GcsConfig, path: &str) -> Result<Operator> {
57+
let url = Url::parse(path)?;
58+
let bucket = url.host_str().ok_or_else(|| {
59+
Error::new(
60+
ErrorKind::DataInvalid,
61+
format!("Invalid gcs url: {}, bucket is required", path),
62+
)
63+
})?;
64+
65+
let mut cfg = cfg.clone();
66+
cfg.bucket = bucket.to_string();
67+
Ok(Operator::from_config(cfg)?.finish())
68+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
services:
19+
gcs-server:
20+
image: fsouza/fake-gcs-server@sha256:36b0116fae5236e8def76ccb07761a9ca323e476f366a5f4bf449cac19deaf2d
21+
expose:
22+
- 4443
23+
command: --scheme http
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Integration tests for FileIO Google Cloud Storage (GCS).
19+
20+
use std::collections::HashMap;
21+
use std::net::SocketAddr;
22+
use std::sync::RwLock;
23+
24+
use bytes::Bytes;
25+
use ctor::{ctor, dtor};
26+
use iceberg::io::{FileIO, FileIOBuilder, GCS_NO_AUTH, GCS_SERVICE_PATH};
27+
use iceberg_test_utils::docker::DockerCompose;
28+
use iceberg_test_utils::{normalize_test_name, set_up};
29+
30+
static DOCKER_COMPOSE_ENV: RwLock<Option<DockerCompose>> = RwLock::new(None);
31+
static FAKE_GCS_PORT: u16 = 4443;
32+
static FAKE_GCS_BUCKET: &str = "test-bucket";
33+
34+
#[ctor]
35+
fn before_all() {
36+
let mut guard = DOCKER_COMPOSE_ENV.write().unwrap();
37+
let docker_compose = DockerCompose::new(
38+
normalize_test_name(module_path!()),
39+
format!("{}/testdata/file_io_gcs", env!("CARGO_MANIFEST_DIR")),
40+
);
41+
docker_compose.run();
42+
guard.replace(docker_compose);
43+
}
44+
45+
#[dtor]
46+
fn after_all() {
47+
let mut guard = DOCKER_COMPOSE_ENV.write().unwrap();
48+
guard.take();
49+
}
50+
51+
async fn get_file_io_gcs() -> FileIO {
52+
set_up();
53+
54+
let ip = DOCKER_COMPOSE_ENV
55+
.read()
56+
.unwrap()
57+
.as_ref()
58+
.unwrap()
59+
.get_container_ip("gcs-server");
60+
let addr = SocketAddr::new(ip, FAKE_GCS_PORT);
61+
62+
// A bucket must exist for FileIO
63+
create_bucket(FAKE_GCS_BUCKET, addr.to_string())
64+
.await
65+
.unwrap();
66+
67+
FileIOBuilder::new("gcs")
68+
.with_props(vec![
69+
(GCS_SERVICE_PATH, format!("http://{}", addr)),
70+
(GCS_NO_AUTH, "true".to_string()),
71+
])
72+
.build()
73+
.unwrap()
74+
}
75+
76+
// Create a bucket against the emulated GCS storage server.
77+
async fn create_bucket(name: &str, server_addr: String) -> anyhow::Result<()> {
78+
let mut bucket_data = HashMap::new();
79+
bucket_data.insert("name", name);
80+
81+
let client = reqwest::Client::new();
82+
let endpoint = format!("http://{}/storage/v1/b", server_addr);
83+
client.post(endpoint).json(&bucket_data).send().await?;
84+
Ok(())
85+
}
86+
87+
fn get_gs_path() -> String {
88+
format!("gs://{}", FAKE_GCS_BUCKET)
89+
}
90+
91+
#[tokio::test]
92+
async fn gcs_exists() {
93+
let file_io = get_file_io_gcs().await;
94+
assert!(file_io
95+
.is_exist(format!("{}/", get_gs_path()))
96+
.await
97+
.unwrap());
98+
}
99+
100+
#[tokio::test]
101+
async fn gcs_write() {
102+
let gs_file = format!("{}/write-file", get_gs_path());
103+
let file_io = get_file_io_gcs().await;
104+
let output = file_io.new_output(&gs_file).unwrap();
105+
output
106+
.write(bytes::Bytes::from_static(b"iceberg-gcs!"))
107+
.await
108+
.expect("Write to test output file");
109+
assert!(file_io.is_exist(gs_file).await.unwrap())
110+
}
111+
112+
#[tokio::test]
113+
async fn gcs_read() {
114+
let gs_file = format!("{}/read-gcs", get_gs_path());
115+
let file_io = get_file_io_gcs().await;
116+
let output = file_io.new_output(&gs_file).unwrap();
117+
output
118+
.write(bytes::Bytes::from_static(b"iceberg!"))
119+
.await
120+
.expect("Write to test output file");
121+
assert!(file_io.is_exist(&gs_file).await.unwrap());
122+
123+
let input = file_io.new_input(gs_file).unwrap();
124+
assert_eq!(input.read().await.unwrap(), Bytes::from_static(b"iceberg!"));
125+
}

0 commit comments

Comments
 (0)