diff --git a/MODULE.bazel b/MODULE.bazel index a763404..d0a18aa 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -21,16 +21,17 @@ module( repo_name = "com_google_array_record", ) -bazel_dep(name = "rules_proto", version = "7.0.2") -bazel_dep(name = "rules_python", version = "1.0.0") +bazel_dep(name = "rules_proto", version = "7.1.0") +bazel_dep(name = "rules_python", version = "1.4.1") bazel_dep(name = "platforms", version = "0.0.11") bazel_dep(name = "protobuf", version = "31.1") bazel_dep(name = "googletest", version = "1.15.2") -bazel_dep(name = "abseil-cpp", version = "20250127.0") +bazel_dep(name = "abseil-cpp", version = "20250127.1") bazel_dep(name = "abseil-py", version = "2.1.0") bazel_dep(name = "eigen", version = "3.4.0.bcr.3") -bazel_dep(name = "riegeli", version = "0.0.0-20241218-3385e3c") +bazel_dep(name = "riegeli", version = "0.0.0-20250717-5b2e77e") bazel_dep(name = "pybind11_bazel", version = "2.12.0") +bazel_dep(name = "google_cloud_cpp", version = "3.0.0-rc0") SUPPORTED_PYTHON_VERSIONS = [ "3.10", diff --git a/python/BUILD b/python/BUILD index 6e99502..d247e61 100644 --- a/python/BUILD +++ b/python/BUILD @@ -20,6 +20,8 @@ pybind_extension( "@riegeli//riegeli/base:initializer", "@riegeli//riegeli/bytes:fd_reader", "@riegeli//riegeli/bytes:fd_writer", + "@riegeli//riegeli/gcs:gcs_object", + "@riegeli//riegeli/gcs:gcs_reader", ], ) diff --git a/python/array_record_module.cc b/python/array_record_module.cc index 2bff909..88b8961 100644 --- a/python/array_record_module.cc +++ b/python/array_record_module.cc @@ -22,6 +22,7 @@ limitations under the License. #include #include "absl/status/status.h" +#include "absl/strings/match.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "cpp/array_record_reader.h" @@ -34,6 +35,8 @@ limitations under the License. #include "riegeli/base/maker.h" #include "riegeli/bytes/fd_reader.h" #include "riegeli/bytes/fd_writer.h" +#include "riegeli/gcs/gcs_object.h" +#include "riegeli/gcs/gcs_reader.h" namespace py = pybind11; @@ -50,10 +53,13 @@ PYBIND11_MODULE(array_record_module, m) { throw py::value_error( std::string(status_or_option.status().message())); } + riegeli::FdWriterBase::Options file_writer_options; + file_writer_options.set_buffer_size(size_t{16} << 20); // Release the GIL because IO is time consuming. py::gil_scoped_release scoped_release; return new array_record::ArrayRecordWriter( - riegeli::Maker(path), + riegeli::Maker( + path, std::move(file_writer_options)), status_or_option.value()); }), py::arg("path"), py::arg("options") = "") @@ -84,18 +90,29 @@ PYBIND11_MODULE(array_record_module, m) { std::string(status_or_option.status().message())); } riegeli::FdReaderBase::Options file_reader_options; + riegeli::GcsReader::Options gcs_reader_options; if (kwargs.contains("file_reader_buffer_size")) { auto file_reader_buffer_size = kwargs["file_reader_buffer_size"].cast(); file_reader_options.set_buffer_size(file_reader_buffer_size); + gcs_reader_options.set_buffer_size(file_reader_buffer_size); } // Release the GIL because IO is time consuming. py::gil_scoped_release scoped_release; - return new array_record::ArrayRecordReader( - riegeli::Maker( - path, std::move(file_reader_options)), - status_or_option.value(), - array_record::ArrayRecordGlobalPool()); + if (absl::StartsWith(path, "gs://")) { + return new array_record::ArrayRecordReader( + riegeli::Maker( + google::cloud::storage::Client(), + riegeli::GcsObject(path), std::move(gcs_reader_options)), + status_or_option.value(), + array_record::ArrayRecordGlobalPool()); + } else { + return new array_record::ArrayRecordReader( + riegeli::Maker( + path, std::move(file_reader_options)), + status_or_option.value(), + array_record::ArrayRecordGlobalPool()); + } }), py::arg("path"), py::arg("options") = "", R"( ArrayRecordReader for fast sequential or random access.