Skip to content
This repository was archived by the owner on Jul 16, 2024. It is now read-only.
Open
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 33 additions & 31 deletions greenplumpython/experimental/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import pathlib
import tarfile
import tempfile
import uuid
from typing import get_type_hints

Expand Down Expand Up @@ -120,37 +121,38 @@ def _install_on_server(pkg_dir: str, requirements: str) -> str:

def _install_packages(db: gp.Database, requirements: str):
tmp_archive_name = f"tar_{uuid.uuid4().hex}"
# FIXME: Windows client is not supported yet.
local_dir = pathlib.Path("/") / "tmp" / tmp_archive_name / "pip"
local_dir.mkdir(parents=True)
cmd = [
sys.executable,
"-m",
"pip",
"download",
"--requirement",
"/dev/stdin",
"--dest",
local_dir,
]
try:
sp.check_output(cmd, text=True, stderr=sp.STDOUT, input=requirements)
except sp.CalledProcessError as e:
raise e from Exception(e.stdout)
_archive_and_upload(tmp_archive_name, [local_dir.resolve()], db)
extracted = db.apply(lambda: _extract_files(tmp_archive_name, "root"), column_name="cache_dir")
assert len(list(extracted)) == 1
server_dir = (
pathlib.Path("/")
/ "tmp"
/ tmp_archive_name
/ "extracted"
/ local_dir.relative_to(local_dir.root)
)
installed = extracted.apply(
lambda _: _install_on_server(server_dir.as_uri(), requirements), column_name="result"
)
assert len(list(installed)) == 1
with tempfile.TemporaryDirectory() as local_dir:
local_dir_path = pathlib.Path(local_dir)
cmd = [
sys.executable,
"-m",
"pip",
"download",
"--requirement",
"/dev/stdin",
"--dest",
str(local_dir_path),
]
try:
sp.check_output(cmd, text=True, stderr=sp.STDOUT, input=requirements)
except sp.CalledProcessError as e:
raise e from Exception(e.stdout)
_archive_and_upload(tmp_archive_name, [local_dir], db)
extracted = db.apply(
lambda: _extract_files(tmp_archive_name, "root"), column_name="cache_dir"
)
assert len(list(extracted)) == 1
server_dir = (
pathlib.Path("/")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this need to be changed as well?

Copy link
Contributor Author

@xuebinsu xuebinsu Oct 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed per your suggestion.

However, while CI is good so far, I have some concerns:

  • The code gets much more complicated. Due to lacking of the delete parameter in tempfile.TemporaryDirectory(), we need to pin the directory object to ensure that it will not be deconstructed and then pass the necessary info around to quite a few UDFs.
  • Debugging is harder. Before the change, randomness only exists on client side. That means, after the SQL statement is built, running it multiple times anywhere will give the same result. This is especially good for debugging. But with tempfile.TemporaryDirectory(), the directory path will change for every run and it is hard to tell which directory is created in which run.
  • Because the returned DataFrame is lazily evaluated, we cannot remove the temp directory at the end of from_files(), this can cause flaky issue if the directory is removed by Python's GC before the DataFrame is evaluated.
  • Depending on a env variable means that user need to restart the database server for the changes to take effect. This is not easy if the server continues ingesting data. Would it be better to customize with a GUC? This does not requires creating an extension on server. For example, on Greenplum we can set an undefined GUC and read it like this:
[gpadmin@localhost GreenplumPython]$ gpconfig -c abc.defg -v hello --skipvalidation
20231023:21:20:17:3893841 gpconfig:localhost:gpadmin-[INFO]:-completed successfully with parameters '-c abc.defg -v hello --skipvalidation'
[gpadmin@localhost GreenplumPython]$ gpstop -afr
[gpadmin@localhost GreenplumPython]$ psql
psql (12.12)
Type "help" for help.

gpadmin=# show abc.defg;
 abc.defg 
----------
 hello
(1 row)

/ "tmp"
/ tmp_archive_name
/ "extracted"
/ local_dir_path.relative_to(local_dir_path.root)
)
installed = extracted.apply(
lambda _: _install_on_server(server_dir.as_uri(), requirements), column_name="result"
)
assert len(list(installed)) == 1


setattr(gp.Database, "install_packages", _install_packages)