Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 27 additions & 18 deletions pydatalab/src/pydatalab/remote_filesystems.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
def get_directory_structures(
directories: List[RemoteFilesystem],
invalidate_cache: Optional[bool] = None,
parallel: bool = False,
parallel: bool = True,
) -> List[Dict[str, Any]]:
"""For all registered top-level directories, call tree either
locally or remotely to get their directory structures, or access
Expand All @@ -36,6 +36,7 @@ def get_directory_structures(
if not directories:
return []

LOGGER.debug("Retrieving directory structures for %s mounts", len(directories))
if parallel:
return multiprocessing.Pool(max(min(len(directories), 8), 1)).map(
functools.partial(
Expand All @@ -45,7 +46,9 @@ def get_directory_structures(
directories,
)
else:
return [get_directory_structure(d, invalidate_cache=invalidate_cache) for d in directories]
result = [get_directory_structure(d, invalidate_cache=invalidate_cache) for d in directories]
LOGGER.debug("Returning directory structures")
return result


def get_directory_structure(
Expand Down Expand Up @@ -78,6 +81,8 @@ def get_directory_structure(

LOGGER.debug(f"Accessing directory structure of {directory}")

cache_age = datetime.timedelta()

try:
cached_dir_structure = _get_cached_directory_structure(directory)
cache_last_updated = None
Expand All @@ -93,6 +98,18 @@ def get_directory_structure(
f"Not invalidating cache as its age ({cache_age=}) is less than the configured {CONFIG.REMOTE_CACHE_MIN_AGE=}."
)

rescan = False
if not cached_dir_structure:
rescan = True
LOGGER.debug("No cache found for %s", directory.name)

elif cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MAX_AGE):
rescan = True
LOGGER.debug("Dir should be invalidated as cache age is %s", cache_age)

elif cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MIN_AGE) and invalidate_cache:
rescan = True
LOGGER.debug("Dir should be invalidated as cache age is %s", cache_age)
# If either:
# 1) no cache for this directory,
# 2) the cache is older than the max cache age and
Expand All @@ -101,17 +118,12 @@ def get_directory_structure(
# is older than the min age,
# AND, if no other processes is updating the cache,
# then rebuild the cache.
if (
(not cached_dir_structure)
or (
invalidate_cache is not False
and cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MAX_AGE)
)
or (
invalidate_cache
and cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MIN_AGE)
if rescan:
LOGGER.debug(
"Remote filesystems cache miss for '%s': last updated %s",
directory.name,
cache_last_updated,
)
):
owns_lock = _acquire_lock_dir_structure(directory)
if owns_lock:
dir_structure = _get_latest_directory_structure(directory.path, directory.hostname)
Expand All @@ -120,11 +132,6 @@ def get_directory_structure(
directory,
dir_structure,
)
LOGGER.debug(
"Remote filesystems cache miss for '%s': last updated %s",
directory.name,
cache_last_updated,
)
status = "updated"
else:
if max_retries <= 0:
Expand All @@ -145,16 +152,18 @@ def get_directory_structure(
last_updated = last_updated.replace(tzinfo=datetime.timezone.utc)
dir_structure = cached_dir_structure["contents"]
LOGGER.debug(
"Remote filesystems cache hit for '%s': last updated %s",
"Remote filesystems cache hit for '%s': last updated %s, cache age %s",
directory.name,
last_updated,
cache_age,
)
status = "cached"

except Exception as exc:
dir_structure = [{"type": "error", "name": directory.name, "details": str(exc)}]
last_updated = datetime.datetime.now(tz=datetime.timezone.utc)
status = "error"
LOGGER.debug("Remote filesystems cache error for '%s': %s", directory.name, exc)

finally:
_release_lock_dir_structure(directory)
Expand Down
3 changes: 3 additions & 0 deletions pydatalab/src/pydatalab/routes/v0_1/remotes.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def list_remote_directories():
then it will be reconstructed.

"""
from pydatalab.logger import LOGGER
if not current_user.is_authenticated and not CONFIG.TESTING:
return (
jsonify(
Expand All @@ -60,6 +61,8 @@ def list_remote_directories():
400,
)

LOGGER.debug("Delving into remote directories with invalidate_cache=%s", invalidate_cache)

all_directory_structures = get_directory_structures(
CONFIG.REMOTE_FILESYSTEMS, invalidate_cache=invalidate_cache
)
Expand Down
Loading