From dde16f40ede511eeb233512614b513270eeba502 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 21 Jan 2025 18:27:34 +0000 Subject: [PATCH 1/3] Add some debug logging to remote filesystems --- pydatalab/src/pydatalab/remote_filesystems.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pydatalab/src/pydatalab/remote_filesystems.py b/pydatalab/src/pydatalab/remote_filesystems.py index 2628268b6..562bdb252 100644 --- a/pydatalab/src/pydatalab/remote_filesystems.py +++ b/pydatalab/src/pydatalab/remote_filesystems.py @@ -36,6 +36,7 @@ def get_directory_structures( if not directories: return [] + LOGGER.debug("Retrieving directory structures for %s mounts", len(directories)) if parallel: return multiprocessing.Pool(max(min(len(directories), 8), 1)).map( functools.partial( @@ -45,7 +46,9 @@ def get_directory_structures( directories, ) else: - return [get_directory_structure(d, invalidate_cache=invalidate_cache) for d in directories] + result = [get_directory_structure(d, invalidate_cache=invalidate_cache) for d in directories] + LOGGER.debug("Returning directory structures") + return result def get_directory_structure( @@ -155,6 +158,7 @@ def get_directory_structure( dir_structure = [{"type": "error", "name": directory.name, "details": str(exc)}] last_updated = datetime.datetime.now(tz=datetime.timezone.utc) status = "error" + LOGGER.debug("Remote filesystems cache error for '%s': %s", directory.name, exc) finally: _release_lock_dir_structure(directory) From 0ad73926e2a4bf0837a97b1038d399d66327c24c Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Wed, 22 Jan 2025 17:00:39 +0000 Subject: [PATCH 2/3] Refactor remote syncing --- pydatalab/src/pydatalab/remote_filesystems.py | 37 +++++++++++-------- .../src/pydatalab/routes/v0_1/remotes.py | 3 ++ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pydatalab/src/pydatalab/remote_filesystems.py b/pydatalab/src/pydatalab/remote_filesystems.py index 562bdb252..2f84902f0 100644 --- a/pydatalab/src/pydatalab/remote_filesystems.py +++ b/pydatalab/src/pydatalab/remote_filesystems.py @@ -81,6 +81,8 @@ def get_directory_structure( LOGGER.debug(f"Accessing directory structure of {directory}") + cache_age = datetime.timedelta() + try: cached_dir_structure = _get_cached_directory_structure(directory) cache_last_updated = None @@ -96,6 +98,18 @@ def get_directory_structure( f"Not invalidating cache as its age ({cache_age=}) is less than the configured {CONFIG.REMOTE_CACHE_MIN_AGE=}." ) + rescan = False + if not cached_dir_structure: + rescan = True + LOGGER.debug("No cache found for %s", directory.name) + + elif cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MAX_AGE): + rescan = True + LOGGER.debug("Dir should be invalidated as cache age is %s", cache_age) + + elif cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MIN_AGE) and invalidate_cache: + rescan = True + LOGGER.debug("Dir should be invalidated as cache age is %s", cache_age) # If either: # 1) no cache for this directory, # 2) the cache is older than the max cache age and @@ -104,17 +118,12 @@ def get_directory_structure( # is older than the min age, # AND, if no other processes is updating the cache, # then rebuild the cache. - if ( - (not cached_dir_structure) - or ( - invalidate_cache is not False - and cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MAX_AGE) - ) - or ( - invalidate_cache - and cache_age > datetime.timedelta(minutes=CONFIG.REMOTE_CACHE_MIN_AGE) + if rescan: + LOGGER.debug( + "Remote filesystems cache miss for '%s': last updated %s", + directory.name, + cache_last_updated, ) - ): owns_lock = _acquire_lock_dir_structure(directory) if owns_lock: dir_structure = _get_latest_directory_structure(directory.path, directory.hostname) @@ -123,11 +132,6 @@ def get_directory_structure( directory, dir_structure, ) - LOGGER.debug( - "Remote filesystems cache miss for '%s': last updated %s", - directory.name, - cache_last_updated, - ) status = "updated" else: if max_retries <= 0: @@ -148,9 +152,10 @@ def get_directory_structure( last_updated = last_updated.replace(tzinfo=datetime.timezone.utc) dir_structure = cached_dir_structure["contents"] LOGGER.debug( - "Remote filesystems cache hit for '%s': last updated %s", + "Remote filesystems cache hit for '%s': last updated %s, cache age %s", directory.name, last_updated, + cache_age, ) status = "cached" diff --git a/pydatalab/src/pydatalab/routes/v0_1/remotes.py b/pydatalab/src/pydatalab/routes/v0_1/remotes.py index e92a443ed..5b4237ccd 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/remotes.py +++ b/pydatalab/src/pydatalab/routes/v0_1/remotes.py @@ -34,6 +34,7 @@ def list_remote_directories(): then it will be reconstructed. """ + from pydatalab.logger import LOGGER if not current_user.is_authenticated and not CONFIG.TESTING: return ( jsonify( @@ -60,6 +61,8 @@ def list_remote_directories(): 400, ) + LOGGER.debug("Delving into remote directories with invalidate_cache=%s", invalidate_cache) + all_directory_structures = get_directory_structures( CONFIG.REMOTE_FILESYSTEMS, invalidate_cache=invalidate_cache ) From 5d025add370e554c2369e1e725cc1f498c00b269 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Tue, 28 Jan 2025 15:16:30 +0000 Subject: [PATCH 3/3] Turn on parallel remote fs syncing --- pydatalab/src/pydatalab/remote_filesystems.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydatalab/src/pydatalab/remote_filesystems.py b/pydatalab/src/pydatalab/remote_filesystems.py index 2f84902f0..0121f55fb 100644 --- a/pydatalab/src/pydatalab/remote_filesystems.py +++ b/pydatalab/src/pydatalab/remote_filesystems.py @@ -15,7 +15,7 @@ def get_directory_structures( directories: List[RemoteFilesystem], invalidate_cache: Optional[bool] = None, - parallel: bool = False, + parallel: bool = True, ) -> List[Dict[str, Any]]: """For all registered top-level directories, call tree either locally or remotely to get their directory structures, or access