From d9e2453ebee6be05d90ba3a4ce5b01f4ebdc4cc3 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:24:35 -0700 Subject: [PATCH 01/33] ignore local files that make creep into image [PTV-1882] --- .dockerignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..532e66e4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +# Even though the Dockerfile only copies over necessary files, certain +# artifacts may creep in that we should exclude from the image. +.DS_Store From ca6623652b0468491e5f09877253458b910777ef Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:24:57 -0700 Subject: [PATCH 02/33] ignore more IDEs and convenient developer dir [PTV-1882] --- .gitignore | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 00b31a50..825fe74b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule .venv venv/ ENV/ +Pipfile # Spyder project settings .spyderproject @@ -100,8 +101,15 @@ ENV/ # mypy .mypy_cache/ +# IDEs +.idea +.fleet + +# misc + data/ .DS_Store .virtualenvs/ test.env -run_tests_single.sh \ No newline at end of file +run_tests_single.sh +local_work/ \ No newline at end of file From fce8e7c7b38e55ce678ee394de7f0113b104d608 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:26:03 -0700 Subject: [PATCH 03/33] update dependencies, reorder alphabetically, adds pytest-asyncio (part of test fixes), gunicorn (part of server runtime improvements) [PTV-1882] --- requirements.txt | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index e637e064..86ba2e17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,19 @@ -uvloop==0.15.3 -aiohttp==3.7.4 +aiofiles==23.1.0 aiohttp_cors==0.7.0 -pytest==6.2.4 -aiofiles==0.3.1 -pytest-aiohttp==0.1.3 -coverage==4.4.2 -pytest-cov==2.5.1 -hypothesis==6.14.5 -globus-sdk==1.6.1 -python-dotenv==0.14.0 -frozendict==2.0.3 -pandas==1.3.2 -xlrd==2.0.1 -openpyxl==3.0.7 +aiohttp==3.8.4 +coverage==7.2.7 defusedxml==0.7.1 -python-magic==0.4.24 \ No newline at end of file +frozendict==2.3.8 +globus-sdk==3.20.1 +gunicorn==20.1.0 +hypothesis==6.76.0 +openpyxl==3.1.2 +pandas==2.0.2 +pytest-aiohttp==1.0.4 +pytest-asyncio==0.21.0 +pytest-cov==4.1.0 +pytest==7.3.1 +python-dotenv==1.0.0 +python-magic==0.4.27 +uvloop==0.17.0 +xlrd==2.0.1 # fairly out of date \ No newline at end of file From 5b277069baa61fa1e361520cfdac2c93be8a55a0 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:27:32 -0700 Subject: [PATCH 04/33] Set of changes to support multiple fixes to upload, as well as necessary code fixes for clarity. [PTV-1882] All changed files are subject to black formatting, for uniformity and readability, and to remove linting errors. The Path class in utils was renamed to StagingPath as it conflicts with Python's path, and forced a renaming of that to PyPath or PathPy. Many changes in app.py, the home of all endpoints, and where half of the fixes occur. Refactored much of the config-related code into a new file config.py Hoisted many hard-coded values into constant-like variables at the module level. Moved app startup logic into main (from __main__) as it is now the entrypoint for gunicorn Applied stream-reading, md5-generation, text/binary recognition fixes to metadata.py. Look for PR comment to explain the changes in more detail. --- staging_service/JGIMetadata.py | 8 +- staging_service/__main__.py | 12 - staging_service/app.py | 423 +++++++++++++++++++++++---------- staging_service/config.py | 80 +++++++ staging_service/globus.py | 32 +-- staging_service/main.py | 19 ++ staging_service/metadata.py | 406 +++++++++++++++++++++++-------- staging_service/utils.py | 38 +-- 8 files changed, 741 insertions(+), 277 deletions(-) delete mode 100644 staging_service/__main__.py create mode 100644 staging_service/config.py create mode 100644 staging_service/main.py diff --git a/staging_service/JGIMetadata.py b/staging_service/JGIMetadata.py index 643be600..92825e16 100644 --- a/staging_service/JGIMetadata.py +++ b/staging_service/JGIMetadata.py @@ -1,13 +1,15 @@ -from .utils import Path +import os from json import JSONDecoder + import aiofiles -import os from aiohttp import web +from .utils import StagingPath + decoder = JSONDecoder() -async def read_metadata_for(path: Path): +async def read_metadata_for(path: StagingPath): if os.path.exists(path.jgi_metadata) and os.path.isfile(path.jgi_metadata): async with aiofiles.open(path.jgi_metadata, mode="r") as json: data = await json.read() diff --git a/staging_service/__main__.py b/staging_service/__main__.py deleted file mode 100644 index e1f31b14..00000000 --- a/staging_service/__main__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .app import app_factory -from aiohttp import web -import asyncio -import os -import uvloop -import configparser - -asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop - -config = configparser.ConfigParser() -config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) -web.run_app(app_factory(config), port=3000) diff --git a/staging_service/app.py b/staging_service/app.py index ac04fc16..373ffbdf 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -4,35 +4,42 @@ import shutil import sys from collections import defaultdict -from urllib.parse import parse_qs -from pathlib import Path as PathPy +from pathlib import Path +from urllib.parse import parse_qs, unquote, urlencode, urlunparse +import aiofiles import aiohttp_cors -from aiohttp import web - -from .app_error_formatter import format_import_spec_errors +from aiohttp import MultipartReader, web + +from staging_service.config import ( + UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, + UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, + get_config, get_max_content_length, + get_max_file_size, + get_read_chunk_size, + get_save_strategy, +) from .AutoDetectUtils import AutoDetectUtils from .JGIMetadata import read_metadata_for -from .auth2Client import KBaseAuth2 +from .app_error_formatter import format_import_spec_errors +from .autodetect.Mappings import CSV, EXCEL, TSV from .globus import assert_globusid_exists, is_globusid -from .metadata import some_metadata, dir_info, add_upa, similar -from .utils import Path, run_command, AclManager from .import_specifications.file_parser import ( ErrorType, FileTypeResolution, parse_import_specifications, ) -from .import_specifications.individual_parsers import parse_csv, parse_tsv, parse_excel from .import_specifications.file_writers import ( + ImportSpecWriteException, write_csv, - write_tsv, write_excel, - ImportSpecWriteException, + write_tsv, ) -from .autodetect.Mappings import CSV, TSV, EXCEL - +from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv +from .metadata import add_upa, dir_info, similar, some_metadata +from .utils import AclManager, StagingPath, auth_client, run_command -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logging.basicConfig(stream=sys.stdout, level=logging.WARNING) routes = web.RouteTableDef() VERSION = "1.3.6" @@ -80,21 +87,27 @@ async def importer_mappings(request: web.Request) -> web.json_response: if len(file_list) == 0: raise web.HTTPBadRequest( text=f"must provide file_list field. Your provided qs: {request.query_string}", - ) + ) mappings = AutoDetectUtils.get_mappings(file_list) return web.json_response(data=mappings) -def _file_type_resolver(path: PathPy) -> FileTypeResolution: +def _file_type_resolver(path: Path) -> FileTypeResolution: fi = AutoDetectUtils.get_mappings([str(path)])["fileinfo"][0] # Here we assume that the first entry in the file_ext_type field is the entry # we want. Presumably secondary entries are less general. - ftype = fi['file_ext_type'][0] if fi['file_ext_type'] else None + ftype = fi["file_ext_type"][0] if fi["file_ext_type"] else None if ftype in _IMPSPEC_FILE_TO_PARSER: return FileTypeResolution(parser=_IMPSPEC_FILE_TO_PARSER[ftype]) else: - ext = fi['suffix'] if fi['suffix'] else path.suffix[1:] if path.suffix else path.name + ext = ( + fi["suffix"] + if fi["suffix"] + else path.suffix[1:] + if path.suffix + else path.name + ) return FileTypeResolution(unsupported_type=ext) @@ -113,17 +126,22 @@ async def bulk_specification(request: web.Request) -> web.json_response: files = [f.strip() for f in files if f.strip()] paths = {} for f in files: - p = Path.validate_path(username, f) - paths[PathPy(p.full_path)] = PathPy(p.user_path) + p = StagingPath.validate_path(username, f) + paths[Path(p.full_path)] = Path(p.user_path) # list(dict) returns a list of the dict keys in insertion order (py3.7+) res = parse_import_specifications( tuple(list(paths)), _file_type_resolver, - lambda e: logging.error("Unexpected error while parsing import specs", exc_info=e)) + lambda e: logging.error( + "Unexpected error while parsing import specs", exc_info=e + ), + ) if res.results: types = {dt: result.result for dt, result in res.results.items()} - files = {dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} - for dt, result in res.results.items()} + files = { + dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} + for dt, result in res.results.items() + } return web.json_response({"types": types, "files": files}) errtypes = {e.error for e in res.errors} errtext = json.dumps({"errors": format_import_spec_errors(res.errors, paths)}) @@ -168,29 +186,33 @@ async def write_bulk_specification(request: web.Request) -> web.json_response: # There should be a way to get aiohttp to handle this but I can't find it return _createJSONErrorResponse( f"Required content-type is {_APP_JSON}", - error_class=web.HTTPUnsupportedMediaType) + error_class=web.HTTPUnsupportedMediaType, + ) if not request.content_length: return _createJSONErrorResponse( "The content-length header is required and must be > 0", - error_class=web.HTTPLengthRequired) + error_class=web.HTTPLengthRequired, + ) # No need to check the max content length; the server already does that. See tests data = await request.json() if type(data) != dict: return _createJSONErrorResponse("The top level JSON element must be a mapping") - folder = data.get('output_directory') - type_ = data.get('output_file_type') + folder = data.get("output_directory") + type_ = data.get("output_file_type") if type(folder) != str: - return _createJSONErrorResponse("output_directory is required and must be a string") + return _createJSONErrorResponse( + "output_directory is required and must be a string" + ) writer = _IMPSPEC_FILE_TO_WRITER.get(type_) if not writer: return _createJSONErrorResponse(f"Invalid output_file_type: {type_}") - folder = Path.validate_path(username, folder) + folder = StagingPath.validate_path(username, folder) os.makedirs(folder.full_path, exist_ok=True) try: - files = writer(PathPy(folder.full_path), data.get("types")) + files = writer(Path(folder.full_path), data.get("types")) except ImportSpecWriteException as e: return _createJSONErrorResponse(e.args[0]) - new_files = {ty: str(PathPy(folder.user_path) / files[ty]) for ty in files} + new_files = {ty: str(Path(folder.user_path) / files[ty]) for ty in files} return web.json_response({"output_file_type": type_, "files_created": new_files}) @@ -202,8 +224,8 @@ def _createJSONErrorResponse(error_text: str, error_class=web.HTTPBadRequest): @routes.get("/add-acl-concierge") async def add_acl_concierge(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path - concierge_path = f"{Path._CONCIERGE_PATH}/{username}/" + user_dir = StagingPath.validate_path(username).full_path + concierge_path = f"{StagingPath._CONCIERGE_PATH}/{username}/" aclm = AclManager() result = aclm.add_acl_concierge( shared_directory=user_dir, concierge_path=concierge_path @@ -211,16 +233,20 @@ async def add_acl_concierge(request: web.Request): result[ "msg" ] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" - result[ - "link" - ] = f"https://app.globus.org/file-manager?destination_id={aclm.endpoint_id}&destination_path={concierge_path}" + + params = {"destination_id": aclm.endpoint_id, "destination_path": concierge_path} + + result["link"] = urlunparse( + ("https", "app.globus.org", "/file-manager", None, urlencode(params), None) + ) + return web.json_response(result) @routes.get("/add-acl") async def add_acl(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path + user_dir = StagingPath.validate_path(username).full_path result = AclManager().add_acl(user_dir) return web.json_response(result) @@ -228,7 +254,7 @@ async def add_acl(request: web.Request): @routes.get("/remove-acl") async def remove_acl(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path + user_dir = StagingPath.validate_path(username).full_path result = AclManager().remove_acl(user_dir) return web.json_response(result) @@ -253,14 +279,14 @@ async def file_lifetime(parameter_list): async def file_exists(request: web.Request): username = await authorize_request(request) query = request.match_info["query"] - user_dir = Path.validate_path(username) + user_dir = StagingPath.validate_path(username) try: show_hidden = request.query["showHidden"] if "true" == show_hidden or "True" == show_hidden: show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False # this scans the entire directory recursively just to see if one file exists... why? results = await dir_info(user_dir, show_hidden, query) @@ -282,7 +308,9 @@ async def list_files(request: web.Request): lists the contents of a directory and some details about them """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info.get("path", "")) + + path = StagingPath.validate_path(username, request.match_info.get("path", "")) + if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -291,15 +319,11 @@ async def list_files(request: web.Request): raise web.HTTPBadRequest( text="{path} is a file not a directory".format(path=path.full_path) ) - try: - show_hidden = request.query["showHidden"] - if "true" == show_hidden or "True" == show_hidden: - show_hidden = True - else: - show_hidden = False - except KeyError as no_query: - show_hidden = False + + show_hidden = request.query.get("showHidden", "false").lower() == "true" + data = await dir_info(path, show_hidden, recurse=True) + return web.json_response(data) @@ -309,7 +333,7 @@ async def download_files(request: web.Request): download a file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info.get("path", "")) + path = StagingPath.validate_path(username, request.match_info.get("path", "")) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -330,7 +354,7 @@ async def similar_files(request: web.Request): lists similar file path for given file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -340,7 +364,7 @@ async def similar_files(request: web.Request): text="{path} is a directory not a file".format(path=path.full_path) ) - root = Path.validate_path(username, "") + root = StagingPath.validate_path(username, "") files = await dir_info(root, show_hidden=False, recurse=True) similar_files = list() @@ -363,14 +387,14 @@ async def search(request: web.Request): """ username = await authorize_request(request) query = request.match_info["query"] - user_dir = Path.validate_path(username) + user_dir = StagingPath.validate_path(username) try: show_hidden = request.query["showHidden"] if "true" == show_hidden or "True" == show_hidden: show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False results = await dir_info(user_dir, show_hidden, query) results.sort(key=lambda x: x["mtime"], reverse=True) @@ -384,7 +408,7 @@ async def get_metadata(request: web.Request): if it's a folder it returns stat data about the folder """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -398,80 +422,211 @@ async def get_jgi_metadata(request: web.Request): returns jgi metadata if associated with a file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) return web.json_response(await read_metadata_for(path)) -@routes.post("/upload") -async def upload_files_chunked(request: web.Request): - """ - uploads a file into the staging area +def _validate_filename(filename: str): """ - username = await authorize_request(request) - - if not request.has_body: - raise web.HTTPBadRequest(text="must provide destPath and uploads in body") - - reader = await request.multipart() - counter = 0 - user_file = None - destPath = None - while ( - counter < 100 - ): # TODO this is arbitrary to keep an attacker from creating infinite loop - # This loop handles the null parts that come in inbetween destpath and file - part = await reader.next() - - if part.name == "destPath": - destPath = await part.text() - elif part.name == "uploads": - user_file = part - break - else: - counter += 1 + Ensure that a given filename is acceptable for usage in the staging area. - if not (user_file and destPath): - raise web.HTTPBadRequest(text="must provide destPath and uploads in body") - - filename: str = user_file.filename + Raises http-response appropriate errors if the filename is invalid. + """ + # + # TODO: Are these really all the rules for filenames? Or are they just specific to the staging + # area? + # In any case, we should capture this in a separate function - ensure_valid_filename + # if filename.lstrip() != filename: raise web.HTTPForbidden( # forbidden isn't really the right code, should be 400 text="cannot upload file with name beginning with space" ) + if "," in filename: - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( # for consistency, we use 403 again text="cannot upload file with ',' in name" ) + # may want to make this configurable if we ever decide to add a hidden files toggle to # the staging area UI if filename.startswith("."): - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( # for consistency, we use 403 again text="cannot upload file with name beginning with '.'" ) - size = 0 - destPath = os.path.join(destPath, filename) - path = Path.validate_path(username, destPath) + +async def _handle_upload_save(stream: MultipartReader, destination_path: StagingPath): + """ + Handles the file upload stream from the /upload endpoint, saving the stream, ultimately, + to the given destination path. + + It honors a "save strategy", which is one of a set of defined methods for handling the upload. + See the function get_save_strategy() for details. + + It also honors the "chunk size", which is used for reading up to the chunk size from the stream + before saving. This constrains memory commitment, especially important for handling large + files and concurrent file upload. + """ + save_strategy = get_save_strategy() + chunk_size = get_read_chunk_size() + max_file_size = get_max_file_size() + if save_strategy == UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY: + async with aiofiles.tempfile.NamedTemporaryFile( + "wb", delete=True + ) as output_file: + # temp_file_name = output_file.name + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size} bytes which exceeds " + + f"the maximum allowed of {max_file_size} bytes" + ) + ) + + chunk = await stream.read_chunk(size=chunk_size) + + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break + + actual_size += len(chunk) + + await output_file.write(chunk) + + async with aiofiles.tempfile.NamedTemporaryFile( + "rb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as copied_file: + shutil.copyfile(output_file.name, copied_file.name) + + shutil.move(copied_file.name, destination_path.full_path) + + elif save_strategy == UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION: + async with aiofiles.tempfile.NamedTemporaryFile( + "wb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as output_file: + temp_file_name = output_file.name + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size:,} bytes which exceeds " + + f"the maximum allowed of {max_file_size:,} bytes)" + ) + ) + + # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. + chunk = await stream.read_chunk(size=chunk_size) + + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break + + actual_size += len(chunk) + + await output_file.write( + chunk, + ) + + shutil.move(temp_file_name, destination_path.full_path) + + +@routes.post("/upload") +async def upload_files_chunked(request: web.Request): + """ + uploads a file into the staging area + """ + # Ensures the current auth token is valid, and if so, returns the associated username. + # If not, will raise an exception + # Also, ensures that if there is a Globus account id available in the user's staging + # directory, it is valid. + username = await authorize_request(request) + + if not request.can_read_body: + raise web.HTTPBadRequest( + text="must provide destPath and uploads in body", + ) + + if request.content_length is None: + raise web.HTTPBadRequest(text="request must include a 'Content-Length' header") + + max_content_length = get_max_content_length() + if request.content_length > max_content_length: + raise web.HTTPBadRequest( + text=f"overall content length exceeds the maximum allowable of {max_content_length:,} bytes" + ) + + reader = await request.multipart() + + dest_path_part = await reader.next() + + # + # Get the destination path, which is actually the subdirectory in which the + # file should be placed within the user's staging directory. + # This field must be first, because the file must come last, in order to + # read the file stream. + # + if dest_path_part.name != "destPath": + raise web.HTTPBadRequest(text="must provide destPath in body") + + destination_directory = await dest_path_part.text() + + # + # Get the upload file part. We read the header bits, the get the + # read and save the file stream. + # + user_file_part = await reader.next() + + if user_file_part.name != "uploads": + raise web.HTTPBadRequest(text="must provide uploads in body") + + file_content_length = user_file_part.headers.get("content-length") + + filename: str = unquote(user_file_part.filename) + + _validate_filename(filename) + + destination_path = os.path.join(destination_directory, filename) + path = StagingPath.validate_path(username, destination_path) os.makedirs(os.path.dirname(path.full_path), exist_ok=True) - with open(path.full_path, "wb") as f: # TODO should we handle partial file uploads? - while True: - chunk = await user_file.read_chunk() - if not chunk: - break - size += len(chunk) - f.write(chunk) + # NB: errors result in an error response ... but the upload will be + # read until the end. This is just built into aiohttp web. + # From what I've read, it is considered unstable to have a server close + # the request stream if an error is encountered, thus aiohttp invokes the + # "release()" method to read the stream to completion, but does nothing with + # the results (other than read into the buffer and). + # It would require a more sophisticated implementation to work around this. + # E.g. an upload stream and a progress stream - they could be plain https + # requests - which would allow the progress stream to report errors which would + # then abort the upload stream. The upload stream could also spin upon an error, + # to avoid the useless bandwidth and cpu usage. This would also work with fetch, + # allowing us to ditch XHR. + await _handle_upload_save(user_file_part, path) + + # TODO: is this really necessary? Would a write failure possibly get here? if not os.path.exists(path.full_path): - error_msg = "We are sorry but upload was interrupted. Please try again.".format( - path=path.full_path - ) + error_msg = "We are sorry but upload was interrupted. Please try again." raise web.HTTPNotFound(text=error_msg) response = await some_metadata( path, + # these are fields to include in the result - the metadata stored + # in the file system has other fields too. desired_fields=["name", "path", "mtime", "size", "isFolder"], source="KBase upload", ) + return web.json_response([response]) @@ -481,22 +636,22 @@ async def define_UPA(request: web.Request): creates an UPA as a field in the metadata file corresponding to the filepath given """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path or not os.path.isfile(path.full_path)): # TODO the security model here is to not care if someone wants to put in a false upa raise web.HTTPNotFound( text="no file found found on path {}".format(path.user_path) ) - if not request.has_body: - raise web.HTTPBadRequest(text="must provide UPA field in body") + if not request.can_read_body: + raise web.HTTPBadRequest(text="must provide 'UPA' field in body") body = await request.post() try: UPA = body["UPA"] - except KeyError as wrong_key: - raise web.HTTPBadRequest(text="must provide UPA field in body") + except KeyError: + raise web.HTTPBadRequest(text="must provide 'UPA' field in body") await add_upa(path, UPA) return web.Response( - text="succesfully updated UPA {UPA} for file {path}".format( + text="successfully updated UPA {UPA} for file {path}".format( UPA=UPA, path=path.user_path ) ) @@ -508,45 +663,51 @@ async def delete(request: web.Request): allows deletion of both directories and files """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) + # make sure directory isn't home if path.user_path == username: raise web.HTTPForbidden(text="cannot delete home directory") + if is_globusid(path, username): raise web.HTTPForbidden(text="cannot delete protected file") + if os.path.isfile(path.full_path): os.remove(path.full_path) if os.path.exists(path.metadata_path): os.remove(path.metadata_path) + elif os.path.isdir(path.full_path): shutil.rmtree(path.full_path) if os.path.exists(path.metadata_path): shutil.rmtree(path.metadata_path) + else: raise web.HTTPNotFound( text="could not delete {path}".format(path=path.user_path) ) + return web.Response(text="successfully deleted {path}".format(path=path.user_path)) @routes.patch("/mv/{path:.+}") async def rename(request: web.Request): username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) # make sure directory isn't home if path.user_path == username: raise web.HTTPForbidden(text="cannot rename or move home directory") if is_globusid(path, username): raise web.HTTPForbidden(text="cannot rename or move protected file") - if not request.has_body: + if not request.can_read_body: raise web.HTTPBadRequest(text="must provide newPath field in body") body = await request.post() try: new_path = body["newPath"] - except KeyError as wrong_key: + except KeyError: raise web.HTTPBadRequest(text="must provide newPath field in body") - new_path = Path.validate_path(username, new_path) + new_path = StagingPath.validate_path(username, new_path) if os.path.exists(path.full_path): if not os.path.exists(new_path.full_path): shutil.move(path.full_path, new_path.full_path) @@ -554,7 +715,7 @@ async def rename(request: web.Request): shutil.move(path.metadata_path, new_path.metadata_path) else: raise web.HTTPConflict( - text="{new_path} allready exists".format(new_path=new_path.user_path) + text="{new_path} already exists".format(new_path=new_path.user_path) ) else: raise web.HTTPNotFound(text="{path} not found".format(path=path.user_path)) @@ -568,7 +729,7 @@ async def rename(request: web.Request): @routes.patch("/decompress/{path:.+}") async def decompress(request: web.Request): username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) # make sure the file can be decompressed filename, file_extension = os.path.splitext(path.full_path) filename, upper_file_extension = os.path.splitext(filename) @@ -578,11 +739,11 @@ async def decompress(request: web.Request): # 3 just overwrite and force destination = os.path.dirname(path.full_path) if ( - upper_file_extension == ".tar" and file_extension == ".gz" + upper_file_extension == ".tar" and file_extension == ".gz" ) or file_extension == ".tgz": await run_command("tar", "xzf", path.full_path, "-C", destination) elif upper_file_extension == ".tar" and ( - file_extension == ".bz" or file_extension == ".bz2" + file_extension == ".bz" or file_extension == ".bz2" ): await run_command("tar", "xjf", path.full_path, "-C", destination) elif file_extension == ".zip" or file_extension == ".ZIP": @@ -612,7 +773,7 @@ async def authorize_request(request): else: # this is a hack for prod because kbase_session won't get shared with the kbase.us domain token = request.cookies.get("kbase_session_backup") - username = await auth_client.get_user(token) + username = await auth_client().get_user(token) await assert_globusid_exists(username, token) return username @@ -641,17 +802,19 @@ def inject_config_dependencies(config): os.path.join(os.getcwd(), FILE_EXTENSION_MAPPINGS) ) - Path._DATA_DIR = DATA_DIR - Path._META_DIR = META_DIR - Path._CONCIERGE_PATH = CONCIERGE_PATH + StagingPath._DATA_DIR = DATA_DIR + StagingPath._META_DIR = META_DIR + StagingPath._CONCIERGE_PATH = CONCIERGE_PATH - if Path._DATA_DIR is None: + # TODO: why make these attributes "private" if they are not used that way? + # either provide a static setter method, or make them public, I'd say. + if StagingPath._DATA_DIR is None: raise Exception("Please provide DATA_DIR in the config file ") - if Path._META_DIR is None: + if StagingPath._META_DIR is None: raise Exception("Please provide META_DIR in the config file ") - if Path._CONCIERGE_PATH is None: + if StagingPath._CONCIERGE_PATH is None: raise Exception("Please provide CONCIERGE_PATH in the config file ") if FILE_EXTENSION_MAPPINGS is None: @@ -664,8 +827,8 @@ def inject_config_dependencies(config): # if we start using the file ext type array for anything else this might need changes filetype = val["file_ext_type"][0] extensions[filetype].add(fileext) - for m in val['mappings']: - datatypes[m['id']].add(filetype) + for m in val["mappings"]: + datatypes[m["id"]].add(filetype) global _DATATYPE_MAPPINGS _DATATYPE_MAPPINGS = { "datatype_to_filetype": {k: sorted(datatypes[k]) for k in datatypes}, @@ -673,9 +836,13 @@ def inject_config_dependencies(config): } -def app_factory(config): +def app_factory(): app = web.Application(middlewares=[web.normalize_path_middleware()]) app.router.add_routes(routes) + + # TODO: IMO (eap) cors should not be configured here, as this app should + # never be exposed directly to the public. The proxy should control + # the CORS policy. cors = aiohttp_cors.setup( app, defaults={ @@ -688,8 +855,8 @@ def app_factory(config): for route in list(app.router.routes()): cors.add(route) + # Saves config values into various global things; see the implementation. + config = get_config() inject_config_dependencies(config) - global auth_client - auth_client = KBaseAuth2(config["staging_service"]["AUTH_URL"]) return app diff --git a/staging_service/config.py b/staging_service/config.py new file mode 100644 index 00000000..83f88486 --- /dev/null +++ b/staging_service/config.py @@ -0,0 +1,80 @@ +import os + +import configparser +from aiohttp import web + +DEFAULT_UPLOAD_MAX_FILE_SIZE = 5 * 1000 * 1000 * 1000 # aka, 5GB +DEFAULT_UPLOAD_READ_CHUNK_SIZE = 1 * 1000 * 1000 +UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY = "SAVE_STRATEGY_TEMP_THEN_COPY" +UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION = "SAVE_STRATEGY_SAVE_TO_DESTINATION" +DEFAULT_UPLOAD_SAVE_STRATEGY = UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION +# THe content length tweak represents the extra header overload for a request, which consists +# of the "fileDest" field and the header section of the upload file. 1K seems reasonable, and +# the whole business of capping the file size is approximate in any case. +# Casual testing shows 272 bytes for Chrome, without the actual destination file and file's +# filename, so 1728 bytes for the path and filename seems quite generous. +CONTENT_LENGTH_TWEAK = 2000 + + +_CONFIG = None + +def get_read_chunk_size() -> int: + """ + Returns the configured chunk size for reading from the + upload request stream. + """ + chunk_size = os.environ.get("UPLOAD_READ_CHUNK_SIZE") + if chunk_size is None: + return DEFAULT_UPLOAD_READ_CHUNK_SIZE + try: + return int(chunk_size) + except ValueError as ve: + raise web.HTTPInternalServerError( + text=f"Error parsing UPLOAD_READ_CHUNK_SIZE environment variable: {str(ve)}" + ) + + +def get_save_strategy() -> str: + """ + Returns the "save strategy" as defined by the environment variable "UPLOAD_SAVE_STRATEGY", + as well as the default value DEFAULT_SAVE_STRATEGY defined at the top of this file. + """ + save_strategy = os.environ.get("UPLOAD_SAVE_STRATEGY") + + if save_strategy is None: + return DEFAULT_UPLOAD_SAVE_STRATEGY + + if save_strategy not in [ + UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, + UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, + ]: + raise web.HTTPInternalServerError( + text=f"Unsupported save strategy in configuration: '{save_strategy}'" + ) + + return save_strategy + + +def get_max_file_size() -> int: + max_file_size = os.environ.get("UPLOAD_MAX_FILE_SIZE") + if max_file_size is None: + return DEFAULT_UPLOAD_MAX_FILE_SIZE + try: + return int(max_file_size) + except ValueError as ve: + raise web.HTTPInternalServerError( + text=f"Error parsing UPLOAD_MAX_FILE_SIZE environment variable: {str(ve)}" + ) + + +def get_max_content_length() -> int: + max_file_size = get_max_file_size() + return max_file_size + CONTENT_LENGTH_TWEAK + + +def get_config(): + global _CONFIG + if _CONFIG is None: + _CONFIG = configparser.ConfigParser() + _CONFIG.read(os.environ["KB_DEPLOYMENT_CONFIG"]) + return _CONFIG diff --git a/staging_service/globus.py b/staging_service/globus.py index 54b41fb4..8cc9cc00 100644 --- a/staging_service/globus.py +++ b/staging_service/globus.py @@ -1,8 +1,10 @@ -from .utils import Path -import aiohttp -import aiofiles -import os import configparser +import os + +import aiofiles +import aiohttp + +from .utils import StagingPath def _get_authme_url(): @@ -18,6 +20,7 @@ def _get_authme_url(): async def _get_globus_ids(token): if not token: raise aiohttp.web.HTTPBadRequest(text="must supply token") + async with aiohttp.ClientSession() as session: auth2_me_url = _get_authme_url() async with session.get(auth2_me_url, headers={"Authorization": token}) as resp: @@ -37,30 +40,31 @@ async def _get_globus_ids(token): def _globus_id_path(username: str): - return Path.validate_path(username, ".globus_id") + return StagingPath.validate_path(username, ".globus_id") -def is_globusid(path: Path, username: str): +def is_globusid(path: StagingPath, username: str): return path.full_path == _globus_id_path(username) async def assert_globusid_exists(username, token): - """ ensures that a globus id exists if there is a valid one for user""" - - # make root dir - root = Path.validate_path(username, "") - if not os.path.exists(root.full_path): - os.makedirs(root.full_path) + """ensures that a globus id exists if there is a valid one for user""" path = _globus_id_path(username) - # check to see if file exists or is empty + + # Ensure the path to the globus file exists. In a deployment, this is the + # user's staging directory. + os.makedirs(os.path.dirname(path.full_path), exist_ok=True) + + # Create the globus if not os.path.exists(path.full_path) or os.stat(path.full_path).st_size == 0: globus_ids = await _get_globus_ids(token) if len(globus_ids) == 0: return + # TODO in the future this should support writing multiple lines # such as the commented code below, for multiple linked accounts # text = '\n'.join(globus_ids) text = globus_ids[0] async with aiofiles.open(path.full_path, mode="w") as globus_file: - await globus_file.writelines(text) + await globus_file.write(text) diff --git a/staging_service/main.py b/staging_service/main.py new file mode 100644 index 00000000..db26e107 --- /dev/null +++ b/staging_service/main.py @@ -0,0 +1,19 @@ +import asyncio +# import configparser +# import os + +import uvloop + +from staging_service.app import app_factory + +# from aiohttp import web + +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop + +# config = configparser.ConfigParser() +# config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) +# web.run_app(app_factory(config), port=5000) + + +async def web_app(): + return app_factory() diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 110a7716..5bea154a 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -1,18 +1,25 @@ +import hashlib +import os +import unicodedata +from difflib import SequenceMatcher from json import JSONDecoder, JSONEncoder + import aiofiles -from .utils import run_command, Path -import os from aiohttp import web -import hashlib -from difflib import SequenceMatcher -from itertools import islice + +from .utils import StagingPath decoder = JSONDecoder() encoder = JSONEncoder() +READ_BUFFER_SIZE = buffer_size = 1 * 1000 * 1000 +FILE_SNIPPET_SIZE = 1024 + -async def stat_data(path: Path) -> dict: +async def stat_data(path: StagingPath) -> dict: """ + Returns our version of file stats. + only call this on a validated full path """ file_stats = os.stat(path.full_path) @@ -27,106 +34,271 @@ async def stat_data(path: Path) -> dict: def _file_read_from_tail(file_path): - upper_bound = min(1024, os.stat(file_path).st_size) - with open(file_path, "r") as file: + """ + Returns a snippet of text from the end of the given file. + + The snippet will be either FILE_SNIPPET_SIZE or the + entire file if it is smaller than that. + + If the file is not proper utf-8 encoded, an error message + will be returned instead. + """ + upper_bound = min(FILE_SNIPPET_SIZE, os.stat(file_path).st_size) + with open(file_path, "r", encoding="utf-8") as file: file.seek(0, os.SEEK_END) file.seek(file.tell() - upper_bound, os.SEEK_SET) - return file.read() + try: + return file.read() + except UnicodeError: + # TODO: should just return None, and let the + # ui determine the message. + return "not a text file" def _file_read_from_head(file_path): - with open(file_path, "r") as file: - return file.read(1024) + """ + Returns a snippet of text from the beginning of the given file. + The snippet will be either FILE_SNIPPET_SIZE or the + entire file if it is smaller than that. -async def _generate_metadata(path: Path, source: str): - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: - data = await extant.read() - data = decoder.decode(data) - else: - data = {} - # first ouptut of md5sum is the checksum - if source: - data["source"] = source + If the file is not proper utf-8 encoded, an error message + will be returned instead. + """ + with open(file_path, "r", encoding="utf-8") as file: + try: + return file.read(FILE_SNIPPET_SIZE) + except UnicodeError: + # TODO: should just return None, and let the + # ui determine the message. + return "not a text file" + + +async def _read_metadata(metadata_path: str): + """ + Reads the file at the provided path, and decodes to JSON. + """ + async with aiofiles.open(metadata_path, mode="r", encoding="utf-8") as extant: + data = await extant.read() + return decoder.decode(data) + + +def is_text_string(bytes_to_check): + """ + Determines if the provided bytes is indeed a text string. + + It is considered a valid string if it is utf-8 encoded and + does not contain control characters. + + This determination is for the purpose of determining whether to + generate text snippets and report lines of text. + """ try: - md5 = hashlib.md5(open(path.full_path, "rb").read()).hexdigest() - except: - md5 = "n/a" - - data["md5"] = md5.split()[0] - # first output of wc is the count - data["lineCount"] = str(sum((1 for i in open(path.full_path, "rb")))) - try: # all things that expect a text file to decode output should be in this block - data["head"] = _file_read_from_head(path.full_path) - data["tail"] = _file_read_from_tail(path.full_path) - except: - data["head"] = "not text file" - data["tail"] = "not text file" - async with aiofiles.open(path.metadata_path, mode="w") as f: - await f.writelines(encoder.encode(data)) - return data + # If it decodes, it is indeed Unicode. + decoded = bytes_to_check.decode("utf-8") + + # But it may not be text, as control characters may be + # in there, and they are unlikely to be part of + # readable text, which is what we are after here, as we + # use this to enable line counting and snippets. + # We allow TAB (9), LF (10), CR (13) for obvious reasons. + for character in decoded: + if unicodedata.category(character) == "Cc" and not ord(character) in [ + 9, + 10, + 13, + ]: + return False + return True + except UnicodeError: + return False + + +async def _generate_metadata(path: StagingPath, source: str = None): + """ + Generates a metadata file for the associated data file. -async def add_upa(path: Path, UPA: str): + Note that it generates it unconditionally. For conditional generation, see _ensure_metadata. + """ + # Open metadatafile if it exists, otherwise create an + # empty dict and ensure there is a writable directory for it + # to be saved to. if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: - data = await extant.read() - data = decoder.decode(data) + existing_metadata = await _read_metadata(path.metadata_path) + else: + os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) + existing_metadata = {} + + additional_metadata = {} + + # We record the modification time for the file, in milliseconds, in order + # to be able to determine if it has changed since the metadata was last + # generated. + file_stats = os.stat(path.full_path) + additional_metadata["mtime"] = int(file_stats.st_mtime * 1000) + + # If source is missing (why would that be?), supply it with the one + # provided, or attempt to determine it. + if "source" not in existing_metadata: + if source is None: + source = _determine_source(path) + additional_metadata["source"] = source + + metadata = existing_metadata | additional_metadata + + # + # We want to know, roughly, if this is a text file or not ("binary"); + # setting to true initially is just part of the logic in the loop below. + # + is_text = True + + # A word on counting lines. We simply count line endings. With a catch. + # The final "line" may be of two forms. Either as counted here, with the + # last line ending being at the very end of the file, or the last bytes + # after the last line ending if the file ends without a terminal line ending. + # The latter adds, unfortunately, extra code and complexity. + line_count = 0 + + # Keeping the previous chunk of bytes read from the file is necessary for accurate + # line counting. + last_chunk_read = None + + chunk_count = 0 + + size = 0 + + md5 = hashlib.md5() + async with aiofiles.open(path.full_path, "rb") as fin: + while True: + chunk = await fin.read(READ_BUFFER_SIZE) + + # + # If attempt to read past end of file, a 0-length bytes is returned + # + if len(chunk) == 0: + break + + last_chunk_read = chunk + + size += len(chunk) + + chunk_count += 1 + + md5.update(chunk) + + # + # We use the first chunk to determine if the file is valid utf-8 text. + # The first chunk is 1MB, so this should be safe. + # + if chunk_count == 1: + is_text = is_text_string(chunk) + + # + # Let us not waste time counting lines if we already think this is not + # text. + # + if is_text: + line_count += chunk.count(b"\n") + + # + # If the last chunk does not end in a newline (LF), then it has a last line + # which is terminated by the end of the file. + # + if last_chunk_read is not None and last_chunk_read[-1] != ord(b"\n"): + line_count += 1 + + if is_text: + metadata["lineCount"] = line_count else: - data = await _generate_metadata(path) # TODO performance optimization - data["UPA"] = UPA - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - async with aiofiles.open(path.metadata_path, mode="w") as update: - await update.writelines(encoder.encode(data)) + metadata["lineCount"] = None + + metadata["md5"] = md5.hexdigest() + + if is_text: + metadata["head"] = _file_read_from_head(path.full_path) + metadata["tail"] = _file_read_from_tail(path.full_path) + else: + metadata["head"] = "not a text file" + metadata["tail"] = "not a text file" + + # Save metadata file + await _save_metadata(path, metadata) + + return metadata + + +async def _save_metadata(path: StagingPath, metadata: dict): + """ + Saves the given metadata dictionary into the metadata file associated with the given path + """ + async with aiofiles.open(path.metadata_path, mode="w") as f: + await f.writelines(encoder.encode(metadata)) + + +async def _update_metadata(path: StagingPath, additional_metadata: dict) -> dict: + """ + Updates the metadata indicated by the path with the additional metadata + provided. + + Assumes the metadata file exists, so it should always be called after + _ensure_metadata. + """ + metadata = await _read_metadata(path.metadata_path) + + new_metdata = metadata | additional_metadata + + await _save_metadata(path, new_metdata) + + return new_metdata -def _determine_source(path: Path): +async def add_upa(path: StagingPath, UPA: str): + """ + Adds the provided UPA to the metadata for the given path. + + If the metadata does not yet exist, the metadata is generated. + """ + await _ensure_metadata(path) + await _update_metadata(path, {"UPA": UPA}) + + +def _determine_source(path: StagingPath): """ tries to determine the source of a file for which the source is unknown currently this works for JGI imported files only """ jgi_path = path.jgi_metadata - if os.path.exists(jgi_path) and os.path.isfile(jgi_path): + if ( + os.path.exists(jgi_path) + and os.path.isfile(jgi_path) + ): return "JGI import" else: return "Unknown" -async def _only_source(path: Path): - if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: - try: - data = await extant.read() - data = decoder.decode(data) - except: - data = {} - else: - data = {} - if "source" in data: - return data["source"] - else: - data["source"] = _determine_source(path) - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - async with aiofiles.open(path.metadata_path, mode="w") as update: - await update.writelines(encoder.encode(data)) - return data["source"] - - async def dir_info( - path: Path, show_hidden: bool, query: str = "", recurse=True + path: StagingPath, show_hidden: bool, query: str = "", recurse=True ) -> list: """ - only call this on a validated full path + Returns a directory listing of the given path. + + If a query is provided, only files containing it will be included. + If show_hidden is True, any files with a "." prefix ("hidden files"), will + be included, otherwise they are omitted + If recurse is True, all subdirectories will be included. + + Only call this on a validated full path. """ response = [] for entry in os.scandir(path.full_path): - specific_path = Path.from_full_path(entry.path) + specific_path = StagingPath.from_full_path(entry.path) # maybe should never show the special .globus_id file ever? # or moving it somewhere outside the user directory would be better - if not show_hidden and entry.name.startswith('.'): + if not show_hidden and entry.name.startswith("."): continue + if entry.is_dir(): if query == "" or specific_path.user_path.find(query) != -1: response.append(await stat_data(specific_path)) @@ -134,11 +306,14 @@ async def dir_info( response.extend( await dir_info(specific_path, show_hidden, query, recurse) ) - if entry.is_file(): + elif entry.is_file(): if query == "" or specific_path.user_path.find(query) != -1: - data = await stat_data(specific_path) - data["source"] = await _only_source(specific_path) - response.append(data) + metadata, file_stats = await _ensure_metadata(specific_path) + # data = await stat_data(specific_path) + # data["source"] = await _only_source(specific_path) + file_stats["source"] = metadata["source"] + response.append(file_stats) + return response @@ -152,39 +327,59 @@ async def similar(file_name, comparing_file_name, similarity_cut_off): return matcher.ratio() >= similarity_cut_off -async def some_metadata(path: Path, desired_fields=False, source=None): +async def _ensure_metadata(path: StagingPath, source: str = None): """ - if desired fields isn't given as a list all fields will be returned - assumes full_path is valid path to a file - valid fields for desired_fields are: - md5, lineCount, head, tail, name, path, mtime, size, isFolder, UPA + Returns metdata if found, otherwise generates and returns it for the given file """ file_stats = await stat_data(path) - if file_stats["isFolder"]: - return file_stats - if (not os.path.exists(path.metadata_path)) or ( - os.stat(path.metadata_path).st_mtime < file_stats["mtime"] / 1000 - ): - # if metadata does not exist or older than file: regenerate - if source is None: # TODO BUGFIX this will overwrite any source in the file - source = _determine_source(path) - data = await _generate_metadata(path, source) - else: # metadata already exists and is up to date - async with aiofiles.open(path.metadata_path, mode="r") as f: - # make metadata fields local variables - data = await f.read() - data = decoder.decode(data) - # due to legacy code, some file has corrupted metadata file - # Also if a file is listed or checked for existence before the upload completes - # this code block will be triggered - expected_keys = ["source", "md5", "lineCount", "head", "tail"] - if not set(expected_keys) <= set(data.keys()): - if source is None and "source" not in data: - source = _determine_source(path) - data = await _generate_metadata(path, source) - data = {**data, **file_stats} + + if os.path.exists(path.metadata_path): + metadata = await _read_metadata(path.metadata_path) + + # + # These are required properties of the metadata, so if any are missing, + # due to corrupted metadata, old metadata generated during development, + # or due to updates in the metadata format, this ensures that we trigger + # a metadata rebuild. + # + built_in_keys = set(["source", "md5", "lineCount", "head", "tail", "mtime"]) + + # + # Note that we are checking if the built in keys are included in the metadata, + # not that they are the complete set of metadata (although they may be). + # + if set(built_in_keys) <= set(metadata.keys()): + # + # This is why we store the modification time of the file. If the + # file has been changed (as measured by the modification time), we + # regenerate the metadata. + # + if metadata["mtime"] == file_stats["mtime"]: + return metadata, file_stats + + metadata = await _generate_metadata(path, source) + + return metadata, file_stats + + +async def some_metadata(path: StagingPath, desired_fields=False, source=None): + """ + Returns metadata with file stats merged, optionally filtered by a set of + desired fields. + + If the desired fields are not provided, all fields will be returned. + Valid fields for desired_fields are: + md5, lineCount, head, tail, source, name, path, mtime, size, isFolder, UPA + + Assumes full_path is valid path to a file + """ + metadata, file_stats = await _ensure_metadata(path, source) + + data = metadata | file_stats + if not desired_fields: return data + result = {} for key in desired_fields: try: @@ -193,4 +388,5 @@ async def some_metadata(path: Path, desired_fields=False, source=None): raise web.HTTPBadRequest( text="no data exists for key {key}".format(key=no_data.args) ) # TODO check this exception message + return result diff --git a/staging_service/utils.py b/staging_service/utils.py index 79841d65..da303681 100644 --- a/staging_service/utils.py +++ b/staging_service/utils.py @@ -4,14 +4,20 @@ import logging import os +from staging_service.auth2Client import KBaseAuth2 import globus_sdk from aiohttp.web import HTTPInternalServerError, HTTPOk +from staging_service.config import get_config + + +def auth_client(): + return KBaseAuth2(get_config()["staging_service"]["AUTH_URL"]) async def run_command(*args): """Run command in subprocess Example from: - http://asyncio.readthedocs.io/en/latest/subprocess.html + https://asyncio.readthedocs.io/en/latest/subprocess.html """ # Create subprocess process = await asyncio.create_subprocess_exec( @@ -31,15 +37,17 @@ async def run_command(*args): if process.returncode == 0: return stdout.decode().strip() else: - error_msg = "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( - cmd=" ".join(args), - returncode=process.returncode, - error=stderr.decode().strip(), + error_msg = ( + "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( + cmd=" ".join(args), + returncode=process.returncode, + error=stderr.decode().strip(), + ) ) raise HTTPInternalServerError(text=error_msg) -class Path(object): +class StagingPath(object): _META_DIR = None # expects to be set by config _DATA_DIR = None # expects to be set by config _CONCIERGE_PATH = None # expects to be set by config @@ -68,22 +76,22 @@ def validate_path(username: str, path: str = ""): while path.startswith("/"): path = path[1:] user_path = os.path.join(username, path) - full_path = os.path.join(Path._DATA_DIR, user_path) + full_path = os.path.join(StagingPath._DATA_DIR, user_path) - metadata_path = os.path.join(Path._META_DIR, user_path) + metadata_path = os.path.join(StagingPath._META_DIR, user_path) name = os.path.basename(path) jgi_metadata = os.path.join(os.path.dirname(full_path), "." + name + ".jgi") - return Path(full_path, metadata_path, user_path, name, jgi_metadata) + return StagingPath(full_path, metadata_path, user_path, name, jgi_metadata) @staticmethod def from_full_path(full_path: str): - user_path = full_path[len(Path._DATA_DIR) :] + user_path = full_path[len(StagingPath._DATA_DIR) :] if user_path.startswith("/"): user_path = user_path[1:] - metadata_path = os.path.join(Path._META_DIR, user_path) + metadata_path = os.path.join(StagingPath._META_DIR, user_path) name = os.path.basename(full_path) jgi_metadata = os.path.join(os.path.dirname(full_path), "." + name + ".jgi") - return Path(full_path, metadata_path, user_path, name, jgi_metadata) + return StagingPath(full_path, metadata_path, user_path, name, jgi_metadata) class AclManager: @@ -167,7 +175,7 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): Attempt to add acl for the given user id and directory """ try: - resp = self.globus_transfer_client.add_endpoint_acl_rule( + self.globus_transfer_client.add_endpoint_acl_rule( self.endpoint_id, dict( DATA_TYPE="access", @@ -240,7 +248,7 @@ def _remove_acl(self, user_identity_id: str): text=json.dumps(response), content_type="application/json" ) - except globus_sdk.GlobusAPIError as error: + except globus_sdk.GlobusAPIErro: response = { "success": False, "error_type": "GlobusAPIError", @@ -258,7 +266,7 @@ def add_acl_concierge(self, shared_directory: str, concierge_path: str): :return: Result of attempt to add acl """ user_identity_id = self._get_globus_identity(shared_directory) - cp_full = f"{Path._DATA_DIR}/{concierge_path}" + cp_full = f"{StagingPath._DATA_DIR}/{concierge_path}" try: os.mkdir(cp_full) print(f"Attempting to create concierge dir {cp_full}") From 81b9b0900059f86f07775ec50fac3d0c9b22df33 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:27:59 -0700 Subject: [PATCH 05/33] move to python 3.11, simplify and improve [PTV-1882] use python 3.11 since this is an opportunity simplify os package installation because it was unnecessarily long; more on line RUN line fewer layers. only COPY necessary directories - everything was copied before which was excessive, potentially insecure --- Dockerfile | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index acbf5f1f..1a7839b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,37 +1,30 @@ -FROM python:3.9-slim-buster +FROM python:3.11.4-slim-buster # ----------------------------------------- RUN mkdir -p /kb/deployment/lib -RUN apt-get update && apt-get install -y --no-install-recommends apt-utils -RUN apt-get install -y zip && \ - apt-get install -y unzip && \ - apt-get install -y bzip2 && \ - apt-get install -y libmagic-dev - - -RUN apt-get install -y htop wget +RUN apt-get update && apt-get install -y zip unzip bzip2 libmagic-dev htop wget +# Standard simplified python setup. COPY ./requirements.txt /requirements.txt -RUN pip install -r /requirements.txt +RUN python -m pip install --upgrade pip && pip install -r /requirements.txt -COPY ./ /kb/module COPY ./globus.cfg /etc/globus.cfg RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log -RUN cp -r /kb/module/staging_service /kb/deployment/lib -RUN cp -r /kb/module/deployment /kb +COPY ./staging_service /kb/deployment/lib/staging_service +COPY ./deployment /kb/deployment +# TODO: should be port 5000 to match other kbase services EXPOSE 3000 WORKDIR /kb/deployment/lib - # The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to # the end LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.schema-version="1.1.8" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" + org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.schema-version="1.1.8" \ + us.kbase.vcs-branch=$BRANCH \ + maintainer="Steve Chan sychan@lbl.gov" ENTRYPOINT ["/kb/deployment/bin/entrypoint.sh"] From 5894114ce94baadcd800fe4ba2429f6f85937e3c Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:33:50 -0700 Subject: [PATCH 06/33] refactor entrypoint to use gunicorn and remove local dev support [PTV-1882] The production entrypoint should not serve a dual purpose - local dev and prod runtime. The local dev support needs work anyway, as it relies on Visual Studio Code, which is out of date. Gunicorn is used now as it helps ameliorate stuck aiohttp servers and relaunching crashed aiothttp servers. It's configuration uses env variables, all with defaults (which may need to thought about more deeply and/or tested.) --- deployment/bin/entrypoint.sh | 45 +++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/deployment/bin/entrypoint.sh b/deployment/bin/entrypoint.sh index 787b2446..5628a483 100755 --- a/deployment/bin/entrypoint.sh +++ b/deployment/bin/entrypoint.sh @@ -3,17 +3,40 @@ # please also update the launch.json for vscode accordingly # the things that must be kept in sync are KB_DEPLOYMENT_CONFIG and PYTHONPATH -#top section for local running -DIR="$( cd "$( dirname "$0" )" && pwd )" -if [ -d "$DIR/../../staging_service" ]; then - PYTHONPATH="$DIR/../../staging_service" - export KB_DEPLOYMENT_CONFIG="$DIR/../conf/local.cfg" - export FILE_LIFETIME="90" -fi -#bottom section for running inside docker if [ -d "kb/deployment/lib/staging_service" ]; then - PYTHONPATH="kb/deployment/lib/staging_service" - # environment variable for KB_DEPLOYMENT_CONFIG set in docker-compose.yml + echo "staging_service not installed in /kb/deployment/lib as expected" + exit 1 fi -python3 -m staging_service \ No newline at end of file + +PYTHONPATH="kb/deployment/lib/staging_service" + +# disabled in favor of running behind gunicorn +# python3 -m staging_service + +# The port at which to run this service. +# TODO: this should probably be 5000 in order to operate like other KBase services. +# Who likes surprises like this? +SERVICE_PORT="${SERVICE_PORT:-3000}" + +# See https://docs.gunicorn.org/en/latest/run.html +# Recommended to have 2-4 workers per core. Default assumes 1 core. +GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}" + +# Workers silent for more than this many seconds are killed and restarted. +# This setting affects the /upload endpoint, as it processes files after +# upload, calculating the md5 and potentially, in the future, additional tasks +# such as validating the file. +# Other endpoints may encounter a delay as well, as any endpoint which access +# metadata may trigger a metadata rebuild, including the md5. +# Default to 10 minutes, a generous amount of time to wait for this to complete. +# Alternative designs could solve this in a different way. +GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-600}" + +# TODO: should be port 5000 to match other kbase services +gunicorn staging_service.main:web_app \ + --bind 0.0.0.0:"${SERVICE_PORT}" \ + --worker-class aiohttp.GunicornWebWorker \ + --pythonpath "${PYTHONPATH}" \ + --workers "${GUNICORN_WORKERS}" \ + --timeout "${GUNICORN_TIMEOUT}" \ No newline at end of file From 6e17a5cbe5242a015a88e34e14c51dcb9332d4cd Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:35:19 -0700 Subject: [PATCH 07/33] testing fixes and improvements [PTV-1882] Add tests to cover new code (need to do more) Generate coverage report to terminal and html Test runner also now supports running specific tests. Fix async tests when update dependencies Improve auth client mocking (replace hand-crafted monkeypatch with pytest method mock) fix tests and code with obvious typos Replace silly aliasing of Optional to O with Optional Moved test_utils to test_helpers as test utils is a module in pytest! Fixed most warnings reported (mostly deprecations) --- run_tests.sh | 11 +- .../import_specifications/file_parser.py | 63 +- .../import_specifications/file_writers.py | 2 +- .../import_specifications/test_file_parser.py | 333 ++-- .../test_file_writers.py | 161 +- .../test_individual_parsers.py | 634 +++++--- tests/pyproject.toml | 3 +- tests/test_app.py | 1342 ++++++++++------- tests/test_autodetect.py | 2 +- tests/{test_utils.py => test_helpers.py} | 52 +- tests/test_metadata.py | 331 +++- 11 files changed, 1922 insertions(+), 1012 deletions(-) rename tests/{test_utils.py => test_helpers.py} (52%) diff --git a/run_tests.sh b/run_tests.sh index 43de2167..4958f8c5 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -2,4 +2,13 @@ DIR="$( cd "$( dirname "$0" )" && pwd )" export KB_DEPLOYMENT_CONFIG="$DIR/deployment/conf/testing.cfg" export FILE_LIFETIME="90" -python3 -m pytest -s -vv --cov=staging_service \ No newline at end of file +export TESTS="${1:-tests}" +echo +echo "****************************" +echo "**" +echo "** Running tests in ${TESTS}" +echo "**" +echo "****************************" +echo +python3 -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html $TESTS + diff --git a/staging_service/import_specifications/file_parser.py b/staging_service/import_specifications/file_parser.py index 68dcaa6b..6b96a1af 100644 --- a/staging_service/import_specifications/file_parser.py +++ b/staging_service/import_specifications/file_parser.py @@ -6,10 +6,12 @@ from collections.abc import Callable from dataclasses import dataclass from enum import Enum +from pathlib import Path +from typing import Optional as O +from typing import Union + # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict -from pathlib import Path -from typing import Union, Optional as O # TODO should get mypy working at some point @@ -20,6 +22,7 @@ class ErrorType(Enum): """ The type of an error encountered when trying to parse import specifications. """ + FILE_NOT_FOUND = 1 PARSE_FAIL = 2 INCORRECT_COLUMN_COUNT = 3 @@ -37,6 +40,7 @@ class SpecificationSource: tab - the name of the spreadsheet file tab from which the import specification was obtained, if any. """ + file: Path tab: O[str] = None @@ -45,19 +49,24 @@ def __post_init__(self): raise ValueError("file is required") -_ERR_MESSAGE = 'message' -_ERR_SOURCE_1 = 'source_1' -_ERR_SOURCE_2 = 'source_2' +_ERR_MESSAGE = "message" +_ERR_SOURCE_1 = "source_1" +_ERR_SOURCE_2 = "source_2" _ERRTYPE_TO_REQ_ARGS = { ErrorType.FILE_NOT_FOUND: (_ERR_SOURCE_1,), ErrorType.PARSE_FAIL: (_ERR_MESSAGE, _ERR_SOURCE_1), ErrorType.INCORRECT_COLUMN_COUNT: (_ERR_MESSAGE, _ERR_SOURCE_1), - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: (_ERR_MESSAGE, _ERR_SOURCE_1, _ERR_SOURCE_2), + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: ( + _ERR_MESSAGE, + _ERR_SOURCE_1, + _ERR_SOURCE_2, + ), ErrorType.NO_FILES_PROVIDED: tuple(), ErrorType.OTHER: (_ERR_MESSAGE,), } + @dataclass(frozen=True) class Error: f""" @@ -94,7 +103,9 @@ def __post_init__(self): for attr in attrs: if not getattr(self, attr): # grammar sucks but this is not expected to be seen by end users so meh - raise ValueError(f"{', '.join(attrs)} is required for a {self.error.name} error") + raise ValueError( + f"{', '.join(attrs)} is required for a {self.error.name} error" + ) @dataclass(frozen=True) @@ -111,6 +122,7 @@ class ParseResult: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create instances of this class. """ + source: SpecificationSource result: tuple[frozendict[str, PRIMITIVE_TYPE], ...] @@ -137,12 +149,13 @@ class ParseResults: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create an instance of this class. """ + results: O[frozendict[str, ParseResult]] = None errors: O[tuple[Error, ...]] = None def __post_init__(self): if not (bool(self.results) ^ bool(self.errors)): # xnor - raise ValueError("Exectly one of results or errors must be supplied") + raise ValueError("Exactly one of results or errors must be supplied") # we assume here that the data is otherwise correctly created. @@ -156,18 +169,21 @@ class FileTypeResolution: parser - a parser for the file unsupported_type - the file type if the type is not a supported type. """ + parser: O[Callable[[Path], ParseResults]] = None unsupported_type: O[str] = None def __post_init__(self): if not (bool(self.parser) ^ bool(self.unsupported_type)): # xnor - raise ValueError("Exectly one of parser or unsupported_type must be supplied") + raise ValueError( + "Exactly one of parser or unsupported_type must be supplied" + ) def parse_import_specifications( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], - log_error: Callable[[Exception], None] + log_error: Callable[[Exception], None], ) -> ParseResults: """ Parse a set of import specification files and return the results. @@ -187,6 +203,7 @@ def parse_import_specifications( errors = tuple([Error(ErrorType.OTHER, str(e))]) return ParseResults(errors=errors) + def _parse( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], @@ -196,12 +213,14 @@ def _parse( for p in paths: file_type = file_type_resolver(p) if file_type.unsupported_type: - errors.append(Error( - ErrorType.PARSE_FAIL, - f"{file_type.unsupported_type} " + errors.append( + Error( + ErrorType.PARSE_FAIL, + f"{file_type.unsupported_type} " + "is not a supported file type for import specifications", - SpecificationSource(p) - )) + SpecificationSource(p), + ) + ) continue res = file_type.parser(p) if res.errors: @@ -209,12 +228,14 @@ def _parse( else: for data_type in res.results: if data_type in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Data type {data_type} appears in two importer specification sources", - results[data_type].source, - res.results[data_type].source - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Data type {data_type} appears in two importer specification sources", + results[data_type].source, + res.results[data_type].source, + ) + ) else: results[data_type] = res.results[data_type] if errors: diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 160b32e2..d4910c80 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -210,7 +210,7 @@ def write_excel(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[st sheet.row_dimensions[2].hidden = True res[datatype] = filename # trash the automatically created sheet - wb.remove_sheet(wb[wb.sheetnames[0]]) + wb.remove(wb[wb.sheetnames[0]]) wb.save(folder / filename) return res diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 857be960..81fe4cc0 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -1,26 +1,27 @@ # not much point in testing data classes unless there's custom logic in them from collections.abc import Callable -from frozendict import frozendict -from pytest import raises -from tests.test_utils import assert_exception_correct from pathlib import Path -from typing import Optional as O +from typing import Optional from unittest.mock import Mock, call +from frozendict import frozendict +from pytest import raises + from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, + Error, ErrorType, FileTypeResolution, - SpecificationSource, - Error, ParseResult, ParseResults, - parse_import_specifications + SpecificationSource, + parse_import_specifications, ) +from tests.test_helpers import assert_exception_correct -def spcsrc(path: str, tab: O[str]=None): +def spcsrc(path: str, tab: Optional[str] = None): return SpecificationSource(Path(path), tab) @@ -43,7 +44,7 @@ def test_SpecificationSource_init_fail(): specificationSource_init_fail(None, ValueError("file is required")) -def specificationSource_init_fail(file_: O[str], expected: Exception): +def specificationSource_init_fail(file_: Optional[str], expected: Exception): with raises(Exception) as got: SpecificationSource(file_) assert_exception_correct(got.value, expected) @@ -65,16 +66,16 @@ def test_FileTypeResolution_init_w_unsupported_type_success(): def test_FileTypeResolution_init_fail(): - err = "Exectly one of parser or unsupported_type must be supplied" + err = "Exactly one of parser or unsupported_type must be supplied" pr = ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) fileTypeResolution_init_fail(None, None, ValueError(err)) fileTypeResolution_init_fail(lambda path: pr, "mp-2", ValueError(err)) def fileTypeResolution_init_fail( - parser: O[Callable[[Path], ParseResults]], - unexpected_type: O[str], - expected: Exception + parser: Optional[Callable[[Path], ParseResults]], + unexpected_type: Optional[str], + expected: Exception, ): with raises(Exception) as got: FileTypeResolution(parser, unexpected_type) @@ -109,7 +110,9 @@ def test_Error_init_w_PARSE_FAIL_success(): def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): - e = Error(ErrorType.INCORRECT_COLUMN_COUNT, message="42", source_1=spcsrc("somefile")) + e = Error( + ErrorType.INCORRECT_COLUMN_COUNT, message="42", source_1=spcsrc("somefile") + ) assert e.error == ErrorType.INCORRECT_COLUMN_COUNT assert e.message == "42" @@ -119,7 +122,10 @@ def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): e = Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo", spcsrc("foo2"), spcsrc("yay") + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "foo", + spcsrc("foo2"), + spcsrc("yay"), ) assert e.error == ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE @@ -158,31 +164,47 @@ def test_Error_init_w_OTHER_success(): def test_Error_init_fail(): # arguments are error type, message string, 1st source, 2nd source, exception error_init_fail(None, None, None, None, ValueError("error is required")) - error_init_fail(ErrorType.FILE_NOT_FOUND, None, None, None, ValueError( - "source_1 is required for a FILE_NOT_FOUND error")) + error_init_fail( + ErrorType.FILE_NOT_FOUND, + None, + None, + None, + ValueError("source_1 is required for a FILE_NOT_FOUND error"), + ) err = "message, source_1 is required for a PARSE_FAIL error" error_init_fail(ErrorType.PARSE_FAIL, None, spcsrc("wooo"), None, ValueError(err)) error_init_fail(ErrorType.PARSE_FAIL, "msg", None, None, ValueError(err)) err = "message, source_1 is required for a INCORRECT_COLUMN_COUNT error" - error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, None, spcsrc("whee"), None, ValueError(err)) - error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err)) + error_init_fail( + ErrorType.INCORRECT_COLUMN_COUNT, None, spcsrc("whee"), None, ValueError(err) + ) + error_init_fail( + ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err) + ) ms = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - err = ("message, source_1, source_2 is required for a " - + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error") + err = ( + "message, source_1, source_2 is required for a " + + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error" + ) error_init_fail(ms, None, None, None, ValueError(err)) error_init_fail(ms, None, spcsrc("foo"), spcsrc("bar"), ValueError(err)) error_init_fail(ms, "msg", None, spcsrc("bar"), ValueError(err)) error_init_fail(ms, "msg", spcsrc("foo"), None, ValueError(err)) - error_init_fail(ErrorType.OTHER, None, None, None, ValueError( - "message is required for a OTHER error")) + error_init_fail( + ErrorType.OTHER, + None, + None, + None, + ValueError("message is required for a OTHER error"), + ) def error_init_fail( - errortype: O[ErrorType], - message: O[str], - source_1: O[SpecificationSource], - source_2: O[SpecificationSource], - expected: Exception + errortype: Optional[ErrorType], + message: Optional[str], + source_1: Optional[SpecificationSource], + source_2: Optional[SpecificationSource], + expected: Exception, ): with raises(Exception) as got: Error(errortype, message, source_1, source_2) @@ -198,30 +220,37 @@ def test_ParseResult_init_success(): def test_ParseResult_init_fail(): parseResult_init_fail(None, None, ValueError("source is required")) - parseResult_init_fail(None, (frozendict({"foo": "bar"}),), ValueError("source is required")) + parseResult_init_fail( + None, (frozendict({"foo": "bar"}),), ValueError("source is required") + ) parseResult_init_fail(spcsrc("foo"), None, ValueError("result is required")) def parseResult_init_fail( - source: O[SpecificationSource], - result: O[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], - expected: Exception + source: Optional[SpecificationSource], + result: Optional[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], + expected: Exception, ): with raises(Exception) as got: ParseResult(source, result) assert_exception_correct(got.value, expected) -PR_RESULTS = frozendict({"data_type": ParseResult( - spcsrc("some_file", "tab"), - (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),) # make a tuple! -)}) +PR_RESULTS = frozendict( + { + "data_type": ParseResult( + spcsrc("some_file", "tab"), + (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),), # make a tuple! + ) + } +) PR_ERROR = ( Error(ErrorType.OTHER, message="foo"), - Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")) + Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")), ) + def test_ParseResults_init_w_results_success(): results_copy = frozendict(PR_RESULTS) # prevent identity equality @@ -233,7 +262,7 @@ def test_ParseResults_init_w_results_success(): def test_ParseResults_init_w_error_success(): - errors_copy = tuple(PR_ERROR) # prevent identity equality + errors_copy = tuple(PR_ERROR) # prevent identity equality pr = ParseResults(errors=PR_ERROR) assert pr.results is None @@ -243,24 +272,27 @@ def test_ParseResults_init_w_error_success(): def test_ParseResults_init_fail(): - err = 'Exectly one of results or errors must be supplied' + err = "Exactly one of results or errors must be supplied" parseResults_init_fail(None, None, ValueError(err)) parseResults_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) def parseResults_init_fail( - results: O[frozendict[str, ParseResult]], - errors: O[tuple[Error, ...]], - expected: Exception + results: Optional[frozendict[str, ParseResult]], + errors: Optional[tuple[Error, ...]], + expected: Exception, ): with raises(Exception) as got: ParseResults(results, errors) assert_exception_correct(got.value, expected) -def _ftr(parser: Callable[[Path], ParseResults] = None, notype: str=None) -> FileTypeResolution: +def _ftr( + parser: Callable[[Path], ParseResults] = None, notype: str = None +) -> FileTypeResolution: return FileTypeResolution(parser, notype) + def _get_mocks(count: int) -> tuple[Mock, ...]: return (Mock() for _ in range(count)) @@ -270,44 +302,54 @@ def test_parse_import_specifications_success(): resolver.side_effect = [_ftr(parser1), _ftr(parser2)] - parser1.return_value = ParseResults(frozendict( - {"type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ) - } - )) - - parser2.return_value = ParseResults(frozendict( - {"type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - } - )) + parser1.return_value = ParseResults( + frozendict( + { + "type1": ParseResult( + spcsrc("myfile.xlsx", "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + spcsrc("myfile.xlsx", "tab2"), + (frozendict({"whee": "whoo"}),), # tuple! + ), + } + ) + ) + + parser2.return_value = ParseResults( + frozendict( + { + "type_other": ParseResult( + spcsrc("somefile.csv"), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ) + } + ) + ) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger ) - assert res == ParseResults(frozendict({ - "type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ), - "type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - })) + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + spcsrc("myfile.xlsx", "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + spcsrc("myfile.xlsx", "tab2"), + (frozendict({"whee": "whoo"}),), # tuple! + ), + "type_other": ParseResult( + spcsrc("somefile.csv"), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ), + } + ) + ) resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) parser1.assert_called_once_with(Path("myfile.xlsx")) @@ -330,7 +372,7 @@ def test_parse_import_specification_resolver_exception(): resolver.side_effect = [_ftr(parser1), ArithmeticError("crapsticks")] # test that other errors aren't included in the result - parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, 'foo')])) + parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, "foo")])) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger @@ -360,32 +402,43 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(notype="JPEG")] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, 'foo'), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")) - ])) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + ] + ) + ) parser2.return_value = ParseResults( - frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))})) + frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))}) + ) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.jpeg")), resolver, logger ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "foo"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), - Error( - ErrorType.PARSE_FAIL, - "JPEG is not a supported file type for import specifications", - spcsrc(Path("x.jpeg")) + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + Error( + ErrorType.PARSE_FAIL, + "JPEG is not a supported file type for import specifications", + spcsrc(Path("x.jpeg")), + ), + ] ) - ])) + ) - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.jpeg")), - ]) + resolver.assert_has_calls( + [ + call(Path("myfile.xlsx")), + call(Path("somefile.csv")), + call(Path("x.jpeg")), + ] + ) parser1.assert_called_once_with(Path("myfile.xlsx")) parser2.assert_called_once_with(Path("somefile.csv")) logger.assert_not_called() @@ -406,50 +459,66 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(parser3)] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")) - ])) - parser2.return_value = ParseResults(frozendict( - {"foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])) - }, - )) - parser3.return_value = ParseResults(frozendict( - {"foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])) - }, - )) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + ] + ) + ) + parser2.return_value = ParseResults( + frozendict( + { + "foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), + "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), + "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])), + }, + ) + ) + parser3.return_value = ParseResults( + frozendict( + { + "foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), + "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), + "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])), + }, + ) + ) res = parse_import_specifications( (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.tsv")), resolver, logger ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type bar appears in two importer specification sources", - spcsrc("b1"), - spcsrc("b2") - ), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type baz appears in two importer specification sources", - spcsrc("c1"), - spcsrc("c2") + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type bar appears in two importer specification sources", + spcsrc("b1"), + spcsrc("b2"), + ), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type baz appears in two importer specification sources", + spcsrc("c1"), + spcsrc("c2"), + ), + ] ) - ])) + ) - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.tsv")), - ]) + resolver.assert_has_calls( + [ + call(Path("myfile.xlsx")), + call(Path("somefile.csv")), + call(Path("x.tsv")), + ] + ) parser1.assert_called_once_with(Path("myfile.xlsx")) parser2.assert_called_once_with(Path("somefile.csv")) parser3.assert_called_once_with(Path("x.tsv")) - logger.assert_not_called() \ No newline at end of file + logger.assert_not_called() diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index 7f2cf26f..1bb1367b 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -1,25 +1,22 @@ import os import uuid - -import openpyxl - from collections.abc import Generator from pathlib import Path -from pytest import raises, fixture -from tests.test_app import FileUtil +import openpyxl +from pytest import fixture, raises from staging_service.import_specifications.file_writers import ( + ImportSpecWriteException, write_csv, - write_tsv, write_excel, - ImportSpecWriteException, + write_tsv, ) - -from tests.test_utils import ( +from tests.test_helpers import ( + FileUtil, assert_exception_correct, - check_file_contents, check_excel_contents, + check_file_contents, ) @@ -43,7 +40,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id3": "comma,comma", "id1": "yay!\ttab", "id2": 42}, {"id3": 56.78, "id1": "boo!", "id2": None}, - ] + ], }, "type2": { "order_and_display": [ @@ -52,7 +49,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id1": "foo"}, {"id1": 0}, - ] + ], }, "type3": { "order_and_display": [ @@ -60,8 +57,8 @@ def temp_dir() -> Generator[Path, None, None]: ("tab\tid", "xsv is only"), ("comma,id", "a template"), ], - "data": [] - } + "data": [], + }, } @@ -80,7 +77,7 @@ def test_write_csv(temp_dir: Path): '"thing,with,comma",other,thing\twith\ttabs\n', 'yay!\ttab,42,"comma,comma"\n', "boo!,,56.78\n", - ] + ], ) check_file_contents( temp_dir / "type2.csv", @@ -90,7 +87,7 @@ def test_write_csv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) check_file_contents( temp_dir / "type3.csv", @@ -98,7 +95,7 @@ def test_write_csv(temp_dir: Path): "Data type: type3; Columns: 3; Version: 1\n", 'some_id,tab\tid,"comma,id"\n', "hey this,xsv is only,a template\n", - ] + ], ) @@ -117,7 +114,7 @@ def test_write_tsv(temp_dir: Path): 'thing,with,comma\tother\t"thing\twith\ttabs"\n', '"yay!\ttab"\t42\tcomma,comma\n', "boo!\t\t56.78\n", - ] + ], ) check_file_contents( temp_dir / "type2.tsv", @@ -127,7 +124,7 @@ def test_write_tsv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) check_file_contents( temp_dir / "type3.tsv", @@ -135,7 +132,7 @@ def test_write_tsv(temp_dir: Path): "Data type: type3; Columns: 3; Version: 1\n", 'some_id\t"tab\tid"\tcomma,id\n', "hey this\txsv is only\ta template\n", - ] + ], ) @@ -179,7 +176,7 @@ def test_write_excel(temp_dir: Path): "type3", [ ["Data type: type3; Columns: 3; Version: 1", None, None], - ["some_id","tab\tid","comma,id"], + ["some_id", "tab\tid", "comma,id"], ["hey this", "xsv is only", "a template"], ], [8.0, 11.0, 10.0], @@ -193,81 +190,137 @@ def test_file_writers_fail(): file_writers_fail(p, None, E("The types value must be a mapping")) file_writers_fail(p, {}, E("At least one data type must be specified")) file_writers_fail( - p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) + p, + {None: 1}, + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail( p, {" \t ": 1}, - E("A data type cannot be a non-string or a whitespace only string")) + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail(p, {"t": []}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": 1}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": {}}, E("Data type t missing order_and_display key")) file_writers_fail( p, {"t": {"order_and_display": {}, "data": []}}, - E("Data type t order_and_display value is not a list")) + E("Data type t order_and_display value is not a list"), + ) file_writers_fail( p, {"t": {"order_and_display": [], "data": []}}, - E("At least one entry is required for order_and_display for type t")) + E("At least one entry is required for order_and_display for type t"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]]}}, - E("Data type t missing data key")) + E("Data type t missing data key"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, - E("Data type t data value is not a list")) + E("Data type t data value is not a list"), + ) file_writers_fail( p, - {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "the entry is not a list")) + {"t": {"order_and_display": [["foo", "bar"], "baz"], "data": []}}, + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "the entry is not a list" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 2 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]], + "data": [], + } + }, + E( + "Invalid order_and_display entry for datatype t at index 2 - " + + "expected 2 item list" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]], + "data": [], + } + }, + E( + "Invalid order_and_display entry for datatype t at index 0 - " + + "expected 2 item list" + ), + ) for parm in [None, " \t ", 1]: file_writers_fail( p, {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "parameter ID cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 0 - " + + "parameter ID cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "parameter display name cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "parameter display name cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, - E("Data type t data row 0 is not a mapping")) + { + "t": { + "order_and_display": [("bar", "foo"), ["whee", "whoo"]], + "data": ["foo"], + } + }, + E("Data type t data row 0 is not a mapping"), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, - E("Data type t data row 1 is not a mapping")) + E("Data type t data row 1 is not a mapping"), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, - E("Data type t data row 1 does not have the same keys as order_and_display")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 1, "whee": 2}, {"foo": 2}], + } + }, + E("Data type t data row 1 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, - E("Data type ty data row 0 does not have the same keys as order_and_display")) + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}], + } + }, + E("Data type ty data row 0 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, - E("Data type ty data row 1's value for parameter whee " - + "is not a number or a string")) - + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}], + } + }, + E( + "Data type ty data row 1's value for parameter whee " + + "is not a number or a string" + ), + ) def file_writers_fail(path: Path, types: dict, expected: Exception): diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index fe73a309..a8b20c1d 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -1,25 +1,23 @@ import os import uuid - from collections.abc import Callable, Generator +from pathlib import Path + # TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 from frozendict.core import frozendict -from pathlib import Path from pytest import fixture - from staging_service.import_specifications.individual_parsers import ( - parse_csv, - parse_tsv, - parse_excel, - ErrorType, Error, - SpecificationSource, + ErrorType, ParseResult, - ParseResults + ParseResults, + SpecificationSource, + parse_csv, + parse_excel, + parse_tsv, ) - -from tests.test_app import FileUtil +from tests.test_helpers import FileUtil _TEST_DATA_DIR = (Path(__file__).parent / "test_data").resolve() @@ -33,45 +31,85 @@ def temp_dir() -> Generator[Path, None, None]: # FileUtil will auto delete after exiting + ########################################## # xSV tests ########################################## def test_xsv_parse_success(temp_dir: Path): - # Tests a special case where if an xSV file is opened by Excel and then resaved, + # Tests a special case where if an xSV file is opened by Excel and then resaved, # Excel will add separators to the end of the 1st header line. This previously caused # the parse to fail. - _xsv_parse_success(temp_dir, ',', parse_csv) - _xsv_parse_success(temp_dir, '\t', parse_tsv) + _xsv_parse_success(temp_dir, ",", parse_csv) + _xsv_parse_success(temp_dir, "\t", parse_tsv) -def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): +def _xsv_parse_success( + temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults] +): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming - f"val3 {s} val4{s} 1{s} 8.9\n", - f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace - f"val6 {s} {s} {s} 3.14\n" # test missing values w/ whitespace - ]) + test_file.writelines( + [ + f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming + f"val3 {s} val4{s} 1{s} 8.9\n", + f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace + f"val6 {s} {s} {s} 3.14\n", # test missing values w/ whitespace + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val5", "spec2": None, "spec3": None, "spec4": 42.42}), - frozendict({"spec1": "val6", "spec2": None, "spec3": None, "spec4": 3.14}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "spec2": None, + "spec3": None, + "spec4": 42.42, + } + ), + frozendict( + { + "spec1": "val6", + "spec2": None, + "spec3": None, + "spec4": 3.14, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_nan_inf(temp_dir: Path): @@ -79,34 +117,66 @@ def test_xsv_parse_success_nan_inf(temp_dir: Path): # they will be returned from the service as barewords in JSON and will cause errors in # some parsers as the JSON spec does not support NaN and Inf (which it should but...) # See https://kbase-jira.atlassian.net/browse/PTV-1866 - _xsv_parse_success_nan_inf(temp_dir, ',', parse_csv) - _xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv) + _xsv_parse_success_nan_inf(temp_dir, ",", parse_csv) + _xsv_parse_success_nan_inf(temp_dir, "\t", parse_tsv) -def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): +def _xsv_parse_success_nan_inf( + temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults] +): s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} -Inf{s} nan {s} inf\n", - f"Spec 1{s} inf{s} Spec 3{s} -inf\n", - f"inf {s} val2 {s} NaN {s} 3.2\n", - f"Inf {s} val4{s} -inf{s} 8.9\n", - f"val5 {s}-Inf{s}{s} nan\n", - ]) + test_file.writelines( + [ + f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} -Inf{s} nan {s} inf\n", + f"Spec 1{s} inf{s} Spec 3{s} -inf\n", + f"inf {s} val2 {s} NaN {s} 3.2\n", + f"Inf {s} val4{s} -inf{s} 8.9\n", + f"val5 {s}-Inf{s}{s} nan\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_nan_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}), - frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}), - frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_nan_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "inf", + "-Inf": "val2", + "nan": "NaN", + "inf": 3.2, + } + ), + frozendict( + { + "spec1": "Inf", + "-Inf": "val4", + "nan": "-inf", + "inf": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "-Inf": "-Inf", + "nan": None, + "inf": "nan", + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): @@ -114,8 +184,8 @@ def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): Not a use case we expect but good to check numeric headers don't cause an unexpected error. """ - _xsv_parse_success_with_numeric_headers(temp_dir, ',', parse_csv) - _xsv_parse_success_with_numeric_headers(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_numeric_headers(temp_dir, ",", parse_csv) + _xsv_parse_success_with_numeric_headers(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_numeric_headers( @@ -124,21 +194,33 @@ def _xsv_parse_success_with_numeric_headers( s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: some_type; Columns: 4; Version: 1\n", - f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - ]) + test_file.writelines( + [ + "Data type: some_type; Columns: 4; Version: 1\n", + f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([frozendict({"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + {"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9} + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path): @@ -146,8 +228,8 @@ def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path Test that leaving one or more empty lines in a csv/tsv file does not cause the parse to fail. This is easy to do accidentally and so will annoy users. """ - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ',', parse_csv) - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ",", parse_csv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_internal_and_trailing_empty_lines( @@ -156,46 +238,73 @@ def _xsv_parse_success_with_internal_and_trailing_empty_lines( s = sep input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - f"spec1{s} spec2{s} spec3{s} spec4\n", - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - "\n", - f"val1 {s} val2{s} 7 {s} 3.2\n", - "\n", - "\n", - "\n", - ]) + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + f"spec1{s} spec2{s} spec3{s} spec4\n", + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + "\n", + f"val1 {s} val2{s} 7 {s} 3.2\n", + "\n", + "\n", + "\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"other_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "other_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_fail_no_file(temp_dir: Path): input_ = temp_dir / str(uuid.uuid4()) with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - "spec1, spec2, spec3, spec4\n", - "Spec 1, Spec 2, Spec 3, Spec 4\n", - "val1 , val2, 7 , 3.2\n", - ]) + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + "spec1, spec2, spec3, spec4\n", + "Spec 1, Spec 2, Spec 3, Spec 4\n", + "val1 , val2, 7 , 3.2\n", + ] + ) input_ = Path(str(input_)[-1]) res = parse_csv(input_) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_)) - ])) + assert res == ParseResults( + errors=tuple( + [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_))] + ) + ) def test_xsv_parse_fail_binary_file(temp_dir: Path): @@ -203,12 +312,17 @@ def test_xsv_parse_fail_binary_file(temp_dir: Path): res = parse_csv(test_file) - assert res == ParseResults(errors=tuple([ - Error( - ErrorType.PARSE_FAIL, - "Not a text file: application/vnd.ms-excel", - source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "Not a text file: application/vnd.ms-excel", + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_xsv_parse_fail_directory(temp_dir: Path): @@ -217,9 +331,17 @@ def test_xsv_parse_fail_directory(temp_dir: Path): res = parse_tsv(test_file) - assert res == ParseResults(errors=tuple([Error( - ErrorType.PARSE_FAIL, "The given path is a directory", SpecificationSource(test_file) - )])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + SpecificationSource(test_file), + ) + ] + ) + ) def _xsv_parse_fail( @@ -235,7 +357,9 @@ def _xsv_parse_fail( test_file.writelines(lines) res = parser(input_) - expected = ParseResults(errors=tuple([Error(err_type, message, SpecificationSource(input_))])) + expected = ParseResults( + errors=tuple([Error(err_type, message, SpecificationSource(input_))]) + ) assert res == expected @@ -244,19 +368,25 @@ def test_xsv_parse_fail_empty_file(temp_dir: Path): def test_xsv_parse_fail_bad_datatype_header(temp_dir: Path): - err = ('Invalid header; got "This is the wrong header", expected "Data type: ' - + '; Columns: ; Version: "') + err = ( + 'Invalid header; got "This is the wrong header", expected "Data type: ' + + '; Columns: ; Version: "' + ) _xsv_parse_fail(temp_dir, ["This is the wrong header"], parse_csv, err) def test_xsv_parse_fail_bad_version(temp_dir: Path): err = "Schema version 87 is larger than maximum processable version 1" - _xsv_parse_fail(temp_dir, ["Data type: foo; Columns: 22; Version: 87"], parse_csv, err) + _xsv_parse_fail( + temp_dir, ["Data type: foo; Columns: 22; Version: 87"], parse_csv, err + ) def test_xsv_parse_fail_missing_column_headers(temp_dir: Path): err = "Missing 2nd header line" - _xsv_parse_fail(temp_dir, ["Data type: foo; Columns: 3; Version: 1\n"], parse_csv, err) + _xsv_parse_fail( + temp_dir, ["Data type: foo; Columns: 3; Version: 1\n"], parse_csv, err + ) err = "Missing 3rd header line" lines = ["Data type: foo; Columns: 3; Version: 1\n", "head1, head2, head3\n"] @@ -275,8 +405,7 @@ def test_xsv_parse_fail_missing_column_header_entries(temp_dir: Path): def test_xsv_parse_fail_missing_data(temp_dir: Path): err = "No non-header data in file" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -285,40 +414,35 @@ def test_xsv_parse_fail_missing_data(temp_dir: Path): def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 3, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 3, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" - "head1\thead2\n", + "Data type: foo; Columns: 2; Version: 1\n" "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" - "head1, head2, head3\n", + "Data type: foo; Columns: 2; Version: 1\n" "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1\thead2\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 5, expected 3, got 4" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1, head2, head 3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", "1, 2, 3, 4\n", @@ -328,8 +452,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 6, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head1\thead2\thead 3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\thead 3\n", "Head 1\tHead 2\tHead 3\n", "1\t2\t3\n", "1\t2\t3\n", @@ -342,16 +465,14 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): def test_xsv_parse_fail_duplicate_headers(temp_dir: Path): err = "Duplicate header name in row 2: head3" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) # test with duplicate dual headers lines = [ - "Data type: foo; Columns: 3; Version: 1\n" - "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", "Head 3, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -389,20 +510,37 @@ def test_excel_parse_success(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "type1": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": "foo", "header2": 1, "header3": 6.7}), - frozendict({"header1": "bar", "header2": 2, "header3": 8.9}), - frozendict({"header1": "baz", "header2": None, "header3": 3.4}), - frozendict({"header1": "bat", "header2": 4, "header3": None}), - )), - "type2": ParseResult(SpecificationSource(ex, "tab2"), ( - frozendict({"h1": "golly gee", "2": 42, "h3": "super"}), - )), - "type3": ParseResult(SpecificationSource(ex, "tab3"), ( - frozendict({"head1": "some data", "head2": 1}), - )), - })) + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict( + {"header1": "foo", "header2": 1, "header3": 6.7} + ), + frozendict( + {"header1": "bar", "header2": 2, "header3": 8.9} + ), + frozendict( + {"header1": "baz", "header2": None, "header3": 3.4} + ), + frozendict( + {"header1": "bat", "header2": 4, "header3": None} + ), + ), + ), + "type2": ParseResult( + SpecificationSource(ex, "tab2"), + (frozendict({"h1": "golly gee", "2": 42, "h3": "super"}),), + ), + "type3": ParseResult( + SpecificationSource(ex, "tab3"), + (frozendict({"head1": "some data", "head2": 1}),), + ), + } + ) + ) def test_excel_parse_success_nan_inf(): @@ -417,29 +555,36 @@ def test_excel_parse_success_nan_inf(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "nan_type": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": 1, "header2": None}), - frozendict({"header1": 2, "header2": None}), - frozendict({"header1": 3, "header2": None}), - frozendict({"header1": 4, "header2": "-1.#IND"}), - frozendict({"header1": 5, "header2": "-1.#QNAN"}), - frozendict({"header1": 6, "header2": "-NaN"}), - frozendict({"header1": 7, "header2": "-nan"}), - frozendict({"header1": 8, "header2": "1.#IND"}), - frozendict({"header1": 9, "header2": "1.#QNAN"}), - frozendict({"header1": 10, "header2": None}), - frozendict({"header1": 11, "header2": None}), - frozendict({"header1": 12, "header2": None}), - frozendict({"header1": 13, "header2": None}), - frozendict({"header1": 14, "header2": "NaN"}), - frozendict({"header1": 15, "header2": None}), - frozendict({"header1": 16, "header2": "nan"}), - frozendict({"header1": 17, "header2": None}), - frozendict({"header1": 18, "header2": None}), - frozendict({"header1": 19, "header2": "some stuff"}), - )), - })) + assert res == ParseResults( + frozendict( + { + "nan_type": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict({"header1": 1, "header2": None}), + frozendict({"header1": 2, "header2": None}), + frozendict({"header1": 3, "header2": None}), + frozendict({"header1": 4, "header2": "-1.#IND"}), + frozendict({"header1": 5, "header2": "-1.#QNAN"}), + frozendict({"header1": 6, "header2": "-NaN"}), + frozendict({"header1": 7, "header2": "-nan"}), + frozendict({"header1": 8, "header2": "1.#IND"}), + frozendict({"header1": 9, "header2": "1.#QNAN"}), + frozendict({"header1": 10, "header2": None}), + frozendict({"header1": 11, "header2": None}), + frozendict({"header1": 12, "header2": None}), + frozendict({"header1": 13, "header2": None}), + frozendict({"header1": 14, "header2": "NaN"}), + frozendict({"header1": 15, "header2": None}), + frozendict({"header1": 16, "header2": "nan"}), + frozendict({"header1": 17, "header2": None}), + frozendict({"header1": 18, "header2": None}), + frozendict({"header1": 19, "header2": "some stuff"}), + ), + ), + } + ) + ) def _excel_parse_fail( @@ -458,16 +603,24 @@ def _excel_parse_fail( if errors: assert res == ParseResults(errors=tuple(errors)) else: - assert res == ParseResults(errors=tuple([ - Error(ErrorType.PARSE_FAIL, message, source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + message, + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_excel_parse_fail_no_file(): f = _get_test_file("testtabs3full2nodata1empty0.xls") - _excel_parse_fail(f, errors=[ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f)) - ]) + _excel_parse_fail( + f, errors=[Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f))] + ) def test_excel_parse_fail_directory(temp_dir): @@ -475,37 +628,52 @@ def test_excel_parse_fail_directory(temp_dir): f = temp_dir / d os.makedirs(f, exist_ok=True) err = "The given path is a directory" - _excel_parse_fail(f, errors=[Error(ErrorType.PARSE_FAIL, err, SpecificationSource(f))]) + _excel_parse_fail( + f, errors=[Error(ErrorType.PARSE_FAIL, err, SpecificationSource(f))] + ) def test_excel_parse_fail_empty_file(temp_dir: Path): - _xsv_parse_fail(temp_dir, [], parse_excel, "Not a supported Excel file type", extension=".xls") + _xsv_parse_fail( + temp_dir, [], parse_excel, "Not a supported Excel file type", extension=".xls" + ) def test_excel_parse_fail_non_excel_file(temp_dir: Path): lines = [ - "Data type: foo; Version: 1\n" - "head1, head2, head 3\n", + "Data type: foo; Version: 1\n" "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", ] _xsv_parse_fail( - temp_dir, lines, parse_excel, "Not a supported Excel file type", extension=".xlsx") + temp_dir, + lines, + parse_excel, + "Not a supported Excel file type", + extension=".xlsx", + ) def test_excel_parse_1emptytab(): - _excel_parse_fail(_get_test_file("testtabs1empty.xls"), "No non-header data in file") + _excel_parse_fail( + _get_test_file("testtabs1empty.xls"), "No non-header data in file" + ) def test_excel_parse_fail_bad_datatype_header(): f = _get_test_file("testbadinitialheader.xls") - err1 = ('Invalid header; got "This header is wack, yo", expected "Data type: ' - + '; Columns: ; Version: "') + err1 = ( + 'Invalid header; got "This header is wack, yo", expected "Data type: ' + + '; Columns: ; Version: "' + ) err2 = "Schema version 2 is larger than maximum processable version 1" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), - ]) + _excel_parse_fail( + f, + errors=[ + Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), + Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), + ], + ) def test_excel_parse_fail_headers_only(): @@ -517,30 +685,63 @@ def test_excel_parse_fail_colliding_datatypes(): f = _get_test_file("testdatatypecollisions.xls") l = lambda t: f"Found datatype {t} in multiple tabs" err = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - _excel_parse_fail(f, errors=[ - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_2")), - Error(err, l("type3"), SpecificationSource(f, "dt3"), SpecificationSource(f, "dt3_2")), - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_3")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + err, + l("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_2"), + ), + Error( + err, + l("type3"), + SpecificationSource(f, "dt3"), + SpecificationSource(f, "dt3_2"), + ), + Error( + err, + l("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_3"), + ), + ], + ) def test_excel_parse_fail_duplicate_headers(): f = _get_test_file("testduplicateheaders.xlsx") l = lambda h: f"Duplicate header name in row 2: {h}" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, l("head1"), SpecificationSource(f, "dt2")), - Error(ErrorType.PARSE_FAIL, l("head2"), SpecificationSource(f, "dt3")), - ]) + _excel_parse_fail( + f, + errors=[ + Error(ErrorType.PARSE_FAIL, l("head1"), SpecificationSource(f, "dt2")), + Error(ErrorType.PARSE_FAIL, l("head2"), SpecificationSource(f, "dt3")), + ], + ) def test_excel_parse_fail_missing_header_item(): f = _get_test_file("testmissingheaderitem.xlsx") err1 = "Missing header entry in row 2, position 3" err2 = "Missing header entry in row 2, position 2" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "missing header item error")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "whitespace header item")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.PARSE_FAIL, + err1, + SpecificationSource(f, "missing header item error"), + ), + Error( + ErrorType.PARSE_FAIL, + err2, + SpecificationSource(f, "whitespace header item"), + ), + ], + ) + def test_excel_parse_fail_unequal_rows(): """ @@ -550,20 +751,23 @@ def test_excel_parse_fail_unequal_rows(): worry about off by one errors when filling in separator characters. """ f = _get_test_file("testunequalrows.xlsx") - _excel_parse_fail(f, errors=[ - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 3, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 human readable") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 2, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 spec IDs") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 5, expected 3, got 4", - SpecificationSource(f, "3 cols, 4 data") - ), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 3, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 human readable"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 2, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 spec IDs"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 5, expected 3, got 4", + SpecificationSource(f, "3 cols, 4 data"), + ), + ], + ) diff --git a/tests/pyproject.toml b/tests/pyproject.toml index 8bc18712..a958f2f2 100644 --- a/tests/pyproject.toml +++ b/tests/pyproject.toml @@ -3,4 +3,5 @@ minversion = "6.0" addopts = "-ra -q" testpaths = [ "tests", -] \ No newline at end of file +] +asyncio_mode = "auto" \ No newline at end of file diff --git a/tests/test_app.py b/tests/test_app.py index eabec07e..81799eb1 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,20 +1,21 @@ import asyncio -import configparser import hashlib -import openpyxl import os -import pandas +import platform import shutil import string import time +from io import BytesIO from json import JSONDecoder from pathlib import Path -from urllib.parse import urlencode,unquote -from io import BytesIO from typing import Any +from unittest.mock import patch +from urllib.parse import unquote, urlencode - -from aiohttp import test_utils, FormData +import openpyxl +import pandas +import pytest +from aiohttp import FormData, test_utils from hypothesis import given, settings from hypothesis import strategies as st @@ -22,28 +23,24 @@ import staging_service.globus as globus import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils - -from tests.test_utils import check_file_contents, check_excel_contents +from tests.test_helpers import ( + DATA_DIR, + META_DIR, + FileUtil, + check_excel_contents, + check_file_contents, +) if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: - from tests.test_utils import bootstrap + from tests.test_helpers import bootstrap bootstrap() decoder = JSONDecoder() -config = configparser.ConfigParser() -config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) -DATA_DIR = config["staging_service"]["DATA_DIR"] -META_DIR = config["staging_service"]["META_DIR"] -AUTH_URL = config["staging_service"]["AUTH_URL"] -if DATA_DIR.startswith("."): - DATA_DIR = os.path.normpath(os.path.join(os.getcwd(), DATA_DIR)) -if META_DIR.startswith("."): - META_DIR = os.path.normpath(os.path.join(os.getcwd(), META_DIR)) -utils.Path._DATA_DIR = DATA_DIR -utils.Path._META_DIR = META_DIR +utils.StagingPath._DATA_DIR = DATA_DIR +utils.StagingPath._META_DIR = META_DIR def asyncgiven(**kwargs): @@ -62,13 +59,15 @@ def aio_wrapper(*args, **kwargs): return real_decorator +# TODO: replace with real unittest mocking; this approach forces a bad design for the auth client; it is +# literally used in only one place in the codebase! def mock_auth_app(): - application = app.app_factory(config) - - async def mock_auth(*args, **kwargs): - return "testuser" + application = app.app_factory() - app.auth_client.get_user = mock_auth + # async def mock_auth(*args, **kwargs): + # return "testuser" + # + # auth_client().get_user = mock_auth async def mock_globus_id(*args, **kwargs): return ["testuser@globusid.org"] @@ -79,8 +78,13 @@ async def mock_globus_id(*args, **kwargs): return application +async def mock_get_user(username): + return username + + class AppClient: - def __init__(self, config, mock_username=None): + # TODO: the constructor params are unused ... probably can be removed ... try later. + def __init__(self): self.server = test_utils.TestServer(mock_auth_app()) async def __aenter__(self): @@ -93,34 +97,6 @@ async def __aexit__(self, *args): await self.client.close() -class FileUtil: - def __init__(self, base_dir=DATA_DIR): - self.base_dir = base_dir - - def __enter__(self): - os.makedirs(self.base_dir, exist_ok=True) - shutil.rmtree(self.base_dir) - os.makedirs(self.base_dir, exist_ok=False) - return self - - def __exit__(self, *args): - shutil.rmtree(self.base_dir) - - def make_file(self, path, contents): - path = os.path.join(self.base_dir, path) - with open(path, encoding="utf-8", mode="w") as f: - f.write(contents) - return path - - def make_dir(self, path): - path = os.path.join(self.base_dir, path) - os.makedirs(path, exist_ok=True) - return path - - def remove_dir(self, path): - shutil.rmtree(path) - - first_letter_alphabet = [c for c in string.ascii_lowercase + string.ascii_uppercase] username_alphabet = [ c for c in "_" + string.ascii_lowercase + string.ascii_uppercase + string.digits @@ -138,42 +114,70 @@ def remove_dir(self, path): def test_path_cases(username_first, username_rest): username = username_first + username_rest assert ( - username + "/foo/bar" == utils.Path.validate_path(username, "foo/bar").user_path + username + "/foo/bar" + == utils.StagingPath.validate_path(username, "foo/bar").user_path + ) + assert ( + username + "/baz" + == utils.StagingPath.validate_path(username, "foo/../bar/../baz").user_path + ) + assert ( + username + "/bar" + == utils.StagingPath.validate_path(username, "foo/../../../../bar").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "./foo").user_path + ) + assert ( + username + "/foo/bar" + == utils.StagingPath.validate_path(username, "../foo/bar").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "/../foo").user_path ) assert ( - username + "/baz" - == utils.Path.validate_path(username, "foo/../bar/../baz").user_path + username + "/" == utils.StagingPath.validate_path(username, "/foo/..").user_path ) assert ( - username + "/bar" - == utils.Path.validate_path(username, "foo/../../../../bar").user_path + username + "/foo" + == utils.StagingPath.validate_path(username, "/foo/.").user_path ) - assert username + "/foo" == utils.Path.validate_path(username, "./foo").user_path assert ( - username + "/foo/bar" - == utils.Path.validate_path(username, "../foo/bar").user_path + username + "/foo" == utils.StagingPath.validate_path(username, "foo/").user_path ) - assert username + "/foo" == utils.Path.validate_path(username, "/../foo").user_path - assert username + "/" == utils.Path.validate_path(username, "/foo/..").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo/.").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo/").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo/").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo/.").user_path - assert username + "/" == utils.Path.validate_path(username, "").user_path - assert username + "/" == utils.Path.validate_path(username, "foo/..").user_path - assert username + "/" == utils.Path.validate_path(username, "/..../").user_path assert ( - username + "/stuff.ext" - == utils.Path.validate_path(username, "/stuff.ext").user_path + username + "/foo" == utils.StagingPath.validate_path(username, "foo").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "/foo/").user_path + ) + assert ( + username + "/foo" == utils.StagingPath.validate_path(username, "/foo").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "foo/.").user_path + ) + assert username + "/" == utils.StagingPath.validate_path(username, "").user_path + assert ( + username + "/" == utils.StagingPath.validate_path(username, "foo/..").user_path + ) + assert ( + username + "/" == utils.StagingPath.validate_path(username, "/..../").user_path + ) + assert ( + username + "/stuff.ext" + == utils.StagingPath.validate_path(username, "/stuff.ext").user_path ) @given(username_first_strat, username_strat, st.text()) def test_path_sanitation(username_first, username_rest, path): username = username_first + username_rest - validated = utils.Path.validate_path(username, path) + validated = utils.StagingPath.validate_path(username, path) assert validated.full_path.startswith(DATA_DIR) assert validated.user_path.startswith(username) assert validated.metadata_path.startswith(META_DIR) @@ -192,40 +196,56 @@ async def test_cmd(txt): assert "" == await utils.run_command("ls", d) f = fs.make_file("test/test2", txt) md5 = hashlib.md5(txt.encode("utf8")).hexdigest() - md52 = await utils.run_command("md5sum", f) - assert md5 == md52.split()[0] + os = platform.system() + if os == "Linux": + md52 = await utils.run_command("md5sum", f) + expected_md5 = md52.split()[0] + elif os == "Darwin": + md52 = await utils.run_command("md5", f) + expected_md5 = md52.split()[3] + + assert md5 == expected_md5 # For mac osx, copy md5 to md5sum and check 3rd element # assert md5 == md52.split()[3] -async def test_auth(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_auth(get_user): + get_user.return_value = "testuser" + async with AppClient() as cli: resp = await cli.get("/test-auth") assert resp.status == 200 text = await resp.text() assert "I'm authenticated as" in text -async def test_service(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_service(get_user): + get_user.return_value = "testuser" + async with AppClient() as cli: resp = await cli.get("/test-service") assert resp.status == 200 text = await resp.text() assert "staging service version" in text -async def test_jbi_metadata(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_jgi_metadata(get_user): txt = "testing text\n" username = "testuser" - jbi_metadata = '{"file_owner": "sdm", "added_date": "2013-08-12T00:21:53.844000"}' + get_user.return_value = username + jgi_metadata = '{"file_owner": "sdm", "added_date": "2013-08-12T00:21:53.844000"}' - async with AppClient(config, username) as cli: + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) - f_jgi = fs.make_file( - os.path.join(username, "test", ".test_jgi.fastq.jgi"), jbi_metadata + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) + fs.make_file( + os.path.join(username, "test", ".test_jgi.fastq.jgi"), jgi_metadata ) res1 = await cli.get( os.path.join("jgi-metadata", "test", "test_jgi.fastq"), @@ -247,13 +267,17 @@ async def test_jbi_metadata(): assert res1.status == 404 -async def test_metadata(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_metadata(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), headers={"Authorization": ""}, @@ -276,7 +300,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -306,7 +330,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -339,7 +363,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -347,13 +371,17 @@ async def test_metadata(): assert not json.get("isFolder") -async def test_define_UPA(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_define_UPA(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # generating metadata file res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), @@ -369,7 +397,10 @@ async def test_define_UPA(): ) assert res2.status == 200 json_text = await res2.text() - assert "succesfully updated UPA test_UPA" in json_text + assert ( + f"successfully updated UPA test_UPA for file {username}/test/test_file_1" + in json_text + ) # getting new metadata res3 = await cli.get( @@ -395,7 +426,7 @@ async def test_define_UPA(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -412,7 +443,7 @@ async def test_define_UPA(): ) assert res4.status == 404 - # tesging missing body + # testing missing body res5 = await cli.post( os.path.join("define-upa", "test", "test_file_1"), headers={"Authorization": ""}, @@ -428,13 +459,16 @@ async def test_define_UPA(): assert res6.status == 400 -async def test_mv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_mv(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # list current test directory res1 = await cli.get( @@ -502,63 +536,71 @@ async def test_mv(): assert res7.status == 404 -async def test_delete(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_delete(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - - # list current test directory - res1 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) - assert res1.status == 200 - json_text = await res1.text() - json = decoder.decode(json_text) - assert len(json) == 1 - assert json[0]["name"] == "test_file_1" - - res2 = await cli.delete( - os.path.join("delete", "test", "test_file_1"), - headers={"Authorization": ""}, - ) - assert res2.status == 200 - json_text = await res2.text() - assert "successfully deleted" in json_text - - # relist test directory - res3 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) - assert res3.status == 200 - json_text = await res3.text() - json = decoder.decode(json_text) - assert len(json) == 0 + get_user.return_value = username + with patch("staging_service.auth2Client.KBaseAuth2") as mock_auth: + mock_auth.return_value.get_user.return_value = username + async with AppClient() as cli: + with FileUtil() as fs: + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - # testing moving root - # res4 = await cli.delete('/delete/', - # headers={'Authorization': ''}) - # assert res4.status == 403 + # list current test directory + res1 = await cli.get( + os.path.join("list", "test"), headers={"Authorization": ""} + ) + assert res1.status == 200 + json_text = await res1.text() + json = decoder.decode(json_text) + assert len(json) == 1 + assert json[0]["name"] == "test_file_1" + + res2 = await cli.delete( + os.path.join("delete", "test", "test_file_1"), + headers={"Authorization": ""}, + ) + assert res2.status == 200 + json_text = await res2.text() + assert "successfully deleted" in json_text - # testing non-existing file - res5 = await cli.delete( - os.path.join("delete", "test", "non_existing.test_file_2"), - headers={"Authorization": ""}, - ) - assert res5.status == 404 + # relist test directory + res3 = await cli.get( + os.path.join("list", "test"), headers={"Authorization": ""} + ) + assert res3.status == 200 + json_text = await res3.text() + json = decoder.decode(json_text) + assert len(json) == 0 + + # testing moving root + # res4 = await cli.delete('/delete/', + # headers={'Authorization': ''}) + # assert res4.status == 403 + + # testing non-existing file + res5 = await cli.delete( + os.path.join("delete", "test", "non_existing.test_file_2"), + headers={"Authorization": ""}, + ) + assert res5.status == 404 -async def test_list(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_list(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt ) res1 = await cli.get("list/..", headers={"Authorization": ""}) @@ -608,7 +650,7 @@ async def test_list(): assert sum(file_folder_count) == 1 # testing list dot-files - f4 = fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) + fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) # f5 = fs.make_file(os.path.join(username, 'test', '.globus_id'), txt) res6 = await cli.get("/list/", headers={"Authorization": ""}) assert res6.status == 200 @@ -637,13 +679,16 @@ async def test_list(): assert len(file_names) == 4 +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @asyncgiven(txt=st.text()) -async def test_download(txt): +@pytest.mark.asyncio +async def test_download(get_user, txt): username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res = await cli.get( os.path.join("download", "test", "test_file_1"), @@ -654,11 +699,14 @@ async def test_download(txt): assert result_text == txt.encode() -async def test_download_errors(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_download_errors(get_user): username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) res1 = await cli.get("dwnload", headers={"Authorization": ""}) assert res1.status == 404 @@ -668,22 +716,25 @@ async def test_download_errors(): assert res2.status == 400 -async def test_similar(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_similar(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) - d1 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f1 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2.fq"), txt ) - f2 = fs.make_file( + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_right.fq"), txt, ) - f3 = fs.make_file( + fs.make_file( os.path.join(username, "test", "test_sub_dir", "my_files"), txt ) @@ -709,24 +760,23 @@ async def test_similar(): assert res3.status == 400 -async def test_existence(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_existence(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt ) - d3 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_file_1") - ) - d4 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_sub_dir") - ) - f4 = fs.make_file( + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_file_1")) + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_sub_dir")) + fs.make_file( os.path.join( username, "test", "test_sub_dir", "test_sub_dir", "test_file_1" ), @@ -775,15 +825,18 @@ async def test_existence(): assert json["isFolder"] is False -async def test_search(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_search(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test2")) - f3 = fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test1"), txt) + fs.make_dir(os.path.join(username, "test", "test2")) + fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) res1 = await cli.get("search/", headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() @@ -801,16 +854,19 @@ async def test_search(): assert len(json) == 2 -async def test_upload(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: # testing missing body res1 = await cli.post("upload", headers={"Authorization": ""}) assert res1.status == 400 with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) files = {"destPath": "/", "uploads": open(f, "rb")} @@ -825,8 +881,7 @@ async def test_upload(): async def _upload_file_fail_filename(filename: str, err: str): # Note two file uploads in a row causes a test error: # https://github.com/aio-libs/aiohttp/issues/3968 - async with AppClient(config, "fake") as cli: # username is ignored by AppClient - + async with AppClient() as cli: formdata = FormData() formdata.add_field("destPath", "/") formdata.add_field("uploads", BytesIO(b"sometext"), filename=filename) @@ -837,29 +892,42 @@ async def _upload_file_fail_filename(filename: str, err: str): assert res.status == 403 -async def test_upload_fail_leading_space(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_leading_space(get_user): + get_user.return_value = "fake" await _upload_file_fail_filename( - " test_file", "cannot upload file with name beginning with space") + " test_file", "cannot upload file with name beginning with space" + ) -async def test_upload_fail_dotfile(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_dotfile(get_user): + get_user.return_value = "whocares" await _upload_file_fail_filename( - ".test_file", "cannot upload file with name beginning with '.'") + ".test_file", "cannot upload file with name beginning with '.'" + ) -async def test_upload_fail_comma_in_file(): - await _upload_file_fail_filename( - "test,file", "cannot upload file with ',' in name") +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_comma_in_file(get_user): + get_user.return_value = "idunno" + await _upload_file_fail_filename("test,file", "cannot upload file with ',' in name") +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @settings(deadline=None) @asyncgiven(contents=st.text()) -async def test_directory_decompression(contents): +@pytest.mark.asyncio +async def test_directory_decompression(get_user, contents): fname = "test" dirname = "dirname" username = "testuser" - path = utils.Path.validate_path(username, os.path.join(dirname, fname)) - path2 = utils.Path.validate_path(username, os.path.join(dirname, fname)) + get_user.return_value = username + path = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) + path2 = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) if path.user_path.endswith("/") or path2.user_path.endswith("/"): # invalid test case # TODO it should be faster if hypothesis could generate all cases except these @@ -873,7 +941,7 @@ async def test_directory_decompression(contents): ("bztar", ".tar.bz"), ("tar", ".tar"), ] - async with AppClient(config, username) as cli: + async with AppClient() as cli: for method, extension in methods: with FileUtil() as fs: d = fs.make_dir(os.path.join(username, dirname)) @@ -913,53 +981,61 @@ async def test_directory_decompression(contents): assert os.path.exists(f3) +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @asyncgiven(contents=st.text()) -async def test_file_decompression(contents): +@pytest.mark.asyncio +async def test_file_decompression(get_user, contents): fname = "test" dirname = "dirname" username = "testuser" - path = utils.Path.validate_path(username, os.path.join(dirname, fname)) + get_user.return_value = username + path = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) if path.user_path.endswith("/"): # invalid test case # TODO it should be faster if hypothesis could generate all cases except these return methods = [("gzip", ".gz"), ("bzip2", ".bz2")] - async with AppClient(config, username) as cli: - for method, extension in methods: - with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, dirname)) - f1 = fs.make_file(path.user_path, contents) - name = fname + extension - await utils.run_command(method, f1) - # check to see that the original is gone - assert os.path.exists(d) - assert not os.path.exists(f1) - assert os.path.exists(os.path.join(d, name)) - resp = await cli.patch( - "/decompress/" + os.path.join(dirname, name), - headers={"Authorization": ""}, - ) - assert resp.status == 200 - text = await resp.text() - assert "succesfully decompressed" in text - assert name in text - # check to see if we got back what we started with for all files and directories - assert os.path.exists(d) - assert os.path.exists(f1) - - -async def test_importer_mappings(): + with patch("staging_service.auth2Client.KBaseAuth2") as mock_auth: + mock_auth.return_value.get_user.return_value = username + async with AppClient() as cli: + for method, extension in methods: + with FileUtil() as fs: + d = fs.make_dir(os.path.join(username, dirname)) + f1 = fs.make_file(path.user_path, contents) + name = fname + extension + await utils.run_command(method, f1) + # check to see that the original is gone + assert os.path.exists(d) + assert not os.path.exists(f1) + assert os.path.exists(os.path.join(d, name)) + resp = await cli.patch( + "/decompress/" + os.path.join(dirname, name), + headers={"Authorization": ""}, + ) + assert resp.status == 200 + text = await resp.text() + assert "succesfully decompressed" in text + assert name in text + # check to see if we got back what we started with for all files and directories + assert os.path.exists(d) + assert os.path.exists(f1) + + +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_importer_mappings(get_user): """ This tests calling with simple good cases, and some expected bad cases :return: """ username = "testuser" + get_user.return_value = username # Normal case, no match data = {"file_list": ["file1.txt"]} qs1 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs1}") assert resp.status == 200 text = await resp.json() @@ -971,7 +1047,7 @@ async def test_importer_mappings(): data = {"file_list": ["file1.txt", "file.tar.gz"]} qs2 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs2}", data=data) assert resp.status == 200 text = await resp.json() @@ -987,7 +1063,8 @@ async def test_importer_mappings(): # A dict is passed in data = {"file_list": [{}]} qs3 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs3}", data=data) assert resp.status == 200 text = await resp.json() @@ -1005,62 +1082,80 @@ async def test_importer_mappings(): for data in bad_data: qsd = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qsd}") assert resp.status == 400 text = await resp.text() - assert f"must provide file_list field. Your provided qs: {unquote(qsd)}" in text + assert ( + f"must provide file_list field. Your provided qs: {unquote(qsd)}" + in text + ) -async def test_bulk_specification_success(): - # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_success(get_user): + username = "testuser" + get_user.return_value = username + + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/somefolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/somefolder") + base = Path(fu.base_dir) / username tsv = "genomes.tsv" with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - "val3\tval4\t1\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + "val3\tval4\t1\n", + ] + ) csv = "somefolder/breakfastcereals.csv" with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - "grape nuts , dietary fiber, also gross\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + "grape nuts , dietary fiber, also gross\n", + ] + ) excel = "importspec.xlsx" with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ["Fred", 1.5] - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ["Fred", 1.5], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get(f"bulk_specification/?files={tsv} , {csv}, {excel} ") + resp = await cli.get( + f"bulk_specification/?files={tsv} , {csv}, {excel} " + ) jsn = await resp.json() assert jsn == { "types": { "genomes": [ {"spec1": "val1", "spec2": "ꔆ", "spec3": 7}, - {"spec1": "val3", "spec2": "val4", "spec3": 1} + {"spec1": "val3", "spec2": "val4", "spec3": 1}, ], "breakfastcereals": [ {"s1": "froot loops", "s2": "puffin", "s3": "gross"}, @@ -1072,22 +1167,29 @@ async def test_bulk_specification_success(): ], "tree_sloths": [ {"entity_id": "That which ends all", "preferred_food": "ꔆ"} - ] + ], }, "files": { "genomes": {"file": "testuser/genomes.tsv", "tab": None}, "breakfastcereals": { "file": "testuser/somefolder/breakfastcereals.csv", - "tab": None}, + "tab": None, + }, "fruit_bats": {"file": "testuser/importspec.xlsx", "tab": "bats"}, - "tree_sloths": {"file": "testuser/importspec.xlsx", "tab": "sloths"}, - } + "tree_sloths": { + "file": "testuser/importspec.xlsx", + "tab": "sloths", + }, + }, } assert resp.status == 200 -async def test_bulk_specification_fail_no_files(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_no_files(get_user): + get_user.return_value = "dontcare" + async with AppClient() as cli: for f in ["", "?files=", "?files= , ,, , "]: resp = await cli.get(f"bulk_specification/{f}") jsn = await resp.json() @@ -1095,343 +1197,424 @@ async def test_bulk_specification_fail_no_files(): assert resp.status == 400 -async def test_bulk_specification_fail_not_found(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_not_found(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/otherfolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/otherfolder") + base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) resp = await cli.get(f"bulk_specification/?files={tsv},somefile.csv") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_find_file", "file": "testuser/somefile.csv"} - ]} + assert jsn == { + "errors": [ + {"type": "cannot_find_file", "file": f"{username}/somefile.csv"} + ] + } assert resp.status == 404 -async def test_bulk_specification_fail_parse_fail(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_parse_fail(get_user): """ Tests a number of different parse fail cases, including incorrect file types. Also tests that all the errors are combined correctly. """ - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/otherfolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/otherfolder") + base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "otherfolder/thing.csv" # this one has a misspelling in the header with open(base / csv, "w") as f: - f.writelines([ - "Dater type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - ]) + f.writelines( + [ + "Dater type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this one is fine - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is missing a parameter ID - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["", "preferred_food"], - ["ID", "Foods I like"], - ["Kevin Garibaldi", "Beeeaaaaans!"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["", "preferred_food"], + ["ID", "Foods I like"], + ["Kevin Garibaldi", "Beeeaaaaans!"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) # this also tests a number of bad file extensions - no need to create files - resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}" - + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile.") + resp = await cli.get( + f"bulk_specification/?files={tsv},{csv},{excel}" + + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile." + ) jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_parse_file", - "message": 'Invalid header; got "Dater type: breakfastcereals; ' - + 'Columns: 3; Version: 1", expected "Data type: ; ' - + 'Columns: ; Version: "', - "file": "testuser/otherfolder/thing.csv", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "Missing header entry in row 2, position 1", - "file": "testuser/stuff.xlsx", - "tab": "sloths", - }, - {"type": "cannot_parse_file", - "message": "badfile is not a supported file type for import specifications", - "file": "testuser/badfile", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "fasta.gz is not a supported file type for import specifications", - "file": "testuser/badfile.fasta.gz", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sra is not a supported file type for import specifications", - "file": "testuser/badfile.sra", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sys is not a supported file type for import specifications", - "file": "testuser/badfile.sys", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "badfile. is not a supported file type for import specifications", - "file": "testuser/badfile.", - "tab": None, - }, - ]} + assert jsn == { + "errors": [ + { + "type": "cannot_parse_file", + "message": 'Invalid header; got "Dater type: breakfastcereals; ' + + 'Columns: 3; Version: 1", expected "Data type: ; ' + + 'Columns: ; Version: "', + "file": f"{username}/otherfolder/thing.csv", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "Missing header entry in row 2, position 1", + "file": f"{username}/stuff.xlsx", + "tab": "sloths", + }, + { + "type": "cannot_parse_file", + "message": "badfile is not a supported file type for import specifications", + "file": f"{username}/badfile", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "fasta.gz is not a supported file type for import specifications", + "file": f"{username}/badfile.fasta.gz", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sra is not a supported file type for import specifications", + "file": f"{username}/badfile.sra", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sys is not a supported file type for import specifications", + "file": f"{username}/badfile.sys", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "badfile. is not a supported file type for import specifications", + "file": f"{username}/badfile.", + "tab": None, + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_column_count(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_column_count(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "thing.csv" # this one is missing a column in the last row with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this one has an extra column in the last row - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42, 56], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42, 56], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get( - f"bulk_specification/?files={tsv},{csv},{excel}") + resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 3, got 2", - "file": "testuser/thing.csv", - "tab": None, - }, - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 2, got 3", - "file": "testuser/stuff.xlsx", - "tab": "bats", - }, - ]} + assert jsn == { + "errors": [ + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 3, got 2", + "file": f"{username}/thing.csv", + "tab": None, + }, + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 2, got 3", + "file": f"{username}/stuff.xlsx", + "tab": "bats", + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_multiple_specs_per_type(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_multiple_specs_per_type(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv1 = "thing.csv" # this is the first of the breakfastcereals data sources, so fine with open(base / csv1, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin, whee\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin, whee\n", + ] + ) csv2 = "thing2.csv" # this data type is also breakfastcereals, so will cause an error with open(base / csv2, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 2; Version: 1\n", - "s1,s2\n", - "S 1,S 2\n", - "froot loops , puffin\n", - ]) + f.writelines( + [ + "Data type: breakfastcereals; Columns: 2; Version: 1\n", + "s1,s2\n", + "S 1,S 2\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: # this data type is also breakfastcereals, so will cause an error - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) resp = await cli.get( - f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}") + f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}" + ) jsn = await resp.json() err = "Data type breakfastcereals appears in two importer specification sources" - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/thing2.csv", - "tab_2": None - }, - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/stuff.xlsx", - "tab_2": "bats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": f"{username}/thing.csv", + "tab_1": None, + "file_2": f"{username}/thing2.csv", + "tab_2": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": f"{username}/thing.csv", + "tab_1": None, + "file_2": f"{username}/stuff.xlsx", + "tab_2": "bats", + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_multiple_specs_per_type_excel(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_multiple_specs_per_type_excel(get_user): """ Test an excel file with an internal data type collision. This is the only case when all 5 error fields are filled out. """ - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username excel = "stuff.xlsx" with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="otherbats", header=False, index=False) resp = await cli.get(f"bulk_specification/?files={excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": "Found datatype breakfastcereals in multiple tabs", - "file_1": "testuser/stuff.xlsx", - "tab_1": "bats", - "file_2": "testuser/stuff.xlsx", - "tab_2": "otherbats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": "Found datatype breakfastcereals in multiple tabs", + "file_1": f"{username}/stuff.xlsx", + "tab_1": "bats", + "file_2": f"{username}/stuff.xlsx", + "tab_2": "otherbats", + }, + ] + } assert resp.status == 400 _IMPORT_SPEC_TEST_DATA = { "genome": { - "order_and_display": [ - ["id1", "display1"], - ["id2", "display2"] - ], + "order_and_display": [["id1", "display1"], ["id2", "display2"]], "data": [ {"id1": 54, "id2": "boo"}, {"id1": 32, "id2": "yikes"}, - ] + ], }, "reads": { "order_and_display": [ ["name", "Reads File Name"], - ["inseam", "Reads inseam measurement in km"] + ["inseam", "Reads inseam measurement in km"], ], "data": [ {"name": "myreads.fa", "inseam": 0.1}, - ] - } + ], + }, } -async def test_write_bulk_specification_success_csv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_csv(get_user): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + fu.make_dir(username) + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "specs", "output_file_type": "CSV", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "CSV", "files_created": { - "genome": "testuser/specs/genome.csv", - "reads": "testuser/specs/reads.csv" - } + "genome": f"{username}/specs/genome.csv", + "reads": f"{username}/specs/reads.csv", + }, } - base = Path(fu.base_dir) / "testuser" + base = Path(fu.base_dir) / username check_file_contents( base / "specs/genome.csv", [ @@ -1440,7 +1623,7 @@ async def test_write_bulk_specification_success_csv(): "display1,display2\n", "54,boo\n", "32,yikes\n", - ] + ], ) check_file_contents( base / "specs/reads.csv", @@ -1449,33 +1632,39 @@ async def test_write_bulk_specification_success_csv(): "name,inseam\n", "Reads File Name,Reads inseam measurement in km\n", "myreads.fa,0.1\n", - ] + ], ) -async def test_write_bulk_specification_success_tsv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_tsv(get_user): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: + username = "foo" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock + fu.make_dir(username) types = dict(_IMPORT_SPEC_TEST_DATA) - types['reads'] = dict(types['reads']) - types['reads']['data'] = [] - resp = await cli.post("write_bulk_specification", json= - { + types["reads"] = dict(types["reads"]) + types["reads"]["data"] = [] + resp = await cli.post( + "write_bulk_specification", + json={ "output_directory": "tsvspecs", "output_file_type": "TSV", "types": types, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "TSV", "files_created": { - "genome": "testuser/tsvspecs/genome.tsv", - "reads": "testuser/tsvspecs/reads.tsv" - } + "genome": f"{username}/tsvspecs/genome.tsv", + "reads": f"{username}/tsvspecs/reads.tsv", + }, } - base = Path(fu.base_dir) / "testuser" + base = Path(fu.base_dir) / username check_file_contents( base / "tsvspecs/genome.tsv", [ @@ -1484,7 +1673,7 @@ async def test_write_bulk_specification_success_tsv(): "display1\tdisplay2\n", "54\tboo\n", "32\tyikes\n", - ] + ], ) check_file_contents( base / "tsvspecs/reads.tsv", @@ -1492,30 +1681,37 @@ async def test_write_bulk_specification_success_tsv(): "Data type: reads; Columns: 2; Version: 1\n", "name\tinseam\n", "Reads File Name\tReads inseam measurement in km\n", - ] + ], ) -async def test_write_bulk_specification_success_excel(): - # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_excel(get_user): + username = "leaf" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + fu.make_dir(username) + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "", "output_file_type": "EXCEL", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "EXCEL", "files_created": { - "genome": "testuser/import_specification.xlsx", - "reads": "testuser/import_specification.xlsx", - } + "genome": f"{username}/import_specification.xlsx", + "reads": f"{username}/import_specification.xlsx", + }, } - wb = openpyxl.load_workbook(Path(fu.base_dir) / "testuser/import_specification.xlsx") + wb = openpyxl.load_workbook( + Path(fu.base_dir) / f"{username}/import_specification.xlsx" + ) assert wb.sheetnames == ["genome", "reads"] check_excel_contents( wb, @@ -1542,77 +1738,117 @@ async def test_write_bulk_specification_success_excel(): ) -async def test_write_bulk_specification_fail_wrong_data_type(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_data_type(get_user): + get_user.return_value = "dontcare" + async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", data="foo") js = await resp.json() assert js == {"error": "Required content-type is application/json"} assert resp.status == 415 -async def test_write_bulk_specification_fail_no_content_length(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_content_length(get_user): + get_user.return_value = "whocares" + async with AppClient() as cli: resp = await cli.post( - "write_bulk_specification", headers={"content-type": "application/json"}) + "write_bulk_specification", headers={"content-type": "application/json"} + ) js = await resp.json() assert js == {"error": "The content-length header is required and must be > 0"} assert resp.status == 411 -async def test_write_bulk_specification_fail_large_input(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_large_input(get_user): + get_user.return_value = "idont" + async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", json="a" * (1024 * 1024 - 2)) txt = await resp.text() # this seems to be a built in (somewhat inaccurate) server feature - assert txt == "Maximum request body size 1048576 exceeded, actual body size 1048576" + assert ( + txt + == "Maximum request body size 1048576 exceeded, actual body size 1048576" + ) assert resp.status == 413 async def _write_bulk_specification_json_fail(json: Any, err: str): - async with AppClient(config) as cli: + async with AppClient() as cli: resp = await cli.post("write_bulk_specification", json=json) js = await resp.json() assert js == {"error": err} assert resp.status == 400 -async def test_write_bulk_specification_fail_not_dict(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_not_dict(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - ["foo"], "The top level JSON element must be a mapping") + ["foo"], "The top level JSON element must be a mapping" + ) -async def test_write_bulk_specification_fail_no_output_dir(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_output_dir(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {}, "output_directory is required and must be a string") + {}, "output_directory is required and must be a string" + ) -async def test_write_bulk_specification_fail_wrong_type_for_output_dir(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_type_for_output_dir(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": 4}, "output_directory is required and must be a string") + {"output_directory": 4}, "output_directory is required and must be a string" + ) -async def test_write_bulk_specification_fail_no_file_type(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_file_type(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": "foo"}, "Invalid output_file_type: None") + {"output_directory": "foo"}, "Invalid output_file_type: None" + ) -async def test_write_bulk_specification_fail_wrong_file_type(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_file_type(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": "foo", "output_file_type": "XSV"}, "Invalid output_file_type: XSV") + {"output_directory": "foo", "output_file_type": "XSV"}, + "Invalid output_file_type: XSV", + ) -async def test_write_bulk_specification_fail_invalid_type_value(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_invalid_type_value(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( {"output_directory": "foo", "output_file_type": "CSV", "types": {"a": "fake"}}, - "The value for data type a must be a mapping" + "The value for data type a must be a mapping", ) -async def test_importer_filetypes(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_importer_filetypes(get_user): """ Only checks a few example entries since the list may expand over time """ - async with AppClient(config) as cli: + get_user.return_value = "dontcare" + async with AppClient() as cli: resp = await cli.get("importer_filetypes") js = await resp.json() assert set(js.keys()) == {"datatype_to_filetype", "filetype_to_extensions"} diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index 21bbbc82..4e4fe7f9 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -5,7 +5,7 @@ ) from staging_service.AutoDetectUtils import AutoDetectUtils from staging_service.app import inject_config_dependencies -from tests.test_utils import bootstrap_config +from tests.test_helpers import bootstrap_config @pytest.fixture(autouse=True, scope="module") diff --git a/tests/test_utils.py b/tests/test_helpers.py similarity index 52% rename from tests/test_utils.py rename to tests/test_helpers.py index 94143456..e6728943 100644 --- a/tests/test_utils.py +++ b/tests/test_helpers.py @@ -1,12 +1,24 @@ import configparser +import os +import shutil import traceback - -from dotenv import load_dotenv from pathlib import Path from typing import Any -import os import openpyxl +from dotenv import load_dotenv + +test_config = configparser.ConfigParser() +test_config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) + +DATA_DIR = test_config["staging_service"]["DATA_DIR"] +META_DIR = test_config["staging_service"]["META_DIR"] +AUTH_URL = test_config["staging_service"]["AUTH_URL"] +if DATA_DIR.startswith("."): + DATA_DIR = os.path.normpath(os.path.join(os.getcwd(), DATA_DIR)) +if META_DIR.startswith("."): + META_DIR = os.path.normpath(os.path.join(os.getcwd(), META_DIR)) + def bootstrap(): test_env_0 = "../test.env" @@ -36,19 +48,49 @@ def assert_exception_correct(got: Exception, expected: Exception): assert got.args == expected.args, err assert type(got) == type(expected) + def check_file_contents(file: Path, lines: list[str]): with open(file) as f: assert f.readlines() == lines + def check_excel_contents( wb: openpyxl.Workbook, sheetname: str, contents: list[list[Any]], - column_widths: list[int] + column_widths: list[int], ): sheet = wb[sheetname] for i, row in enumerate(sheet.iter_rows()): assert [cell.value for cell in row] == contents[i] # presumably there's an easier way to do this, but it works so f it dims = [sheet.column_dimensions[dim].width for dim in sheet.column_dimensions] - assert dims == column_widths \ No newline at end of file + assert dims == column_widths + + +class FileUtil: + def __init__(self, base_dir=DATA_DIR): + self.base_dir = base_dir + + def __enter__(self): + os.makedirs(self.base_dir, exist_ok=True) + shutil.rmtree(self.base_dir) + os.makedirs(self.base_dir, exist_ok=False) + return self + + def __exit__(self, *args): + shutil.rmtree(self.base_dir) + + def make_file(self, path, contents): + path = os.path.join(self.base_dir, path) + with open(path, encoding="utf-8", mode="w") as f: + f.write(contents) + return path + + def make_dir(self, path): + path = os.path.join(self.base_dir, path) + os.makedirs(path, exist_ok=True) + return path + + def remove_dir(self, path): + shutil.rmtree(path) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 599f0cef..341b818f 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,24 +1,31 @@ """ Unit tests for the metadata handling routines. """ import json +import os +import random +import string import uuid - from collections.abc import Generator -from pathlib import Path as PyPath -from pytest import fixture - -from staging_service.utils import Path -from staging_service.metadata import some_metadata +from pathlib import Path -from tests.test_app import FileUtil +from aiohttp import web +from pytest import fixture, raises -# TODO TEST add more unit tests here. +from staging_service.metadata import ( + _determine_source, + _file_read_from_head, + _file_read_from_tail, + is_text_string, + some_metadata, +) +from staging_service.utils import StagingPath +from tests.test_helpers import FileUtil -@fixture(scope="module") -def temp_dir() -> Generator[PyPath, None, None]: +@fixture(scope="function") +def temp_dir() -> Generator[Path, None, None]: with FileUtil() as fu: - childdir = PyPath(fu.make_dir(str(uuid.uuid4()))).resolve() + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() yield childdir @@ -27,31 +34,27 @@ def temp_dir() -> Generator[PyPath, None, None]: async def test_incomplete_metadata_file_update(temp_dir: Path): """ - Tests the case where a file is listed or checked for existance prior to completing + Tests the case where a file is listed or checked for existence prior to completing upload, and then an UPA is added to the file. This previously caused incomplete metadata to be returned as the logic for whether to run the metadata regeneration code based on the contents of the current metadata file was incorrect. See https://kbase-jira.atlassian.net/browse/PTV-1767 """ await _incomplete_metadata_file_update( - temp_dir, - {"source": "some place", "UPA": "1/2/3"}, - "some place" - ) + temp_dir, {"source": "some place", "UPA": "1/2/3"}, "some place" + ) + + await _incomplete_metadata_file_update(temp_dir, {"UPA": "1/2/3"}, "Unknown") - await _incomplete_metadata_file_update( - temp_dir, - {"UPA": "1/2/3"}, - "Unknown" - ) async def _incomplete_metadata_file_update(temp_dir, metadict, source): - target = Path( - str(temp_dir / "full"), - str(temp_dir / "meta"), + target = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), "user_path", "myfilename", - "super_fake_jgi_path") + "super_fake_jgi_path", + ) with open(target.full_path, "w") as p: p.writelines(make_test_lines(1, 6)) @@ -70,11 +73,283 @@ async def _incomplete_metadata_file_update(temp_dir, metadict, source): "tail": "".join(make_test_lines(2, 6)), "UPA": "1/2/3", "isFolder": False, - "lineCount": "5", + "lineCount": 5, "name": "myfilename", "path": "user_path", - "size": 1280 - } + "size": 1280, + } + def make_test_lines(start, stop): return [str(i) + "a" * (256 - len(str(i)) - 1) + "\n" for i in range(start, stop)] + + +async def _generate_binary_file(temp_dir): + target = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + "", + ) + with open(target.full_path, "wb") as data_file: + data_file.write(b"\0\1\2\3") + + res = await some_metadata(target) + + assert "mtime" in res + assert isinstance(res["mtime"], int) + assert "lineCount" in res + assert res["lineCount"] is None + assert res["head"] == "not a text file" + assert res["tail"] == "not a text file" + + +async def test_binary_data(temp_dir: Path): + """ + Test case in which metadata is generated for a binary file. + """ + await _generate_binary_file(temp_dir) + + +def test_is_text(): + should_be_text = b"hello I'm text" + should_not_be_text = b"\0\1\2\4" + is_almost_text = b"hello \0oops" + text_with_acceptable_control_characters = b"hello\tthis \nis a\rstring" + invalid_unicode = b"\ac2\xa3" + + assert is_text_string(should_be_text) is True + assert is_text_string(text_with_acceptable_control_characters) is True + assert is_text_string(should_not_be_text) is False + assert is_text_string(is_almost_text) is False + assert is_text_string(invalid_unicode) is False + + +async def test_determine_source_jgi_metadata_source_exists(temp_dir: Path): + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + os.path.join(temp_dir, "jgi_metadata"), + ) + + # In this test, the actual data doesn't matter, just the + # fact that a file exists. + with open(staging_path.jgi_metadata, "w") as data_file: + data_file.write("foo") + + assert _determine_source(staging_path) == "JGI import" + + +def _create_validated_staging_path(staging_dir: str, file_path: str): + # Model actual usage, in which we have: + # - a user + # - a directory dedicated to user data - each user has a subdirectory + # inside it with the name the same as their username + # - a directory dedicated to metadata, parallel to the data dir + # - some file, which is the target of our path + username = "some_user" + data_dir = os.path.join(staging_dir, "data") + metadata_dir = os.path.join(staging_dir, "metadata") + + # Must create the data directory for the user, as the + # jgi metadata file lives IN the data dir (even though KBase + # metadata files live in the metadata parallel directory) + os.makedirs(os.path.join(data_dir, username, os.path.dirname(file_path))) + + # Here we "configure" the class. Note that this is global for the class. + StagingPath._DATA_DIR = data_dir + StagingPath._META_DIR = metadata_dir + + # Note that "validate_path" does not actually validate the path, + # it just populates the object with the various paths based on + # the config above, the username, and a file, possibly on a path. + return StagingPath.validate_path(username, file_path) + + +def test_determine_source_jgi_metadata_source_exists_canonical(temp_dir: Path): + """ + Tests the case of a simple, top level file. + """ + file_path = "foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == "JGI import" + + +def test_determine_source_jgi_metadata_source_exists_canonical_subdir(temp_dir: Path): + """ + Tests the case of a file in a subdirectory + """ + file_path = "some/foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == "JGI import" + + +def test_determine_source_jgi_metadata_source_exists_canonical_deepsubdir( + temp_dir: Path, +): + """ + Tests the case of a file in a deeply nested directory + """ + file_path = "some/deep/dark/scary/foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == "JGI import" + + +def test_determine_source_jgi_metadata_source_doesnt_exist(temp_dir: StagingPath): + # + # The _determine_source logic tests if the jgi_metadata exists for this + # StagingPath object. Unfortunately, it currently tests if the file indicated + # by the string exists and is a file. It always checks if the file exists, + # so it must be populated by SOME string. + # + # This may seem strange, but the reason is that a StagingPath is NEVER created + # with the constructor in the code itself, as we do here. It is always + # constructed by it's own methods, the only two methods it has, which + # blindly construct the path the file, whether it exists or not. + # + # I really don't quite understand this design, but that is the way it is, + # for now. + # + # So the conditions under which "no jgi metadata" is determined are: + # - path which is not associated with a file or directory (not found) + # - path associated with a directory + # + values_that_indicate_no_jgi_import_metadata_file = [ + "", # common case of leaving it empty if no metadata file possible + "zzz", # nonsensical file name + os.path.join( + temp_dir, "jgi_metadata" + ), # in the right place, but we still doesn't exist + os.path.join(temp_dir), # a directory + ] + for invalid_path in values_that_indicate_no_jgi_import_metadata_file: + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + invalid_path, + ) + assert _determine_source(staging_path) == "Unknown" + + +@fixture(scope="function") +def temp_dir2() -> Generator[Path, None, None]: + with FileUtil() as fu: + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() + + yield childdir + + +def make_random_string(string_length: str) -> str: + possible_letters = string.ascii_letters + return "".join(random.choice(possible_letters) for i in range(string_length)) + + +def make_random_non_text_bytes(string_length: str) -> bytes: + """ + We consider non-text to be invalid utf-8, or unicode + control characters other than 9, 10, 10. + """ + possible_letters = string.ascii_letters + return "".join(random.choice(possible_letters) for i in range(string_length)) + + +def test_read_from_head_happy(tmp_path: Path): + """ + In which we read a text snippet successfully from the front of the file. + """ + cases = [1, 10, 100, 1000, 10000, 100000] + # we just happen to know this from metadata.py... + snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "foo.txt") + file_content = make_random_string(case) + with open(file_path, "w") as output_file: + output_file.write(file_content) + snippet = _file_read_from_head(file_path) + assert snippet == file_content[:snippet_length] + + +def test_read_from_tail_happy(tmp_path: Path): + """ + In which we read a text snippet successfully from the tail of the file. + """ + cases = [1, 10, 100, 1000, 10000, 100000] + # we just happen to know this from metadata.py... + snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "foo.txt") + file_content = make_random_string(case) + with open(file_path, "w") as output_file: + output_file.write(file_content) + snippet = _file_read_from_tail(file_path) + assert snippet == file_content[-snippet_length:] + + +def test_read_from_head_sad(tmp_path: Path): + """ + In which we attempt to read a snippet from a non-text (binary) + file, and get the default "error" string. + """ + cases = [b"\ac2\xa3"] + # we just happen to know this from metadata.py... + # snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "foo.txt") + file_content = case + with open(file_path, "wb") as output_file: + output_file.write(file_content) + snippet = _file_read_from_head(file_path) + assert snippet == "not a text file" + + +def test_read_from_tail_sad(tmp_path: Path): + """ + In which we attempt to read a snippet from a non-text (binary) + file, and get the default "error" string. + """ + cases = [b"\ac2\xa3"] + # we just happen to know this from metadata.py... + # snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "foo.txt") + file_content = case + with open(file_path, "wb") as output_file: + output_file.write(file_content) + snippet = _file_read_from_tail(file_path) + assert snippet == "not a text file" + + +async def test_invalid_desired_fields(temp_dir: Path): + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + "", + ) + + with open(staging_path.full_path, "w") as p: + p.write("foo") + + with raises(web.HTTPBadRequest) as ex: + await some_metadata(staging_path, ["foo"]) + assert str(ex.value) == "Bad Request" + # Okay, not a lovely error message, but it is what it is, and should be improved. + assert ex.value.text == "no data exists for key ('foo',)" From 7c679b672438fe65b7ab6db0250c25ad232c5e39 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:36:06 -0700 Subject: [PATCH 08/33] Run tests in GHA with Python 3.11 [PTV-1882] Matches the dockerfile changes --- .github/workflows/run_tests.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2a2996a5..9e9f7120 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -1,22 +1,17 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - name: Run Staging Service Tests - on: [pull_request] - jobs: build: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v2 - - name: Set up Python 3.9 + - name: Set up Python 3.11 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.11 - name: Install dependencies run: | python -m pip install --upgrade pip From 18aad93272d3b58a0abc49540c2ab3811b55c591 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:39:11 -0700 Subject: [PATCH 09/33] Development tools [PTV-1882] Isolated development tools, to aid in development and testing within a container. Although the host-based server and testing works, it requires installation of the precise same compiler version, which is not really possible cross-platform (though pretty close). Also dependencies in linux (and in the specific version in deployment), do affect Python dependencies, so this reduces that variable greatly. (Of course, the way the service is run in a container in production will be different.) --- development/docker-compose-kbase-ui.yml | 27 +++++++++++++++++++++ development/scripts/README.md | 13 ++++++++++ development/scripts/generate-data-files.py | 15 ++++++++++++ development/scripts/list-uploads.sh | 11 +++++++++ development/scripts/run | 6 +++++ development/scripts/run-dev-server.sh | 6 +++++ development/tools/runner/Dockerfile | 26 ++++++++++++++++++++ development/tools/runner/docker-compose.yml | 10 ++++++++ development/tools/runner/entrypoint.sh | 17 +++++++++++++ 9 files changed, 131 insertions(+) create mode 100644 development/docker-compose-kbase-ui.yml create mode 100644 development/scripts/README.md create mode 100644 development/scripts/generate-data-files.py create mode 100755 development/scripts/list-uploads.sh create mode 100755 development/scripts/run create mode 100755 development/scripts/run-dev-server.sh create mode 100644 development/tools/runner/Dockerfile create mode 100644 development/tools/runner/docker-compose.yml create mode 100755 development/tools/runner/entrypoint.sh diff --git a/development/docker-compose-kbase-ui.yml b/development/docker-compose-kbase-ui.yml new file mode 100644 index 00000000..7efb10d4 --- /dev/null +++ b/development/docker-compose-kbase-ui.yml @@ -0,0 +1,27 @@ +# This docker compose file works with kbase-ui - specifically the kbase-dev +# network that allows proxying service requests from kbase-ui and the narrative. +version: "3.8" +networks: + kbase-dev: + name: kbase-dev +services: + staging_service: + hostname: staging_service + build: + context: . + ports: + # TODO: should be port 5000 to match other kbase services + - "3010:3000" + volumes: + - "./data:/kb/deployment/lib/src/data" + - "./staging_service:/kb/deployment/lib/staging_service" + networks: + - kbase-dev + dns: + - 8.8.8.8 + environment: + - KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg + - FILE_LIFETIME=90 + # - SAVE_STRATEGY=SAVE_STRATEGY_TEMP_THEN_COPY + # - SAVE_STRATEGY=SAVE_STRATEGY_SAVE_TO_DESTINATION + # - CHUNK_SIZE= diff --git a/development/scripts/README.md b/development/scripts/README.md new file mode 100644 index 00000000..3f18d5b3 --- /dev/null +++ b/development/scripts/README.md @@ -0,0 +1,13 @@ +# Scripts + +## `generate-data-files.py` + +A small tool, an example function really, to generate files for manual testing of upload and download. I use it by copy/pasting into a Python REPL session in the directory in which I want to generate files, and just run the function with the required parameters. + +## `list-uploads.sh` + +Handy for listing the contents of the `./data` directory while developing the `/upload` endpoint of the server locally. It simply lists the directory every 0.5s. + +## `run` + +TODO diff --git a/development/scripts/generate-data-files.py b/development/scripts/generate-data-files.py new file mode 100644 index 00000000..854e6804 --- /dev/null +++ b/development/scripts/generate-data-files.py @@ -0,0 +1,15 @@ +import io + + +def generate_null_file(size: int, name: str): + """ + Generate a file full of 0 bytes, for testing different + file size scenarios. + + e.g. generate_null_file(5100000000, "5.1g.out") + """ + with open(name, "wb") as out: + bw = io.BufferedWriter(out, 10000000) + for i in range(size): + bw.write(b"\0") + bw.flush() diff --git a/development/scripts/list-uploads.sh b/development/scripts/list-uploads.sh new file mode 100755 index 00000000..08f6ef46 --- /dev/null +++ b/development/scripts/list-uploads.sh @@ -0,0 +1,11 @@ +# +# A handy script to display the contents of the data directory in a +# terminal window, refreshing every 0.5 seconds and excluding +# any macOS .DS_Store Desktop Services files. +# +while : +do + clear + find data -type f \( ! -name ".DS_Store" \) -ls + sleep 0.5 +done \ No newline at end of file diff --git a/development/scripts/run b/development/scripts/run new file mode 100755 index 00000000..c63ddd7d --- /dev/null +++ b/development/scripts/run @@ -0,0 +1,6 @@ +#!/bin/sh +docker compose \ + -f "development/tools/runner/docker-compose.yml" \ + --project-directory "${PWD}" \ + --project-name tools \ + run --rm runner "${@:1}" \ No newline at end of file diff --git a/development/scripts/run-dev-server.sh b/development/scripts/run-dev-server.sh new file mode 100755 index 00000000..16b9c467 --- /dev/null +++ b/development/scripts/run-dev-server.sh @@ -0,0 +1,6 @@ + +echo "Running server in development..." +docker compose \ + -f development/docker-compose-kbase-ui.yml \ + --project-directory "${PWD}" \ + run --rm staging_service \ No newline at end of file diff --git a/development/tools/runner/Dockerfile b/development/tools/runner/Dockerfile new file mode 100644 index 00000000..93fa3078 --- /dev/null +++ b/development/tools/runner/Dockerfile @@ -0,0 +1,26 @@ +FROM python:3.11.4-slim-buster +# ----------------------------------------- +RUN apt-get update && apt-get install -y zip unzip bzip2 libmagic-dev htop wget + +SHELL ["/bin/bash", "-c"] + +RUN mkdir -p /app && chmod -R a+rw /app + +WORKDIR /app + +# Standard simplified python setup. +COPY ./requirements.txt /app/requirements.txt +RUN python -m pip install --upgrade pip && \ + pip install --upgrade setuptools && \ + pip install -r /app/requirements.txt + +COPY ./globus.cfg /etc/globus.cfg +RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log + +# ENV PATH="/root/.local/bin:$PATH" +# ENV PYTHONPATH="/kb/module/src" +ENV PATH="/app:${PATH}" + +ENTRYPOINT [ "/app/development/tools/runner/entrypoint.sh" ] + +CMD [ "bash" ] \ No newline at end of file diff --git a/development/tools/runner/docker-compose.yml b/development/tools/runner/docker-compose.yml new file mode 100644 index 00000000..f1b8e1ea --- /dev/null +++ b/development/tools/runner/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3.8' +services: + runner: + build: + context: . + dockerfile: development/tools/runner/Dockerfile + volumes: + - .:/app + command: + - bash diff --git a/development/tools/runner/entrypoint.sh b/development/tools/runner/entrypoint.sh new file mode 100755 index 00000000..2ba1545b --- /dev/null +++ b/development/tools/runner/entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +# Nice to set this up for all future python code being run. +export PYTHONPATH="${PWD}/src" + +# +# This execs whatever is provided as a COMMAND to the container. By default, as established +# in the image via the Dockerfile, it is scripts/start-server.sh +# This mechanism, though, allows us to run anything inside the container. We use this +# for running tests, mypy, black, etc. through docker compose. +# Note that surrounding $@ with double quote causes the expansion (which starts with +# parameter 1 not 0) to result in each parameter being retained without splitting. +# Sort of like "$@" -> "$1" "$2" "$3" .. +# +exec "$@" From baa17da55e4ef856ac7eafd833b3e0188a25e991 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 11:40:50 -0700 Subject: [PATCH 10/33] Standalone small testing doc [PTV-1882] This adds a small markdown file to document testing procedures, also covering the just-added development tools for running w/in a docker container. More docs should be added to cover, e.g., configuration, service architecture, etc. --- docs/testing.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 docs/testing.md diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 00000000..4b6a0af4 --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,64 @@ +# Testing + +## In Container + +You can run tests in a container which uses the same base image and uses the same dependencies. (This container can also run other python tasks.) + +```shell +./development/scripts/run run_tests.sh +``` + +To run tests in individual test file you may supply the path to it. By default, the tests run against `tests/`. + +```shell +./development/scripts/run run_tests.sh tests/test_app.py +``` + +## On Host + +### install or activate python 3.11 + +E.g. macOS with macports: + +```shell +sudo port install python311 +sudo port select --set python python311 +``` + +### install venv + +```shell +python -m venv venv +``` + +### update pip + +```shell +python -m pip install --upgrade pip +``` + +### install deps + +```shell +pip install -r requirements.txt +``` + +### run tests + +```shell +./run_tests.sh +``` + +### Other + +#### cherrypick tests + +e.g. to just test app.py: + +```shell +./run_tests.sh tests/test_app.py +``` + +#### coverage + +check coverage in `htmlcov/index.html` From 9222cc3534c13d6c49557acd23843c6f160923cd Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 16:44:35 -0700 Subject: [PATCH 11/33] making codacy, and me, happier [PTV-1882] --- Dockerfile | 7 ++-- development/scripts/generate-data-files.py | 2 +- development/scripts/list-uploads.sh | 2 ++ development/scripts/run-dev-server.sh | 1 + development/tools/runner/Dockerfile | 11 +++--- run_tests.sh | 2 +- staging_service/app.py | 40 +++++++++++----------- staging_service/metadata.py | 7 ++-- 8 files changed, 37 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1a7839b0..d798ebc1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,14 @@ FROM python:3.11.4-slim-buster # ----------------------------------------- RUN mkdir -p /kb/deployment/lib -RUN apt-get update && apt-get install -y zip unzip bzip2 libmagic-dev htop wget +RUN apt-get update && \ + apt-get install -y --no-install-recommends zip=3.0-11+b1 unzip=6.0-23+deb10u3 bzip2=1.0.6-9.2~deb10u2 libmagic-dev=1:5.35-4+deb10u2 htop=2.2.0-1+b1 wget=1.20.1-1.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* # Standard simplified python setup. COPY ./requirements.txt /requirements.txt -RUN python -m pip install --upgrade pip && pip install -r /requirements.txt +RUN python -m pip install "pip==23.1.2" && pip install -r /requirements.txt COPY ./globus.cfg /etc/globus.cfg RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log diff --git a/development/scripts/generate-data-files.py b/development/scripts/generate-data-files.py index 854e6804..ceebe63d 100644 --- a/development/scripts/generate-data-files.py +++ b/development/scripts/generate-data-files.py @@ -10,6 +10,6 @@ def generate_null_file(size: int, name: str): """ with open(name, "wb") as out: bw = io.BufferedWriter(out, 10000000) - for i in range(size): + for _ in range(size): bw.write(b"\0") bw.flush() diff --git a/development/scripts/list-uploads.sh b/development/scripts/list-uploads.sh index 08f6ef46..10f87f8e 100755 --- a/development/scripts/list-uploads.sh +++ b/development/scripts/list-uploads.sh @@ -1,3 +1,5 @@ +#!/bin/bash + # # A handy script to display the contents of the data directory in a # terminal window, refreshing every 0.5 seconds and excluding diff --git a/development/scripts/run-dev-server.sh b/development/scripts/run-dev-server.sh index 16b9c467..6c2f6d65 100755 --- a/development/scripts/run-dev-server.sh +++ b/development/scripts/run-dev-server.sh @@ -1,3 +1,4 @@ +#!/bin/bash echo "Running server in development..." docker compose \ diff --git a/development/tools/runner/Dockerfile b/development/tools/runner/Dockerfile index 93fa3078..00cfbd93 100644 --- a/development/tools/runner/Dockerfile +++ b/development/tools/runner/Dockerfile @@ -1,6 +1,9 @@ FROM python:3.11.4-slim-buster # ----------------------------------------- -RUN apt-get update && apt-get install -y zip unzip bzip2 libmagic-dev htop wget +RUN apt-get update && \ + apt-get install -y --no-install-recommends zip=3.0-11+b1 unzip=6.0-23+deb10u3 bzip2=1.0.6-9.2~deb10u2 libmagic-dev=1:5.35-4+deb10u2 htop=2.2.0-1+b1 wget=1.20.1-1.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* SHELL ["/bin/bash", "-c"] @@ -10,15 +13,11 @@ WORKDIR /app # Standard simplified python setup. COPY ./requirements.txt /app/requirements.txt -RUN python -m pip install --upgrade pip && \ - pip install --upgrade setuptools && \ - pip install -r /app/requirements.txt +RUN python -m pip install "pip==23.1.2" && pip install -r /app/requirements.txt COPY ./globus.cfg /etc/globus.cfg RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log -# ENV PATH="/root/.local/bin:$PATH" -# ENV PYTHONPATH="/kb/module/src" ENV PATH="/app:${PATH}" ENTRYPOINT [ "/app/development/tools/runner/entrypoint.sh" ] diff --git a/run_tests.sh b/run_tests.sh index 4958f8c5..52ea58d5 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -10,5 +10,5 @@ echo "** Running tests in ${TESTS}" echo "**" echo "****************************" echo -python3 -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html $TESTS +python3 -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html "${TESTS}" diff --git a/staging_service/app.py b/staging_service/app.py index 373ffbdf..c7c5920b 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -14,15 +14,16 @@ from staging_service.config import ( UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, - get_config, get_max_content_length, + get_config, + get_max_content_length, get_max_file_size, get_read_chunk_size, get_save_strategy, ) -from .AutoDetectUtils import AutoDetectUtils -from .JGIMetadata import read_metadata_for + from .app_error_formatter import format_import_spec_errors from .autodetect.Mappings import CSV, EXCEL, TSV +from .AutoDetectUtils import AutoDetectUtils from .globus import assert_globusid_exists, is_globusid from .import_specifications.file_parser import ( ErrorType, @@ -36,6 +37,7 @@ write_tsv, ) from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv +from .JGIMetadata import read_metadata_for from .metadata import add_upa, dir_info, similar, some_metadata from .utils import AclManager, StagingPath, auth_client, run_command @@ -472,7 +474,7 @@ async def _handle_upload_save(stream: MultipartReader, destination_path: Staging max_file_size = get_max_file_size() if save_strategy == UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY: async with aiofiles.tempfile.NamedTemporaryFile( - "wb", delete=True + "wb", delete=True ) as output_file: # temp_file_name = output_file.name actual_size = 0 @@ -480,8 +482,8 @@ async def _handle_upload_save(stream: MultipartReader, destination_path: Staging if actual_size > max_file_size: raise web.HTTPBadRequest( text=( - f"file size reached {actual_size} bytes which exceeds " - + f"the maximum allowed of {max_file_size} bytes" + f"file size reached {actual_size} bytes which exceeds " + + f"the maximum allowed of {max_file_size} bytes" ) ) @@ -497,10 +499,10 @@ async def _handle_upload_save(stream: MultipartReader, destination_path: Staging await output_file.write(chunk) async with aiofiles.tempfile.NamedTemporaryFile( - "rb", - delete=False, - dir=os.path.dirname(destination_path.full_path), - prefix=".upload.", + "rb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", ) as copied_file: shutil.copyfile(output_file.name, copied_file.name) @@ -508,10 +510,10 @@ async def _handle_upload_save(stream: MultipartReader, destination_path: Staging elif save_strategy == UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION: async with aiofiles.tempfile.NamedTemporaryFile( - "wb", - delete=False, - dir=os.path.dirname(destination_path.full_path), - prefix=".upload.", + "wb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", ) as output_file: temp_file_name = output_file.name actual_size = 0 @@ -519,8 +521,8 @@ async def _handle_upload_save(stream: MultipartReader, destination_path: Staging if actual_size > max_file_size: raise web.HTTPBadRequest( text=( - f"file size reached {actual_size:,} bytes which exceeds " - + f"the maximum allowed of {max_file_size:,} bytes)" + f"file size reached {actual_size:,} bytes which exceeds " + + f"the maximum allowed of {max_file_size:,} bytes)" ) ) @@ -590,8 +592,6 @@ async def upload_files_chunked(request: web.Request): if user_file_part.name != "uploads": raise web.HTTPBadRequest(text="must provide uploads in body") - file_content_length = user_file_part.headers.get("content-length") - filename: str = unquote(user_file_part.filename) _validate_filename(filename) @@ -739,11 +739,11 @@ async def decompress(request: web.Request): # 3 just overwrite and force destination = os.path.dirname(path.full_path) if ( - upper_file_extension == ".tar" and file_extension == ".gz" + upper_file_extension == ".tar" and file_extension == ".gz" ) or file_extension == ".tgz": await run_command("tar", "xzf", path.full_path, "-C", destination) elif upper_file_extension == ".tar" and ( - file_extension == ".bz" or file_extension == ".bz2" + file_extension == ".bz" or file_extension == ".bz2" ): await run_command("tar", "xjf", path.full_path, "-C", destination) elif file_extension == ".zip" or file_extension == ".ZIP": diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 5bea154a..3a233c54 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -168,7 +168,7 @@ async def _generate_metadata(path: StagingPath, source: str = None): size = 0 - md5 = hashlib.md5() + md5 = hashlib.md5() # noqua: S303 async with aiofiles.open(path.full_path, "rb") as fin: while True: chunk = await fin.read(READ_BUFFER_SIZE) @@ -269,10 +269,7 @@ def _determine_source(path: StagingPath): currently this works for JGI imported files only """ jgi_path = path.jgi_metadata - if ( - os.path.exists(jgi_path) - and os.path.isfile(jgi_path) - ): + if os.path.exists(jgi_path) and os.path.isfile(jgi_path): return "JGI import" else: return "Unknown" From 3651c5d458a8ef26362d6039c927bcb0c26b9635 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 17:24:12 -0700 Subject: [PATCH 12/33] make random string generator deterministic [PTV-1882] --- tests/test_metadata.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 341b818f..74a32a6f 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -12,6 +12,7 @@ from pytest import fixture, raises from staging_service.metadata import ( + FILE_SNIPPET_SIZE, _determine_source, _file_read_from_head, _file_read_from_tail, @@ -257,15 +258,7 @@ def temp_dir2() -> Generator[Path, None, None]: def make_random_string(string_length: str) -> str: - possible_letters = string.ascii_letters - return "".join(random.choice(possible_letters) for i in range(string_length)) - - -def make_random_non_text_bytes(string_length: str) -> bytes: - """ - We consider non-text to be invalid utf-8, or unicode - control characters other than 9, 10, 10. - """ + random.seed(42) possible_letters = string.ascii_letters return "".join(random.choice(possible_letters) for i in range(string_length)) @@ -276,11 +269,11 @@ def test_read_from_head_happy(tmp_path: Path): """ cases = [1, 10, 100, 1000, 10000, 100000] # we just happen to know this from metadata.py... - snippet_length = 1024 + snippet_length = FILE_SNIPPET_SIZE for case in cases: file_path = os.path.join(tmp_path, "foo.txt") file_content = make_random_string(case) - with open(file_path, "w") as output_file: + with open(file_path, "w", encoding="utf-8") as output_file: output_file.write(file_content) snippet = _file_read_from_head(file_path) assert snippet == file_content[:snippet_length] @@ -292,11 +285,11 @@ def test_read_from_tail_happy(tmp_path: Path): """ cases = [1, 10, 100, 1000, 10000, 100000] # we just happen to know this from metadata.py... - snippet_length = 1024 + snippet_length = FILE_SNIPPET_SIZE for case in cases: file_path = os.path.join(tmp_path, "foo.txt") file_content = make_random_string(case) - with open(file_path, "w") as output_file: + with open(file_path, "w", encoding="utf-8") as output_file: output_file.write(file_content) snippet = _file_read_from_tail(file_path) assert snippet == file_content[-snippet_length:] From dc30414e40a56a8a08e26d87d0c53a942414b64d Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Wed, 14 Jun 2023 17:57:29 -0700 Subject: [PATCH 13/33] more changes to make codacy and me happy, more to come [PTV-1882] --- staging_service/app.py | 194 +++++++++++++++------------ staging_service/main.py | 12 +- staging_service/metadata.py | 10 +- staging_service/utils.py | 3 +- tests/test_app.py | 29 ++-- tests/test_autodetect.py | 258 +++++++++++++++++++++--------------- tests/test_metadata.py | 24 ++-- 7 files changed, 290 insertions(+), 240 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index c7c5920b..232172d3 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -103,13 +103,12 @@ def _file_type_resolver(path: Path) -> FileTypeResolution: if ftype in _IMPSPEC_FILE_TO_PARSER: return FileTypeResolution(parser=_IMPSPEC_FILE_TO_PARSER[ftype]) else: - ext = ( - fi["suffix"] - if fi["suffix"] - else path.suffix[1:] - if path.suffix - else path.name - ) + if fi["suffix"]: + ext = fi["suffix"] + elif path.suffix: + ext = path.suffix[1:] + else: + ext = path.name return FileTypeResolution(unsupported_type=ext) @@ -434,11 +433,7 @@ def _validate_filename(filename: str): Raises http-response appropriate errors if the filename is invalid. """ - # - # TODO: Are these really all the rules for filenames? Or are they just specific to the staging - # area? - # In any case, we should capture this in a separate function - ensure_valid_filename - # + if filename.lstrip() != filename: raise web.HTTPForbidden( # forbidden isn't really the right code, should be 400 text="cannot upload file with name beginning with space" @@ -457,101 +452,122 @@ def _validate_filename(filename: str): ) -async def _handle_upload_save(stream: MultipartReader, destination_path: StagingPath): - """ - Handles the file upload stream from the /upload endpoint, saving the stream, ultimately, - to the given destination path. +async def _handle_upload_save_to_destination( + stream: MultipartReader, + destination_path: StagingPath, + chunk_size: int, + max_file_size: int, +): + async with aiofiles.tempfile.NamedTemporaryFile( + "wb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as output_file: + temp_file_name = output_file.name + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size:,} bytes which exceeds " + + f"the maximum allowed of {max_file_size:,} bytes)" + ) + ) - It honors a "save strategy", which is one of a set of defined methods for handling the upload. - See the function get_save_strategy() for details. + # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. + chunk = await stream.read_chunk(size=chunk_size) - It also honors the "chunk size", which is used for reading up to the chunk size from the stream - before saving. This constrains memory commitment, especially important for handling large - files and concurrent file upload. - """ - save_strategy = get_save_strategy() - chunk_size = get_read_chunk_size() - max_file_size = get_max_file_size() - if save_strategy == UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY: - async with aiofiles.tempfile.NamedTemporaryFile( - "wb", delete=True - ) as output_file: - # temp_file_name = output_file.name - actual_size = 0 - while True: - if actual_size > max_file_size: - raise web.HTTPBadRequest( - text=( - f"file size reached {actual_size} bytes which exceeds " - + f"the maximum allowed of {max_file_size} bytes" - ) - ) + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break - chunk = await stream.read_chunk(size=chunk_size) + actual_size += len(chunk) - # Chunk is always a "bytes" object, but will be empty if there is nothing else - # to read, which is considered Falsy. - if not chunk: - break + await output_file.write( + chunk, + ) - actual_size += len(chunk) + shutil.move(temp_file_name, destination_path.full_path) + + +async def _handle_upload_save_to_temp( + stream: MultipartReader, + destination_path: StagingPath, + chunk_size: int, + max_file_size: int, +): + async with aiofiles.tempfile.NamedTemporaryFile("wb", delete=True) as output_file: + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size} bytes which exceeds " + + f"the maximum allowed of {max_file_size} bytes" + ) + ) - await output_file.write(chunk) + chunk = await stream.read_chunk(size=chunk_size) - async with aiofiles.tempfile.NamedTemporaryFile( - "rb", - delete=False, - dir=os.path.dirname(destination_path.full_path), - prefix=".upload.", - ) as copied_file: - shutil.copyfile(output_file.name, copied_file.name) + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break - shutil.move(copied_file.name, destination_path.full_path) + actual_size += len(chunk) + + await output_file.write(chunk) - elif save_strategy == UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION: async with aiofiles.tempfile.NamedTemporaryFile( - "wb", + "rb", delete=False, dir=os.path.dirname(destination_path.full_path), prefix=".upload.", - ) as output_file: - temp_file_name = output_file.name - actual_size = 0 - while True: - if actual_size > max_file_size: - raise web.HTTPBadRequest( - text=( - f"file size reached {actual_size:,} bytes which exceeds " - + f"the maximum allowed of {max_file_size:,} bytes)" - ) - ) + ) as copied_file: + shutil.copyfile(output_file.name, copied_file.name) - # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. - chunk = await stream.read_chunk(size=chunk_size) + shutil.move(copied_file.name, destination_path.full_path) - # Chunk is always a "bytes" object, but will be empty if there is nothing else - # to read, which is considered Falsy. - if not chunk: - break - actual_size += len(chunk) +async def _handle_upload_save(stream: MultipartReader, destination_path: StagingPath): + """ + Handles the file upload stream from the /upload endpoint, saving the stream, ultimately, + to the given destination path. - await output_file.write( - chunk, - ) + It honors a "save strategy", which is one of a set of defined methods for handling the upload. + See the function get_save_strategy() for details. - shutil.move(temp_file_name, destination_path.full_path) + It also honors the "chunk size", which is used for reading up to the chunk size from the stream + before saving. This constrains memory commitment, especially important for handling large + files and concurrent file upload. + """ + chunk_size = get_read_chunk_size() + max_file_size = get_max_file_size() + save_strategy = get_save_strategy() + if save_strategy == UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY: + await _handle_upload_save_to_temp( + stream, destination_path, chunk_size, max_file_size + ) + elif save_strategy == UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION: + await _handle_upload_save_to_destination( + stream, destination_path, chunk_size, max_file_size + ) @routes.post("/upload") async def upload_files_chunked(request: web.Request): """ - uploads a file into the staging area + Uploads a file into the staging area. + + Ensures the current auth token is valid, and if so, returns the associated username. + If not, will raise an exception + + Also, ensures that if there is a Globus account id available in the user's staging + directory, it is valid. """ - # Ensures the current auth token is valid, and if so, returns the associated username. - # If not, will raise an exception - # Also, ensures that if there is a Globus account id available in the user's staging - # directory, it is valid. + username = await authorize_request(request) if not request.can_read_body: @@ -614,7 +630,6 @@ async def upload_files_chunked(request: web.Request): # allowing us to ditch XHR. await _handle_upload_save(user_file_part, path) - # TODO: is this really necessary? Would a write failure possibly get here? if not os.path.exists(path.full_path): error_msg = "We are sorry but upload was interrupted. Please try again." raise web.HTTPNotFound(text=error_msg) @@ -638,10 +653,8 @@ async def define_UPA(request: web.Request): username = await authorize_request(request) path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path or not os.path.isfile(path.full_path)): - # TODO the security model here is to not care if someone wants to put in a false upa - raise web.HTTPNotFound( - text="no file found found on path {}".format(path.user_path) - ) + # The security model here is to not care if someone wants to put in a false upa + raise web.HTTPNotFound(text=f"no file found found on path {path.user_path}") if not request.can_read_body: raise web.HTTPBadRequest(text="must provide 'UPA' field in body") body = await request.post() @@ -837,6 +850,11 @@ def inject_config_dependencies(config): def app_factory(): + """ + Creates an aiohttp web application, with routes and configuration, and returns it. + + Used by the gunicorn server to spawn new workers. + """ app = web.Application(middlewares=[web.normalize_path_middleware()]) app.router.add_routes(routes) diff --git a/staging_service/main.py b/staging_service/main.py index db26e107..7c386622 100644 --- a/staging_service/main.py +++ b/staging_service/main.py @@ -1,19 +1,15 @@ import asyncio -# import configparser -# import os import uvloop from staging_service.app import app_factory -# from aiohttp import web - asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop -# config = configparser.ConfigParser() -# config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) -# web.run_app(app_factory(config), port=5000) - async def web_app(): + """ + The gunicorn web app factory function, which in turn just calls the + aiohttp web server factory. + """ return app_factory() diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 3a233c54..e2486a42 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -50,8 +50,7 @@ def _file_read_from_tail(file_path): try: return file.read() except UnicodeError: - # TODO: should just return None, and let the - # ui determine the message. + # Note that this string propagates all the way to the ui. return "not a text file" @@ -69,8 +68,7 @@ def _file_read_from_head(file_path): try: return file.read(FILE_SNIPPET_SIZE) except UnicodeError: - # TODO: should just return None, and let the - # ui determine the message. + # Note that this string propagates all the way to the ui. return "not a text file" @@ -382,8 +380,6 @@ async def some_metadata(path: StagingPath, desired_fields=False, source=None): try: result[key] = data[key] except KeyError as no_data: - raise web.HTTPBadRequest( - text="no data exists for key {key}".format(key=no_data.args) - ) # TODO check this exception message + raise web.HTTPBadRequest(text=f"no data exists for key {no_data.args}") return result diff --git a/staging_service/utils.py b/staging_service/utils.py index da303681..99ce0e90 100644 --- a/staging_service/utils.py +++ b/staging_service/utils.py @@ -4,16 +4,17 @@ import logging import os -from staging_service.auth2Client import KBaseAuth2 import globus_sdk from aiohttp.web import HTTPInternalServerError, HTTPOk +from staging_service.auth2Client import KBaseAuth2 from staging_service.config import get_config def auth_client(): return KBaseAuth2(get_config()["staging_service"]["AUTH_URL"]) + async def run_command(*args): """Run command in subprocess Example from: diff --git a/tests/test_app.py b/tests/test_app.py index 81799eb1..46e4acf6 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -23,13 +23,8 @@ import staging_service.globus as globus import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils -from tests.test_helpers import ( - DATA_DIR, - META_DIR, - FileUtil, - check_excel_contents, - check_file_contents, -) +from tests.test_helpers import (DATA_DIR, META_DIR, FileUtil, + check_excel_contents, check_file_contents) if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: from tests.test_helpers import bootstrap @@ -1104,7 +1099,7 @@ async def test_bulk_specification_success(get_user): fu.make_dir(f"{username}/somefolder") base = Path(fu.base_dir) / username tsv = "genomes.tsv" - with open(base / tsv, "w") as f: + with open(base / tsv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: genomes; Columns: 3; Version: 1\n", @@ -1115,7 +1110,7 @@ async def test_bulk_specification_success(get_user): ] ) csv = "somefolder/breakfastcereals.csv" - with open(base / csv, "w") as f: + with open(base / csv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: breakfastcereals; Columns: 3; Version: 1\n", @@ -1207,7 +1202,7 @@ async def test_bulk_specification_fail_not_found(get_user): fu.make_dir(f"{username}/otherfolder") base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" - with open(base / tsv, "w") as f: + with open(base / tsv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: genomes; Columns: 3; Version: 1\n", @@ -1241,7 +1236,7 @@ async def test_bulk_specification_fail_parse_fail(get_user): base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: + with open(base / tsv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: genomes; Columns: 3; Version: 1\n", @@ -1252,7 +1247,7 @@ async def test_bulk_specification_fail_parse_fail(get_user): ) csv = "otherfolder/thing.csv" # this one has a misspelling in the header - with open(base / csv, "w") as f: + with open(base / csv, "w", encoding="utf-8") as f: f.writelines( [ "Dater type: breakfastcereals; Columns: 3; Version: 1\n", @@ -1352,7 +1347,7 @@ async def test_bulk_specification_fail_column_count(get_user): base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: + with open(base / tsv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: genomes; Columns: 3; Version: 1\n", @@ -1363,7 +1358,7 @@ async def test_bulk_specification_fail_column_count(get_user): ) csv = "thing.csv" # this one is missing a column in the last row - with open(base / csv, "w") as f: + with open(base / csv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: breakfastcereals; Columns: 3; Version: 1\n", @@ -1427,7 +1422,7 @@ async def test_bulk_specification_fail_multiple_specs_per_type(get_user): base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: + with open(base / tsv, "w", encoding="utf-8") as f: f.writelines( [ "Data type: genomes; Columns: 3; Version: 1\n", @@ -1438,7 +1433,7 @@ async def test_bulk_specification_fail_multiple_specs_per_type(get_user): ) csv1 = "thing.csv" # this is the first of the breakfastcereals data sources, so fine - with open(base / csv1, "w") as f: + with open(base / csv1, "w", encoding="utf-8") as f: f.writelines( [ "Data type: breakfastcereals; Columns: 3; Version: 1\n", @@ -1449,7 +1444,7 @@ async def test_bulk_specification_fail_multiple_specs_per_type(get_user): ) csv2 = "thing2.csv" # this data type is also breakfastcereals, so will cause an error - with open(base / csv2, "w") as f: + with open(base / csv2, "w", encoding="utf-8") as f: f.writelines( [ "Data type: breakfastcereals; Columns: 2; Version: 1\n", diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index 4e4fe7f9..c41f82e1 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -1,10 +1,11 @@ import pytest + +from staging_service.app import inject_config_dependencies from staging_service.autodetect.GenerateMappings import ( - file_format_to_extension_mapping, extensions_mapping, + file_format_to_extension_mapping, ) from staging_service.AutoDetectUtils import AutoDetectUtils -from staging_service.app import inject_config_dependencies from tests.test_helpers import bootstrap_config @@ -36,11 +37,7 @@ def test_bad_filenames(): filename=filename ) assert possible_importers is None - assert fileinfo == { - "prefix": filename, - "suffix": None, - "file_ext_type": [] - } + assert fileinfo == {"prefix": filename, "suffix": None, "file_ext_type": []} def test_reasonable_filenames(): @@ -48,16 +45,17 @@ def test_reasonable_filenames(): Test variants "reasonable" filenames that are in the mappings Note: Some apps and mappings are not yet supported and are commented out """ - good_filenames = [] for heading in file_format_to_extension_mapping.keys(): extensions = file_format_to_extension_mapping[heading] for extension in extensions: - good_filenames.append(( - f"{heading}.{extension}", - heading.count("."), - extensions_mapping[extension]['file_ext_type'] - )) + good_filenames.append( + ( + f"{heading}.{extension}", + heading.count("."), + extensions_mapping[extension]["file_ext_type"], + ) + ) for filename, heading_dotcount, ext in good_filenames: for filename_variant in [ @@ -71,12 +69,16 @@ def test_reasonable_filenames(): ) assert possible_importers is not None expected_suffix = filename_variant.split(".", heading_dotcount + 1)[-1] - assert possible_importers == AutoDetectUtils._MAPPINGS["types"][ - expected_suffix.lower()]["mappings"], filename_variant + assert ( + possible_importers + == AutoDetectUtils._MAPPINGS["types"][expected_suffix.lower()][ + "mappings" + ] + ), filename_variant assert fileinfo == { - "prefix": filename_variant[:-len(expected_suffix) - 1], + "prefix": filename_variant[: -len(expected_suffix) - 1], "suffix": expected_suffix, - "file_ext_type": ext + "file_ext_type": ext, } @@ -85,72 +87,95 @@ def test_specific_filenames(): Test some made up filenames to check that multi-dot extensions are handled correctly """ test_data = [ - ("filename", (None, {"prefix": "filename", "suffix": None, "file_ext_type": []})), - ("file.name", (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []})), - ("fil.en.ame", (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []})), - ("file.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file" , "suffix": "gZ", "file_ext_type": ['CompressedFileFormatArchive']}, - ) - ), - ("file.name.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file.name", - "suffix": "gZ", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - ) - ), - ("oscar_the_grouch_does_meth.FaStA.gz", ( - [ - {'app_weight': 1, - 'id': 'assembly', - 'title': 'Assembly', + ( + "filename", + (None, {"prefix": "filename", "suffix": None, "file_ext_type": []}), + ), + ( + "file.name", + (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []}), + ), + ( + "fil.en.ame", + (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []}), + ), + ( + "file.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', + ), + ), + ( + "file.name.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file.name", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "oscar_the_grouch_does_meth", - "suffix": "FaStA.gz", - "file_ext_type": ["FASTA"] - }, - ) - ), - ("look.at.all.these.frigging.dots.gff2.gzip", ( - [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "look.at.all.these.frigging.dots", - "suffix": "gff2.gzip", - "file_ext_type": ["GFF"] - }, - ) - ) + ), + ), + ( + "oscar_the_grouch_does_meth.FaStA.gz", + ( + [ + {"app_weight": 1, "id": "assembly", "title": "Assembly"}, + {"app_weight": 1, "id": "gff_genome", "title": "GFF/FASTA Genome"}, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "oscar_the_grouch_does_meth", + "suffix": "FaStA.gz", + "file_ext_type": ["FASTA"], + }, + ), + ), + ( + "look.at.all.these.frigging.dots.gff2.gzip", + ( + [ + {"app_weight": 1, "id": "gff_genome", "title": "GFF/FASTA Genome"}, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "look.at.all.these.frigging.dots", + "suffix": "gff2.gzip", + "file_ext_type": ["GFF"], + }, + ), + ), ] for filename, importers in test_data: - assert AutoDetectUtils.determine_possible_importers(filename) == importers, filename + assert ( + AutoDetectUtils.determine_possible_importers(filename) == importers + ), filename def test_sra_mappings(): @@ -160,12 +185,15 @@ def test_sra_mappings(): """ sra_file = "test.sra" possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=sra_file) - assert possible_importers == [{ - 'id': 'sra_reads', - 'app_weight': 1, - 'title': 'SRA Reads', - }] + filename=sra_file + ) + assert possible_importers == [ + { + "id": "sra_reads", + "app_weight": 1, + "title": "SRA Reads", + } + ] assert fileinfo == {"prefix": "test", "suffix": "sra", "file_ext_type": ["SRA"]} @@ -176,16 +204,19 @@ def test_zip_mappings(): """ gz_file = "test.tar.gz" possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=gz_file) - assert possible_importers == [{ - 'id': 'decompress', - 'app_weight': 1, - 'title': 'Decompress/Unpack', - }] + filename=gz_file + ) + assert possible_importers == [ + { + "id": "decompress", + "app_weight": 1, + "title": "Decompress/Unpack", + } + ] assert fileinfo == { "prefix": "test", "suffix": "tar.gz", - 'file_ext_type': ['CompressedFileFormatArchive'] + "file_ext_type": ["CompressedFileFormatArchive"], } @@ -194,31 +225,38 @@ def test_get_mappings(): Basic test of the get mappings logic. Most of the logic is in determine_possible_importers which is throughly tested above. """ - assert AutoDetectUtils.get_mappings(["filename", "file.name.Gz", "some.dots.gff3.gz"]) == { + assert AutoDetectUtils.get_mappings( + ["filename", "file.name.Gz", "some.dots.gff3.gz"] + ) == { "mappings": [ None, - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", } ], + [ + { + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], ], "fileinfo": [ {"prefix": "filename", "suffix": None, "file_ext_type": []}, - {"prefix": "file.name", - "suffix": "Gz", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ['GFF']}, - ] + { + "prefix": "file.name", + "suffix": "Gz", + "file_ext_type": ["CompressedFileFormatArchive"], + }, + {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ["GFF"]}, + ], } diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 74a32a6f..be22d518 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -23,8 +23,13 @@ from tests.test_helpers import FileUtil -@fixture(scope="function") -def temp_dir() -> Generator[Path, None, None]: +@fixture(scope="function", name="temp_dir") +def temp_dir_fixture() -> Generator[Path, None, None]: + """ + A fixture to generate a unique subdirectory within the DATA_DIR + directory set in the test configuration (which defaults to the "data" + directory in the root of the repo) + """ with FileUtil() as fu: childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() @@ -57,9 +62,10 @@ async def _incomplete_metadata_file_update(temp_dir, metadict, source): "super_fake_jgi_path", ) - with open(target.full_path, "w") as p: + with open(target.full_path, "w", encoding="utf-8") as p: p.writelines(make_test_lines(1, 6)) - with open(target.metadata_path, "w") as p: + + with open(target.metadata_path, "w", encoding="utf-8") as p: p.write(json.dumps(metadict)) res = await some_metadata(target) @@ -138,7 +144,7 @@ async def test_determine_source_jgi_metadata_source_exists(temp_dir: Path): # In this test, the actual data doesn't matter, just the # fact that a file exists. - with open(staging_path.jgi_metadata, "w") as data_file: + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write("foo") assert _determine_source(staging_path) == "JGI import" @@ -177,7 +183,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical(temp_dir: Path): file_path = "foo.bar" filename_base = os.path.basename(file_path) staging_path = _create_validated_staging_path(temp_dir, file_path) - with open(staging_path.jgi_metadata, "w") as data_file: + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") assert _determine_source(staging_path) == "JGI import" @@ -190,7 +196,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical_subdir(temp_dir: file_path = "some/foo.bar" filename_base = os.path.basename(file_path) staging_path = _create_validated_staging_path(temp_dir, file_path) - with open(staging_path.jgi_metadata, "w") as data_file: + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") assert _determine_source(staging_path) == "JGI import" @@ -205,7 +211,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical_deepsubdir( file_path = "some/deep/dark/scary/foo.bar" filename_base = os.path.basename(file_path) staging_path = _create_validated_staging_path(temp_dir, file_path) - with open(staging_path.jgi_metadata, "w") as data_file: + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") assert _determine_source(staging_path) == "JGI import" @@ -338,7 +344,7 @@ async def test_invalid_desired_fields(temp_dir: Path): "", ) - with open(staging_path.full_path, "w") as p: + with open(staging_path.full_path, "w", encoding="utf-8") as p: p.write("foo") with raises(web.HTTPBadRequest) as ex: From 66def97f044f2e71440c03c8c03aa85ae9e13a90 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 07:28:53 -0700 Subject: [PATCH 14/33] more refactoring to fix usage errors and warnings [PTV-1882] --- staging_service/AutoDetectUtils.py | 71 ++++++++++++++----- staging_service/app.py | 14 ++-- .../test_file_writers.py | 14 ++-- tests/test_app.py | 41 +++++------ tests/test_autodetect.py | 4 +- tests/test_helpers.py | 16 +++-- 6 files changed, 98 insertions(+), 62 deletions(-) diff --git a/staging_service/AutoDetectUtils.py b/staging_service/AutoDetectUtils.py index 97766b72..4b398120 100644 --- a/staging_service/AutoDetectUtils.py +++ b/staging_service/AutoDetectUtils.py @@ -2,14 +2,44 @@ This class is in charge of determining possible importers by determining the suffix of the filepath pulled in, and by looking up the appropriate mappings in the supported_apps_w_extensions.json file """ -from typing import Optional, Tuple, Dict +from typing import Any, Dict, Optional, Tuple + +Mappings = dict[str, Any] + +MappingsConfig = dict[str, Mappings] class AutoDetectUtils: - _MAPPINGS = None # expects to be set by config + _MAPPINGS: None | dict[str, Any] = None # expects to be set by config + + @classmethod + def set_mappings(cls, mappings_config: MappingsConfig): + cls._MAPPINGS = mappings_config["types"] + + @classmethod + def has_mappings(cls) -> bool: + if cls._MAPPINGS is not None: + return True + return False + + # @classmethod + # def get_mappings(cls) -> dict[str, Any]: + # return cls._MAPPINGS - @staticmethod - def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[str, object]]: + @classmethod + def get_mappings_by_extension(cls, extension: str): + if cls._MAPPINGS is not None: + return cls._MAPPINGS.get(extension) + + @classmethod + def get_extension_mappings(cls): + return cls._MAPPINGS + + @classmethod + def determine_possible_importers( + cls, + filename: str, + ) -> Tuple[Optional[list], Dict[str, object]]: """ Given a filename, come up with a reference to all possible apps. :param filename: The filename to find applicable apps for @@ -18,31 +48,35 @@ def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[st The fileinfo dict, containing: the file prefix the file suffix, if a suffix matched a mapping - the file types, if a suffix matched a mapping, otherwise an empty list + the file types, if a suffix matched a mapping, otherwise an empty list """ dotcount = filename.count(".") - if dotcount: + if dotcount and cls._MAPPINGS is not None: # preferentially choose the most specific suffix (e.g. longest) # to get file type mappings - m = AutoDetectUtils._MAPPINGS + for i in range(1, dotcount + 1): parts = filename.split(".", i) suffix = parts[-1].lower() - if suffix in m["types"]: + # pylint: disable=unsubscriptable-object + # added because it complains about mappings below. + if suffix in cls._MAPPINGS: # pylint: disable=E1135 prefix = ".".join(parts[0:i]) return ( - m["types"][suffix]["mappings"], - {"prefix": prefix, - "suffix": parts[-1], - "file_ext_type": m["types"][suffix]["file_ext_type"], - } + cls._MAPPINGS[suffix]["mappings"], + { + "prefix": prefix, + "suffix": parts[-1], + "file_ext_type": cls._MAPPINGS[suffix]["file_ext_type"], + }, ) return None, {"prefix": filename, "suffix": None, "file_ext_type": []} - @staticmethod - def get_mappings(file_list: list) -> dict: + @classmethod + def get_mappings(cls, file_list: list) -> dict: """ Given a list of files, get their mappings if they exist + :param file_list: A list of files :return: return a listing of apps, a listing of extension_mappings for each filename, and information about each file, currently the file prefix and the suffix used to @@ -51,11 +85,10 @@ def get_mappings(file_list: list) -> dict: mappings = [] fileinfo = [] for filename in file_list: - typemaps, fi = AutoDetectUtils.determine_possible_importers(filename) + typemaps, file_importers = cls.determine_possible_importers(filename) mappings.append(typemaps) - fileinfo.append(fi) - rv = { + fileinfo.append(file_importers) + return { "mappings": mappings, "fileinfo": fileinfo, } - return rv diff --git a/staging_service/app.py b/staging_service/app.py index 232172d3..37336f6f 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -63,11 +63,13 @@ @routes.get("/importer_filetypes/") -async def importer_filetypes(request: web.Request) -> web.json_response: +async def importer_filetypes(_: web.Request) -> web.json_response: """ Returns the file types for the configured datatypes. The returned JSON contains two keys: + * datatype_to_filetype, which maps import datatypes (like gff_genome) to their accepted filetypes (like [FASTA, GFF]) + * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. *.fa, *.fasta, *.fa.gz, etc.) @@ -350,7 +352,7 @@ async def download_files(request: web.Request): @routes.get("/similar/{path:.+}") -async def similar_files(request: web.Request): +async def get_similar_files(request: web.Request): """ lists similar file path for given file """ @@ -832,11 +834,13 @@ def inject_config_dependencies(config): if FILE_EXTENSION_MAPPINGS is None: raise Exception("Please provide FILE_EXTENSION_MAPPINGS in the config file ") - with open(FILE_EXTENSION_MAPPINGS) as f: - AutoDetectUtils._MAPPINGS = json.load(f) + with open( + FILE_EXTENSION_MAPPINGS, encoding="utf-8" + ) as file_extension_mappings_file: + AutoDetectUtils.set_mappings(json.load(file_extension_mappings_file)) datatypes = defaultdict(set) extensions = defaultdict(set) - for fileext, val in AutoDetectUtils._MAPPINGS["types"].items(): + for fileext, val in AutoDetectUtils.get_extension_mappings().items(): # if we start using the file ext type array for anything else this might need changes filetype = val["file_ext_type"][0] extensions[filetype].add(fileext) diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index 1bb1367b..b8c9e147 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -15,8 +15,8 @@ from tests.test_helpers import ( FileUtil, assert_exception_correct, + assert_file_contents, check_excel_contents, - check_file_contents, ) @@ -69,7 +69,7 @@ def test_write_csv(temp_dir: Path): "type2": "type2.csv", "type3": "type3.csv", } - check_file_contents( + assert_file_contents( temp_dir / "type1.csv", [ "Data type: type1; Columns: 3; Version: 1\n", @@ -79,7 +79,7 @@ def test_write_csv(temp_dir: Path): "boo!,,56.78\n", ], ) - check_file_contents( + assert_file_contents( temp_dir / "type2.csv", [ "Data type: type2; Columns: 1; Version: 1\n", @@ -89,7 +89,7 @@ def test_write_csv(temp_dir: Path): "0\n", ], ) - check_file_contents( + assert_file_contents( temp_dir / "type3.csv", [ "Data type: type3; Columns: 3; Version: 1\n", @@ -106,7 +106,7 @@ def test_write_tsv(temp_dir: Path): "type2": "type2.tsv", "type3": "type3.tsv", } - check_file_contents( + assert_file_contents( temp_dir / "type1.tsv", [ "Data type: type1; Columns: 3; Version: 1\n", @@ -116,7 +116,7 @@ def test_write_tsv(temp_dir: Path): "boo!\t\t56.78\n", ], ) - check_file_contents( + assert_file_contents( temp_dir / "type2.tsv", [ "Data type: type2; Columns: 1; Version: 1\n", @@ -126,7 +126,7 @@ def test_write_tsv(temp_dir: Path): "0\n", ], ) - check_file_contents( + assert_file_contents( temp_dir / "type3.tsv", [ "Data type: type3; Columns: 3; Version: 1\n", diff --git a/tests/test_app.py b/tests/test_app.py index 46e4acf6..95f1f441 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -24,11 +24,10 @@ import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils from tests.test_helpers import (DATA_DIR, META_DIR, FileUtil, - check_excel_contents, check_file_contents) + assert_file_contents, bootstrap, + check_excel_contents) if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: - from tests.test_helpers import bootstrap - bootstrap() decoder = JSONDecoder() @@ -39,7 +38,7 @@ def asyncgiven(**kwargs): - """alterantive to hypothesis.given decorator for async""" + """alternative to hypothesis.given decorator for async""" def real_decorator(fn): @given(**kwargs) @@ -54,16 +53,10 @@ def aio_wrapper(*args, **kwargs): return real_decorator -# TODO: replace with real unittest mocking; this approach forces a bad design for the auth client; it is -# literally used in only one place in the codebase! -def mock_auth_app(): +# TODO: replace with real unittest mocking +def mock_app(): application = app.app_factory() - # async def mock_auth(*args, **kwargs): - # return "testuser" - # - # auth_client().get_user = mock_auth - async def mock_globus_id(*args, **kwargs): return ["testuser@globusid.org"] @@ -73,14 +66,10 @@ async def mock_globus_id(*args, **kwargs): return application -async def mock_get_user(username): - return username - - class AppClient: - # TODO: the constructor params are unused ... probably can be removed ... try later. def __init__(self): - self.server = test_utils.TestServer(mock_auth_app()) + self.server = test_utils.TestServer(mock_app()) + self.client = None async def __aenter__(self): await self.server.start_server(loop=asyncio.get_event_loop()) @@ -89,7 +78,8 @@ async def __aenter__(self): async def __aexit__(self, *args): await self.server.close() - await self.client.close() + if self.client is not None: + await self.client.close() first_letter_alphabet = [c for c in string.ascii_lowercase + string.ascii_uppercase] @@ -1053,7 +1043,10 @@ async def test_importer_mappings(get_user): # Or we need to reload json file itself # unzip_mapping = AutoDetectUtils._MAPPINGS["apps"]["decompress/unpack"] - assert mappings[1][0] == AutoDetectUtils._MAPPINGS["types"]["gz"]["mappings"][0] + assert ( + mappings[1][0] + == AutoDetectUtils.get_mappings_by_extension("gz")["mappings"][0] + ) # A dict is passed in data = {"file_list": [{}]} @@ -1610,7 +1603,7 @@ async def test_write_bulk_specification_success_csv(get_user): }, } base = Path(fu.base_dir) / username - check_file_contents( + assert_file_contents( base / "specs/genome.csv", [ "Data type: genome; Columns: 2; Version: 1\n", @@ -1620,7 +1613,7 @@ async def test_write_bulk_specification_success_csv(get_user): "32,yikes\n", ], ) - check_file_contents( + assert_file_contents( base / "specs/reads.csv", [ "Data type: reads; Columns: 2; Version: 1\n", @@ -1660,7 +1653,7 @@ async def test_write_bulk_specification_success_tsv(get_user): }, } base = Path(fu.base_dir) / username - check_file_contents( + assert_file_contents( base / "tsvspecs/genome.tsv", [ "Data type: genome; Columns: 2; Version: 1\n", @@ -1670,7 +1663,7 @@ async def test_write_bulk_specification_success_tsv(get_user): "32\tyikes\n", ], ) - check_file_contents( + assert_file_contents( base / "tsvspecs/reads.tsv", [ "Data type: reads; Columns: 2; Version: 1\n", diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index c41f82e1..a5c2714d 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -21,7 +21,7 @@ def test_config(): #TODO Can test PATH injections as well :return: """ - mappings = AutoDetectUtils._MAPPINGS + mappings = AutoDetectUtils.has_mappings() assert mappings @@ -71,7 +71,7 @@ def test_reasonable_filenames(): expected_suffix = filename_variant.split(".", heading_dotcount + 1)[-1] assert ( possible_importers - == AutoDetectUtils._MAPPINGS["types"][expected_suffix.lower()][ + == AutoDetectUtils.get_mappings_by_extension(expected_suffix.lower())[ "mappings" ] ), filename_variant diff --git a/tests/test_helpers.py b/tests/test_helpers.py index e6728943..8f23525e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -21,14 +21,20 @@ def bootstrap(): + """ + Attempts to load environment variables from a number of potentially + present files + + Not sure it actually does anything useful. + """ test_env_0 = "../test.env" test_env_1 = "test.env" test_env_2 = "test/test.env" - for item in [test_env_0, test_env_1, test_env_2]: + for potential_env_file in [test_env_0, test_env_1, test_env_2]: try: - load_dotenv(item, verbose=True) - except Exception: + load_dotenv(potential_env_file, verbose=True) + except IOError: pass @@ -49,8 +55,8 @@ def assert_exception_correct(got: Exception, expected: Exception): assert type(got) == type(expected) -def check_file_contents(file: Path, lines: list[str]): - with open(file) as f: +def assert_file_contents(file: Path, lines: list[str]): + with open(file, "r", encoding="utf-8") as f: assert f.readlines() == lines From e6efb8068ac49426587109620ca5a4b2b8783569 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 08:03:30 -0700 Subject: [PATCH 15/33] more code quality fixes [PTV-1882] I'll stop when the # of reported issues is low enough. Doesn't need to be perfect, just enough so that the # of issues to ignore is tractable to review. --- .../import_specifications/file_parser.py | 24 +-- .../individual_parsers.py | 202 +++++++++++------- staging_service/metadata.py | 5 +- .../import_specifications/test_file_parser.py | 150 +++++++++---- .../test_individual_parsers.py | 82 ++++--- 5 files changed, 293 insertions(+), 170 deletions(-) diff --git a/staging_service/import_specifications/file_parser.py b/staging_service/import_specifications/file_parser.py index 6b96a1af..d855af2c 100644 --- a/staging_service/import_specifications/file_parser.py +++ b/staging_service/import_specifications/file_parser.py @@ -7,13 +7,9 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Optional as O -from typing import Union +from typing import Optional, Union -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict - -# TODO should get mypy working at some point +from frozendict import frozendict PRIMITIVE_TYPE = Union[str, int, float, bool, None] @@ -42,7 +38,7 @@ class SpecificationSource: """ file: Path - tab: O[str] = None + tab: Optional[str] = None def __post_init__(self): if not self.file: @@ -89,9 +85,9 @@ class Error: """ error: ErrorType - message: O[str] = None - source_1: O[SpecificationSource] = None - source_2: O[SpecificationSource] = None + message: Optional[str] = None + source_1: Optional[SpecificationSource] = None + source_2: Optional[SpecificationSource] = None def __post_init__(self): if not self.error: @@ -150,8 +146,8 @@ class ParseResults: parse_import_specifications method to create an instance of this class. """ - results: O[frozendict[str, ParseResult]] = None - errors: O[tuple[Error, ...]] = None + results: Optional[frozendict[str, ParseResult]] = None + errors: Optional[tuple[Error, ...]] = None def __post_init__(self): if not (bool(self.results) ^ bool(self.errors)): # xnor @@ -170,8 +166,8 @@ class FileTypeResolution: unsupported_type - the file type if the type is not a supported type. """ - parser: O[Callable[[Path], ParseResults]] = None - unsupported_type: O[str] = None + parser: Optional[Callable[[Path], ParseResults]] = None + unsupported_type: Optional[str] = None def __post_init__(self): if not (bool(self.parser) ^ bool(self.unsupported_type)): # xnor diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 61bf84b5..e714bd6c 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -3,23 +3,22 @@ """ import csv -import magic import math -import pandas import re - -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict from pathlib import Path -from typing import Optional as O, Union, Any +from typing import Any, Optional, Union + +import magic +import pandas +from frozendict import frozendict from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, - ErrorType, - SpecificationSource, Error, + ErrorType, ParseResult, ParseResults, + SpecificationSource, ) # this version defines the schema of the xsv and Excel files. @@ -31,10 +30,14 @@ _VERSION_STR = "Version:" _COLUMN_STR = "Columns:" _HEADER_SEP = ";" -_EXPECTED_HEADER = (f"{_DATA_TYPE} {_HEADER_SEP} " - + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} ") -_HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " - + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)") +_EXPECTED_HEADER = ( + f"{_DATA_TYPE} {_HEADER_SEP} " + + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} " +) +_HEADER_REGEX = re.compile( + f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " + + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)" +) _MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} @@ -60,24 +63,29 @@ class _ParseException(Exception): pass -def _parse_header(header: str, spec_source: SpecificationSource, maximum_version: int +def _parse_header( + header: str, spec_source: SpecificationSource, maximum_version: int ) -> tuple[str, int]: # return is (data type, column count) match = _HEADER_REGEX.fullmatch(header) if not match: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', + spec_source, + ) + ) version = int(match[3]) if version > maximum_version: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Schema version {version} is larger than maximum processable " - + f"version {maximum_version}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Schema version {version} is larger than maximum processable " + + f"version {maximum_version}", + spec_source, + ) + ) return match[1], int(match[2]) @@ -86,23 +94,26 @@ def _csv_next( line_number: int, expected_line_count: Union[None, int], # None = skip columns check spec_source: SpecificationSource, - error: str + error: str, ) -> list[str]: try: line = next(input_) except StopIteration: raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) if expected_line_count and len(line) != expected_line_count: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {line_number}, " - + f"expected {expected_line_count}, got {len(line)}", - spec_source)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {line_number}, " + + f"expected {expected_line_count}, got {len(line)}", + spec_source, + ) + ) return line def _error(error: Error) -> ParseResults: - return ParseResults(errors = tuple([error])) + return ParseResults(errors=tuple([error])) def _normalize_pandas(val: PRIMITIVE_TYPE) -> PRIMITIVE_TYPE: @@ -141,18 +152,22 @@ def _normalize_headers( ret = [str(s).strip() if not pandas.isna(s) else None for s in headers] for i, name in enumerate(ret, start=1): if not name: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Missing header entry in row {line_number}, position {i}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Missing header entry in row {line_number}, position {i}", + spec_source, + ) + ) if name in seen: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Duplicate header name in row {line_number}: {name}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Duplicate header name in row {line_number}: {name}", + spec_source, + ) + ) seen.add(name) return ret @@ -162,8 +177,10 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: try: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: - return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) - with open(path, newline='') as input_: + return _error( + Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc) + ) + with open(path, newline="") as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) @@ -176,34 +193,44 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: if len(row) != columns: # could collect errors (first 10?) and throw an exception with a list # lets wait and see if that's really needed - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {i}, " - + f"expected {columns}, got {len(row)}", - spcsrc)) - results.append(frozendict( - {param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row))})) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {i}, " + + f"expected {columns}, got {len(row)}", + spcsrc, + ) + ) + results.append( + frozendict( + { + param_ids[j]: _normalize_xsv(row[j]) + for j in range(len(row)) + } + ) + ) if not results: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - return ParseResults(frozendict( - {datatype: ParseResult(spcsrc, tuple(results))} - )) + raise _ParseException( + Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc) + ) + return ParseResults(frozendict({datatype: ParseResult(spcsrc, tuple(results))})) except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: - return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + return _error( + Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) + ) except _ParseException as e: return _error(e.args[0]) def parse_csv(path: Path) -> ParseResults: - """ Parse the provided CSV file. """ + """Parse the provided CSV file.""" return _parse_xsv(path, ",") def parse_tsv(path: Path) -> ParseResults: - """ Parse the provided TSV file. """ + """Parse the provided TSV file.""" return _parse_xsv(path, "\t") @@ -214,17 +241,23 @@ def _process_excel_row( if pandas.isna(row[-1]): # inefficient, but premature optimization... row = row[0:-1] else: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {rownum}, " - + f"expected {expected_columns}, got {len(row)}", - spcsrc)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {rownum}, " + + f"expected {expected_columns}, got {len(row)}", + spcsrc, + ) + ) return row - -def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource -) -> (O[str], O[ParseResult]): - df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False) + +def _process_excel_tab( + excel: pandas.ExcelFile, spcsrc: SpecificationSource +) -> (Optional[str], Optional[ParseResult]): + df = excel.parse( + sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False + ) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, @@ -239,8 +272,11 @@ def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource for i, row in enumerate(it, start=4): row = _process_excel_row(row, i, columns, spcsrc) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows - results.append(frozendict( - {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))})) + results.append( + frozendict( + {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))} + ) + ) return datatype, ParseResult(spcsrc, tuple(results)) @@ -262,12 +298,14 @@ def parse_excel(path: Path) -> ParseResults: if not datatype: continue elif datatype in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Found datatype {datatype} in multiple tabs", - SpecificationSource(path, datatype_to_tab[datatype]), - spcsrc_tab, - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Found datatype {datatype} in multiple tabs", + SpecificationSource(path, datatype_to_tab[datatype]), + spcsrc_tab, + ) + ) else: datatype_to_tab[datatype] = tab results[datatype] = result @@ -276,11 +314,18 @@ def parse_excel(path: Path) -> ParseResults: except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: - return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + return _error( + Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) + ) except ValueError as e: if "Excel file format cannot be determined" in str(e): - return _error(Error( - ErrorType.PARSE_FAIL, "Not a supported Excel file type", source_1=spcsrc)) + return _error( + Error( + ErrorType.PARSE_FAIL, + "Not a supported Excel file type", + source_1=spcsrc, + ) + ) raise e # bail out, not sure what's wrong, not sure how to test either if errors: return ParseResults(errors=tuple(errors)) @@ -288,4 +333,3 @@ def parse_excel(path: Path) -> ParseResults: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - \ No newline at end of file diff --git a/staging_service/metadata.py b/staging_service/metadata.py index e2486a42..44418d3e 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -14,6 +14,7 @@ READ_BUFFER_SIZE = buffer_size = 1 * 1000 * 1000 FILE_SNIPPET_SIZE = 1024 +NOT_TEXT_FILE_VALUE = "not a text file" async def stat_data(path: StagingPath) -> dict: @@ -51,7 +52,7 @@ def _file_read_from_tail(file_path): return file.read() except UnicodeError: # Note that this string propagates all the way to the ui. - return "not a text file" + return NOT_TEXT_FILE_VALUE def _file_read_from_head(file_path): @@ -69,7 +70,7 @@ def _file_read_from_head(file_path): return file.read(FILE_SNIPPET_SIZE) except UnicodeError: # Note that this string propagates all the way to the ui. - return "not a text file" + return NOT_TEXT_FILE_VALUE async def _read_metadata(metadata_path: str): diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 81fe4cc0..4d0e9183 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -20,8 +20,12 @@ ) from tests.test_helpers import assert_exception_correct +# some test functions include "SpecificationSource" in the name, which violates +# snake_case rules, but is meaningful in this context. +# pylint: disable=C0103 -def spcsrc(path: str, tab: Optional[str] = None): + +def specification_source(path: str, tab: Optional[str] = None): return SpecificationSource(Path(path), tab) @@ -84,39 +88,45 @@ def fileTypeResolution_init_fail( def test_Error_init_w_FILE_NOT_FOUND_success(): # minimal - e = Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo")) + e = Error(ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo")) assert e.error == ErrorType.FILE_NOT_FOUND assert e.message is None - assert e.source_1 == spcsrc("foo") + assert e.source_1 == specification_source("foo") assert e.source_2 is None # all - e = Error(ErrorType.FILE_NOT_FOUND, message="bar", source_1=spcsrc("foo")) + e = Error( + ErrorType.FILE_NOT_FOUND, message="bar", source_1=specification_source("foo") + ) assert e.error == ErrorType.FILE_NOT_FOUND assert e.message == "bar" - assert e.source_1 == spcsrc("foo") + assert e.source_1 == specification_source("foo") assert e.source_2 is None def test_Error_init_w_PARSE_FAIL_success(): - e = Error(ErrorType.PARSE_FAIL, message="foo", source_1=spcsrc("foo2")) + e = Error( + ErrorType.PARSE_FAIL, message="foo", source_1=specification_source("foo2") + ) assert e.error == ErrorType.PARSE_FAIL assert e.message == "foo" - assert e.source_1 == spcsrc("foo2") + assert e.source_1 == specification_source("foo2") assert e.source_2 is None def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): e = Error( - ErrorType.INCORRECT_COLUMN_COUNT, message="42", source_1=spcsrc("somefile") + ErrorType.INCORRECT_COLUMN_COUNT, + message="42", + source_1=specification_source("somefile"), ) assert e.error == ErrorType.INCORRECT_COLUMN_COUNT assert e.message == "42" - assert e.source_1 == spcsrc("somefile") + assert e.source_1 == specification_source("somefile") assert e.source_2 is None @@ -124,14 +134,14 @@ def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): e = Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo", - spcsrc("foo2"), - spcsrc("yay"), + specification_source("foo2"), + specification_source("yay"), ) assert e.error == ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE assert e.message == "foo" - assert e.source_1 == spcsrc("foo2") - assert e.source_2 == spcsrc("yay") + assert e.source_1 == specification_source("foo2") + assert e.source_2 == specification_source("yay") def test_Error_init_w_NO_FILES_PROVIDED_success(): @@ -153,11 +163,11 @@ def test_Error_init_w_OTHER_success(): assert e.source_2 is None # all - e = Error(ErrorType.OTHER, message="foo", source_1=spcsrc("wooo")) + e = Error(ErrorType.OTHER, message="foo", source_1=specification_source("wooo")) assert e.error == ErrorType.OTHER assert e.message == "foo" - assert e.source_1 == spcsrc("wooo") + assert e.source_1 == specification_source("wooo") assert e.source_2 is None @@ -172,11 +182,17 @@ def test_Error_init_fail(): ValueError("source_1 is required for a FILE_NOT_FOUND error"), ) err = "message, source_1 is required for a PARSE_FAIL error" - error_init_fail(ErrorType.PARSE_FAIL, None, spcsrc("wooo"), None, ValueError(err)) + error_init_fail( + ErrorType.PARSE_FAIL, None, specification_source("wooo"), None, ValueError(err) + ) error_init_fail(ErrorType.PARSE_FAIL, "msg", None, None, ValueError(err)) err = "message, source_1 is required for a INCORRECT_COLUMN_COUNT error" error_init_fail( - ErrorType.INCORRECT_COLUMN_COUNT, None, spcsrc("whee"), None, ValueError(err) + ErrorType.INCORRECT_COLUMN_COUNT, + None, + specification_source("whee"), + None, + ValueError(err), ) error_init_fail( ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err) @@ -187,9 +203,15 @@ def test_Error_init_fail(): + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error" ) error_init_fail(ms, None, None, None, ValueError(err)) - error_init_fail(ms, None, spcsrc("foo"), spcsrc("bar"), ValueError(err)) - error_init_fail(ms, "msg", None, spcsrc("bar"), ValueError(err)) - error_init_fail(ms, "msg", spcsrc("foo"), None, ValueError(err)) + error_init_fail( + ms, + None, + specification_source("foo"), + specification_source("bar"), + ValueError(err), + ) + error_init_fail(ms, "msg", None, specification_source("bar"), ValueError(err)) + error_init_fail(ms, "msg", specification_source("foo"), None, ValueError(err)) error_init_fail( ErrorType.OTHER, None, @@ -212,9 +234,9 @@ def error_init_fail( def test_ParseResult_init_success(): - pr = ParseResult(spcsrc("bar"), (frozendict({"foo": "bar"}),)) + pr = ParseResult(specification_source("bar"), (frozendict({"foo": "bar"}),)) - assert pr.source == spcsrc("bar") + assert pr.source == specification_source("bar") assert pr.result == (frozendict({"foo": "bar"}),) @@ -223,7 +245,9 @@ def test_ParseResult_init_fail(): parseResult_init_fail( None, (frozendict({"foo": "bar"}),), ValueError("source is required") ) - parseResult_init_fail(spcsrc("foo"), None, ValueError("result is required")) + parseResult_init_fail( + specification_source("foo"), None, ValueError("result is required") + ) def parseResult_init_fail( @@ -239,7 +263,7 @@ def parseResult_init_fail( PR_RESULTS = frozendict( { "data_type": ParseResult( - spcsrc("some_file", "tab"), + specification_source("some_file", "tab"), (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),), # make a tuple! ) } @@ -247,7 +271,11 @@ def parseResult_init_fail( PR_ERROR = ( Error(ErrorType.OTHER, message="foo"), - Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")), + Error( + ErrorType.PARSE_FAIL, + message="bar", + source_1=specification_source("some_file", "tab3"), + ), ) @@ -306,11 +334,11 @@ def test_parse_import_specifications_success(): frozendict( { "type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), + specification_source("myfile.xlsx", "tab1"), (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), ), "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), + specification_source("myfile.xlsx", "tab2"), (frozendict({"whee": "whoo"}),), # tuple! ), } @@ -321,7 +349,7 @@ def test_parse_import_specifications_success(): frozendict( { "type_other": ParseResult( - spcsrc("somefile.csv"), + specification_source("somefile.csv"), (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), ) } @@ -336,15 +364,15 @@ def test_parse_import_specifications_success(): frozendict( { "type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), + specification_source("myfile.xlsx", "tab1"), (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), ), "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), + specification_source("myfile.xlsx", "tab2"), (frozendict({"whee": "whoo"}),), # tuple! ), "type_other": ParseResult( - spcsrc("somefile.csv"), + specification_source("somefile.csv"), (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), ), } @@ -406,12 +434,20 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): errors=tuple( [ Error(ErrorType.OTHER, "foo"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + Error( + ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo.csv") + ), ] ) ) parser2.return_value = ParseResults( - frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))}) + frozendict( + { + "foo": ParseResult( + specification_source("a"), tuple([frozendict({"a": "b"})]) + ) + } + ) ) res = parse_import_specifications( @@ -422,11 +458,13 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): errors=tuple( [ Error(ErrorType.OTHER, "foo"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), + Error( + ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo.csv") + ), Error( ErrorType.PARSE_FAIL, "JPEG is not a supported file type for import specifications", - spcsrc(Path("x.jpeg")), + specification_source("x.jpeg"), ), ] ) @@ -463,25 +501,40 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): errors=tuple( [ Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + Error( + ErrorType.FILE_NOT_FOUND, + source_1=specification_source("myfile.xlsx"), + ), ] ) ) parser2.return_value = ParseResults( frozendict( { - "foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])), + "foo": ParseResult( + specification_source("a1"), tuple([frozendict({"a": "b"})]) + ), + "bar": ParseResult( + specification_source("b1"), tuple([frozendict({"a": "b"})]) + ), + "baz": ParseResult( + specification_source("c1"), tuple([frozendict({"a": "b"})]) + ), }, ) ) parser3.return_value = ParseResults( frozendict( { - "foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])), + "foo2": ParseResult( + specification_source("a2"), tuple([frozendict({"a": "b"})]) + ), + "bar": ParseResult( + specification_source("b2"), tuple([frozendict({"a": "b"})]) + ), + "baz": ParseResult( + specification_source("c2"), tuple([frozendict({"a": "b"})]) + ), }, ) ) @@ -494,18 +547,21 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): errors=tuple( [ Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), + Error( + ErrorType.FILE_NOT_FOUND, + source_1=specification_source("myfile.xlsx"), + ), Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "Data type bar appears in two importer specification sources", - spcsrc("b1"), - spcsrc("b2"), + specification_source("b1"), + specification_source("b2"), ), Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "Data type baz appears in two importer specification sources", - spcsrc("c1"), - spcsrc("c2"), + specification_source("c1"), + specification_source("c2"), ), ] ) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index a8b20c1d..12df6c84 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -3,8 +3,7 @@ from collections.abc import Callable, Generator from pathlib import Path -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict +from frozendict import frozendict from pytest import fixture from staging_service.import_specifications.individual_parsers import ( @@ -21,9 +20,12 @@ _TEST_DATA_DIR = (Path(__file__).parent / "test_data").resolve() +# Most test functions don't have docstrings, and probably never will. +# pylint: disable=C0116 -@fixture(scope="module") -def temp_dir() -> Generator[Path, None, None]: + +@fixture(scope="module", name="temp_dir") +def temp_dir_fixture() -> Generator[Path, None, None]: with FileUtil() as fu: childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() @@ -50,7 +52,7 @@ def _xsv_parse_success( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines( [ f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", @@ -126,7 +128,7 @@ def _xsv_parse_success_nan_inf( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines( [ f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", @@ -193,7 +195,7 @@ def _xsv_parse_success_with_numeric_headers( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines( [ "Data type: some_type; Columns: 4; Version: 1\n", @@ -237,7 +239,7 @@ def _xsv_parse_success_with_internal_and_trailing_empty_lines( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines( [ "Data type: other_type; Columns: 4; Version: 1\n", @@ -287,7 +289,7 @@ def _xsv_parse_success_with_internal_and_trailing_empty_lines( def test_xsv_parse_fail_no_file(temp_dir: Path): input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines( [ "Data type: other_type; Columns: 4; Version: 1\n", @@ -307,7 +309,7 @@ def test_xsv_parse_fail_no_file(temp_dir: Path): ) -def test_xsv_parse_fail_binary_file(temp_dir: Path): +def test_xsv_parse_fail_binary_file(): test_file = _TEST_DATA_DIR / "testtabs3full2nodata1empty.xls" res = parse_csv(test_file) @@ -353,7 +355,7 @@ def _xsv_parse_fail( extension: str = "", ): input_ = temp_dir / (str(uuid.uuid4()) + extension) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines(lines) res = parser(input_) @@ -405,7 +407,8 @@ def test_xsv_parse_fail_missing_column_header_entries(temp_dir: Path): def test_xsv_parse_fail_missing_data(temp_dir: Path): err = "No non-header data in file" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head1, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -414,35 +417,40 @@ def test_xsv_parse_fail_missing_data(temp_dir: Path): def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 3, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 3, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" "head1\thead2\n", + "Data type: foo; Columns: 2; Version: 1\n", + "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" "head1, head2, head3\n", + "Data type: foo; Columns: 2; Version: 1\n", + "head1, head2, head3\n", "Head 1, Head 2\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 2, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_tsv, err, ErrorType.INCORRECT_COLUMN_COUNT) err = "Incorrect number of items in line 5, expected 3, got 4" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head1, head2, head 3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", "1, 2, 3, 4\n", @@ -452,7 +460,8 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 6, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head1\thead2\thead 3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head1\thead2\thead 3\n", "Head 1\tHead 2\tHead 3\n", "1\t2\t3\n", "1\t2\t3\n", @@ -465,14 +474,16 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): def test_xsv_parse_fail_duplicate_headers(temp_dir: Path): err = "Duplicate header name in row 2: head3" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head3, head2, head3\n", "Head 1, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) # test with duplicate dual headers lines = [ - "Data type: foo; Columns: 3; Version: 1\n" "head3, head2, head3\n", + "Data type: foo; Columns: 3; Version: 1\n", + "head3, head2, head3\n", "Head 3, Head 2, Head 3\n", ] _xsv_parse_fail(temp_dir, lines, parse_csv, err) @@ -641,7 +652,8 @@ def test_excel_parse_fail_empty_file(temp_dir: Path): def test_excel_parse_fail_non_excel_file(temp_dir: Path): lines = [ - "Data type: foo; Version: 1\n" "head1, head2, head 3\n", + "Data type: foo; Version: 1\n", + "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", ] @@ -683,26 +695,29 @@ def test_excel_parse_fail_headers_only(): def test_excel_parse_fail_colliding_datatypes(): f = _get_test_file("testdatatypecollisions.xls") - l = lambda t: f"Found datatype {t} in multiple tabs" + + def error_message(data_type_name): + return f"Found datatype {data_type_name} in multiple tabs" + err = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE _excel_parse_fail( f, errors=[ Error( err, - l("type2"), + error_message("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_2"), ), Error( err, - l("type3"), + error_message("type3"), SpecificationSource(f, "dt3"), SpecificationSource(f, "dt3_2"), ), Error( err, - l("type2"), + error_message("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_3"), ), @@ -712,12 +727,23 @@ def test_excel_parse_fail_colliding_datatypes(): def test_excel_parse_fail_duplicate_headers(): f = _get_test_file("testduplicateheaders.xlsx") - l = lambda h: f"Duplicate header name in row 2: {h}" + + def error_message(header_name): + return f"Duplicate header name in row 2: {header_name}" + _excel_parse_fail( f, errors=[ - Error(ErrorType.PARSE_FAIL, l("head1"), SpecificationSource(f, "dt2")), - Error(ErrorType.PARSE_FAIL, l("head2"), SpecificationSource(f, "dt3")), + Error( + ErrorType.PARSE_FAIL, + error_message("head1"), + SpecificationSource(f, "dt2"), + ), + Error( + ErrorType.PARSE_FAIL, + error_message("head2"), + SpecificationSource(f, "dt3"), + ), ], ) From 1158081f4a93d6001aad68240bb07ec1ac21c59e Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 08:39:36 -0700 Subject: [PATCH 16/33] just a few more linting fixes [PTV-1882] --- staging_service/AutoDetectUtils.py | 6 +- .../individual_parsers.py | 73 ++++++++++++------- staging_service/metadata.py | 2 - .../import_specifications/test_file_parser.py | 9 ++- tests/test_app.py | 2 +- 5 files changed, 57 insertions(+), 35 deletions(-) diff --git a/staging_service/AutoDetectUtils.py b/staging_service/AutoDetectUtils.py index 4b398120..ad1bfab9 100644 --- a/staging_service/AutoDetectUtils.py +++ b/staging_service/AutoDetectUtils.py @@ -8,6 +8,8 @@ MappingsConfig = dict[str, Mappings] +# pylint: disable=C0115,C0116 + class AutoDetectUtils: _MAPPINGS: None | dict[str, Any] = None # expects to be set by config @@ -22,10 +24,6 @@ def has_mappings(cls) -> bool: return True return False - # @classmethod - # def get_mappings(cls) -> dict[str, Any]: - # return cls._MAPPINGS - @classmethod def get_mappings_by_extension(cls, extension: str): if cls._MAPPINGS is not None: diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index e714bd6c..68696eaf 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -6,7 +6,7 @@ import math import re from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import magic import pandas @@ -173,20 +173,28 @@ def _normalize_headers( def _parse_xsv(path: Path, sep: str) -> ParseResults: - spcsrc = SpecificationSource(path) + specification_source = SpecificationSource(path) try: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: return _error( - Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc) + Error( + ErrorType.PARSE_FAIL, + "Not a text file: " + filetype, + specification_source, + ) ) - with open(path, newline="") as input_: + with open(path, newline="", encoding="utf-8") as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting - dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") - datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) - hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line") - param_ids = _normalize_headers(hd1, 2, spcsrc) - _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line") + dthd = _csv_next( + rdr, 1, None, specification_source, "Missing data type / version header" + ) + datatype, columns = _parse_header(dthd[0], specification_source, _VERSION) + hd1 = _csv_next( + rdr, 2, columns, specification_source, "Missing 2nd header line" + ) + param_ids = _normalize_headers(hd1, 2, specification_source) + _csv_next(rdr, 3, columns, specification_source, "Missing 3rd header line") results = [] for i, row in enumerate(rdr, start=4): if row: # skip empty rows @@ -198,7 +206,7 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: ErrorType.INCORRECT_COLUMN_COUNT, f"Incorrect number of items in line {i}, " + f"expected {columns}, got {len(row)}", - spcsrc, + specification_source, ) ) results.append( @@ -211,14 +219,24 @@ def _parse_xsv(path: Path, sep: str) -> ParseResults: ) if not results: raise _ParseException( - Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc) + Error( + ErrorType.PARSE_FAIL, + "No non-header data in file", + specification_source, + ) ) - return ParseResults(frozendict({datatype: ParseResult(spcsrc, tuple(results))})) + return ParseResults( + frozendict({datatype: ParseResult(specification_source, tuple(results))}) + ) except FileNotFoundError: - return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) + return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=specification_source)) except IsADirectoryError: return _error( - Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + specification_source, + ) ) except _ParseException as e: return _error(e.args[0]) @@ -235,7 +253,10 @@ def parse_tsv(path: Path) -> ParseResults: def _process_excel_row( - row: tuple[Any], rownum: int, expected_columns: int, spcsrc: SpecificationSource + row: tuple[Any], + rownum: int, + expected_columns: int, + specification_source: SpecificationSource, ) -> list[Any]: while len(row) > expected_columns: if pandas.isna(row[-1]): # inefficient, but premature optimization... @@ -246,38 +267,40 @@ def _process_excel_row( ErrorType.INCORRECT_COLUMN_COUNT, f"Incorrect number of items in line {rownum}, " + f"expected {expected_columns}, got {len(row)}", - spcsrc, + specification_source, ) ) return row def _process_excel_tab( - excel: pandas.ExcelFile, spcsrc: SpecificationSource -) -> (Optional[str], Optional[ParseResult]): + excel: pandas.ExcelFile, specification_source: SpecificationSource +) -> Tuple[Optional[str], Optional[ParseResult]]: df = excel.parse( - sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False + sheet_name=specification_source.tab, + na_values=_EXCEL_MISSING_VALUES, + keep_default_na=False, ) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, # parameter ID header, display name header, and at least one data row header = df.columns.get_level_values(0)[0] - datatype, columns = _parse_header(header, spcsrc, _VERSION) + datatype, columns = _parse_header(header, specification_source, _VERSION) it = df.itertuples(index=False, name=None) - hd1 = _process_excel_row(next(it), 2, columns, spcsrc) - param_ids = _normalize_headers(hd1, 2, spcsrc) - _process_excel_row(next(it), 3, columns, spcsrc) + hd1 = _process_excel_row(next(it), 2, columns, specification_source) + param_ids = _normalize_headers(hd1, 2, specification_source) + _process_excel_row(next(it), 3, columns, specification_source) results = [] for i, row in enumerate(it, start=4): - row = _process_excel_row(row, i, columns, spcsrc) + row = _process_excel_row(row, i, columns, specification_source) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows results.append( frozendict( {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))} ) ) - return datatype, ParseResult(spcsrc, tuple(results)) + return datatype, ParseResult(specification_source, tuple(results)) def parse_excel(path: Path) -> ParseResults: diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 44418d3e..0ef1e8ea 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -305,8 +305,6 @@ async def dir_info( elif entry.is_file(): if query == "" or specific_path.user_path.find(query) != -1: metadata, file_stats = await _ensure_metadata(specific_path) - # data = await stat_data(specific_path) - # data["source"] = await _only_source(specific_path) file_stats["source"] = metadata["source"] response.append(file_stats) diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 4d0e9183..ad40c0e5 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -23,6 +23,7 @@ # some test functions include "SpecificationSource" in the name, which violates # snake_case rules, but is meaningful in this context. # pylint: disable=C0103 +# pylint: disable=C0116 def specification_source(path: str, tab: Optional[str] = None): @@ -55,10 +56,12 @@ def specificationSource_init_fail(file_: Optional[str], expected: Exception): def test_FileTypeResolution_init_w_parser_success(): - p = lambda path: ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - ftr = FileTypeResolution(p) + def parser(_: str): + return ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - assert ftr.parser is p # Here only identity equality makes sense + ftr = FileTypeResolution(parser) + + assert ftr.parser is parser # Here only identity equality makes sense assert ftr.unsupported_type is None diff --git a/tests/test_app.py b/tests/test_app.py index 95f1f441..12f4a0a0 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -358,7 +358,7 @@ async def test_metadata(get_user): @pytest.mark.asyncio @patch("staging_service.auth2Client.KBaseAuth2.get_user") -async def test_define_UPA(get_user): +async def test_define_UPA(get_user): # pylint: disable=C0103 txt = "testing text\n" username = "testuser" get_user.return_value = username From e3164c963672185e5db82e10db3a35761c0f3ea4 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 08:53:18 -0700 Subject: [PATCH 17/33] a few more code quality fixes [PTV-1882] --- staging_service/app.py | 15 +++++++++------ staging_service/metadata.py | 4 ++-- tests/test_autodetect.py | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index 37336f6f..7268e9b0 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -821,19 +821,22 @@ def inject_config_dependencies(config): StagingPath._META_DIR = META_DIR StagingPath._CONCIERGE_PATH = CONCIERGE_PATH - # TODO: why make these attributes "private" if they are not used that way? - # either provide a static setter method, or make them public, I'd say. if StagingPath._DATA_DIR is None: - raise Exception("Please provide DATA_DIR in the config file ") + raise web.HTTPInternalServerError(text="DATA_DIR missing in the config file ") if StagingPath._META_DIR is None: - raise Exception("Please provide META_DIR in the config file ") + raise web.HTTPInternalServerError(text="META_DIR missing in the config file ") if StagingPath._CONCIERGE_PATH is None: - raise Exception("Please provide CONCIERGE_PATH in the config file ") + raise web.HTTPInternalServerError( + text="CONCIERGE_PATH missing in the config file " + ) if FILE_EXTENSION_MAPPINGS is None: - raise Exception("Please provide FILE_EXTENSION_MAPPINGS in the config file ") + raise web.HTTPInternalServerError( + text="FILE_EXTENSION_MAPPINGS missing in the config file " + ) + with open( FILE_EXTENSION_MAPPINGS, encoding="utf-8" ) as file_extension_mappings_file: diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 0ef1e8ea..73fed669 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -24,13 +24,13 @@ async def stat_data(path: StagingPath) -> dict: only call this on a validated full path """ file_stats = os.stat(path.full_path) - isFolder = os.path.isdir(path.full_path) + is_folder = os.path.isdir(path.full_path) return { "name": path.name, "path": path.user_path, "mtime": int(file_stats.st_mtime * 1000), # given in seconds, want ms "size": file_stats.st_size, - "isFolder": isFolder, + "isFolder": is_folder, } diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index a5c2714d..32b24856 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -223,7 +223,7 @@ def test_zip_mappings(): def test_get_mappings(): """ Basic test of the get mappings logic. Most of the logic is in determine_possible_importers - which is throughly tested above. + which is thoroughly tested above. """ assert AutoDetectUtils.get_mappings( ["filename", "file.name.Gz", "some.dots.gff3.gz"] From 2fb8db07068b2b3d641ba82b32b7ac0c1c32d4f3 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 09:05:00 -0700 Subject: [PATCH 18/33] last of the "critical" code smell issues? [PTV-1882] --- staging_service/metadata.py | 45 ++++++++++++------------------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 73fed669..d3e9265b 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -12,8 +12,10 @@ decoder = JSONDecoder() encoder = JSONEncoder() -READ_BUFFER_SIZE = buffer_size = 1 * 1000 * 1000 +READ_BUFFER_SIZE = 1 * 1000 * 1000 FILE_SNIPPET_SIZE = 1024 + +# Set as the value for snippets for a non-text file NOT_TEXT_FILE_VALUE = "not a text file" @@ -137,19 +139,16 @@ async def _generate_metadata(path: StagingPath, source: str = None): file_stats = os.stat(path.full_path) additional_metadata["mtime"] = int(file_stats.st_mtime * 1000) - # If source is missing (why would that be?), supply it with the one + # If source is missing (why woudld that be?), supply it with the one # provided, or attempt to determine it. - if "source" not in existing_metadata: - if source is None: - source = _determine_source(path) - additional_metadata["source"] = source + if "source" not in existing_metadata and source is None: + source = _determine_source(path) + additional_metadata["source"] = source metadata = existing_metadata | additional_metadata - # # We want to know, roughly, if this is a text file or not ("binary"); # setting to true initially is just part of the logic in the loop below. - # is_text = True # A word on counting lines. We simply count line endings. With a catch. @@ -162,9 +161,7 @@ async def _generate_metadata(path: StagingPath, source: str = None): # Keeping the previous chunk of bytes read from the file is necessary for accurate # line counting. last_chunk_read = None - chunk_count = 0 - size = 0 md5 = hashlib.md5() # noqua: S303 @@ -172,31 +169,20 @@ async def _generate_metadata(path: StagingPath, source: str = None): while True: chunk = await fin.read(READ_BUFFER_SIZE) - # # If attempt to read past end of file, a 0-length bytes is returned - # if len(chunk) == 0: break last_chunk_read = chunk - size += len(chunk) - chunk_count += 1 - md5.update(chunk) - # # We use the first chunk to determine if the file is valid utf-8 text. # The first chunk is 1MB, so this should be safe. - # if chunk_count == 1: is_text = is_text_string(chunk) - # - # Let us not waste time counting lines if we already think this is not - # text. - # if is_text: line_count += chunk.count(b"\n") @@ -207,19 +193,16 @@ async def _generate_metadata(path: StagingPath, source: str = None): if last_chunk_read is not None and last_chunk_read[-1] != ord(b"\n"): line_count += 1 - if is_text: - metadata["lineCount"] = line_count - else: - metadata["lineCount"] = None + metadata["lineCount"] = line_count if is_text else None metadata["md5"] = md5.hexdigest() - if is_text: - metadata["head"] = _file_read_from_head(path.full_path) - metadata["tail"] = _file_read_from_tail(path.full_path) - else: - metadata["head"] = "not a text file" - metadata["tail"] = "not a text file" + metadata["head"] = ( + _file_read_from_head(path.full_path) if is_text else NOT_TEXT_FILE_VALUE + ) + metadata["tail"] = ( + _file_read_from_tail(path.full_path) if is_text else NOT_TEXT_FILE_VALUE + ) # Save metadata file await _save_metadata(path, metadata) From 680d81f6542c7317c205e3497d9b0ae0c96171f6 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 09:10:33 -0700 Subject: [PATCH 19/33] couple more for sonarcloud [PTV-1882] --- .../import_specifications/test_file_parser.py | 4 ++-- tests/test_app.py | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index ad40c0e5..57dd8a38 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -46,10 +46,10 @@ def test_SpecificationSource_init_success(): def test_SpecificationSource_init_fail(): # could inline this, but might as well follow the same pattern as all the other tests - specificationSource_init_fail(None, ValueError("file is required")) + assert_SpecificationSource_init_fail(None, ValueError("file is required")) -def specificationSource_init_fail(file_: Optional[str], expected: Exception): +def assert_SpecificationSource_init_fail(file_: Optional[str], expected: Exception): with raises(Exception) as got: SpecificationSource(file_) assert_exception_correct(got.value, expected) diff --git a/tests/test_app.py b/tests/test_app.py index 12f4a0a0..d9089b6c 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -23,9 +23,14 @@ import staging_service.globus as globus import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils -from tests.test_helpers import (DATA_DIR, META_DIR, FileUtil, - assert_file_contents, bootstrap, - check_excel_contents) +from tests.test_helpers import ( + DATA_DIR, + META_DIR, + FileUtil, + assert_file_contents, + bootstrap, + check_excel_contents, +) if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: bootstrap() @@ -358,15 +363,15 @@ async def test_metadata(get_user): @pytest.mark.asyncio @patch("staging_service.auth2Client.KBaseAuth2.get_user") -async def test_define_UPA(get_user): # pylint: disable=C0103 +async def test_define_upa(get_user): txt = "testing text\n" username = "testuser" get_user.return_value = username async with AppClient() as cli: - with FileUtil() as fs: - fs.make_dir(os.path.join(username, "test")) - fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + with FileUtil() as file_util: + file_util.make_dir(os.path.join(username, "test")) + file_util.make_file(os.path.join(username, "test", "test_file_1"), txt) # generating metadata file res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), From e53e1c30c357fd8a700b686fe9c8df257d169657 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 09:19:27 -0700 Subject: [PATCH 20/33] finally fix the file-existence-check race condition [PTV-1882] --- staging_service/metadata.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index d3e9265b..096c2bcb 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -84,6 +84,18 @@ async def _read_metadata(metadata_path: str): return decoder.decode(data) +async def _get_metadata(metadata_path: str): + """ + Gets the file metadata from the file system or, if it doesn't exist, + return an empty dict. + """ + try: + return await _read_metadata(metadata_path) + except FileNotFoundError: + os.makedirs(os.path.dirname(metadata_path), exist_ok=True) + return {} + + def is_text_string(bytes_to_check): """ Determines if the provided bytes is indeed a text string. @@ -125,11 +137,7 @@ async def _generate_metadata(path: StagingPath, source: str = None): # Open metadatafile if it exists, otherwise create an # empty dict and ensure there is a writable directory for it # to be saved to. - if os.path.exists(path.metadata_path): - existing_metadata = await _read_metadata(path.metadata_path) - else: - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - existing_metadata = {} + existing_metadata = await _get_metadata(path.metadata_path) additional_metadata = {} From 4479bc275554fd8d3e96a09174e638cf726b3b48 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 10:41:54 -0700 Subject: [PATCH 21/33] few more code quality fixes [PTV-1882] --- staging_service/metadata.py | 15 ++-- .../import_specifications/test_file_parser.py | 71 +++++++++++-------- tests/test_metadata.py | 20 +++--- 3 files changed, 61 insertions(+), 45 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 096c2bcb..ca264412 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -18,6 +18,9 @@ # Set as the value for snippets for a non-text file NOT_TEXT_FILE_VALUE = "not a text file" +SOURCE_JGI_IMPORT = "JGI import" +SOURCE_UNKNOWN = "Unknown" + async def stat_data(path: StagingPath) -> dict: """ @@ -116,7 +119,7 @@ def is_text_string(bytes_to_check): # use this to enable line counting and snippets. # We allow TAB (9), LF (10), CR (13) for obvious reasons. for character in decoded: - if unicodedata.category(character) == "Cc" and not ord(character) in [ + if unicodedata.category(character) == "Cc" and ord(character) not in [ 9, 10, 13, @@ -170,7 +173,6 @@ async def _generate_metadata(path: StagingPath, source: str = None): # line counting. last_chunk_read = None chunk_count = 0 - size = 0 md5 = hashlib.md5() # noqua: S303 async with aiofiles.open(path.full_path, "rb") as fin: @@ -182,7 +184,6 @@ async def _generate_metadata(path: StagingPath, source: str = None): break last_chunk_read = chunk - size += len(chunk) chunk_count += 1 md5.update(chunk) @@ -243,14 +244,14 @@ async def _update_metadata(path: StagingPath, additional_metadata: dict) -> dict return new_metdata -async def add_upa(path: StagingPath, UPA: str): +async def add_upa(path: StagingPath, upa: str): """ Adds the provided UPA to the metadata for the given path. If the metadata does not yet exist, the metadata is generated. """ await _ensure_metadata(path) - await _update_metadata(path, {"UPA": UPA}) + await _update_metadata(path, {"UPA": upa}) def _determine_source(path: StagingPath): @@ -260,9 +261,9 @@ def _determine_source(path: StagingPath): """ jgi_path = path.jgi_metadata if os.path.exists(jgi_path) and os.path.isfile(jgi_path): - return "JGI import" + return SOURCE_JGI_IMPORT else: - return "Unknown" + return SOURCE_UNKNOWN async def dir_info( diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 57dd8a38..3453dcbb 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -20,6 +20,11 @@ ) from tests.test_helpers import assert_exception_correct +TEST_EXCEL_FILE = "myfile.xlsx" + +# This file does not exist, it is used for a mock, appears in many locations. +TEST_MOCK_CSV_FILE_NAME = "somefile.csv" + # some test functions include "SpecificationSource" in the name, which violates # snake_case rules, but is meaningful in this context. # pylint: disable=C0103 @@ -49,9 +54,9 @@ def test_SpecificationSource_init_fail(): assert_SpecificationSource_init_fail(None, ValueError("file is required")) -def assert_SpecificationSource_init_fail(file_: Optional[str], expected: Exception): +def assert_SpecificationSource_init_fail(file: Optional[str], expected: Exception): with raises(Exception) as got: - SpecificationSource(file_) + SpecificationSource(file) assert_exception_correct(got.value, expected) @@ -124,12 +129,12 @@ def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): e = Error( ErrorType.INCORRECT_COLUMN_COUNT, message="42", - source_1=specification_source("somefile"), + source_1=specification_source("some_other_file"), ) assert e.error == ErrorType.INCORRECT_COLUMN_COUNT assert e.message == "42" - assert e.source_1 == specification_source("somefile") + assert e.source_1 == specification_source("some_other_file") assert e.source_2 is None @@ -337,11 +342,11 @@ def test_parse_import_specifications_success(): frozendict( { "type1": ParseResult( - specification_source("myfile.xlsx", "tab1"), + specification_source(TEST_EXCEL_FILE, "tab1"), (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), ), "type2": ParseResult( - specification_source("myfile.xlsx", "tab2"), + specification_source(TEST_EXCEL_FILE, "tab2"), (frozendict({"whee": "whoo"}),), # tuple! ), } @@ -352,7 +357,7 @@ def test_parse_import_specifications_success(): frozendict( { "type_other": ParseResult( - specification_source("somefile.csv"), + specification_source(TEST_MOCK_CSV_FILE_NAME), (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), ) } @@ -360,31 +365,33 @@ def test_parse_import_specifications_success(): ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME)), resolver, logger ) assert res == ParseResults( frozendict( { "type1": ParseResult( - specification_source("myfile.xlsx", "tab1"), + specification_source(TEST_EXCEL_FILE, "tab1"), (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), ), "type2": ParseResult( - specification_source("myfile.xlsx", "tab2"), + specification_source(TEST_EXCEL_FILE, "tab2"), (frozendict({"whee": "whoo"}),), # tuple! ), "type_other": ParseResult( - specification_source("somefile.csv"), + specification_source(TEST_MOCK_CSV_FILE_NAME), (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), ), } ) ) - resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + resolver.assert_has_calls( + [call(Path(TEST_EXCEL_FILE)), call(Path(TEST_MOCK_CSV_FILE_NAME))] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) logger.assert_not_called() @@ -406,13 +413,15 @@ def test_parse_import_specification_resolver_exception(): parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, "foo")])) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME)), resolver, logger ) assert res == ParseResults(errors=tuple([Error(ErrorType.OTHER, "crapsticks")])) - resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) - parser1.assert_called_once_with(Path("myfile.xlsx")) + resolver.assert_has_calls( + [call(Path(TEST_EXCEL_FILE)), call(Path(TEST_MOCK_CSV_FILE_NAME))] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) # In [1]: ArithmeticError("a") == ArithmeticError("a") # Out[1]: False # so assert_called_once_with doesn't work @@ -454,7 +463,9 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.jpeg")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME), Path("x.jpeg")), + resolver, + logger, ) assert res == ParseResults( @@ -475,13 +486,13 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): resolver.assert_has_calls( [ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), + call(Path(TEST_EXCEL_FILE)), + call(Path(TEST_MOCK_CSV_FILE_NAME)), call(Path("x.jpeg")), ] ) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) logger.assert_not_called() @@ -506,7 +517,7 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): Error(ErrorType.OTHER, "other"), Error( ErrorType.FILE_NOT_FOUND, - source_1=specification_source("myfile.xlsx"), + source_1=specification_source(TEST_EXCEL_FILE), ), ] ) @@ -543,7 +554,9 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.tsv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME), Path("x.tsv")), + resolver, + logger, ) assert res == ParseResults( @@ -552,7 +565,7 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): Error(ErrorType.OTHER, "other"), Error( ErrorType.FILE_NOT_FOUND, - source_1=specification_source("myfile.xlsx"), + source_1=specification_source(TEST_EXCEL_FILE), ), Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, @@ -572,12 +585,12 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): resolver.assert_has_calls( [ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), + call(Path(TEST_EXCEL_FILE)), + call(Path(TEST_MOCK_CSV_FILE_NAME)), call(Path("x.tsv")), ] ) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) parser3.assert_called_once_with(Path("x.tsv")) logger.assert_not_called() diff --git a/tests/test_metadata.py b/tests/test_metadata.py index be22d518..3f43fcfa 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -13,6 +13,8 @@ from staging_service.metadata import ( FILE_SNIPPET_SIZE, + NOT_TEXT_FILE_VALUE, + SOURCE_JGI_IMPORT, _determine_source, _file_read_from_head, _file_read_from_tail, @@ -108,8 +110,8 @@ async def _generate_binary_file(temp_dir): assert isinstance(res["mtime"], int) assert "lineCount" in res assert res["lineCount"] is None - assert res["head"] == "not a text file" - assert res["tail"] == "not a text file" + assert res["head"] == NOT_TEXT_FILE_VALUE + assert res["tail"] == NOT_TEXT_FILE_VALUE async def test_binary_data(temp_dir: Path): @@ -147,7 +149,7 @@ async def test_determine_source_jgi_metadata_source_exists(temp_dir: Path): with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write("foo") - assert _determine_source(staging_path) == "JGI import" + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT def _create_validated_staging_path(staging_dir: str, file_path: str): @@ -186,7 +188,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical(temp_dir: Path): with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") - assert _determine_source(staging_path) == "JGI import" + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT def test_determine_source_jgi_metadata_source_exists_canonical_subdir(temp_dir: Path): @@ -199,7 +201,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical_subdir(temp_dir: with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") - assert _determine_source(staging_path) == "JGI import" + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT def test_determine_source_jgi_metadata_source_exists_canonical_deepsubdir( @@ -214,7 +216,7 @@ def test_determine_source_jgi_metadata_source_exists_canonical_deepsubdir( with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: data_file.write(f".{staging_path.full_path}.{filename_base}jgi") - assert _determine_source(staging_path) == "JGI import" + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT def test_determine_source_jgi_metadata_source_doesnt_exist(temp_dir: StagingPath): @@ -266,7 +268,7 @@ def temp_dir2() -> Generator[Path, None, None]: def make_random_string(string_length: str) -> str: random.seed(42) possible_letters = string.ascii_letters - return "".join(random.choice(possible_letters) for i in range(string_length)) + return "".join(random.choice(possible_letters) for _ in range(string_length)) def test_read_from_head_happy(tmp_path: Path): @@ -315,7 +317,7 @@ def test_read_from_head_sad(tmp_path: Path): with open(file_path, "wb") as output_file: output_file.write(file_content) snippet = _file_read_from_head(file_path) - assert snippet == "not a text file" + assert snippet == NOT_TEXT_FILE_VALUE def test_read_from_tail_sad(tmp_path: Path): @@ -332,7 +334,7 @@ def test_read_from_tail_sad(tmp_path: Path): with open(file_path, "wb") as output_file: output_file.write(file_content) snippet = _file_read_from_tail(file_path) - assert snippet == "not a text file" + assert snippet == NOT_TEXT_FILE_VALUE async def test_invalid_desired_fields(temp_dir: Path): From 1afa4d1eac4654316d49f0ff445d13f5a1b6d883 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 11:41:11 -0700 Subject: [PATCH 22/33] more code quality fixes, thanks to sonarlint [PTV-1882] --- .../import_specifications/test_file_parser.py | 60 +++++----- .../test_file_writers.py | 4 +- tests/test_app.py | 19 +-- tests/test_app_error_formatter.py | 108 ++++++++++-------- tests/test_metadata.py | 8 +- 5 files changed, 106 insertions(+), 93 deletions(-) diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 3453dcbb..8b010654 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -35,7 +35,7 @@ def specification_source(path: str, tab: Optional[str] = None): return SpecificationSource(Path(path), tab) -def test_SpecificationSource_init_success(): +def test_specification_source_init_success(): # minimal ss = SpecificationSource(Path("foo")) @@ -49,18 +49,18 @@ def test_SpecificationSource_init_success(): assert ss.tab == "tabbytab" -def test_SpecificationSource_init_fail(): +def test_specification_source_init_fail(): # could inline this, but might as well follow the same pattern as all the other tests - assert_SpecificationSource_init_fail(None, ValueError("file is required")) + assert_specification_source_init_fail(None, ValueError("file is required")) -def assert_SpecificationSource_init_fail(file: Optional[str], expected: Exception): +def assert_specification_source_init_fail(file: Optional[str], expected: Exception): with raises(Exception) as got: SpecificationSource(file) assert_exception_correct(got.value, expected) -def test_FileTypeResolution_init_w_parser_success(): +def test_file_type_resolution_init_w_parser_success(): def parser(_: str): return ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) @@ -70,21 +70,21 @@ def parser(_: str): assert ftr.unsupported_type is None -def test_FileTypeResolution_init_w_unsupported_type_success(): +def test_file_type_resolution_init_w_unsupported_type_success(): ftr = FileTypeResolution(unsupported_type="sys") assert ftr.parser is None assert ftr.unsupported_type == "sys" -def test_FileTypeResolution_init_fail(): +def test_file_type_resolution_init_fail(): err = "Exactly one of parser or unsupported_type must be supplied" pr = ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - fileTypeResolution_init_fail(None, None, ValueError(err)) - fileTypeResolution_init_fail(lambda path: pr, "mp-2", ValueError(err)) + assert_file_type_resolution_init_fail(None, None, ValueError(err)) + assert_file_type_resolution_init_fail(lambda path: pr, "mp-2", ValueError(err)) -def fileTypeResolution_init_fail( +def assert_file_type_resolution_init_fail( parser: Optional[Callable[[Path], ParseResults]], unexpected_type: Optional[str], expected: Exception, @@ -94,7 +94,7 @@ def fileTypeResolution_init_fail( assert_exception_correct(got.value, expected) -def test_Error_init_w_FILE_NOT_FOUND_success(): +def test_error_init_w_file_not_found_success(): # minimal e = Error(ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo")) @@ -114,7 +114,7 @@ def test_Error_init_w_FILE_NOT_FOUND_success(): assert e.source_2 is None -def test_Error_init_w_PARSE_FAIL_success(): +def test_error_init_w_parse_fail_success(): e = Error( ErrorType.PARSE_FAIL, message="foo", source_1=specification_source("foo2") ) @@ -125,7 +125,7 @@ def test_Error_init_w_PARSE_FAIL_success(): assert e.source_2 is None -def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): +def test_error_init_w_incorrect_column_count_success(): e = Error( ErrorType.INCORRECT_COLUMN_COUNT, message="42", @@ -138,7 +138,7 @@ def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): assert e.source_2 is None -def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): +def test_error_init_w_multiple_specifications_for_data_type_success(): e = Error( ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo", @@ -152,7 +152,7 @@ def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): assert e.source_2 == specification_source("yay") -def test_Error_init_w_NO_FILES_PROVIDED_success(): +def test_error_init_w_no_files_provided_success(): e = Error(ErrorType.NO_FILES_PROVIDED) assert e.error == ErrorType.NO_FILES_PROVIDED @@ -161,7 +161,7 @@ def test_Error_init_w_NO_FILES_PROVIDED_success(): assert e.source_2 is None -def test_Error_init_w_OTHER_success(): +def test_error_init_w_other_success(): # minimal e = Error(ErrorType.OTHER, message="foo") @@ -179,7 +179,7 @@ def test_Error_init_w_OTHER_success(): assert e.source_2 is None -def test_Error_init_fail(): +def test_error_init_fail(): # arguments are error type, message string, 1st source, 2nd source, exception error_init_fail(None, None, None, None, ValueError("error is required")) error_init_fail( @@ -241,24 +241,24 @@ def error_init_fail( assert_exception_correct(got.value, expected) -def test_ParseResult_init_success(): +def test_parse_result_init_success(): pr = ParseResult(specification_source("bar"), (frozendict({"foo": "bar"}),)) assert pr.source == specification_source("bar") assert pr.result == (frozendict({"foo": "bar"}),) -def test_ParseResult_init_fail(): - parseResult_init_fail(None, None, ValueError("source is required")) - parseResult_init_fail( +def test_parse_result_init_fail(): + assert_parse_result_init_fail(None, None, ValueError("source is required")) + assert_parse_result_init_fail( None, (frozendict({"foo": "bar"}),), ValueError("source is required") ) - parseResult_init_fail( + assert_parse_result_init_fail( specification_source("foo"), None, ValueError("result is required") ) -def parseResult_init_fail( +def assert_parse_result_init_fail( source: Optional[SpecificationSource], result: Optional[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], expected: Exception, @@ -287,7 +287,7 @@ def parseResult_init_fail( ) -def test_ParseResults_init_w_results_success(): +def test_parse_results_init_w_results_success(): results_copy = frozendict(PR_RESULTS) # prevent identity equality pr = ParseResults(PR_RESULTS) @@ -297,7 +297,7 @@ def test_ParseResults_init_w_results_success(): assert pr == ParseResults(results_copy) -def test_ParseResults_init_w_error_success(): +def test_parse_results_init_w_error_success(): errors_copy = tuple(PR_ERROR) # prevent identity equality pr = ParseResults(errors=PR_ERROR) @@ -307,16 +307,16 @@ def test_ParseResults_init_w_error_success(): assert pr == ParseResults(errors=errors_copy) -def test_ParseResults_init_fail(): +def test_parse_results_init_fail(): err = "Exactly one of results or errors must be supplied" - parseResults_init_fail(None, None, ValueError(err)) - parseResults_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) + assert_parse_results_init_fail(None, None, ValueError(err)) + assert_parse_results_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) -def parseResults_init_fail( +def assert_parse_results_init_fail( results: Optional[frozendict[str, ParseResult]], errors: Optional[tuple[Error, ...]], - expected: Exception, + expected: ValueError, ): with raises(Exception) as got: ParseResults(results, errors) diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index b8c9e147..44ee28ee 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -19,6 +19,8 @@ check_excel_contents, ) +# pylint: disable=C0116 + @fixture(scope="module") def temp_dir() -> Generator[Path, None, None]: @@ -323,7 +325,7 @@ def test_file_writers_fail(): ) -def file_writers_fail(path: Path, types: dict, expected: Exception): +def file_writers_fail(path: Path | None, types: dict | None, expected: Exception): with raises(Exception) as got: write_csv(path, types) assert_exception_correct(got.value, expected) diff --git a/tests/test_app.py b/tests/test_app.py index d9089b6c..b3648d97 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -41,6 +41,8 @@ utils.StagingPath._DATA_DIR = DATA_DIR utils.StagingPath._META_DIR = META_DIR +# pylint: disable=C0116 + def asyncgiven(**kwargs): """alternative to hypothesis.given decorator for async""" @@ -641,7 +643,7 @@ async def test_list(get_user): # testing list dot-files fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) - # f5 = fs.make_file(os.path.join(username, 'test', '.globus_id'), txt) + res6 = await cli.get("/list/", headers={"Authorization": ""}) assert res6.status == 200 json_text = await res6.text() @@ -1047,7 +1049,6 @@ async def test_importer_mappings(get_user): # As we update the app mappings this test may need to be changed # Or we need to reload json file itself - # unzip_mapping = AutoDetectUtils._MAPPINGS["apps"]["decompress/unpack"] assert ( mappings[1][0] == AutoDetectUtils.get_mappings_by_extension("gz")["mappings"][0] @@ -1119,7 +1120,7 @@ async def test_bulk_specification_success(get_user): ] ) excel = "importspec.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 df = pandas.DataFrame( [ ["Data type: fruit_bats; Columns: 2; Version: 1"], @@ -1255,7 +1256,7 @@ async def test_bulk_specification_fail_parse_fail(get_user): ] ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this one is fine df = pandas.DataFrame( [ @@ -1366,7 +1367,7 @@ async def test_bulk_specification_fail_column_count(get_user): ] ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this one has an extra column in the last row df = pandas.DataFrame( [ @@ -1452,7 +1453,7 @@ async def test_bulk_specification_fail_multiple_specs_per_type(get_user): ] ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this data type is also breakfastcereals, so will cause an error df = pandas.DataFrame( [ @@ -1516,7 +1517,7 @@ async def test_bulk_specification_fail_multiple_specs_per_type_excel(get_user): fu.make_dir(username) base = Path(fu.base_dir) / username excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 df = pandas.DataFrame( [ ["Data type: breakfastcereals; Columns: 2; Version: 1"], @@ -1734,7 +1735,7 @@ async def test_write_bulk_specification_success_excel(get_user): @pytest.mark.asyncio @patch("staging_service.auth2Client.KBaseAuth2.get_user") async def test_write_bulk_specification_fail_wrong_data_type(get_user): - get_user.return_value = "dontcare" + get_user.return_value = "do_not_care" async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", data="foo") js = await resp.json() @@ -1758,7 +1759,7 @@ async def test_write_bulk_specification_fail_no_content_length(get_user): @pytest.mark.asyncio @patch("staging_service.auth2Client.KBaseAuth2.get_user") async def test_write_bulk_specification_fail_large_input(get_user): - get_user.return_value = "idont" + get_user.return_value = "i_do_not" async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", json="a" * (1024 * 1024 - 2)) txt = await resp.text() diff --git a/tests/test_app_error_formatter.py b/tests/test_app_error_formatter.py index a700fd9b..5a1aa3ff 100644 --- a/tests/test_app_error_formatter.py +++ b/tests/test_app_error_formatter.py @@ -1,7 +1,13 @@ from pathlib import Path from staging_service.app_error_formatter import format_import_spec_errors -from staging_service.import_specifications.file_parser import ErrorType, Error, SpecificationSource +from staging_service.import_specifications.file_parser import ( + Error, + ErrorType, + SpecificationSource, +) + +# pylint: disable=C0116 def _ss(file: str, tab: str = None) -> SpecificationSource: @@ -15,10 +21,11 @@ def test_format_import_spec_errors_no_input(): def test_format_import_spec_errors_one_error(): errors = [Error(ErrorType.OTHER, "foobar")] assert format_import_spec_errors(errors, {}) == [ - {"type": "unexpected_error", - "message": "foobar", - "file": None, - } + { + "type": "unexpected_error", + "message": "foobar", + "file": None, + } ] @@ -31,7 +38,7 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar4", _ss("file4"), - _ss("file5") + _ss("file5"), ), Error(ErrorType.NO_FILES_PROVIDED), Error(ErrorType.FILE_NOT_FOUND, source_1=_ss("file6")), @@ -45,31 +52,31 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): Path("file6"): Path("f6"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "unexpected_error", - "message": "foobar1", - "file": "f1", - }, - {"type": "cannot_parse_file", - "message": "foobar2", - "file": "f2", - "tab": None - }, - {"type": "incorrect_column_count", - "message": "foobar3", - "file": "f3", - "tab": None, - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar4", - "file_1": "f4", - "tab_1": None, - "file_2": "f5", - "tab_2": None, - }, + { + "type": "unexpected_error", + "message": "foobar1", + "file": "f1", + }, + {"type": "cannot_parse_file", "message": "foobar2", "file": "f2", "tab": None}, + { + "type": "incorrect_column_count", + "message": "foobar3", + "file": "f3", + "tab": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar4", + "file_1": "f4", + "tab_1": None, + "file_2": "f5", + "tab_2": None, + }, {"type": "no_files_provided"}, - {"type": "cannot_find_file", - "file": "f6", - }, + { + "type": "cannot_find_file", + "file": "f6", + }, ] @@ -81,7 +88,7 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar3", _ss("file3", "tab3"), - _ss("file4", "tab4") + _ss("file4", "tab4"), ), ] paths = { @@ -91,21 +98,24 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): Path("file4"): Path("f4"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "cannot_parse_file", - "message": "foobar1", - "file": "f1", - "tab": "tab1" - }, - {"type": "incorrect_column_count", - "message": "foobar2", - "file": "f2", - "tab": "tab2", - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar3", - "file_1": "f3", - "tab_1": "tab3", - "file_2": "f4", - "tab_2": "tab4", - }, - ] \ No newline at end of file + { + "type": "cannot_parse_file", + "message": "foobar1", + "file": "f1", + "tab": "tab1", + }, + { + "type": "incorrect_column_count", + "message": "foobar2", + "file": "f2", + "tab": "tab2", + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar3", + "file_1": "f3", + "tab_1": "tab3", + "file_2": "f4", + "tab_2": "tab4", + }, + ] diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 3f43fcfa..7c070f61 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -279,7 +279,7 @@ def test_read_from_head_happy(tmp_path: Path): # we just happen to know this from metadata.py... snippet_length = FILE_SNIPPET_SIZE for case in cases: - file_path = os.path.join(tmp_path, "foo.txt") + file_path = os.path.join(tmp_path, "happy_head.txt") file_content = make_random_string(case) with open(file_path, "w", encoding="utf-8") as output_file: output_file.write(file_content) @@ -295,7 +295,7 @@ def test_read_from_tail_happy(tmp_path: Path): # we just happen to know this from metadata.py... snippet_length = FILE_SNIPPET_SIZE for case in cases: - file_path = os.path.join(tmp_path, "foo.txt") + file_path = os.path.join(tmp_path, "happy_tail.txt") file_content = make_random_string(case) with open(file_path, "w", encoding="utf-8") as output_file: output_file.write(file_content) @@ -312,7 +312,7 @@ def test_read_from_head_sad(tmp_path: Path): # we just happen to know this from metadata.py... # snippet_length = 1024 for case in cases: - file_path = os.path.join(tmp_path, "foo.txt") + file_path = os.path.join(tmp_path, "sad_head.txt") file_content = case with open(file_path, "wb") as output_file: output_file.write(file_content) @@ -329,7 +329,7 @@ def test_read_from_tail_sad(tmp_path: Path): # we just happen to know this from metadata.py... # snippet_length = 1024 for case in cases: - file_path = os.path.join(tmp_path, "foo.txt") + file_path = os.path.join(tmp_path, "sad_tail.txt") file_content = case with open(file_path, "wb") as output_file: output_file.write(file_content) From ba42da082231d050e9d2e16e48e764e527aceec5 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 11:42:25 -0700 Subject: [PATCH 23/33] fix wrong parameter [PTV-1882] --- staging_service/app.py | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index 7268e9b0..7b18a852 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -11,32 +11,23 @@ import aiohttp_cors from aiohttp import MultipartReader, web -from staging_service.config import ( - UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, - UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, - get_config, - get_max_content_length, - get_max_file_size, - get_read_chunk_size, - get_save_strategy, -) +from staging_service.config import (UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, + UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, + get_config, get_max_content_length, + get_max_file_size, get_read_chunk_size, + get_save_strategy) from .app_error_formatter import format_import_spec_errors from .autodetect.Mappings import CSV, EXCEL, TSV from .AutoDetectUtils import AutoDetectUtils from .globus import assert_globusid_exists, is_globusid -from .import_specifications.file_parser import ( - ErrorType, - FileTypeResolution, - parse_import_specifications, -) -from .import_specifications.file_writers import ( - ImportSpecWriteException, - write_csv, - write_excel, - write_tsv, -) -from .import_specifications.individual_parsers import parse_csv, parse_excel, parse_tsv +from .import_specifications.file_parser import (ErrorType, FileTypeResolution, + parse_import_specifications) +from .import_specifications.file_writers import (ImportSpecWriteException, + write_csv, write_excel, + write_tsv) +from .import_specifications.individual_parsers import (parse_csv, parse_excel, + parse_tsv) from .JGIMetadata import read_metadata_for from .metadata import add_upa, dir_info, similar, some_metadata from .utils import AclManager, StagingPath, auth_client, run_command @@ -274,7 +265,7 @@ async def test_auth(request: web.Request): @routes.get("/file-lifetime") -async def file_lifetime(parameter_list): +async def file_lifetime(_: web.Request): return web.Response(text=os.environ["FILE_LIFETIME"]) From 61a2f0390c476711df2ff1f95d2053a2b5bc0471 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 11:56:35 -0700 Subject: [PATCH 24/33] don't run local tool container as root [PTV-1882] --- development/tools/runner/Dockerfile | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/development/tools/runner/Dockerfile b/development/tools/runner/Dockerfile index 00cfbd93..5f3b5e7f 100644 --- a/development/tools/runner/Dockerfile +++ b/development/tools/runner/Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && \ SHELL ["/bin/bash", "-c"] -RUN mkdir -p /app && chmod -R a+rw /app +RUN mkdir -p /app WORKDIR /app @@ -15,11 +15,13 @@ WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN python -m pip install "pip==23.1.2" && pip install -r /app/requirements.txt -COPY ./globus.cfg /etc/globus.cfg -RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log - ENV PATH="/app:${PATH}" +# Run as a regular user, not root. +RUN addgroup --system kbapp && \ + adduser --system --ingroup kbapp kbapp && \ + chown -R kbapp:kbapp /app + ENTRYPOINT [ "/app/development/tools/runner/entrypoint.sh" ] CMD [ "bash" ] \ No newline at end of file From 428a928747143ee131fe708a9e88e8ca59949b1f Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 12:11:15 -0700 Subject: [PATCH 25/33] does NOSONAR work? [PTV-1882] --- tests/test_metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 7c070f61..4bef7ce7 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -268,7 +268,9 @@ def temp_dir2() -> Generator[Path, None, None]: def make_random_string(string_length: str) -> str: random.seed(42) possible_letters = string.ascii_letters - return "".join(random.choice(possible_letters) for _ in range(string_length)) + return "".join( + random.choice(possible_letters) for _ in range(string_length) # NOSONAR + ) def test_read_from_head_happy(tmp_path: Path): From b3608630c55a2bb384f6d4687c766316ebd57c7f Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 12:15:52 -0700 Subject: [PATCH 26/33] exclude usage of md5() fom sonarcloud [PTV-1882] --- staging_service/metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index ca264412..6f68bc39 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -174,7 +174,7 @@ async def _generate_metadata(path: StagingPath, source: str = None): last_chunk_read = None chunk_count = 0 - md5 = hashlib.md5() # noqua: S303 + md5 = hashlib.md5() # NOSONAR async with aiofiles.open(path.full_path, "rb") as fin: while True: chunk = await fin.read(READ_BUFFER_SIZE) From d2ff902e964a4a4f039917742625b14945c1721b Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 12:35:27 -0700 Subject: [PATCH 27/33] run as kbapp [PTV-1882] --- development/tools/runner/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/development/tools/runner/Dockerfile b/development/tools/runner/Dockerfile index 5f3b5e7f..d2c4f374 100644 --- a/development/tools/runner/Dockerfile +++ b/development/tools/runner/Dockerfile @@ -22,6 +22,8 @@ RUN addgroup --system kbapp && \ adduser --system --ingroup kbapp kbapp && \ chown -R kbapp:kbapp /app +USER kbapp + ENTRYPOINT [ "/app/development/tools/runner/entrypoint.sh" ] CMD [ "bash" ] \ No newline at end of file From 571de6f655426d86ba74125dbefee2d13e918d23 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 15:59:05 -0700 Subject: [PATCH 28/33] codacy can get some satisfaction? [PTV-1882] --- staging_service/metadata.py | 3 ++- tests/test_helpers.py | 2 +- tests/test_metadata.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 6f68bc39..51a92d91 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -174,7 +174,8 @@ async def _generate_metadata(path: StagingPath, source: str = None): last_chunk_read = None chunk_count = 0 - md5 = hashlib.md5() # NOSONAR + # Below we disable first sonarcloud, second bandit (codacy) + md5 = hashlib.md5() # NOSONAR nosec async with aiofiles.open(path.full_path, "rb") as fin: while True: chunk = await fin.read(READ_BUFFER_SIZE) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 8f23525e..5b7ed70e 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -52,7 +52,7 @@ def bootstrap_config(): def assert_exception_correct(got: Exception, expected: Exception): err = "".join(traceback.TracebackException.from_exception(got).format()) assert got.args == expected.args, err - assert type(got) == type(expected) + assert isinstance(got, type(expected)) def assert_file_contents(file: Path, lines: list[str]): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 4bef7ce7..2b0cf58b 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -269,7 +269,7 @@ def make_random_string(string_length: str) -> str: random.seed(42) possible_letters = string.ascii_letters return "".join( - random.choice(possible_letters) for _ in range(string_length) # NOSONAR + random.choice(possible_letters) for _ in range(string_length) # NOSONAR nosec ) From 00a0d43195dd5be4ca7d90fde9f04679aac11593 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Thu, 15 Jun 2023 17:02:25 -0700 Subject: [PATCH 29/33] maybe this will make bandit happy [PTV-1882] --- staging_service/metadata.py | 2 +- tests/test_metadata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 51a92d91..24c44f9d 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -175,7 +175,7 @@ async def _generate_metadata(path: StagingPath, source: str = None): chunk_count = 0 # Below we disable first sonarcloud, second bandit (codacy) - md5 = hashlib.md5() # NOSONAR nosec + md5 = hashlib.md5() # NOSONAR #nosec async with aiofiles.open(path.full_path, "rb") as fin: while True: chunk = await fin.read(READ_BUFFER_SIZE) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 2b0cf58b..22baf494 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -269,7 +269,7 @@ def make_random_string(string_length: str) -> str: random.seed(42) possible_letters = string.ascii_letters return "".join( - random.choice(possible_letters) for _ in range(string_length) # NOSONAR nosec + random.choice(possible_letters) for _ in range(string_length) # NOSONAR #nosec ) From 9ad78e447d452da19ed97e1e75195294bf601aab Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Fri, 16 Jun 2023 16:15:38 -0700 Subject: [PATCH 30/33] paper over corrupt metadata file (race condition), write json not writelines [PTV-1882] --- staging_service/metadata.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 24c44f9d..1b47c43d 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -1,4 +1,5 @@ import hashlib +import json import os import unicodedata from difflib import SequenceMatcher @@ -82,9 +83,14 @@ async def _read_metadata(metadata_path: str): """ Reads the file at the provided path, and decodes to JSON. """ - async with aiofiles.open(metadata_path, mode="r", encoding="utf-8") as extant: - data = await extant.read() - return decoder.decode(data) + try: + async with aiofiles.open(metadata_path, mode="r", encoding="utf-8") as extant: + data = await extant.read() + return decoder.decode(data) + except json.decoder.JSONDecodeError: + # The metadata file is either corrupted or is currently being written. + # TODO: can we distinguish these cases, or redesign things? + return {} async def _get_metadata(metadata_path: str): @@ -224,8 +230,8 @@ async def _save_metadata(path: StagingPath, metadata: dict): """ Saves the given metadata dictionary into the metadata file associated with the given path """ - async with aiofiles.open(path.metadata_path, mode="w") as f: - await f.writelines(encoder.encode(metadata)) + async with aiofiles.open(path.metadata_path, mode="w") as metadata_file: + await metadata_file.write(encoder.encode(metadata)) async def _update_metadata(path: StagingPath, additional_metadata: dict) -> dict: From e22b98a75200a8b5566e9ad461fb2a6703963a3a Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Fri, 16 Jun 2023 16:20:23 -0700 Subject: [PATCH 31/33] handle temp file better, as well as canceled uploads [PTV-1882] --- staging_service/app.py | 66 +++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 26 deletions(-) diff --git a/staging_service/app.py b/staging_service/app.py index 7b18a852..4c0ec5fd 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -451,38 +451,47 @@ async def _handle_upload_save_to_destination( chunk_size: int, max_file_size: int, ): - async with aiofiles.tempfile.NamedTemporaryFile( - "wb", - delete=False, - dir=os.path.dirname(destination_path.full_path), - prefix=".upload.", - ) as output_file: - temp_file_name = output_file.name - actual_size = 0 - while True: - if actual_size > max_file_size: - raise web.HTTPBadRequest( - text=( - f"file size reached {actual_size:,} bytes which exceeds " - + f"the maximum allowed of {max_file_size:,} bytes)" + try: + async with aiofiles.tempfile.NamedTemporaryFile( + "wb", + delete=True, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as output_file: + temp_file_name = output_file.name + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size:,} bytes which exceeds " + + f"the maximum allowed of {max_file_size:,} bytes)" + ) ) - ) - # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. - chunk = await stream.read_chunk(size=chunk_size) + # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. + chunk = await stream.read_chunk(size=chunk_size) - # Chunk is always a "bytes" object, but will be empty if there is nothing else - # to read, which is considered Falsy. - if not chunk: - break + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break - actual_size += len(chunk) + actual_size += len(chunk) - await output_file.write( - chunk, - ) + await output_file.write( + chunk, + ) - shutil.move(temp_file_name, destination_path.full_path) + shutil.move(temp_file_name, destination_path.full_path) + except FileNotFoundError: + # this is a normal condition after the file is moved + pass + except ConnectionResetError: + # this is a normal condition if the file transfer is canceled, + # or the Narrative is closed during upload. + # Although it may indicate other problems as well. + pass async def _handle_upload_save_to_temp( @@ -623,6 +632,10 @@ async def upload_files_chunked(request: web.Request): # allowing us to ditch XHR. await _handle_upload_save(user_file_part, path) + # This may be triggered by an canceled or otherwise interrupted upload. + # Note that now that we save to temp then move/copy to destination, this + # works, prior to this the file was written directly, which could leave a partial + # file. if not os.path.exists(path.full_path): error_msg = "We are sorry but upload was interrupted. Please try again." raise web.HTTPNotFound(text=error_msg) @@ -779,6 +792,7 @@ async def authorize_request(request): else: # this is a hack for prod because kbase_session won't get shared with the kbase.us domain token = request.cookies.get("kbase_session_backup") + username = await auth_client().get_user(token) await assert_globusid_exists(username, token) return username From 3140c0a711a166a2e6b214b7389da02ebda99315 Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Fri, 16 Jun 2023 16:20:49 -0700 Subject: [PATCH 32/33] add section for running dev server [PTV-1882] --- docs/testing.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/testing.md b/docs/testing.md index 4b6a0af4..7bc8b7a4 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -37,6 +37,14 @@ python -m venv venv python -m pip install --upgrade pip ``` +## Running Dev Server + +For integration tests, the server can be stood up inside a docker container: + +```shell +./development/scripts/run-dev-server.sh +``` + ### install deps ```shell From 5c36473778c3524869c3f066227df5c1af5b67cd Mon Sep 17 00:00:00 2001 From: Erik Pearson Date: Fri, 16 Jun 2023 16:21:22 -0700 Subject: [PATCH 33/33] dev 'run' command improvement [PTV-1882] --- development/scripts/run | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/development/scripts/run b/development/scripts/run index c63ddd7d..219486f2 100755 --- a/development/scripts/run +++ b/development/scripts/run @@ -1,6 +1,8 @@ #!/bin/sh +PROJECT_DIRECTORY=${DIR:-$PWD} +echo "Project Directory: ${PROJECT_DIRECTORY}" docker compose \ - -f "development/tools/runner/docker-compose.yml" \ - --project-directory "${PWD}" \ + -f "${PROJECT_DIRECTORY}/development/tools/runner/docker-compose.yml" \ + --project-directory "${PROJECT_DIRECTORY}" \ --project-name tools \ run --rm runner "${@:1}" \ No newline at end of file