diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..532e66e4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +# Even though the Dockerfile only copies over necessary files, certain +# artifacts may creep in that we should exclude from the image. +.DS_Store diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2a2996a5..9e9f7120 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -1,22 +1,17 @@ # This workflow will install Python dependencies, run tests and lint with a single version of Python # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions - name: Run Staging Service Tests - on: [pull_request] - jobs: build: - runs-on: ubuntu-latest - steps: - uses: actions/checkout@v2 - - name: Set up Python 3.9 + - name: Set up Python 3.11 uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: 3.11 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.gitignore b/.gitignore index 00b31a50..825fe74b 100644 --- a/.gitignore +++ b/.gitignore @@ -86,6 +86,7 @@ celerybeat-schedule .venv venv/ ENV/ +Pipfile # Spyder project settings .spyderproject @@ -100,8 +101,15 @@ ENV/ # mypy .mypy_cache/ +# IDEs +.idea +.fleet + +# misc + data/ .DS_Store .virtualenvs/ test.env -run_tests_single.sh \ No newline at end of file +run_tests_single.sh +local_work/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index acbf5f1f..d798ebc1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,37 +1,33 @@ -FROM python:3.9-slim-buster +FROM python:3.11.4-slim-buster # ----------------------------------------- RUN mkdir -p /kb/deployment/lib -RUN apt-get update && apt-get install -y --no-install-recommends apt-utils -RUN apt-get install -y zip && \ - apt-get install -y unzip && \ - apt-get install -y bzip2 && \ - apt-get install -y libmagic-dev - - -RUN apt-get install -y htop wget +RUN apt-get update && \ + apt-get install -y --no-install-recommends zip=3.0-11+b1 unzip=6.0-23+deb10u3 bzip2=1.0.6-9.2~deb10u2 libmagic-dev=1:5.35-4+deb10u2 htop=2.2.0-1+b1 wget=1.20.1-1.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* +# Standard simplified python setup. COPY ./requirements.txt /requirements.txt -RUN pip install -r /requirements.txt +RUN python -m pip install "pip==23.1.2" && pip install -r /requirements.txt -COPY ./ /kb/module COPY ./globus.cfg /etc/globus.cfg RUN touch /var/log/globus.log && chmod 777 /var/log/globus.log -RUN cp -r /kb/module/staging_service /kb/deployment/lib -RUN cp -r /kb/module/deployment /kb +COPY ./staging_service /kb/deployment/lib/staging_service +COPY ./deployment /kb/deployment +# TODO: should be port 5000 to match other kbase services EXPOSE 3000 WORKDIR /kb/deployment/lib - # The BUILD_DATE value seem to bust the docker cache when the timestamp changes, move to # the end LABEL org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ - org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.schema-version="1.1.8" \ - us.kbase.vcs-branch=$BRANCH \ - maintainer="Steve Chan sychan@lbl.gov" + org.label-schema.vcs-url="https://github.com/kbase/staging_service.git" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.schema-version="1.1.8" \ + us.kbase.vcs-branch=$BRANCH \ + maintainer="Steve Chan sychan@lbl.gov" ENTRYPOINT ["/kb/deployment/bin/entrypoint.sh"] diff --git a/deployment/bin/entrypoint.sh b/deployment/bin/entrypoint.sh index 787b2446..5628a483 100755 --- a/deployment/bin/entrypoint.sh +++ b/deployment/bin/entrypoint.sh @@ -3,17 +3,40 @@ # please also update the launch.json for vscode accordingly # the things that must be kept in sync are KB_DEPLOYMENT_CONFIG and PYTHONPATH -#top section for local running -DIR="$( cd "$( dirname "$0" )" && pwd )" -if [ -d "$DIR/../../staging_service" ]; then - PYTHONPATH="$DIR/../../staging_service" - export KB_DEPLOYMENT_CONFIG="$DIR/../conf/local.cfg" - export FILE_LIFETIME="90" -fi -#bottom section for running inside docker if [ -d "kb/deployment/lib/staging_service" ]; then - PYTHONPATH="kb/deployment/lib/staging_service" - # environment variable for KB_DEPLOYMENT_CONFIG set in docker-compose.yml + echo "staging_service not installed in /kb/deployment/lib as expected" + exit 1 fi -python3 -m staging_service \ No newline at end of file + +PYTHONPATH="kb/deployment/lib/staging_service" + +# disabled in favor of running behind gunicorn +# python3 -m staging_service + +# The port at which to run this service. +# TODO: this should probably be 5000 in order to operate like other KBase services. +# Who likes surprises like this? +SERVICE_PORT="${SERVICE_PORT:-3000}" + +# See https://docs.gunicorn.org/en/latest/run.html +# Recommended to have 2-4 workers per core. Default assumes 1 core. +GUNICORN_WORKERS="${GUNICORN_WORKERS:-4}" + +# Workers silent for more than this many seconds are killed and restarted. +# This setting affects the /upload endpoint, as it processes files after +# upload, calculating the md5 and potentially, in the future, additional tasks +# such as validating the file. +# Other endpoints may encounter a delay as well, as any endpoint which access +# metadata may trigger a metadata rebuild, including the md5. +# Default to 10 minutes, a generous amount of time to wait for this to complete. +# Alternative designs could solve this in a different way. +GUNICORN_TIMEOUT="${GUNICORN_TIMEOUT:-600}" + +# TODO: should be port 5000 to match other kbase services +gunicorn staging_service.main:web_app \ + --bind 0.0.0.0:"${SERVICE_PORT}" \ + --worker-class aiohttp.GunicornWebWorker \ + --pythonpath "${PYTHONPATH}" \ + --workers "${GUNICORN_WORKERS}" \ + --timeout "${GUNICORN_TIMEOUT}" \ No newline at end of file diff --git a/development/docker-compose-kbase-ui.yml b/development/docker-compose-kbase-ui.yml new file mode 100644 index 00000000..7efb10d4 --- /dev/null +++ b/development/docker-compose-kbase-ui.yml @@ -0,0 +1,27 @@ +# This docker compose file works with kbase-ui - specifically the kbase-dev +# network that allows proxying service requests from kbase-ui and the narrative. +version: "3.8" +networks: + kbase-dev: + name: kbase-dev +services: + staging_service: + hostname: staging_service + build: + context: . + ports: + # TODO: should be port 5000 to match other kbase services + - "3010:3000" + volumes: + - "./data:/kb/deployment/lib/src/data" + - "./staging_service:/kb/deployment/lib/staging_service" + networks: + - kbase-dev + dns: + - 8.8.8.8 + environment: + - KB_DEPLOYMENT_CONFIG=/kb/deployment/conf/deployment.cfg + - FILE_LIFETIME=90 + # - SAVE_STRATEGY=SAVE_STRATEGY_TEMP_THEN_COPY + # - SAVE_STRATEGY=SAVE_STRATEGY_SAVE_TO_DESTINATION + # - CHUNK_SIZE= diff --git a/development/scripts/README.md b/development/scripts/README.md new file mode 100644 index 00000000..3f18d5b3 --- /dev/null +++ b/development/scripts/README.md @@ -0,0 +1,13 @@ +# Scripts + +## `generate-data-files.py` + +A small tool, an example function really, to generate files for manual testing of upload and download. I use it by copy/pasting into a Python REPL session in the directory in which I want to generate files, and just run the function with the required parameters. + +## `list-uploads.sh` + +Handy for listing the contents of the `./data` directory while developing the `/upload` endpoint of the server locally. It simply lists the directory every 0.5s. + +## `run` + +TODO diff --git a/development/scripts/generate-data-files.py b/development/scripts/generate-data-files.py new file mode 100644 index 00000000..ceebe63d --- /dev/null +++ b/development/scripts/generate-data-files.py @@ -0,0 +1,15 @@ +import io + + +def generate_null_file(size: int, name: str): + """ + Generate a file full of 0 bytes, for testing different + file size scenarios. + + e.g. generate_null_file(5100000000, "5.1g.out") + """ + with open(name, "wb") as out: + bw = io.BufferedWriter(out, 10000000) + for _ in range(size): + bw.write(b"\0") + bw.flush() diff --git a/development/scripts/list-uploads.sh b/development/scripts/list-uploads.sh new file mode 100755 index 00000000..10f87f8e --- /dev/null +++ b/development/scripts/list-uploads.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# +# A handy script to display the contents of the data directory in a +# terminal window, refreshing every 0.5 seconds and excluding +# any macOS .DS_Store Desktop Services files. +# +while : +do + clear + find data -type f \( ! -name ".DS_Store" \) -ls + sleep 0.5 +done \ No newline at end of file diff --git a/development/scripts/run b/development/scripts/run new file mode 100755 index 00000000..219486f2 --- /dev/null +++ b/development/scripts/run @@ -0,0 +1,8 @@ +#!/bin/sh +PROJECT_DIRECTORY=${DIR:-$PWD} +echo "Project Directory: ${PROJECT_DIRECTORY}" +docker compose \ + -f "${PROJECT_DIRECTORY}/development/tools/runner/docker-compose.yml" \ + --project-directory "${PROJECT_DIRECTORY}" \ + --project-name tools \ + run --rm runner "${@:1}" \ No newline at end of file diff --git a/development/scripts/run-dev-server.sh b/development/scripts/run-dev-server.sh new file mode 100755 index 00000000..6c2f6d65 --- /dev/null +++ b/development/scripts/run-dev-server.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "Running server in development..." +docker compose \ + -f development/docker-compose-kbase-ui.yml \ + --project-directory "${PWD}" \ + run --rm staging_service \ No newline at end of file diff --git a/development/tools/runner/Dockerfile b/development/tools/runner/Dockerfile new file mode 100644 index 00000000..d2c4f374 --- /dev/null +++ b/development/tools/runner/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11.4-slim-buster +# ----------------------------------------- +RUN apt-get update && \ + apt-get install -y --no-install-recommends zip=3.0-11+b1 unzip=6.0-23+deb10u3 bzip2=1.0.6-9.2~deb10u2 libmagic-dev=1:5.35-4+deb10u2 htop=2.2.0-1+b1 wget=1.20.1-1.1 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +SHELL ["/bin/bash", "-c"] + +RUN mkdir -p /app + +WORKDIR /app + +# Standard simplified python setup. +COPY ./requirements.txt /app/requirements.txt +RUN python -m pip install "pip==23.1.2" && pip install -r /app/requirements.txt + +ENV PATH="/app:${PATH}" + +# Run as a regular user, not root. +RUN addgroup --system kbapp && \ + adduser --system --ingroup kbapp kbapp && \ + chown -R kbapp:kbapp /app + +USER kbapp + +ENTRYPOINT [ "/app/development/tools/runner/entrypoint.sh" ] + +CMD [ "bash" ] \ No newline at end of file diff --git a/development/tools/runner/docker-compose.yml b/development/tools/runner/docker-compose.yml new file mode 100644 index 00000000..f1b8e1ea --- /dev/null +++ b/development/tools/runner/docker-compose.yml @@ -0,0 +1,10 @@ +version: '3.8' +services: + runner: + build: + context: . + dockerfile: development/tools/runner/Dockerfile + volumes: + - .:/app + command: + - bash diff --git a/development/tools/runner/entrypoint.sh b/development/tools/runner/entrypoint.sh new file mode 100755 index 00000000..2ba1545b --- /dev/null +++ b/development/tools/runner/entrypoint.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +set -e + +# Nice to set this up for all future python code being run. +export PYTHONPATH="${PWD}/src" + +# +# This execs whatever is provided as a COMMAND to the container. By default, as established +# in the image via the Dockerfile, it is scripts/start-server.sh +# This mechanism, though, allows us to run anything inside the container. We use this +# for running tests, mypy, black, etc. through docker compose. +# Note that surrounding $@ with double quote causes the expansion (which starts with +# parameter 1 not 0) to result in each parameter being retained without splitting. +# Sort of like "$@" -> "$1" "$2" "$3" .. +# +exec "$@" diff --git a/docs/testing.md b/docs/testing.md new file mode 100644 index 00000000..7bc8b7a4 --- /dev/null +++ b/docs/testing.md @@ -0,0 +1,72 @@ +# Testing + +## In Container + +You can run tests in a container which uses the same base image and uses the same dependencies. (This container can also run other python tasks.) + +```shell +./development/scripts/run run_tests.sh +``` + +To run tests in individual test file you may supply the path to it. By default, the tests run against `tests/`. + +```shell +./development/scripts/run run_tests.sh tests/test_app.py +``` + +## On Host + +### install or activate python 3.11 + +E.g. macOS with macports: + +```shell +sudo port install python311 +sudo port select --set python python311 +``` + +### install venv + +```shell +python -m venv venv +``` + +### update pip + +```shell +python -m pip install --upgrade pip +``` + +## Running Dev Server + +For integration tests, the server can be stood up inside a docker container: + +```shell +./development/scripts/run-dev-server.sh +``` + +### install deps + +```shell +pip install -r requirements.txt +``` + +### run tests + +```shell +./run_tests.sh +``` + +### Other + +#### cherrypick tests + +e.g. to just test app.py: + +```shell +./run_tests.sh tests/test_app.py +``` + +#### coverage + +check coverage in `htmlcov/index.html` diff --git a/requirements.txt b/requirements.txt index e637e064..86ba2e17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,19 @@ -uvloop==0.15.3 -aiohttp==3.7.4 +aiofiles==23.1.0 aiohttp_cors==0.7.0 -pytest==6.2.4 -aiofiles==0.3.1 -pytest-aiohttp==0.1.3 -coverage==4.4.2 -pytest-cov==2.5.1 -hypothesis==6.14.5 -globus-sdk==1.6.1 -python-dotenv==0.14.0 -frozendict==2.0.3 -pandas==1.3.2 -xlrd==2.0.1 -openpyxl==3.0.7 +aiohttp==3.8.4 +coverage==7.2.7 defusedxml==0.7.1 -python-magic==0.4.24 \ No newline at end of file +frozendict==2.3.8 +globus-sdk==3.20.1 +gunicorn==20.1.0 +hypothesis==6.76.0 +openpyxl==3.1.2 +pandas==2.0.2 +pytest-aiohttp==1.0.4 +pytest-asyncio==0.21.0 +pytest-cov==4.1.0 +pytest==7.3.1 +python-dotenv==1.0.0 +python-magic==0.4.27 +uvloop==0.17.0 +xlrd==2.0.1 # fairly out of date \ No newline at end of file diff --git a/run_tests.sh b/run_tests.sh index 43de2167..52ea58d5 100755 --- a/run_tests.sh +++ b/run_tests.sh @@ -2,4 +2,13 @@ DIR="$( cd "$( dirname "$0" )" && pwd )" export KB_DEPLOYMENT_CONFIG="$DIR/deployment/conf/testing.cfg" export FILE_LIFETIME="90" -python3 -m pytest -s -vv --cov=staging_service \ No newline at end of file +export TESTS="${1:-tests}" +echo +echo "****************************" +echo "**" +echo "** Running tests in ${TESTS}" +echo "**" +echo "****************************" +echo +python3 -m pytest -s -vv --cov=staging_service --cov-report term --cov-report html "${TESTS}" + diff --git a/staging_service/AutoDetectUtils.py b/staging_service/AutoDetectUtils.py index 97766b72..ad1bfab9 100644 --- a/staging_service/AutoDetectUtils.py +++ b/staging_service/AutoDetectUtils.py @@ -2,14 +2,42 @@ This class is in charge of determining possible importers by determining the suffix of the filepath pulled in, and by looking up the appropriate mappings in the supported_apps_w_extensions.json file """ -from typing import Optional, Tuple, Dict +from typing import Any, Dict, Optional, Tuple + +Mappings = dict[str, Any] + +MappingsConfig = dict[str, Mappings] + +# pylint: disable=C0115,C0116 class AutoDetectUtils: - _MAPPINGS = None # expects to be set by config + _MAPPINGS: None | dict[str, Any] = None # expects to be set by config + + @classmethod + def set_mappings(cls, mappings_config: MappingsConfig): + cls._MAPPINGS = mappings_config["types"] + + @classmethod + def has_mappings(cls) -> bool: + if cls._MAPPINGS is not None: + return True + return False - @staticmethod - def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[str, object]]: + @classmethod + def get_mappings_by_extension(cls, extension: str): + if cls._MAPPINGS is not None: + return cls._MAPPINGS.get(extension) + + @classmethod + def get_extension_mappings(cls): + return cls._MAPPINGS + + @classmethod + def determine_possible_importers( + cls, + filename: str, + ) -> Tuple[Optional[list], Dict[str, object]]: """ Given a filename, come up with a reference to all possible apps. :param filename: The filename to find applicable apps for @@ -18,31 +46,35 @@ def determine_possible_importers(filename: str) -> Tuple[Optional[list], Dict[st The fileinfo dict, containing: the file prefix the file suffix, if a suffix matched a mapping - the file types, if a suffix matched a mapping, otherwise an empty list + the file types, if a suffix matched a mapping, otherwise an empty list """ dotcount = filename.count(".") - if dotcount: + if dotcount and cls._MAPPINGS is not None: # preferentially choose the most specific suffix (e.g. longest) # to get file type mappings - m = AutoDetectUtils._MAPPINGS + for i in range(1, dotcount + 1): parts = filename.split(".", i) suffix = parts[-1].lower() - if suffix in m["types"]: + # pylint: disable=unsubscriptable-object + # added because it complains about mappings below. + if suffix in cls._MAPPINGS: # pylint: disable=E1135 prefix = ".".join(parts[0:i]) return ( - m["types"][suffix]["mappings"], - {"prefix": prefix, - "suffix": parts[-1], - "file_ext_type": m["types"][suffix]["file_ext_type"], - } + cls._MAPPINGS[suffix]["mappings"], + { + "prefix": prefix, + "suffix": parts[-1], + "file_ext_type": cls._MAPPINGS[suffix]["file_ext_type"], + }, ) return None, {"prefix": filename, "suffix": None, "file_ext_type": []} - @staticmethod - def get_mappings(file_list: list) -> dict: + @classmethod + def get_mappings(cls, file_list: list) -> dict: """ Given a list of files, get their mappings if they exist + :param file_list: A list of files :return: return a listing of apps, a listing of extension_mappings for each filename, and information about each file, currently the file prefix and the suffix used to @@ -51,11 +83,10 @@ def get_mappings(file_list: list) -> dict: mappings = [] fileinfo = [] for filename in file_list: - typemaps, fi = AutoDetectUtils.determine_possible_importers(filename) + typemaps, file_importers = cls.determine_possible_importers(filename) mappings.append(typemaps) - fileinfo.append(fi) - rv = { + fileinfo.append(file_importers) + return { "mappings": mappings, "fileinfo": fileinfo, } - return rv diff --git a/staging_service/JGIMetadata.py b/staging_service/JGIMetadata.py index 643be600..92825e16 100644 --- a/staging_service/JGIMetadata.py +++ b/staging_service/JGIMetadata.py @@ -1,13 +1,15 @@ -from .utils import Path +import os from json import JSONDecoder + import aiofiles -import os from aiohttp import web +from .utils import StagingPath + decoder = JSONDecoder() -async def read_metadata_for(path: Path): +async def read_metadata_for(path: StagingPath): if os.path.exists(path.jgi_metadata) and os.path.isfile(path.jgi_metadata): async with aiofiles.open(path.jgi_metadata, mode="r") as json: data = await json.read() diff --git a/staging_service/__main__.py b/staging_service/__main__.py deleted file mode 100644 index e1f31b14..00000000 --- a/staging_service/__main__.py +++ /dev/null @@ -1,12 +0,0 @@ -from .app import app_factory -from aiohttp import web -import asyncio -import os -import uvloop -import configparser - -asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop - -config = configparser.ConfigParser() -config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) -web.run_app(app_factory(config), port=3000) diff --git a/staging_service/app.py b/staging_service/app.py index ac04fc16..4c0ec5fd 100644 --- a/staging_service/app.py +++ b/staging_service/app.py @@ -4,35 +4,35 @@ import shutil import sys from collections import defaultdict -from urllib.parse import parse_qs -from pathlib import Path as PathPy +from pathlib import Path +from urllib.parse import parse_qs, unquote, urlencode, urlunparse +import aiofiles import aiohttp_cors -from aiohttp import web +from aiohttp import MultipartReader, web + +from staging_service.config import (UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, + UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, + get_config, get_max_content_length, + get_max_file_size, get_read_chunk_size, + get_save_strategy) from .app_error_formatter import format_import_spec_errors +from .autodetect.Mappings import CSV, EXCEL, TSV from .AutoDetectUtils import AutoDetectUtils -from .JGIMetadata import read_metadata_for -from .auth2Client import KBaseAuth2 from .globus import assert_globusid_exists, is_globusid -from .metadata import some_metadata, dir_info, add_upa, similar -from .utils import Path, run_command, AclManager -from .import_specifications.file_parser import ( - ErrorType, - FileTypeResolution, - parse_import_specifications, -) -from .import_specifications.individual_parsers import parse_csv, parse_tsv, parse_excel -from .import_specifications.file_writers import ( - write_csv, - write_tsv, - write_excel, - ImportSpecWriteException, -) -from .autodetect.Mappings import CSV, TSV, EXCEL - - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +from .import_specifications.file_parser import (ErrorType, FileTypeResolution, + parse_import_specifications) +from .import_specifications.file_writers import (ImportSpecWriteException, + write_csv, write_excel, + write_tsv) +from .import_specifications.individual_parsers import (parse_csv, parse_excel, + parse_tsv) +from .JGIMetadata import read_metadata_for +from .metadata import add_upa, dir_info, similar, some_metadata +from .utils import AclManager, StagingPath, auth_client, run_command + +logging.basicConfig(stream=sys.stdout, level=logging.WARNING) routes = web.RouteTableDef() VERSION = "1.3.6" @@ -54,11 +54,13 @@ @routes.get("/importer_filetypes/") -async def importer_filetypes(request: web.Request) -> web.json_response: +async def importer_filetypes(_: web.Request) -> web.json_response: """ Returns the file types for the configured datatypes. The returned JSON contains two keys: + * datatype_to_filetype, which maps import datatypes (like gff_genome) to their accepted filetypes (like [FASTA, GFF]) + * filetype_to_extensions, which maps file types (e.g. FASTA) to their extensions (e.g. *.fa, *.fasta, *.fa.gz, etc.) @@ -80,21 +82,26 @@ async def importer_mappings(request: web.Request) -> web.json_response: if len(file_list) == 0: raise web.HTTPBadRequest( text=f"must provide file_list field. Your provided qs: {request.query_string}", - ) + ) mappings = AutoDetectUtils.get_mappings(file_list) return web.json_response(data=mappings) -def _file_type_resolver(path: PathPy) -> FileTypeResolution: +def _file_type_resolver(path: Path) -> FileTypeResolution: fi = AutoDetectUtils.get_mappings([str(path)])["fileinfo"][0] # Here we assume that the first entry in the file_ext_type field is the entry # we want. Presumably secondary entries are less general. - ftype = fi['file_ext_type'][0] if fi['file_ext_type'] else None + ftype = fi["file_ext_type"][0] if fi["file_ext_type"] else None if ftype in _IMPSPEC_FILE_TO_PARSER: return FileTypeResolution(parser=_IMPSPEC_FILE_TO_PARSER[ftype]) else: - ext = fi['suffix'] if fi['suffix'] else path.suffix[1:] if path.suffix else path.name + if fi["suffix"]: + ext = fi["suffix"] + elif path.suffix: + ext = path.suffix[1:] + else: + ext = path.name return FileTypeResolution(unsupported_type=ext) @@ -113,17 +120,22 @@ async def bulk_specification(request: web.Request) -> web.json_response: files = [f.strip() for f in files if f.strip()] paths = {} for f in files: - p = Path.validate_path(username, f) - paths[PathPy(p.full_path)] = PathPy(p.user_path) + p = StagingPath.validate_path(username, f) + paths[Path(p.full_path)] = Path(p.user_path) # list(dict) returns a list of the dict keys in insertion order (py3.7+) res = parse_import_specifications( tuple(list(paths)), _file_type_resolver, - lambda e: logging.error("Unexpected error while parsing import specs", exc_info=e)) + lambda e: logging.error( + "Unexpected error while parsing import specs", exc_info=e + ), + ) if res.results: types = {dt: result.result for dt, result in res.results.items()} - files = {dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} - for dt, result in res.results.items()} + files = { + dt: {"file": str(paths[result.source.file]), "tab": result.source.tab} + for dt, result in res.results.items() + } return web.json_response({"types": types, "files": files}) errtypes = {e.error for e in res.errors} errtext = json.dumps({"errors": format_import_spec_errors(res.errors, paths)}) @@ -168,29 +180,33 @@ async def write_bulk_specification(request: web.Request) -> web.json_response: # There should be a way to get aiohttp to handle this but I can't find it return _createJSONErrorResponse( f"Required content-type is {_APP_JSON}", - error_class=web.HTTPUnsupportedMediaType) + error_class=web.HTTPUnsupportedMediaType, + ) if not request.content_length: return _createJSONErrorResponse( "The content-length header is required and must be > 0", - error_class=web.HTTPLengthRequired) + error_class=web.HTTPLengthRequired, + ) # No need to check the max content length; the server already does that. See tests data = await request.json() if type(data) != dict: return _createJSONErrorResponse("The top level JSON element must be a mapping") - folder = data.get('output_directory') - type_ = data.get('output_file_type') + folder = data.get("output_directory") + type_ = data.get("output_file_type") if type(folder) != str: - return _createJSONErrorResponse("output_directory is required and must be a string") + return _createJSONErrorResponse( + "output_directory is required and must be a string" + ) writer = _IMPSPEC_FILE_TO_WRITER.get(type_) if not writer: return _createJSONErrorResponse(f"Invalid output_file_type: {type_}") - folder = Path.validate_path(username, folder) + folder = StagingPath.validate_path(username, folder) os.makedirs(folder.full_path, exist_ok=True) try: - files = writer(PathPy(folder.full_path), data.get("types")) + files = writer(Path(folder.full_path), data.get("types")) except ImportSpecWriteException as e: return _createJSONErrorResponse(e.args[0]) - new_files = {ty: str(PathPy(folder.user_path) / files[ty]) for ty in files} + new_files = {ty: str(Path(folder.user_path) / files[ty]) for ty in files} return web.json_response({"output_file_type": type_, "files_created": new_files}) @@ -202,8 +218,8 @@ def _createJSONErrorResponse(error_text: str, error_class=web.HTTPBadRequest): @routes.get("/add-acl-concierge") async def add_acl_concierge(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path - concierge_path = f"{Path._CONCIERGE_PATH}/{username}/" + user_dir = StagingPath.validate_path(username).full_path + concierge_path = f"{StagingPath._CONCIERGE_PATH}/{username}/" aclm = AclManager() result = aclm.add_acl_concierge( shared_directory=user_dir, concierge_path=concierge_path @@ -211,16 +227,20 @@ async def add_acl_concierge(request: web.Request): result[ "msg" ] = f"Requesting Globus Perms for the following globus dir: {concierge_path}" - result[ - "link" - ] = f"https://app.globus.org/file-manager?destination_id={aclm.endpoint_id}&destination_path={concierge_path}" + + params = {"destination_id": aclm.endpoint_id, "destination_path": concierge_path} + + result["link"] = urlunparse( + ("https", "app.globus.org", "/file-manager", None, urlencode(params), None) + ) + return web.json_response(result) @routes.get("/add-acl") async def add_acl(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path + user_dir = StagingPath.validate_path(username).full_path result = AclManager().add_acl(user_dir) return web.json_response(result) @@ -228,7 +248,7 @@ async def add_acl(request: web.Request): @routes.get("/remove-acl") async def remove_acl(request: web.Request): username = await authorize_request(request) - user_dir = Path.validate_path(username).full_path + user_dir = StagingPath.validate_path(username).full_path result = AclManager().remove_acl(user_dir) return web.json_response(result) @@ -245,7 +265,7 @@ async def test_auth(request: web.Request): @routes.get("/file-lifetime") -async def file_lifetime(parameter_list): +async def file_lifetime(_: web.Request): return web.Response(text=os.environ["FILE_LIFETIME"]) @@ -253,14 +273,14 @@ async def file_lifetime(parameter_list): async def file_exists(request: web.Request): username = await authorize_request(request) query = request.match_info["query"] - user_dir = Path.validate_path(username) + user_dir = StagingPath.validate_path(username) try: show_hidden = request.query["showHidden"] if "true" == show_hidden or "True" == show_hidden: show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False # this scans the entire directory recursively just to see if one file exists... why? results = await dir_info(user_dir, show_hidden, query) @@ -282,7 +302,9 @@ async def list_files(request: web.Request): lists the contents of a directory and some details about them """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info.get("path", "")) + + path = StagingPath.validate_path(username, request.match_info.get("path", "")) + if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -291,15 +313,11 @@ async def list_files(request: web.Request): raise web.HTTPBadRequest( text="{path} is a file not a directory".format(path=path.full_path) ) - try: - show_hidden = request.query["showHidden"] - if "true" == show_hidden or "True" == show_hidden: - show_hidden = True - else: - show_hidden = False - except KeyError as no_query: - show_hidden = False + + show_hidden = request.query.get("showHidden", "false").lower() == "true" + data = await dir_info(path, show_hidden, recurse=True) + return web.json_response(data) @@ -309,7 +327,7 @@ async def download_files(request: web.Request): download a file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info.get("path", "")) + path = StagingPath.validate_path(username, request.match_info.get("path", "")) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -325,12 +343,12 @@ async def download_files(request: web.Request): @routes.get("/similar/{path:.+}") -async def similar_files(request: web.Request): +async def get_similar_files(request: web.Request): """ lists similar file path for given file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -340,7 +358,7 @@ async def similar_files(request: web.Request): text="{path} is a directory not a file".format(path=path.full_path) ) - root = Path.validate_path(username, "") + root = StagingPath.validate_path(username, "") files = await dir_info(root, show_hidden=False, recurse=True) similar_files = list() @@ -363,14 +381,14 @@ async def search(request: web.Request): """ username = await authorize_request(request) query = request.match_info["query"] - user_dir = Path.validate_path(username) + user_dir = StagingPath.validate_path(username) try: show_hidden = request.query["showHidden"] if "true" == show_hidden or "True" == show_hidden: show_hidden = True else: show_hidden = False - except KeyError as no_query: + except KeyError: show_hidden = False results = await dir_info(user_dir, show_hidden, query) results.sort(key=lambda x: x["mtime"], reverse=True) @@ -384,7 +402,7 @@ async def get_metadata(request: web.Request): if it's a folder it returns stat data about the folder """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path): raise web.HTTPNotFound( text="path {path} does not exist".format(path=path.user_path) @@ -398,80 +416,238 @@ async def get_jgi_metadata(request: web.Request): returns jgi metadata if associated with a file """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) return web.json_response(await read_metadata_for(path)) -@routes.post("/upload") -async def upload_files_chunked(request: web.Request): - """ - uploads a file into the staging area +def _validate_filename(filename: str): """ - username = await authorize_request(request) - - if not request.has_body: - raise web.HTTPBadRequest(text="must provide destPath and uploads in body") - - reader = await request.multipart() - counter = 0 - user_file = None - destPath = None - while ( - counter < 100 - ): # TODO this is arbitrary to keep an attacker from creating infinite loop - # This loop handles the null parts that come in inbetween destpath and file - part = await reader.next() - - if part.name == "destPath": - destPath = await part.text() - elif part.name == "uploads": - user_file = part - break - else: - counter += 1 + Ensure that a given filename is acceptable for usage in the staging area. - if not (user_file and destPath): - raise web.HTTPBadRequest(text="must provide destPath and uploads in body") + Raises http-response appropriate errors if the filename is invalid. + """ - filename: str = user_file.filename if filename.lstrip() != filename: raise web.HTTPForbidden( # forbidden isn't really the right code, should be 400 text="cannot upload file with name beginning with space" ) + if "," in filename: - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( # for consistency, we use 403 again text="cannot upload file with ',' in name" ) + # may want to make this configurable if we ever decide to add a hidden files toggle to # the staging area UI if filename.startswith("."): - raise web.HTTPForbidden( # for consistency we use 403 again + raise web.HTTPForbidden( # for consistency, we use 403 again text="cannot upload file with name beginning with '.'" ) - size = 0 - destPath = os.path.join(destPath, filename) - path = Path.validate_path(username, destPath) - os.makedirs(os.path.dirname(path.full_path), exist_ok=True) - with open(path.full_path, "wb") as f: # TODO should we handle partial file uploads? + +async def _handle_upload_save_to_destination( + stream: MultipartReader, + destination_path: StagingPath, + chunk_size: int, + max_file_size: int, +): + try: + async with aiofiles.tempfile.NamedTemporaryFile( + "wb", + delete=True, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as output_file: + temp_file_name = output_file.name + actual_size = 0 + while True: + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size:,} bytes which exceeds " + + f"the maximum allowed of {max_file_size:,} bytes)" + ) + ) + + # A chunk size of 1MB seems to be a bit faster than the default of 8Kb. + chunk = await stream.read_chunk(size=chunk_size) + + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. + if not chunk: + break + + actual_size += len(chunk) + + await output_file.write( + chunk, + ) + + shutil.move(temp_file_name, destination_path.full_path) + except FileNotFoundError: + # this is a normal condition after the file is moved + pass + except ConnectionResetError: + # this is a normal condition if the file transfer is canceled, + # or the Narrative is closed during upload. + # Although it may indicate other problems as well. + pass + + +async def _handle_upload_save_to_temp( + stream: MultipartReader, + destination_path: StagingPath, + chunk_size: int, + max_file_size: int, +): + async with aiofiles.tempfile.NamedTemporaryFile("wb", delete=True) as output_file: + actual_size = 0 while True: - chunk = await user_file.read_chunk() + if actual_size > max_file_size: + raise web.HTTPBadRequest( + text=( + f"file size reached {actual_size} bytes which exceeds " + + f"the maximum allowed of {max_file_size} bytes" + ) + ) + + chunk = await stream.read_chunk(size=chunk_size) + + # Chunk is always a "bytes" object, but will be empty if there is nothing else + # to read, which is considered Falsy. if not chunk: break - size += len(chunk) - f.write(chunk) - if not os.path.exists(path.full_path): - error_msg = "We are sorry but upload was interrupted. Please try again.".format( - path=path.full_path + actual_size += len(chunk) + + await output_file.write(chunk) + + async with aiofiles.tempfile.NamedTemporaryFile( + "rb", + delete=False, + dir=os.path.dirname(destination_path.full_path), + prefix=".upload.", + ) as copied_file: + shutil.copyfile(output_file.name, copied_file.name) + + shutil.move(copied_file.name, destination_path.full_path) + + +async def _handle_upload_save(stream: MultipartReader, destination_path: StagingPath): + """ + Handles the file upload stream from the /upload endpoint, saving the stream, ultimately, + to the given destination path. + + It honors a "save strategy", which is one of a set of defined methods for handling the upload. + See the function get_save_strategy() for details. + + It also honors the "chunk size", which is used for reading up to the chunk size from the stream + before saving. This constrains memory commitment, especially important for handling large + files and concurrent file upload. + """ + chunk_size = get_read_chunk_size() + max_file_size = get_max_file_size() + save_strategy = get_save_strategy() + if save_strategy == UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY: + await _handle_upload_save_to_temp( + stream, destination_path, chunk_size, max_file_size + ) + elif save_strategy == UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION: + await _handle_upload_save_to_destination( + stream, destination_path, chunk_size, max_file_size + ) + + +@routes.post("/upload") +async def upload_files_chunked(request: web.Request): + """ + Uploads a file into the staging area. + + Ensures the current auth token is valid, and if so, returns the associated username. + If not, will raise an exception + + Also, ensures that if there is a Globus account id available in the user's staging + directory, it is valid. + """ + + username = await authorize_request(request) + + if not request.can_read_body: + raise web.HTTPBadRequest( + text="must provide destPath and uploads in body", ) + + if request.content_length is None: + raise web.HTTPBadRequest(text="request must include a 'Content-Length' header") + + max_content_length = get_max_content_length() + if request.content_length > max_content_length: + raise web.HTTPBadRequest( + text=f"overall content length exceeds the maximum allowable of {max_content_length:,} bytes" + ) + + reader = await request.multipart() + + dest_path_part = await reader.next() + + # + # Get the destination path, which is actually the subdirectory in which the + # file should be placed within the user's staging directory. + # This field must be first, because the file must come last, in order to + # read the file stream. + # + if dest_path_part.name != "destPath": + raise web.HTTPBadRequest(text="must provide destPath in body") + + destination_directory = await dest_path_part.text() + + # + # Get the upload file part. We read the header bits, the get the + # read and save the file stream. + # + user_file_part = await reader.next() + + if user_file_part.name != "uploads": + raise web.HTTPBadRequest(text="must provide uploads in body") + + filename: str = unquote(user_file_part.filename) + + _validate_filename(filename) + + destination_path = os.path.join(destination_directory, filename) + path = StagingPath.validate_path(username, destination_path) + os.makedirs(os.path.dirname(path.full_path), exist_ok=True) + + # NB: errors result in an error response ... but the upload will be + # read until the end. This is just built into aiohttp web. + # From what I've read, it is considered unstable to have a server close + # the request stream if an error is encountered, thus aiohttp invokes the + # "release()" method to read the stream to completion, but does nothing with + # the results (other than read into the buffer and). + # It would require a more sophisticated implementation to work around this. + # E.g. an upload stream and a progress stream - they could be plain https + # requests - which would allow the progress stream to report errors which would + # then abort the upload stream. The upload stream could also spin upon an error, + # to avoid the useless bandwidth and cpu usage. This would also work with fetch, + # allowing us to ditch XHR. + await _handle_upload_save(user_file_part, path) + + # This may be triggered by an canceled or otherwise interrupted upload. + # Note that now that we save to temp then move/copy to destination, this + # works, prior to this the file was written directly, which could leave a partial + # file. + if not os.path.exists(path.full_path): + error_msg = "We are sorry but upload was interrupted. Please try again." raise web.HTTPNotFound(text=error_msg) response = await some_metadata( path, + # these are fields to include in the result - the metadata stored + # in the file system has other fields too. desired_fields=["name", "path", "mtime", "size", "isFolder"], source="KBase upload", ) + return web.json_response([response]) @@ -481,22 +657,20 @@ async def define_UPA(request: web.Request): creates an UPA as a field in the metadata file corresponding to the filepath given """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) if not os.path.exists(path.full_path or not os.path.isfile(path.full_path)): - # TODO the security model here is to not care if someone wants to put in a false upa - raise web.HTTPNotFound( - text="no file found found on path {}".format(path.user_path) - ) - if not request.has_body: - raise web.HTTPBadRequest(text="must provide UPA field in body") + # The security model here is to not care if someone wants to put in a false upa + raise web.HTTPNotFound(text=f"no file found found on path {path.user_path}") + if not request.can_read_body: + raise web.HTTPBadRequest(text="must provide 'UPA' field in body") body = await request.post() try: UPA = body["UPA"] - except KeyError as wrong_key: - raise web.HTTPBadRequest(text="must provide UPA field in body") + except KeyError: + raise web.HTTPBadRequest(text="must provide 'UPA' field in body") await add_upa(path, UPA) return web.Response( - text="succesfully updated UPA {UPA} for file {path}".format( + text="successfully updated UPA {UPA} for file {path}".format( UPA=UPA, path=path.user_path ) ) @@ -508,45 +682,51 @@ async def delete(request: web.Request): allows deletion of both directories and files """ username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) + # make sure directory isn't home if path.user_path == username: raise web.HTTPForbidden(text="cannot delete home directory") + if is_globusid(path, username): raise web.HTTPForbidden(text="cannot delete protected file") + if os.path.isfile(path.full_path): os.remove(path.full_path) if os.path.exists(path.metadata_path): os.remove(path.metadata_path) + elif os.path.isdir(path.full_path): shutil.rmtree(path.full_path) if os.path.exists(path.metadata_path): shutil.rmtree(path.metadata_path) + else: raise web.HTTPNotFound( text="could not delete {path}".format(path=path.user_path) ) + return web.Response(text="successfully deleted {path}".format(path=path.user_path)) @routes.patch("/mv/{path:.+}") async def rename(request: web.Request): username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) # make sure directory isn't home if path.user_path == username: raise web.HTTPForbidden(text="cannot rename or move home directory") if is_globusid(path, username): raise web.HTTPForbidden(text="cannot rename or move protected file") - if not request.has_body: + if not request.can_read_body: raise web.HTTPBadRequest(text="must provide newPath field in body") body = await request.post() try: new_path = body["newPath"] - except KeyError as wrong_key: + except KeyError: raise web.HTTPBadRequest(text="must provide newPath field in body") - new_path = Path.validate_path(username, new_path) + new_path = StagingPath.validate_path(username, new_path) if os.path.exists(path.full_path): if not os.path.exists(new_path.full_path): shutil.move(path.full_path, new_path.full_path) @@ -554,7 +734,7 @@ async def rename(request: web.Request): shutil.move(path.metadata_path, new_path.metadata_path) else: raise web.HTTPConflict( - text="{new_path} allready exists".format(new_path=new_path.user_path) + text="{new_path} already exists".format(new_path=new_path.user_path) ) else: raise web.HTTPNotFound(text="{path} not found".format(path=path.user_path)) @@ -568,7 +748,7 @@ async def rename(request: web.Request): @routes.patch("/decompress/{path:.+}") async def decompress(request: web.Request): username = await authorize_request(request) - path = Path.validate_path(username, request.match_info["path"]) + path = StagingPath.validate_path(username, request.match_info["path"]) # make sure the file can be decompressed filename, file_extension = os.path.splitext(path.full_path) filename, upper_file_extension = os.path.splitext(filename) @@ -612,7 +792,8 @@ async def authorize_request(request): else: # this is a hack for prod because kbase_session won't get shared with the kbase.us domain token = request.cookies.get("kbase_session_backup") - username = await auth_client.get_user(token) + + username = await auth_client().get_user(token) await assert_globusid_exists(username, token) return username @@ -641,31 +822,38 @@ def inject_config_dependencies(config): os.path.join(os.getcwd(), FILE_EXTENSION_MAPPINGS) ) - Path._DATA_DIR = DATA_DIR - Path._META_DIR = META_DIR - Path._CONCIERGE_PATH = CONCIERGE_PATH + StagingPath._DATA_DIR = DATA_DIR + StagingPath._META_DIR = META_DIR + StagingPath._CONCIERGE_PATH = CONCIERGE_PATH - if Path._DATA_DIR is None: - raise Exception("Please provide DATA_DIR in the config file ") + if StagingPath._DATA_DIR is None: + raise web.HTTPInternalServerError(text="DATA_DIR missing in the config file ") - if Path._META_DIR is None: - raise Exception("Please provide META_DIR in the config file ") + if StagingPath._META_DIR is None: + raise web.HTTPInternalServerError(text="META_DIR missing in the config file ") - if Path._CONCIERGE_PATH is None: - raise Exception("Please provide CONCIERGE_PATH in the config file ") + if StagingPath._CONCIERGE_PATH is None: + raise web.HTTPInternalServerError( + text="CONCIERGE_PATH missing in the config file " + ) if FILE_EXTENSION_MAPPINGS is None: - raise Exception("Please provide FILE_EXTENSION_MAPPINGS in the config file ") - with open(FILE_EXTENSION_MAPPINGS) as f: - AutoDetectUtils._MAPPINGS = json.load(f) + raise web.HTTPInternalServerError( + text="FILE_EXTENSION_MAPPINGS missing in the config file " + ) + + with open( + FILE_EXTENSION_MAPPINGS, encoding="utf-8" + ) as file_extension_mappings_file: + AutoDetectUtils.set_mappings(json.load(file_extension_mappings_file)) datatypes = defaultdict(set) extensions = defaultdict(set) - for fileext, val in AutoDetectUtils._MAPPINGS["types"].items(): + for fileext, val in AutoDetectUtils.get_extension_mappings().items(): # if we start using the file ext type array for anything else this might need changes filetype = val["file_ext_type"][0] extensions[filetype].add(fileext) - for m in val['mappings']: - datatypes[m['id']].add(filetype) + for m in val["mappings"]: + datatypes[m["id"]].add(filetype) global _DATATYPE_MAPPINGS _DATATYPE_MAPPINGS = { "datatype_to_filetype": {k: sorted(datatypes[k]) for k in datatypes}, @@ -673,9 +861,18 @@ def inject_config_dependencies(config): } -def app_factory(config): +def app_factory(): + """ + Creates an aiohttp web application, with routes and configuration, and returns it. + + Used by the gunicorn server to spawn new workers. + """ app = web.Application(middlewares=[web.normalize_path_middleware()]) app.router.add_routes(routes) + + # TODO: IMO (eap) cors should not be configured here, as this app should + # never be exposed directly to the public. The proxy should control + # the CORS policy. cors = aiohttp_cors.setup( app, defaults={ @@ -688,8 +885,8 @@ def app_factory(config): for route in list(app.router.routes()): cors.add(route) + # Saves config values into various global things; see the implementation. + config = get_config() inject_config_dependencies(config) - global auth_client - auth_client = KBaseAuth2(config["staging_service"]["AUTH_URL"]) return app diff --git a/staging_service/config.py b/staging_service/config.py new file mode 100644 index 00000000..83f88486 --- /dev/null +++ b/staging_service/config.py @@ -0,0 +1,80 @@ +import os + +import configparser +from aiohttp import web + +DEFAULT_UPLOAD_MAX_FILE_SIZE = 5 * 1000 * 1000 * 1000 # aka, 5GB +DEFAULT_UPLOAD_READ_CHUNK_SIZE = 1 * 1000 * 1000 +UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY = "SAVE_STRATEGY_TEMP_THEN_COPY" +UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION = "SAVE_STRATEGY_SAVE_TO_DESTINATION" +DEFAULT_UPLOAD_SAVE_STRATEGY = UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION +# THe content length tweak represents the extra header overload for a request, which consists +# of the "fileDest" field and the header section of the upload file. 1K seems reasonable, and +# the whole business of capping the file size is approximate in any case. +# Casual testing shows 272 bytes for Chrome, without the actual destination file and file's +# filename, so 1728 bytes for the path and filename seems quite generous. +CONTENT_LENGTH_TWEAK = 2000 + + +_CONFIG = None + +def get_read_chunk_size() -> int: + """ + Returns the configured chunk size for reading from the + upload request stream. + """ + chunk_size = os.environ.get("UPLOAD_READ_CHUNK_SIZE") + if chunk_size is None: + return DEFAULT_UPLOAD_READ_CHUNK_SIZE + try: + return int(chunk_size) + except ValueError as ve: + raise web.HTTPInternalServerError( + text=f"Error parsing UPLOAD_READ_CHUNK_SIZE environment variable: {str(ve)}" + ) + + +def get_save_strategy() -> str: + """ + Returns the "save strategy" as defined by the environment variable "UPLOAD_SAVE_STRATEGY", + as well as the default value DEFAULT_SAVE_STRATEGY defined at the top of this file. + """ + save_strategy = os.environ.get("UPLOAD_SAVE_STRATEGY") + + if save_strategy is None: + return DEFAULT_UPLOAD_SAVE_STRATEGY + + if save_strategy not in [ + UPLOAD_SAVE_STRATEGY_TEMP_THEN_COPY, + UPLOAD_SAVE_STRATEGY_SAVE_TO_DESTINATION, + ]: + raise web.HTTPInternalServerError( + text=f"Unsupported save strategy in configuration: '{save_strategy}'" + ) + + return save_strategy + + +def get_max_file_size() -> int: + max_file_size = os.environ.get("UPLOAD_MAX_FILE_SIZE") + if max_file_size is None: + return DEFAULT_UPLOAD_MAX_FILE_SIZE + try: + return int(max_file_size) + except ValueError as ve: + raise web.HTTPInternalServerError( + text=f"Error parsing UPLOAD_MAX_FILE_SIZE environment variable: {str(ve)}" + ) + + +def get_max_content_length() -> int: + max_file_size = get_max_file_size() + return max_file_size + CONTENT_LENGTH_TWEAK + + +def get_config(): + global _CONFIG + if _CONFIG is None: + _CONFIG = configparser.ConfigParser() + _CONFIG.read(os.environ["KB_DEPLOYMENT_CONFIG"]) + return _CONFIG diff --git a/staging_service/globus.py b/staging_service/globus.py index 54b41fb4..8cc9cc00 100644 --- a/staging_service/globus.py +++ b/staging_service/globus.py @@ -1,8 +1,10 @@ -from .utils import Path -import aiohttp -import aiofiles -import os import configparser +import os + +import aiofiles +import aiohttp + +from .utils import StagingPath def _get_authme_url(): @@ -18,6 +20,7 @@ def _get_authme_url(): async def _get_globus_ids(token): if not token: raise aiohttp.web.HTTPBadRequest(text="must supply token") + async with aiohttp.ClientSession() as session: auth2_me_url = _get_authme_url() async with session.get(auth2_me_url, headers={"Authorization": token}) as resp: @@ -37,30 +40,31 @@ async def _get_globus_ids(token): def _globus_id_path(username: str): - return Path.validate_path(username, ".globus_id") + return StagingPath.validate_path(username, ".globus_id") -def is_globusid(path: Path, username: str): +def is_globusid(path: StagingPath, username: str): return path.full_path == _globus_id_path(username) async def assert_globusid_exists(username, token): - """ ensures that a globus id exists if there is a valid one for user""" - - # make root dir - root = Path.validate_path(username, "") - if not os.path.exists(root.full_path): - os.makedirs(root.full_path) + """ensures that a globus id exists if there is a valid one for user""" path = _globus_id_path(username) - # check to see if file exists or is empty + + # Ensure the path to the globus file exists. In a deployment, this is the + # user's staging directory. + os.makedirs(os.path.dirname(path.full_path), exist_ok=True) + + # Create the globus if not os.path.exists(path.full_path) or os.stat(path.full_path).st_size == 0: globus_ids = await _get_globus_ids(token) if len(globus_ids) == 0: return + # TODO in the future this should support writing multiple lines # such as the commented code below, for multiple linked accounts # text = '\n'.join(globus_ids) text = globus_ids[0] async with aiofiles.open(path.full_path, mode="w") as globus_file: - await globus_file.writelines(text) + await globus_file.write(text) diff --git a/staging_service/import_specifications/file_parser.py b/staging_service/import_specifications/file_parser.py index 68dcaa6b..d855af2c 100644 --- a/staging_service/import_specifications/file_parser.py +++ b/staging_service/import_specifications/file_parser.py @@ -6,12 +6,10 @@ from collections.abc import Callable from dataclasses import dataclass from enum import Enum -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict from pathlib import Path -from typing import Union, Optional as O +from typing import Optional, Union -# TODO should get mypy working at some point +from frozendict import frozendict PRIMITIVE_TYPE = Union[str, int, float, bool, None] @@ -20,6 +18,7 @@ class ErrorType(Enum): """ The type of an error encountered when trying to parse import specifications. """ + FILE_NOT_FOUND = 1 PARSE_FAIL = 2 INCORRECT_COLUMN_COUNT = 3 @@ -37,27 +36,33 @@ class SpecificationSource: tab - the name of the spreadsheet file tab from which the import specification was obtained, if any. """ + file: Path - tab: O[str] = None + tab: Optional[str] = None def __post_init__(self): if not self.file: raise ValueError("file is required") -_ERR_MESSAGE = 'message' -_ERR_SOURCE_1 = 'source_1' -_ERR_SOURCE_2 = 'source_2' +_ERR_MESSAGE = "message" +_ERR_SOURCE_1 = "source_1" +_ERR_SOURCE_2 = "source_2" _ERRTYPE_TO_REQ_ARGS = { ErrorType.FILE_NOT_FOUND: (_ERR_SOURCE_1,), ErrorType.PARSE_FAIL: (_ERR_MESSAGE, _ERR_SOURCE_1), ErrorType.INCORRECT_COLUMN_COUNT: (_ERR_MESSAGE, _ERR_SOURCE_1), - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: (_ERR_MESSAGE, _ERR_SOURCE_1, _ERR_SOURCE_2), + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE: ( + _ERR_MESSAGE, + _ERR_SOURCE_1, + _ERR_SOURCE_2, + ), ErrorType.NO_FILES_PROVIDED: tuple(), ErrorType.OTHER: (_ERR_MESSAGE,), } + @dataclass(frozen=True) class Error: f""" @@ -80,9 +85,9 @@ class Error: """ error: ErrorType - message: O[str] = None - source_1: O[SpecificationSource] = None - source_2: O[SpecificationSource] = None + message: Optional[str] = None + source_1: Optional[SpecificationSource] = None + source_2: Optional[SpecificationSource] = None def __post_init__(self): if not self.error: @@ -94,7 +99,9 @@ def __post_init__(self): for attr in attrs: if not getattr(self, attr): # grammar sucks but this is not expected to be seen by end users so meh - raise ValueError(f"{', '.join(attrs)} is required for a {self.error.name} error") + raise ValueError( + f"{', '.join(attrs)} is required for a {self.error.name} error" + ) @dataclass(frozen=True) @@ -111,6 +118,7 @@ class ParseResult: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create instances of this class. """ + source: SpecificationSource result: tuple[frozendict[str, PRIMITIVE_TYPE], ...] @@ -137,12 +145,13 @@ class ParseResults: expected that the class creator do that error checking. Users should use the parse_import_specifications method to create an instance of this class. """ - results: O[frozendict[str, ParseResult]] = None - errors: O[tuple[Error, ...]] = None + + results: Optional[frozendict[str, ParseResult]] = None + errors: Optional[tuple[Error, ...]] = None def __post_init__(self): if not (bool(self.results) ^ bool(self.errors)): # xnor - raise ValueError("Exectly one of results or errors must be supplied") + raise ValueError("Exactly one of results or errors must be supplied") # we assume here that the data is otherwise correctly created. @@ -156,18 +165,21 @@ class FileTypeResolution: parser - a parser for the file unsupported_type - the file type if the type is not a supported type. """ - parser: O[Callable[[Path], ParseResults]] = None - unsupported_type: O[str] = None + + parser: Optional[Callable[[Path], ParseResults]] = None + unsupported_type: Optional[str] = None def __post_init__(self): if not (bool(self.parser) ^ bool(self.unsupported_type)): # xnor - raise ValueError("Exectly one of parser or unsupported_type must be supplied") + raise ValueError( + "Exactly one of parser or unsupported_type must be supplied" + ) def parse_import_specifications( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], - log_error: Callable[[Exception], None] + log_error: Callable[[Exception], None], ) -> ParseResults: """ Parse a set of import specification files and return the results. @@ -187,6 +199,7 @@ def parse_import_specifications( errors = tuple([Error(ErrorType.OTHER, str(e))]) return ParseResults(errors=errors) + def _parse( paths: tuple[Path, ...], file_type_resolver: Callable[[Path], FileTypeResolution], @@ -196,12 +209,14 @@ def _parse( for p in paths: file_type = file_type_resolver(p) if file_type.unsupported_type: - errors.append(Error( - ErrorType.PARSE_FAIL, - f"{file_type.unsupported_type} " + errors.append( + Error( + ErrorType.PARSE_FAIL, + f"{file_type.unsupported_type} " + "is not a supported file type for import specifications", - SpecificationSource(p) - )) + SpecificationSource(p), + ) + ) continue res = file_type.parser(p) if res.errors: @@ -209,12 +224,14 @@ def _parse( else: for data_type in res.results: if data_type in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Data type {data_type} appears in two importer specification sources", - results[data_type].source, - res.results[data_type].source - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Data type {data_type} appears in two importer specification sources", + results[data_type].source, + res.results[data_type].source, + ) + ) else: results[data_type] = res.results[data_type] if errors: diff --git a/staging_service/import_specifications/file_writers.py b/staging_service/import_specifications/file_writers.py index 160b32e2..d4910c80 100644 --- a/staging_service/import_specifications/file_writers.py +++ b/staging_service/import_specifications/file_writers.py @@ -210,7 +210,7 @@ def write_excel(folder: Path, types: dict[str, dict[str, list[Any]]]) -> dict[st sheet.row_dimensions[2].hidden = True res[datatype] = filename # trash the automatically created sheet - wb.remove_sheet(wb[wb.sheetnames[0]]) + wb.remove(wb[wb.sheetnames[0]]) wb.save(folder / filename) return res diff --git a/staging_service/import_specifications/individual_parsers.py b/staging_service/import_specifications/individual_parsers.py index 61bf84b5..68696eaf 100644 --- a/staging_service/import_specifications/individual_parsers.py +++ b/staging_service/import_specifications/individual_parsers.py @@ -3,23 +3,22 @@ """ import csv -import magic import math -import pandas import re - -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict from pathlib import Path -from typing import Optional as O, Union, Any +from typing import Any, Optional, Tuple, Union + +import magic +import pandas +from frozendict import frozendict from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, - ErrorType, - SpecificationSource, Error, + ErrorType, ParseResult, ParseResults, + SpecificationSource, ) # this version defines the schema of the xsv and Excel files. @@ -31,10 +30,14 @@ _VERSION_STR = "Version:" _COLUMN_STR = "Columns:" _HEADER_SEP = ";" -_EXPECTED_HEADER = (f"{_DATA_TYPE} {_HEADER_SEP} " - + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} ") -_HEADER_REGEX = re.compile(f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " - + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)") +_EXPECTED_HEADER = ( + f"{_DATA_TYPE} {_HEADER_SEP} " + + f"{_COLUMN_STR} {_HEADER_SEP} {_VERSION_STR} " +) +_HEADER_REGEX = re.compile( + f"{_DATA_TYPE} (\\w+){_HEADER_SEP} " + + f"{_COLUMN_STR} (\\d+){_HEADER_SEP} {_VERSION_STR} (\\d+)" +) _MAGIC_TEXT_FILES = {"text/plain", "inode/x-empty", "application/csv", "text/csv"} @@ -60,24 +63,29 @@ class _ParseException(Exception): pass -def _parse_header(header: str, spec_source: SpecificationSource, maximum_version: int +def _parse_header( + header: str, spec_source: SpecificationSource, maximum_version: int ) -> tuple[str, int]: # return is (data type, column count) match = _HEADER_REGEX.fullmatch(header) if not match: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f'Invalid header; got "{header}", expected "{_EXPECTED_HEADER}"', + spec_source, + ) + ) version = int(match[3]) if version > maximum_version: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Schema version {version} is larger than maximum processable " - + f"version {maximum_version}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Schema version {version} is larger than maximum processable " + + f"version {maximum_version}", + spec_source, + ) + ) return match[1], int(match[2]) @@ -86,23 +94,26 @@ def _csv_next( line_number: int, expected_line_count: Union[None, int], # None = skip columns check spec_source: SpecificationSource, - error: str + error: str, ) -> list[str]: try: line = next(input_) except StopIteration: raise _ParseException(Error(ErrorType.PARSE_FAIL, error, spec_source)) if expected_line_count and len(line) != expected_line_count: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {line_number}, " - + f"expected {expected_line_count}, got {len(line)}", - spec_source)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {line_number}, " + + f"expected {expected_line_count}, got {len(line)}", + spec_source, + ) + ) return line def _error(error: Error) -> ParseResults: - return ParseResults(errors = tuple([error])) + return ParseResults(errors=tuple([error])) def _normalize_pandas(val: PRIMITIVE_TYPE) -> PRIMITIVE_TYPE: @@ -141,107 +152,155 @@ def _normalize_headers( ret = [str(s).strip() if not pandas.isna(s) else None for s in headers] for i, name in enumerate(ret, start=1): if not name: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Missing header entry in row {line_number}, position {i}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Missing header entry in row {line_number}, position {i}", + spec_source, + ) + ) if name in seen: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, - f"Duplicate header name in row {line_number}: {name}", - spec_source - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + f"Duplicate header name in row {line_number}: {name}", + spec_source, + ) + ) seen.add(name) return ret def _parse_xsv(path: Path, sep: str) -> ParseResults: - spcsrc = SpecificationSource(path) + specification_source = SpecificationSource(path) try: filetype = magic.from_file(str(path), mime=True) if filetype not in _MAGIC_TEXT_FILES: - return _error(Error(ErrorType.PARSE_FAIL, "Not a text file: " + filetype, spcsrc)) - with open(path, newline='') as input_: + return _error( + Error( + ErrorType.PARSE_FAIL, + "Not a text file: " + filetype, + specification_source, + ) + ) + with open(path, newline="", encoding="utf-8") as input_: rdr = csv.reader(input_, delimiter=sep) # let parser handle quoting - dthd = _csv_next(rdr, 1, None, spcsrc, "Missing data type / version header") - datatype, columns = _parse_header(dthd[0], spcsrc, _VERSION) - hd1 = _csv_next(rdr, 2, columns, spcsrc, "Missing 2nd header line") - param_ids = _normalize_headers(hd1, 2, spcsrc) - _csv_next(rdr, 3, columns, spcsrc, "Missing 3rd header line") + dthd = _csv_next( + rdr, 1, None, specification_source, "Missing data type / version header" + ) + datatype, columns = _parse_header(dthd[0], specification_source, _VERSION) + hd1 = _csv_next( + rdr, 2, columns, specification_source, "Missing 2nd header line" + ) + param_ids = _normalize_headers(hd1, 2, specification_source) + _csv_next(rdr, 3, columns, specification_source, "Missing 3rd header line") results = [] for i, row in enumerate(rdr, start=4): if row: # skip empty rows if len(row) != columns: # could collect errors (first 10?) and throw an exception with a list # lets wait and see if that's really needed - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {i}, " - + f"expected {columns}, got {len(row)}", - spcsrc)) - results.append(frozendict( - {param_ids[j]: _normalize_xsv(row[j]) for j in range(len(row))})) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {i}, " + + f"expected {columns}, got {len(row)}", + specification_source, + ) + ) + results.append( + frozendict( + { + param_ids[j]: _normalize_xsv(row[j]) + for j in range(len(row)) + } + ) + ) if not results: - raise _ParseException(Error( - ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - return ParseResults(frozendict( - {datatype: ParseResult(spcsrc, tuple(results))} - )) + raise _ParseException( + Error( + ErrorType.PARSE_FAIL, + "No non-header data in file", + specification_source, + ) + ) + return ParseResults( + frozendict({datatype: ParseResult(specification_source, tuple(results))}) + ) except FileNotFoundError: - return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) + return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=specification_source)) except IsADirectoryError: - return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + return _error( + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + specification_source, + ) + ) except _ParseException as e: return _error(e.args[0]) def parse_csv(path: Path) -> ParseResults: - """ Parse the provided CSV file. """ + """Parse the provided CSV file.""" return _parse_xsv(path, ",") def parse_tsv(path: Path) -> ParseResults: - """ Parse the provided TSV file. """ + """Parse the provided TSV file.""" return _parse_xsv(path, "\t") def _process_excel_row( - row: tuple[Any], rownum: int, expected_columns: int, spcsrc: SpecificationSource + row: tuple[Any], + rownum: int, + expected_columns: int, + specification_source: SpecificationSource, ) -> list[Any]: while len(row) > expected_columns: if pandas.isna(row[-1]): # inefficient, but premature optimization... row = row[0:-1] else: - raise _ParseException(Error( - ErrorType.INCORRECT_COLUMN_COUNT, - f"Incorrect number of items in line {rownum}, " - + f"expected {expected_columns}, got {len(row)}", - spcsrc)) + raise _ParseException( + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + f"Incorrect number of items in line {rownum}, " + + f"expected {expected_columns}, got {len(row)}", + specification_source, + ) + ) return row - -def _process_excel_tab(excel: pandas.ExcelFile, spcsrc: SpecificationSource -) -> (O[str], O[ParseResult]): - df = excel.parse(sheet_name=spcsrc.tab, na_values=_EXCEL_MISSING_VALUES, keep_default_na=False) + +def _process_excel_tab( + excel: pandas.ExcelFile, specification_source: SpecificationSource +) -> Tuple[Optional[str], Optional[ParseResult]]: + df = excel.parse( + sheet_name=specification_source.tab, + na_values=_EXCEL_MISSING_VALUES, + keep_default_na=False, + ) if df.shape[0] < 3: # might as well not error check headers in sheets with no data return (None, None) # at this point we know that at least 4 lines are present - expecting the data type header, # parameter ID header, display name header, and at least one data row header = df.columns.get_level_values(0)[0] - datatype, columns = _parse_header(header, spcsrc, _VERSION) + datatype, columns = _parse_header(header, specification_source, _VERSION) it = df.itertuples(index=False, name=None) - hd1 = _process_excel_row(next(it), 2, columns, spcsrc) - param_ids = _normalize_headers(hd1, 2, spcsrc) - _process_excel_row(next(it), 3, columns, spcsrc) + hd1 = _process_excel_row(next(it), 2, columns, specification_source) + param_ids = _normalize_headers(hd1, 2, specification_source) + _process_excel_row(next(it), 3, columns, specification_source) results = [] for i, row in enumerate(it, start=4): - row = _process_excel_row(row, i, columns, spcsrc) + row = _process_excel_row(row, i, columns, specification_source) if any(map(lambda x: not pandas.isna(x), row)): # skip empty rows - results.append(frozendict( - {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))})) - return datatype, ParseResult(spcsrc, tuple(results)) + results.append( + frozendict( + {param_ids[j]: _normalize_pandas(row[j]) for j in range(len(row))} + ) + ) + return datatype, ParseResult(specification_source, tuple(results)) def parse_excel(path: Path) -> ParseResults: @@ -262,12 +321,14 @@ def parse_excel(path: Path) -> ParseResults: if not datatype: continue elif datatype in results: - errors.append(Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - f"Found datatype {datatype} in multiple tabs", - SpecificationSource(path, datatype_to_tab[datatype]), - spcsrc_tab, - )) + errors.append( + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + f"Found datatype {datatype} in multiple tabs", + SpecificationSource(path, datatype_to_tab[datatype]), + spcsrc_tab, + ) + ) else: datatype_to_tab[datatype] = tab results[datatype] = result @@ -276,11 +337,18 @@ def parse_excel(path: Path) -> ParseResults: except FileNotFoundError: return _error(Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc)) except IsADirectoryError: - return _error(Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc)) + return _error( + Error(ErrorType.PARSE_FAIL, "The given path is a directory", spcsrc) + ) except ValueError as e: if "Excel file format cannot be determined" in str(e): - return _error(Error( - ErrorType.PARSE_FAIL, "Not a supported Excel file type", source_1=spcsrc)) + return _error( + Error( + ErrorType.PARSE_FAIL, + "Not a supported Excel file type", + source_1=spcsrc, + ) + ) raise e # bail out, not sure what's wrong, not sure how to test either if errors: return ParseResults(errors=tuple(errors)) @@ -288,4 +356,3 @@ def parse_excel(path: Path) -> ParseResults: return ParseResults(frozendict(results)) else: return _error(Error(ErrorType.PARSE_FAIL, "No non-header data in file", spcsrc)) - \ No newline at end of file diff --git a/staging_service/main.py b/staging_service/main.py new file mode 100644 index 00000000..7c386622 --- /dev/null +++ b/staging_service/main.py @@ -0,0 +1,15 @@ +import asyncio + +import uvloop + +from staging_service.app import app_factory + +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) # for speed of event loop + + +async def web_app(): + """ + The gunicorn web app factory function, which in turn just calls the + aiohttp web server factory. + """ + return app_factory() diff --git a/staging_service/metadata.py b/staging_service/metadata.py index 110a7716..1b47c43d 100644 --- a/staging_service/metadata.py +++ b/staging_service/metadata.py @@ -1,132 +1,299 @@ +import hashlib +import json +import os +import unicodedata +from difflib import SequenceMatcher from json import JSONDecoder, JSONEncoder + import aiofiles -from .utils import run_command, Path -import os from aiohttp import web -import hashlib -from difflib import SequenceMatcher -from itertools import islice + +from .utils import StagingPath decoder = JSONDecoder() encoder = JSONEncoder() +READ_BUFFER_SIZE = 1 * 1000 * 1000 +FILE_SNIPPET_SIZE = 1024 + +# Set as the value for snippets for a non-text file +NOT_TEXT_FILE_VALUE = "not a text file" + +SOURCE_JGI_IMPORT = "JGI import" +SOURCE_UNKNOWN = "Unknown" + -async def stat_data(path: Path) -> dict: +async def stat_data(path: StagingPath) -> dict: """ + Returns our version of file stats. + only call this on a validated full path """ file_stats = os.stat(path.full_path) - isFolder = os.path.isdir(path.full_path) + is_folder = os.path.isdir(path.full_path) return { "name": path.name, "path": path.user_path, "mtime": int(file_stats.st_mtime * 1000), # given in seconds, want ms "size": file_stats.st_size, - "isFolder": isFolder, + "isFolder": is_folder, } def _file_read_from_tail(file_path): - upper_bound = min(1024, os.stat(file_path).st_size) - with open(file_path, "r") as file: + """ + Returns a snippet of text from the end of the given file. + + The snippet will be either FILE_SNIPPET_SIZE or the + entire file if it is smaller than that. + + If the file is not proper utf-8 encoded, an error message + will be returned instead. + """ + upper_bound = min(FILE_SNIPPET_SIZE, os.stat(file_path).st_size) + with open(file_path, "r", encoding="utf-8") as file: file.seek(0, os.SEEK_END) file.seek(file.tell() - upper_bound, os.SEEK_SET) - return file.read() + try: + return file.read() + except UnicodeError: + # Note that this string propagates all the way to the ui. + return NOT_TEXT_FILE_VALUE def _file_read_from_head(file_path): - with open(file_path, "r") as file: - return file.read(1024) + """ + Returns a snippet of text from the beginning of the given file. + The snippet will be either FILE_SNIPPET_SIZE or the + entire file if it is smaller than that. -async def _generate_metadata(path: Path, source: str): - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: - data = await extant.read() - data = decoder.decode(data) - else: - data = {} - # first ouptut of md5sum is the checksum - if source: - data["source"] = source + If the file is not proper utf-8 encoded, an error message + will be returned instead. + """ + with open(file_path, "r", encoding="utf-8") as file: + try: + return file.read(FILE_SNIPPET_SIZE) + except UnicodeError: + # Note that this string propagates all the way to the ui. + return NOT_TEXT_FILE_VALUE + + +async def _read_metadata(metadata_path: str): + """ + Reads the file at the provided path, and decodes to JSON. + """ try: - md5 = hashlib.md5(open(path.full_path, "rb").read()).hexdigest() - except: - md5 = "n/a" - - data["md5"] = md5.split()[0] - # first output of wc is the count - data["lineCount"] = str(sum((1 for i in open(path.full_path, "rb")))) - try: # all things that expect a text file to decode output should be in this block - data["head"] = _file_read_from_head(path.full_path) - data["tail"] = _file_read_from_tail(path.full_path) - except: - data["head"] = "not text file" - data["tail"] = "not text file" - async with aiofiles.open(path.metadata_path, mode="w") as f: - await f.writelines(encoder.encode(data)) - return data - - -async def add_upa(path: Path, UPA: str): - if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: + async with aiofiles.open(metadata_path, mode="r", encoding="utf-8") as extant: data = await extant.read() - data = decoder.decode(data) - else: - data = await _generate_metadata(path) # TODO performance optimization - data["UPA"] = UPA - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - async with aiofiles.open(path.metadata_path, mode="w") as update: - await update.writelines(encoder.encode(data)) + return decoder.decode(data) + except json.decoder.JSONDecodeError: + # The metadata file is either corrupted or is currently being written. + # TODO: can we distinguish these cases, or redesign things? + return {} + + +async def _get_metadata(metadata_path: str): + """ + Gets the file metadata from the file system or, if it doesn't exist, + return an empty dict. + """ + try: + return await _read_metadata(metadata_path) + except FileNotFoundError: + os.makedirs(os.path.dirname(metadata_path), exist_ok=True) + return {} + + +def is_text_string(bytes_to_check): + """ + Determines if the provided bytes is indeed a text string. + + It is considered a valid string if it is utf-8 encoded and + does not contain control characters. + + This determination is for the purpose of determining whether to + generate text snippets and report lines of text. + """ + try: + # If it decodes, it is indeed Unicode. + decoded = bytes_to_check.decode("utf-8") + + # But it may not be text, as control characters may be + # in there, and they are unlikely to be part of + # readable text, which is what we are after here, as we + # use this to enable line counting and snippets. + # We allow TAB (9), LF (10), CR (13) for obvious reasons. + for character in decoded: + if unicodedata.category(character) == "Cc" and ord(character) not in [ + 9, + 10, + 13, + ]: + return False + + return True + except UnicodeError: + return False + + +async def _generate_metadata(path: StagingPath, source: str = None): + """ + Generates a metadata file for the associated data file. + + Note that it generates it unconditionally. For conditional generation, see _ensure_metadata. + """ + # Open metadatafile if it exists, otherwise create an + # empty dict and ensure there is a writable directory for it + # to be saved to. + existing_metadata = await _get_metadata(path.metadata_path) + + additional_metadata = {} + + # We record the modification time for the file, in milliseconds, in order + # to be able to determine if it has changed since the metadata was last + # generated. + file_stats = os.stat(path.full_path) + additional_metadata["mtime"] = int(file_stats.st_mtime * 1000) + + # If source is missing (why woudld that be?), supply it with the one + # provided, or attempt to determine it. + if "source" not in existing_metadata and source is None: + source = _determine_source(path) + additional_metadata["source"] = source + + metadata = existing_metadata | additional_metadata + + # We want to know, roughly, if this is a text file or not ("binary"); + # setting to true initially is just part of the logic in the loop below. + is_text = True + + # A word on counting lines. We simply count line endings. With a catch. + # The final "line" may be of two forms. Either as counted here, with the + # last line ending being at the very end of the file, or the last bytes + # after the last line ending if the file ends without a terminal line ending. + # The latter adds, unfortunately, extra code and complexity. + line_count = 0 + # Keeping the previous chunk of bytes read from the file is necessary for accurate + # line counting. + last_chunk_read = None + chunk_count = 0 -def _determine_source(path: Path): + # Below we disable first sonarcloud, second bandit (codacy) + md5 = hashlib.md5() # NOSONAR #nosec + async with aiofiles.open(path.full_path, "rb") as fin: + while True: + chunk = await fin.read(READ_BUFFER_SIZE) + + # If attempt to read past end of file, a 0-length bytes is returned + if len(chunk) == 0: + break + + last_chunk_read = chunk + chunk_count += 1 + md5.update(chunk) + + # We use the first chunk to determine if the file is valid utf-8 text. + # The first chunk is 1MB, so this should be safe. + if chunk_count == 1: + is_text = is_text_string(chunk) + + if is_text: + line_count += chunk.count(b"\n") + + # + # If the last chunk does not end in a newline (LF), then it has a last line + # which is terminated by the end of the file. + # + if last_chunk_read is not None and last_chunk_read[-1] != ord(b"\n"): + line_count += 1 + + metadata["lineCount"] = line_count if is_text else None + + metadata["md5"] = md5.hexdigest() + + metadata["head"] = ( + _file_read_from_head(path.full_path) if is_text else NOT_TEXT_FILE_VALUE + ) + metadata["tail"] = ( + _file_read_from_tail(path.full_path) if is_text else NOT_TEXT_FILE_VALUE + ) + + # Save metadata file + await _save_metadata(path, metadata) + + return metadata + + +async def _save_metadata(path: StagingPath, metadata: dict): + """ + Saves the given metadata dictionary into the metadata file associated with the given path + """ + async with aiofiles.open(path.metadata_path, mode="w") as metadata_file: + await metadata_file.write(encoder.encode(metadata)) + + +async def _update_metadata(path: StagingPath, additional_metadata: dict) -> dict: + """ + Updates the metadata indicated by the path with the additional metadata + provided. + + Assumes the metadata file exists, so it should always be called after + _ensure_metadata. + """ + metadata = await _read_metadata(path.metadata_path) + + new_metdata = metadata | additional_metadata + + await _save_metadata(path, new_metdata) + + return new_metdata + + +async def add_upa(path: StagingPath, upa: str): + """ + Adds the provided UPA to the metadata for the given path. + + If the metadata does not yet exist, the metadata is generated. + """ + await _ensure_metadata(path) + await _update_metadata(path, {"UPA": upa}) + + +def _determine_source(path: StagingPath): """ tries to determine the source of a file for which the source is unknown currently this works for JGI imported files only """ jgi_path = path.jgi_metadata if os.path.exists(jgi_path) and os.path.isfile(jgi_path): - return "JGI import" + return SOURCE_JGI_IMPORT else: - return "Unknown" - - -async def _only_source(path: Path): - if os.path.exists(path.metadata_path): - async with aiofiles.open(path.metadata_path, mode="r") as extant: - try: - data = await extant.read() - data = decoder.decode(data) - except: - data = {} - else: - data = {} - if "source" in data: - return data["source"] - else: - data["source"] = _determine_source(path) - os.makedirs(os.path.dirname(path.metadata_path), exist_ok=True) - async with aiofiles.open(path.metadata_path, mode="w") as update: - await update.writelines(encoder.encode(data)) - return data["source"] + return SOURCE_UNKNOWN async def dir_info( - path: Path, show_hidden: bool, query: str = "", recurse=True + path: StagingPath, show_hidden: bool, query: str = "", recurse=True ) -> list: """ - only call this on a validated full path + Returns a directory listing of the given path. + + If a query is provided, only files containing it will be included. + If show_hidden is True, any files with a "." prefix ("hidden files"), will + be included, otherwise they are omitted + If recurse is True, all subdirectories will be included. + + Only call this on a validated full path. """ response = [] for entry in os.scandir(path.full_path): - specific_path = Path.from_full_path(entry.path) + specific_path = StagingPath.from_full_path(entry.path) # maybe should never show the special .globus_id file ever? # or moving it somewhere outside the user directory would be better - if not show_hidden and entry.name.startswith('.'): + if not show_hidden and entry.name.startswith("."): continue + if entry.is_dir(): if query == "" or specific_path.user_path.find(query) != -1: response.append(await stat_data(specific_path)) @@ -134,11 +301,12 @@ async def dir_info( response.extend( await dir_info(specific_path, show_hidden, query, recurse) ) - if entry.is_file(): + elif entry.is_file(): if query == "" or specific_path.user_path.find(query) != -1: - data = await stat_data(specific_path) - data["source"] = await _only_source(specific_path) - response.append(data) + metadata, file_stats = await _ensure_metadata(specific_path) + file_stats["source"] = metadata["source"] + response.append(file_stats) + return response @@ -152,45 +320,64 @@ async def similar(file_name, comparing_file_name, similarity_cut_off): return matcher.ratio() >= similarity_cut_off -async def some_metadata(path: Path, desired_fields=False, source=None): +async def _ensure_metadata(path: StagingPath, source: str = None): """ - if desired fields isn't given as a list all fields will be returned - assumes full_path is valid path to a file - valid fields for desired_fields are: - md5, lineCount, head, tail, name, path, mtime, size, isFolder, UPA + Returns metdata if found, otherwise generates and returns it for the given file """ file_stats = await stat_data(path) - if file_stats["isFolder"]: - return file_stats - if (not os.path.exists(path.metadata_path)) or ( - os.stat(path.metadata_path).st_mtime < file_stats["mtime"] / 1000 - ): - # if metadata does not exist or older than file: regenerate - if source is None: # TODO BUGFIX this will overwrite any source in the file - source = _determine_source(path) - data = await _generate_metadata(path, source) - else: # metadata already exists and is up to date - async with aiofiles.open(path.metadata_path, mode="r") as f: - # make metadata fields local variables - data = await f.read() - data = decoder.decode(data) - # due to legacy code, some file has corrupted metadata file - # Also if a file is listed or checked for existence before the upload completes - # this code block will be triggered - expected_keys = ["source", "md5", "lineCount", "head", "tail"] - if not set(expected_keys) <= set(data.keys()): - if source is None and "source" not in data: - source = _determine_source(path) - data = await _generate_metadata(path, source) - data = {**data, **file_stats} + + if os.path.exists(path.metadata_path): + metadata = await _read_metadata(path.metadata_path) + + # + # These are required properties of the metadata, so if any are missing, + # due to corrupted metadata, old metadata generated during development, + # or due to updates in the metadata format, this ensures that we trigger + # a metadata rebuild. + # + built_in_keys = set(["source", "md5", "lineCount", "head", "tail", "mtime"]) + + # + # Note that we are checking if the built in keys are included in the metadata, + # not that they are the complete set of metadata (although they may be). + # + if set(built_in_keys) <= set(metadata.keys()): + # + # This is why we store the modification time of the file. If the + # file has been changed (as measured by the modification time), we + # regenerate the metadata. + # + if metadata["mtime"] == file_stats["mtime"]: + return metadata, file_stats + + metadata = await _generate_metadata(path, source) + + return metadata, file_stats + + +async def some_metadata(path: StagingPath, desired_fields=False, source=None): + """ + Returns metadata with file stats merged, optionally filtered by a set of + desired fields. + + If the desired fields are not provided, all fields will be returned. + Valid fields for desired_fields are: + md5, lineCount, head, tail, source, name, path, mtime, size, isFolder, UPA + + Assumes full_path is valid path to a file + """ + metadata, file_stats = await _ensure_metadata(path, source) + + data = metadata | file_stats + if not desired_fields: return data + result = {} for key in desired_fields: try: result[key] = data[key] except KeyError as no_data: - raise web.HTTPBadRequest( - text="no data exists for key {key}".format(key=no_data.args) - ) # TODO check this exception message + raise web.HTTPBadRequest(text=f"no data exists for key {no_data.args}") + return result diff --git a/staging_service/utils.py b/staging_service/utils.py index 79841d65..99ce0e90 100644 --- a/staging_service/utils.py +++ b/staging_service/utils.py @@ -7,11 +7,18 @@ import globus_sdk from aiohttp.web import HTTPInternalServerError, HTTPOk +from staging_service.auth2Client import KBaseAuth2 +from staging_service.config import get_config + + +def auth_client(): + return KBaseAuth2(get_config()["staging_service"]["AUTH_URL"]) + async def run_command(*args): """Run command in subprocess Example from: - http://asyncio.readthedocs.io/en/latest/subprocess.html + https://asyncio.readthedocs.io/en/latest/subprocess.html """ # Create subprocess process = await asyncio.create_subprocess_exec( @@ -31,15 +38,17 @@ async def run_command(*args): if process.returncode == 0: return stdout.decode().strip() else: - error_msg = "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( - cmd=" ".join(args), - returncode=process.returncode, - error=stderr.decode().strip(), + error_msg = ( + "command {cmd} failed\nreturn code: {returncode}\nerror: {error}".format( + cmd=" ".join(args), + returncode=process.returncode, + error=stderr.decode().strip(), + ) ) raise HTTPInternalServerError(text=error_msg) -class Path(object): +class StagingPath(object): _META_DIR = None # expects to be set by config _DATA_DIR = None # expects to be set by config _CONCIERGE_PATH = None # expects to be set by config @@ -68,22 +77,22 @@ def validate_path(username: str, path: str = ""): while path.startswith("/"): path = path[1:] user_path = os.path.join(username, path) - full_path = os.path.join(Path._DATA_DIR, user_path) + full_path = os.path.join(StagingPath._DATA_DIR, user_path) - metadata_path = os.path.join(Path._META_DIR, user_path) + metadata_path = os.path.join(StagingPath._META_DIR, user_path) name = os.path.basename(path) jgi_metadata = os.path.join(os.path.dirname(full_path), "." + name + ".jgi") - return Path(full_path, metadata_path, user_path, name, jgi_metadata) + return StagingPath(full_path, metadata_path, user_path, name, jgi_metadata) @staticmethod def from_full_path(full_path: str): - user_path = full_path[len(Path._DATA_DIR) :] + user_path = full_path[len(StagingPath._DATA_DIR) :] if user_path.startswith("/"): user_path = user_path[1:] - metadata_path = os.path.join(Path._META_DIR, user_path) + metadata_path = os.path.join(StagingPath._META_DIR, user_path) name = os.path.basename(full_path) jgi_metadata = os.path.join(os.path.dirname(full_path), "." + name + ".jgi") - return Path(full_path, metadata_path, user_path, name, jgi_metadata) + return StagingPath(full_path, metadata_path, user_path, name, jgi_metadata) class AclManager: @@ -167,7 +176,7 @@ def _add_acl(self, user_identity_id: str, shared_directory_basename: str): Attempt to add acl for the given user id and directory """ try: - resp = self.globus_transfer_client.add_endpoint_acl_rule( + self.globus_transfer_client.add_endpoint_acl_rule( self.endpoint_id, dict( DATA_TYPE="access", @@ -240,7 +249,7 @@ def _remove_acl(self, user_identity_id: str): text=json.dumps(response), content_type="application/json" ) - except globus_sdk.GlobusAPIError as error: + except globus_sdk.GlobusAPIErro: response = { "success": False, "error_type": "GlobusAPIError", @@ -258,7 +267,7 @@ def add_acl_concierge(self, shared_directory: str, concierge_path: str): :return: Result of attempt to add acl """ user_identity_id = self._get_globus_identity(shared_directory) - cp_full = f"{Path._DATA_DIR}/{concierge_path}" + cp_full = f"{StagingPath._DATA_DIR}/{concierge_path}" try: os.mkdir(cp_full) print(f"Attempting to create concierge dir {cp_full}") diff --git a/tests/import_specifications/test_file_parser.py b/tests/import_specifications/test_file_parser.py index 857be960..8b010654 100644 --- a/tests/import_specifications/test_file_parser.py +++ b/tests/import_specifications/test_file_parser.py @@ -1,30 +1,41 @@ # not much point in testing data classes unless there's custom logic in them from collections.abc import Callable -from frozendict import frozendict -from pytest import raises -from tests.test_utils import assert_exception_correct from pathlib import Path -from typing import Optional as O +from typing import Optional from unittest.mock import Mock, call +from frozendict import frozendict +from pytest import raises + from staging_service.import_specifications.file_parser import ( PRIMITIVE_TYPE, + Error, ErrorType, FileTypeResolution, - SpecificationSource, - Error, ParseResult, ParseResults, - parse_import_specifications + SpecificationSource, + parse_import_specifications, ) +from tests.test_helpers import assert_exception_correct + +TEST_EXCEL_FILE = "myfile.xlsx" + +# This file does not exist, it is used for a mock, appears in many locations. +TEST_MOCK_CSV_FILE_NAME = "somefile.csv" + +# some test functions include "SpecificationSource" in the name, which violates +# snake_case rules, but is meaningful in this context. +# pylint: disable=C0103 +# pylint: disable=C0116 -def spcsrc(path: str, tab: O[str]=None): +def specification_source(path: str, tab: Optional[str] = None): return SpecificationSource(Path(path), tab) -def test_SpecificationSource_init_success(): +def test_specification_source_init_success(): # minimal ss = SpecificationSource(Path("foo")) @@ -38,97 +49,110 @@ def test_SpecificationSource_init_success(): assert ss.tab == "tabbytab" -def test_SpecificationSource_init_fail(): +def test_specification_source_init_fail(): # could inline this, but might as well follow the same pattern as all the other tests - specificationSource_init_fail(None, ValueError("file is required")) + assert_specification_source_init_fail(None, ValueError("file is required")) -def specificationSource_init_fail(file_: O[str], expected: Exception): +def assert_specification_source_init_fail(file: Optional[str], expected: Exception): with raises(Exception) as got: - SpecificationSource(file_) + SpecificationSource(file) assert_exception_correct(got.value, expected) -def test_FileTypeResolution_init_w_parser_success(): - p = lambda path: ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - ftr = FileTypeResolution(p) +def test_file_type_resolution_init_w_parser_success(): + def parser(_: str): + return ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - assert ftr.parser is p # Here only identity equality makes sense + ftr = FileTypeResolution(parser) + + assert ftr.parser is parser # Here only identity equality makes sense assert ftr.unsupported_type is None -def test_FileTypeResolution_init_w_unsupported_type_success(): +def test_file_type_resolution_init_w_unsupported_type_success(): ftr = FileTypeResolution(unsupported_type="sys") assert ftr.parser is None assert ftr.unsupported_type == "sys" -def test_FileTypeResolution_init_fail(): - err = "Exectly one of parser or unsupported_type must be supplied" +def test_file_type_resolution_init_fail(): + err = "Exactly one of parser or unsupported_type must be supplied" pr = ParseResults(errors=(Error(ErrorType.OTHER, "foo"),)) - fileTypeResolution_init_fail(None, None, ValueError(err)) - fileTypeResolution_init_fail(lambda path: pr, "mp-2", ValueError(err)) + assert_file_type_resolution_init_fail(None, None, ValueError(err)) + assert_file_type_resolution_init_fail(lambda path: pr, "mp-2", ValueError(err)) -def fileTypeResolution_init_fail( - parser: O[Callable[[Path], ParseResults]], - unexpected_type: O[str], - expected: Exception +def assert_file_type_resolution_init_fail( + parser: Optional[Callable[[Path], ParseResults]], + unexpected_type: Optional[str], + expected: Exception, ): with raises(Exception) as got: FileTypeResolution(parser, unexpected_type) assert_exception_correct(got.value, expected) -def test_Error_init_w_FILE_NOT_FOUND_success(): +def test_error_init_w_file_not_found_success(): # minimal - e = Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo")) + e = Error(ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo")) assert e.error == ErrorType.FILE_NOT_FOUND assert e.message is None - assert e.source_1 == spcsrc("foo") + assert e.source_1 == specification_source("foo") assert e.source_2 is None # all - e = Error(ErrorType.FILE_NOT_FOUND, message="bar", source_1=spcsrc("foo")) + e = Error( + ErrorType.FILE_NOT_FOUND, message="bar", source_1=specification_source("foo") + ) assert e.error == ErrorType.FILE_NOT_FOUND assert e.message == "bar" - assert e.source_1 == spcsrc("foo") + assert e.source_1 == specification_source("foo") assert e.source_2 is None -def test_Error_init_w_PARSE_FAIL_success(): - e = Error(ErrorType.PARSE_FAIL, message="foo", source_1=spcsrc("foo2")) +def test_error_init_w_parse_fail_success(): + e = Error( + ErrorType.PARSE_FAIL, message="foo", source_1=specification_source("foo2") + ) assert e.error == ErrorType.PARSE_FAIL assert e.message == "foo" - assert e.source_1 == spcsrc("foo2") + assert e.source_1 == specification_source("foo2") assert e.source_2 is None -def test_Error_init_w_INCORRECT_COLUMN_COUNT_success(): - e = Error(ErrorType.INCORRECT_COLUMN_COUNT, message="42", source_1=spcsrc("somefile")) +def test_error_init_w_incorrect_column_count_success(): + e = Error( + ErrorType.INCORRECT_COLUMN_COUNT, + message="42", + source_1=specification_source("some_other_file"), + ) assert e.error == ErrorType.INCORRECT_COLUMN_COUNT assert e.message == "42" - assert e.source_1 == spcsrc("somefile") + assert e.source_1 == specification_source("some_other_file") assert e.source_2 is None -def test_Error_init_w_MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE_success(): +def test_error_init_w_multiple_specifications_for_data_type_success(): e = Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foo", spcsrc("foo2"), spcsrc("yay") + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "foo", + specification_source("foo2"), + specification_source("yay"), ) assert e.error == ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE assert e.message == "foo" - assert e.source_1 == spcsrc("foo2") - assert e.source_2 == spcsrc("yay") + assert e.source_1 == specification_source("foo2") + assert e.source_2 == specification_source("yay") -def test_Error_init_w_NO_FILES_PROVIDED_success(): +def test_error_init_w_no_files_provided_success(): e = Error(ErrorType.NO_FILES_PROVIDED) assert e.error == ErrorType.NO_FILES_PROVIDED @@ -137,7 +161,7 @@ def test_Error_init_w_NO_FILES_PROVIDED_success(): assert e.source_2 is None -def test_Error_init_w_OTHER_success(): +def test_error_init_w_other_success(): # minimal e = Error(ErrorType.OTHER, message="foo") @@ -147,82 +171,123 @@ def test_Error_init_w_OTHER_success(): assert e.source_2 is None # all - e = Error(ErrorType.OTHER, message="foo", source_1=spcsrc("wooo")) + e = Error(ErrorType.OTHER, message="foo", source_1=specification_source("wooo")) assert e.error == ErrorType.OTHER assert e.message == "foo" - assert e.source_1 == spcsrc("wooo") + assert e.source_1 == specification_source("wooo") assert e.source_2 is None -def test_Error_init_fail(): +def test_error_init_fail(): # arguments are error type, message string, 1st source, 2nd source, exception error_init_fail(None, None, None, None, ValueError("error is required")) - error_init_fail(ErrorType.FILE_NOT_FOUND, None, None, None, ValueError( - "source_1 is required for a FILE_NOT_FOUND error")) + error_init_fail( + ErrorType.FILE_NOT_FOUND, + None, + None, + None, + ValueError("source_1 is required for a FILE_NOT_FOUND error"), + ) err = "message, source_1 is required for a PARSE_FAIL error" - error_init_fail(ErrorType.PARSE_FAIL, None, spcsrc("wooo"), None, ValueError(err)) + error_init_fail( + ErrorType.PARSE_FAIL, None, specification_source("wooo"), None, ValueError(err) + ) error_init_fail(ErrorType.PARSE_FAIL, "msg", None, None, ValueError(err)) err = "message, source_1 is required for a INCORRECT_COLUMN_COUNT error" - error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, None, spcsrc("whee"), None, ValueError(err)) - error_init_fail(ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err)) + error_init_fail( + ErrorType.INCORRECT_COLUMN_COUNT, + None, + specification_source("whee"), + None, + ValueError(err), + ) + error_init_fail( + ErrorType.INCORRECT_COLUMN_COUNT, "msg", None, None, ValueError(err) + ) ms = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - err = ("message, source_1, source_2 is required for a " - + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error") + err = ( + "message, source_1, source_2 is required for a " + + "MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE error" + ) error_init_fail(ms, None, None, None, ValueError(err)) - error_init_fail(ms, None, spcsrc("foo"), spcsrc("bar"), ValueError(err)) - error_init_fail(ms, "msg", None, spcsrc("bar"), ValueError(err)) - error_init_fail(ms, "msg", spcsrc("foo"), None, ValueError(err)) - error_init_fail(ErrorType.OTHER, None, None, None, ValueError( - "message is required for a OTHER error")) + error_init_fail( + ms, + None, + specification_source("foo"), + specification_source("bar"), + ValueError(err), + ) + error_init_fail(ms, "msg", None, specification_source("bar"), ValueError(err)) + error_init_fail(ms, "msg", specification_source("foo"), None, ValueError(err)) + error_init_fail( + ErrorType.OTHER, + None, + None, + None, + ValueError("message is required for a OTHER error"), + ) def error_init_fail( - errortype: O[ErrorType], - message: O[str], - source_1: O[SpecificationSource], - source_2: O[SpecificationSource], - expected: Exception + errortype: Optional[ErrorType], + message: Optional[str], + source_1: Optional[SpecificationSource], + source_2: Optional[SpecificationSource], + expected: Exception, ): with raises(Exception) as got: Error(errortype, message, source_1, source_2) assert_exception_correct(got.value, expected) -def test_ParseResult_init_success(): - pr = ParseResult(spcsrc("bar"), (frozendict({"foo": "bar"}),)) +def test_parse_result_init_success(): + pr = ParseResult(specification_source("bar"), (frozendict({"foo": "bar"}),)) - assert pr.source == spcsrc("bar") + assert pr.source == specification_source("bar") assert pr.result == (frozendict({"foo": "bar"}),) -def test_ParseResult_init_fail(): - parseResult_init_fail(None, None, ValueError("source is required")) - parseResult_init_fail(None, (frozendict({"foo": "bar"}),), ValueError("source is required")) - parseResult_init_fail(spcsrc("foo"), None, ValueError("result is required")) +def test_parse_result_init_fail(): + assert_parse_result_init_fail(None, None, ValueError("source is required")) + assert_parse_result_init_fail( + None, (frozendict({"foo": "bar"}),), ValueError("source is required") + ) + assert_parse_result_init_fail( + specification_source("foo"), None, ValueError("result is required") + ) -def parseResult_init_fail( - source: O[SpecificationSource], - result: O[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], - expected: Exception +def assert_parse_result_init_fail( + source: Optional[SpecificationSource], + result: Optional[tuple[frozendict[str, PRIMITIVE_TYPE], ...]], + expected: Exception, ): with raises(Exception) as got: ParseResult(source, result) assert_exception_correct(got.value, expected) -PR_RESULTS = frozendict({"data_type": ParseResult( - spcsrc("some_file", "tab"), - (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),) # make a tuple! -)}) +PR_RESULTS = frozendict( + { + "data_type": ParseResult( + specification_source("some_file", "tab"), + (frozendict({"fasta_file": "foo.fa", "do_thing": 1}),), # make a tuple! + ) + } +) PR_ERROR = ( Error(ErrorType.OTHER, message="foo"), - Error(ErrorType.PARSE_FAIL, message="bar", source_1=spcsrc("some_file", "tab3")) + Error( + ErrorType.PARSE_FAIL, + message="bar", + source_1=specification_source("some_file", "tab3"), + ), ) -def test_ParseResults_init_w_results_success(): + +def test_parse_results_init_w_results_success(): results_copy = frozendict(PR_RESULTS) # prevent identity equality pr = ParseResults(PR_RESULTS) @@ -232,8 +297,8 @@ def test_ParseResults_init_w_results_success(): assert pr == ParseResults(results_copy) -def test_ParseResults_init_w_error_success(): - errors_copy = tuple(PR_ERROR) # prevent identity equality +def test_parse_results_init_w_error_success(): + errors_copy = tuple(PR_ERROR) # prevent identity equality pr = ParseResults(errors=PR_ERROR) assert pr.results is None @@ -242,25 +307,28 @@ def test_ParseResults_init_w_error_success(): assert pr == ParseResults(errors=errors_copy) -def test_ParseResults_init_fail(): - err = 'Exectly one of results or errors must be supplied' - parseResults_init_fail(None, None, ValueError(err)) - parseResults_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) +def test_parse_results_init_fail(): + err = "Exactly one of results or errors must be supplied" + assert_parse_results_init_fail(None, None, ValueError(err)) + assert_parse_results_init_fail(PR_RESULTS, PR_ERROR, ValueError(err)) -def parseResults_init_fail( - results: O[frozendict[str, ParseResult]], - errors: O[tuple[Error, ...]], - expected: Exception +def assert_parse_results_init_fail( + results: Optional[frozendict[str, ParseResult]], + errors: Optional[tuple[Error, ...]], + expected: ValueError, ): with raises(Exception) as got: ParseResults(results, errors) assert_exception_correct(got.value, expected) -def _ftr(parser: Callable[[Path], ParseResults] = None, notype: str=None) -> FileTypeResolution: +def _ftr( + parser: Callable[[Path], ParseResults] = None, notype: str = None +) -> FileTypeResolution: return FileTypeResolution(parser, notype) + def _get_mocks(count: int) -> tuple[Mock, ...]: return (Mock() for _ in range(count)) @@ -270,48 +338,60 @@ def test_parse_import_specifications_success(): resolver.side_effect = [_ftr(parser1), _ftr(parser2)] - parser1.return_value = ParseResults(frozendict( - {"type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ) - } - )) - - parser2.return_value = ParseResults(frozendict( - {"type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - } - )) + parser1.return_value = ParseResults( + frozendict( + { + "type1": ParseResult( + specification_source(TEST_EXCEL_FILE, "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + specification_source(TEST_EXCEL_FILE, "tab2"), + (frozendict({"whee": "whoo"}),), # tuple! + ), + } + ) + ) + + parser2.return_value = ParseResults( + frozendict( + { + "type_other": ParseResult( + specification_source(TEST_MOCK_CSV_FILE_NAME), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ) + } + ) + ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME)), resolver, logger ) - assert res == ParseResults(frozendict({ - "type1": ParseResult( - spcsrc("myfile.xlsx", "tab1"), - (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})) - ), - "type2": ParseResult( - spcsrc("myfile.xlsx", "tab2"), - (frozendict({"whee": "whoo"}),) # tuple! - ), - "type_other": ParseResult( - spcsrc("somefile.csv"), - (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})) - ) - })) - - resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + specification_source(TEST_EXCEL_FILE, "tab1"), + (frozendict({"foo": "bar"}), frozendict({"baz": "bat"})), + ), + "type2": ParseResult( + specification_source(TEST_EXCEL_FILE, "tab2"), + (frozendict({"whee": "whoo"}),), # tuple! + ), + "type_other": ParseResult( + specification_source(TEST_MOCK_CSV_FILE_NAME), + (frozendict({"foo": "bar2"}), frozendict({"baz": "bat2"})), + ), + } + ) + ) + + resolver.assert_has_calls( + [call(Path(TEST_EXCEL_FILE)), call(Path(TEST_MOCK_CSV_FILE_NAME))] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) logger.assert_not_called() @@ -330,16 +410,18 @@ def test_parse_import_specification_resolver_exception(): resolver.side_effect = [_ftr(parser1), ArithmeticError("crapsticks")] # test that other errors aren't included in the result - parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, 'foo')])) + parser1.return_value = ParseResults(errors=tuple([Error(ErrorType.OTHER, "foo")])) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME)), resolver, logger ) assert res == ParseResults(errors=tuple([Error(ErrorType.OTHER, "crapsticks")])) - resolver.assert_has_calls([call(Path("myfile.xlsx")), call(Path("somefile.csv"))]) - parser1.assert_called_once_with(Path("myfile.xlsx")) + resolver.assert_has_calls( + [call(Path(TEST_EXCEL_FILE)), call(Path(TEST_MOCK_CSV_FILE_NAME))] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) # In [1]: ArithmeticError("a") == ArithmeticError("a") # Out[1]: False # so assert_called_once_with doesn't work @@ -360,34 +442,57 @@ def test_parse_import_specification_unsupported_type_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(notype="JPEG")] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, 'foo'), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")) - ])) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error( + ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo.csv") + ), + ] + ) + ) parser2.return_value = ParseResults( - frozendict({"foo": ParseResult(spcsrc("a"), tuple([frozendict({"a": "b"})]))})) + frozendict( + { + "foo": ParseResult( + specification_source("a"), tuple([frozendict({"a": "b"})]) + ) + } + ) + ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.jpeg")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME), Path("x.jpeg")), + resolver, + logger, ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "foo"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("foo.csv")), - Error( - ErrorType.PARSE_FAIL, - "JPEG is not a supported file type for import specifications", - spcsrc(Path("x.jpeg")) + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "foo"), + Error( + ErrorType.FILE_NOT_FOUND, source_1=specification_source("foo.csv") + ), + Error( + ErrorType.PARSE_FAIL, + "JPEG is not a supported file type for import specifications", + specification_source("x.jpeg"), + ), + ] ) - ])) - - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.jpeg")), - ]) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + ) + + resolver.assert_has_calls( + [ + call(Path(TEST_EXCEL_FILE)), + call(Path(TEST_MOCK_CSV_FILE_NAME)), + call(Path("x.jpeg")), + ] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) logger.assert_not_called() @@ -406,50 +511,86 @@ def test_parse_import_specification_multiple_specs_and_parser_error(): resolver.side_effect = [_ftr(parser1), _ftr(parser2), _ftr(parser3)] # check that other errors are also returned, and the results are ignored - parser1.return_value = ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")) - ])) - parser2.return_value = ParseResults(frozendict( - {"foo": ParseResult(spcsrc("a1"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b1"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c1"), tuple([frozendict({"a": "b"})])) - }, - )) - parser3.return_value = ParseResults(frozendict( - {"foo2": ParseResult(spcsrc("a2"), tuple([frozendict({"a": "b"})])), - "bar": ParseResult(spcsrc("b2"), tuple([frozendict({"a": "b"})])), - "baz": ParseResult(spcsrc("c2"), tuple([frozendict({"a": "b"})])) - }, - )) + parser1.return_value = ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error( + ErrorType.FILE_NOT_FOUND, + source_1=specification_source(TEST_EXCEL_FILE), + ), + ] + ) + ) + parser2.return_value = ParseResults( + frozendict( + { + "foo": ParseResult( + specification_source("a1"), tuple([frozendict({"a": "b"})]) + ), + "bar": ParseResult( + specification_source("b1"), tuple([frozendict({"a": "b"})]) + ), + "baz": ParseResult( + specification_source("c1"), tuple([frozendict({"a": "b"})]) + ), + }, + ) + ) + parser3.return_value = ParseResults( + frozendict( + { + "foo2": ParseResult( + specification_source("a2"), tuple([frozendict({"a": "b"})]) + ), + "bar": ParseResult( + specification_source("b2"), tuple([frozendict({"a": "b"})]) + ), + "baz": ParseResult( + specification_source("c2"), tuple([frozendict({"a": "b"})]) + ), + }, + ) + ) res = parse_import_specifications( - (Path("myfile.xlsx"), Path("somefile.csv"), Path("x.tsv")), resolver, logger + (Path(TEST_EXCEL_FILE), Path(TEST_MOCK_CSV_FILE_NAME), Path("x.tsv")), + resolver, + logger, ) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.OTHER, "other"), - Error(ErrorType.FILE_NOT_FOUND, source_1=spcsrc("myfile.xlsx")), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type bar appears in two importer specification sources", - spcsrc("b1"), - spcsrc("b2") - ), - Error( - ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, - "Data type baz appears in two importer specification sources", - spcsrc("c1"), - spcsrc("c2") + assert res == ParseResults( + errors=tuple( + [ + Error(ErrorType.OTHER, "other"), + Error( + ErrorType.FILE_NOT_FOUND, + source_1=specification_source(TEST_EXCEL_FILE), + ), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type bar appears in two importer specification sources", + specification_source("b1"), + specification_source("b2"), + ), + Error( + ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, + "Data type baz appears in two importer specification sources", + specification_source("c1"), + specification_source("c2"), + ), + ] ) - ])) - - resolver.assert_has_calls([ - call(Path("myfile.xlsx")), - call(Path("somefile.csv")), - call(Path("x.tsv")), - ]) - parser1.assert_called_once_with(Path("myfile.xlsx")) - parser2.assert_called_once_with(Path("somefile.csv")) + ) + + resolver.assert_has_calls( + [ + call(Path(TEST_EXCEL_FILE)), + call(Path(TEST_MOCK_CSV_FILE_NAME)), + call(Path("x.tsv")), + ] + ) + parser1.assert_called_once_with(Path(TEST_EXCEL_FILE)) + parser2.assert_called_once_with(Path(TEST_MOCK_CSV_FILE_NAME)) parser3.assert_called_once_with(Path("x.tsv")) - logger.assert_not_called() \ No newline at end of file + logger.assert_not_called() diff --git a/tests/import_specifications/test_file_writers.py b/tests/import_specifications/test_file_writers.py index 7f2cf26f..44ee28ee 100644 --- a/tests/import_specifications/test_file_writers.py +++ b/tests/import_specifications/test_file_writers.py @@ -1,27 +1,26 @@ import os import uuid - -import openpyxl - from collections.abc import Generator from pathlib import Path -from pytest import raises, fixture -from tests.test_app import FileUtil +import openpyxl +from pytest import fixture, raises from staging_service.import_specifications.file_writers import ( + ImportSpecWriteException, write_csv, - write_tsv, write_excel, - ImportSpecWriteException, + write_tsv, ) - -from tests.test_utils import ( +from tests.test_helpers import ( + FileUtil, assert_exception_correct, - check_file_contents, + assert_file_contents, check_excel_contents, ) +# pylint: disable=C0116 + @fixture(scope="module") def temp_dir() -> Generator[Path, None, None]: @@ -43,7 +42,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id3": "comma,comma", "id1": "yay!\ttab", "id2": 42}, {"id3": 56.78, "id1": "boo!", "id2": None}, - ] + ], }, "type2": { "order_and_display": [ @@ -52,7 +51,7 @@ def temp_dir() -> Generator[Path, None, None]: "data": [ {"id1": "foo"}, {"id1": 0}, - ] + ], }, "type3": { "order_and_display": [ @@ -60,8 +59,8 @@ def temp_dir() -> Generator[Path, None, None]: ("tab\tid", "xsv is only"), ("comma,id", "a template"), ], - "data": [] - } + "data": [], + }, } @@ -72,7 +71,7 @@ def test_write_csv(temp_dir: Path): "type2": "type2.csv", "type3": "type3.csv", } - check_file_contents( + assert_file_contents( temp_dir / "type1.csv", [ "Data type: type1; Columns: 3; Version: 1\n", @@ -80,9 +79,9 @@ def test_write_csv(temp_dir: Path): '"thing,with,comma",other,thing\twith\ttabs\n', 'yay!\ttab,42,"comma,comma"\n', "boo!,,56.78\n", - ] + ], ) - check_file_contents( + assert_file_contents( temp_dir / "type2.csv", [ "Data type: type2; Columns: 1; Version: 1\n", @@ -90,15 +89,15 @@ def test_write_csv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) - check_file_contents( + assert_file_contents( temp_dir / "type3.csv", [ "Data type: type3; Columns: 3; Version: 1\n", 'some_id,tab\tid,"comma,id"\n', "hey this,xsv is only,a template\n", - ] + ], ) @@ -109,7 +108,7 @@ def test_write_tsv(temp_dir: Path): "type2": "type2.tsv", "type3": "type3.tsv", } - check_file_contents( + assert_file_contents( temp_dir / "type1.tsv", [ "Data type: type1; Columns: 3; Version: 1\n", @@ -117,9 +116,9 @@ def test_write_tsv(temp_dir: Path): 'thing,with,comma\tother\t"thing\twith\ttabs"\n', '"yay!\ttab"\t42\tcomma,comma\n', "boo!\t\t56.78\n", - ] + ], ) - check_file_contents( + assert_file_contents( temp_dir / "type2.tsv", [ "Data type: type2; Columns: 1; Version: 1\n", @@ -127,15 +126,15 @@ def test_write_tsv(temp_dir: Path): "oh no I only have one column\n", "foo\n", "0\n", - ] + ], ) - check_file_contents( + assert_file_contents( temp_dir / "type3.tsv", [ "Data type: type3; Columns: 3; Version: 1\n", 'some_id\t"tab\tid"\tcomma,id\n', "hey this\txsv is only\ta template\n", - ] + ], ) @@ -179,7 +178,7 @@ def test_write_excel(temp_dir: Path): "type3", [ ["Data type: type3; Columns: 3; Version: 1", None, None], - ["some_id","tab\tid","comma,id"], + ["some_id", "tab\tid", "comma,id"], ["hey this", "xsv is only", "a template"], ], [8.0, 11.0, 10.0], @@ -193,84 +192,140 @@ def test_file_writers_fail(): file_writers_fail(p, None, E("The types value must be a mapping")) file_writers_fail(p, {}, E("At least one data type must be specified")) file_writers_fail( - p, {None: 1}, E("A data type cannot be a non-string or a whitespace only string")) + p, + {None: 1}, + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail( p, {" \t ": 1}, - E("A data type cannot be a non-string or a whitespace only string")) + E("A data type cannot be a non-string or a whitespace only string"), + ) file_writers_fail(p, {"t": []}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": 1}, E("The value for data type t must be a mapping")) file_writers_fail(p, {"t": {}}, E("Data type t missing order_and_display key")) file_writers_fail( p, {"t": {"order_and_display": {}, "data": []}}, - E("Data type t order_and_display value is not a list")) + E("Data type t order_and_display value is not a list"), + ) file_writers_fail( p, {"t": {"order_and_display": [], "data": []}}, - E("At least one entry is required for order_and_display for type t")) + E("At least one entry is required for order_and_display for type t"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]]}}, - E("Data type t missing data key")) + E("Data type t missing data key"), + ) file_writers_fail( p, {"t": {"order_and_display": [["foo", "bar"]], "data": "foo"}}, - E("Data type t data value is not a list")) + E("Data type t data value is not a list"), + ) file_writers_fail( p, - {"t": {"order_and_display": [["foo", "bar"], "baz"] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "the entry is not a list")) + {"t": {"order_and_display": [["foo", "bar"], "baz"], "data": []}}, + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "the entry is not a list" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 2 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"], ["baz"]], + "data": [], + } + }, + E( + "Invalid order_and_display entry for datatype t at index 2 - " + + "expected 2 item list" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]] , "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "expected 2 item list")) + { + "t": { + "order_and_display": [("foo", "bar", "whee"), ["whee", "whoo"]], + "data": [], + } + }, + E( + "Invalid order_and_display entry for datatype t at index 0 - " + + "expected 2 item list" + ), + ) for parm in [None, " \t ", 1]: file_writers_fail( p, {"t": {"order_and_display": [(parm, "foo"), ["whee", "whoo"]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 0 - " - + "parameter ID cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 0 - " + + "parameter ID cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo"), ["whee", parm]], "data": []}}, - E("Invalid order_and_display entry for datatype t at index 1 - " - + "parameter display name cannot be a non-string or a whitespace only string")) + E( + "Invalid order_and_display entry for datatype t at index 1 - " + + "parameter display name cannot be a non-string or a whitespace only string" + ), + ) file_writers_fail( p, - {"t": {"order_and_display": [("bar", "foo"), ["whee", "whoo"]], "data": ["foo"]}}, - E("Data type t data row 0 is not a mapping")) + { + "t": { + "order_and_display": [("bar", "foo"), ["whee", "whoo"]], + "data": ["foo"], + } + }, + E("Data type t data row 0 is not a mapping"), + ) file_writers_fail( p, {"t": {"order_and_display": [("bar", "foo")], "data": [{"bar": 1}, []]}}, - E("Data type t data row 1 is not a mapping")) + E("Data type t data row 1 is not a mapping"), + ) file_writers_fail( p, - {"t": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 1, "whee": 2}, {"foo": 2}]}}, - E("Data type t data row 1 does not have the same keys as order_and_display")) + { + "t": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 1, "whee": 2}, {"foo": 2}], + } + }, + E("Data type t data row 1 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}]}}, - E("Data type ty data row 0 does not have the same keys as order_and_display")) + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3, 5: 4}, {"foo": 1, "whee": 2}], + } + }, + E("Data type ty data row 0 does not have the same keys as order_and_display"), + ) file_writers_fail( p, - {"ty": {"order_and_display": [("foo", "bar"), ["whee", "whoo"]], - "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}]}}, - E("Data type ty data row 1's value for parameter whee " - + "is not a number or a string")) - + { + "ty": { + "order_and_display": [("foo", "bar"), ["whee", "whoo"]], + "data": [{"foo": 2, "whee": 3}, {"foo": 1, "whee": []}], + } + }, + E( + "Data type ty data row 1's value for parameter whee " + + "is not a number or a string" + ), + ) -def file_writers_fail(path: Path, types: dict, expected: Exception): +def file_writers_fail(path: Path | None, types: dict | None, expected: Exception): with raises(Exception) as got: write_csv(path, types) assert_exception_correct(got.value, expected) diff --git a/tests/import_specifications/test_individual_parsers.py b/tests/import_specifications/test_individual_parsers.py index fe73a309..12df6c84 100644 --- a/tests/import_specifications/test_individual_parsers.py +++ b/tests/import_specifications/test_individual_parsers.py @@ -1,31 +1,31 @@ import os import uuid - from collections.abc import Callable, Generator -# TODO update to C impl when fixed: https://github.com/Marco-Sulla/python-frozendict/issues/26 -from frozendict.core import frozendict from pathlib import Path -from pytest import fixture +from frozendict import frozendict +from pytest import fixture from staging_service.import_specifications.individual_parsers import ( - parse_csv, - parse_tsv, - parse_excel, - ErrorType, Error, - SpecificationSource, + ErrorType, ParseResult, - ParseResults + ParseResults, + SpecificationSource, + parse_csv, + parse_excel, + parse_tsv, ) - -from tests.test_app import FileUtil +from tests.test_helpers import FileUtil _TEST_DATA_DIR = (Path(__file__).parent / "test_data").resolve() +# Most test functions don't have docstrings, and probably never will. +# pylint: disable=C0116 -@fixture(scope="module") -def temp_dir() -> Generator[Path, None, None]: + +@fixture(scope="module", name="temp_dir") +def temp_dir_fixture() -> Generator[Path, None, None]: with FileUtil() as fu: childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() @@ -33,45 +33,85 @@ def temp_dir() -> Generator[Path, None, None]: # FileUtil will auto delete after exiting + ########################################## # xSV tests ########################################## def test_xsv_parse_success(temp_dir: Path): - # Tests a special case where if an xSV file is opened by Excel and then resaved, + # Tests a special case where if an xSV file is opened by Excel and then resaved, # Excel will add separators to the end of the 1st header line. This previously caused # the parse to fail. - _xsv_parse_success(temp_dir, ',', parse_csv) - _xsv_parse_success(temp_dir, '\t', parse_tsv) + _xsv_parse_success(temp_dir, ",", parse_csv) + _xsv_parse_success(temp_dir, "\t", parse_tsv) -def _xsv_parse_success(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): +def _xsv_parse_success( + temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults] +): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming - f"val3 {s} val4{s} 1{s} 8.9\n", - f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace - f"val6 {s} {s} {s} 3.14\n" # test missing values w/ whitespace - ]) + with open(input_, "w", encoding="utf-8") as test_file: + test_file.writelines( + [ + f"Data type: some_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} spec2{s} spec3 {s} spec4\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val1 {s} val2 {s} 7 {s} 3.2\n", # test trimming + f"val3 {s} val4{s} 1{s} 8.9\n", + f"val5 {s}{s}{s} 42.42\n", # test missing values w/o whitespace + f"val6 {s} {s} {s} 3.14\n", # test missing values w/ whitespace + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val5", "spec2": None, "spec3": None, "spec4": 42.42}), - frozendict({"spec1": "val6", "spec2": None, "spec3": None, "spec4": 3.14}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "spec2": None, + "spec3": None, + "spec4": 42.42, + } + ), + frozendict( + { + "spec1": "val6", + "spec2": None, + "spec3": None, + "spec4": 3.14, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_nan_inf(temp_dir: Path): @@ -79,34 +119,66 @@ def test_xsv_parse_success_nan_inf(temp_dir: Path): # they will be returned from the service as barewords in JSON and will cause errors in # some parsers as the JSON spec does not support NaN and Inf (which it should but...) # See https://kbase-jira.atlassian.net/browse/PTV-1866 - _xsv_parse_success_nan_inf(temp_dir, ',', parse_csv) - _xsv_parse_success_nan_inf(temp_dir, '\t', parse_tsv) + _xsv_parse_success_nan_inf(temp_dir, ",", parse_csv) + _xsv_parse_success_nan_inf(temp_dir, "\t", parse_tsv) -def _xsv_parse_success_nan_inf(temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults]): +def _xsv_parse_success_nan_inf( + temp_dir: Path, sep: str, parser: Callable[[Path], ParseResults] +): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: - test_file.writelines([ - f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", - f"spec1{s} -Inf{s} nan {s} inf\n", - f"Spec 1{s} inf{s} Spec 3{s} -inf\n", - f"inf {s} val2 {s} NaN {s} 3.2\n", - f"Inf {s} val4{s} -inf{s} 8.9\n", - f"val5 {s}-Inf{s}{s} nan\n", - ]) + with open(input_, "w", encoding="utf-8") as test_file: + test_file.writelines( + [ + f"Data type: some_nan_type; Columns: 4; Version: 1{s}{s}{s}\n", + f"spec1{s} -Inf{s} nan {s} inf\n", + f"Spec 1{s} inf{s} Spec 3{s} -inf\n", + f"inf {s} val2 {s} NaN {s} 3.2\n", + f"Inf {s} val4{s} -inf{s} 8.9\n", + f"val5 {s}-Inf{s}{s} nan\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_nan_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "inf", "-Inf": "val2", "nan": "NaN", "inf": 3.2}), - frozendict({"spec1": "Inf", "-Inf": "val4", "nan": "-inf", "inf": 8.9}), - frozendict({"spec1": "val5", "-Inf": "-Inf", "nan": None, "inf": "nan"}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_nan_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "inf", + "-Inf": "val2", + "nan": "NaN", + "inf": 3.2, + } + ), + frozendict( + { + "spec1": "Inf", + "-Inf": "val4", + "nan": "-inf", + "inf": 8.9, + } + ), + frozendict( + { + "spec1": "val5", + "-Inf": "-Inf", + "nan": None, + "inf": "nan", + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): @@ -114,8 +186,8 @@ def test_xsv_parse_success_with_numeric_headers(temp_dir: Path): Not a use case we expect but good to check numeric headers don't cause an unexpected error. """ - _xsv_parse_success_with_numeric_headers(temp_dir, ',', parse_csv) - _xsv_parse_success_with_numeric_headers(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_numeric_headers(temp_dir, ",", parse_csv) + _xsv_parse_success_with_numeric_headers(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_numeric_headers( @@ -123,22 +195,34 @@ def _xsv_parse_success_with_numeric_headers( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: some_type; Columns: 4; Version: 1\n", - f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - ]) + with open(input_, "w", encoding="utf-8") as test_file: + test_file.writelines( + [ + "Data type: some_type; Columns: 4; Version: 1\n", + f"1{s} 2.0{s} 3{s} 4.1\n", # test trimming + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"some_type": ParseResult(SpecificationSource(input_), - tuple([frozendict({"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "some_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + {"1": "val3", "2.0": "val4", "3": 1, "4.1": 8.9} + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path): @@ -146,8 +230,8 @@ def test_xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir: Path Test that leaving one or more empty lines in a csv/tsv file does not cause the parse to fail. This is easy to do accidentally and so will annoy users. """ - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ',', parse_csv) - _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, '\t', parse_tsv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, ",", parse_csv) + _xsv_parse_success_with_internal_and_trailing_empty_lines(temp_dir, "\t", parse_tsv) def _xsv_parse_success_with_internal_and_trailing_empty_lines( @@ -155,60 +239,92 @@ def _xsv_parse_success_with_internal_and_trailing_empty_lines( ): s = sep input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - f"spec1{s} spec2{s} spec3{s} spec4\n", - f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", - f"val3 {s} val4{s} 1{s} 8.9\n", - "\n", - f"val1 {s} val2{s} 7 {s} 3.2\n", - "\n", - "\n", - "\n", - ]) + with open(input_, "w", encoding="utf-8") as test_file: + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + f"spec1{s} spec2{s} spec3{s} spec4\n", + f"Spec 1{s} Spec 2{s} Spec 3{s} Spec 4\n", + f"val3 {s} val4{s} 1{s} 8.9\n", + "\n", + f"val1 {s} val2{s} 7 {s} 3.2\n", + "\n", + "\n", + "\n", + ] + ) res = parser(input_) - assert res == ParseResults(frozendict( - {"other_type": ParseResult(SpecificationSource(input_), - tuple([ - frozendict({"spec1": "val3", "spec2": "val4", "spec3": 1, "spec4": 8.9}), - frozendict({"spec1": "val1", "spec2": "val2", "spec3": 7, "spec4": 3.2}), - ]) - )} - )) + assert res == ParseResults( + frozendict( + { + "other_type": ParseResult( + SpecificationSource(input_), + tuple( + [ + frozendict( + { + "spec1": "val3", + "spec2": "val4", + "spec3": 1, + "spec4": 8.9, + } + ), + frozendict( + { + "spec1": "val1", + "spec2": "val2", + "spec3": 7, + "spec4": 3.2, + } + ), + ] + ), + ) + } + ) + ) def test_xsv_parse_fail_no_file(temp_dir: Path): input_ = temp_dir / str(uuid.uuid4()) - with open(input_, "w") as test_file: - test_file.writelines([ - "Data type: other_type; Columns: 4; Version: 1\n", - "spec1, spec2, spec3, spec4\n", - "Spec 1, Spec 2, Spec 3, Spec 4\n", - "val1 , val2, 7 , 3.2\n", - ]) + with open(input_, "w", encoding="utf-8") as test_file: + test_file.writelines( + [ + "Data type: other_type; Columns: 4; Version: 1\n", + "spec1, spec2, spec3, spec4\n", + "Spec 1, Spec 2, Spec 3, Spec 4\n", + "val1 , val2, 7 , 3.2\n", + ] + ) input_ = Path(str(input_)[-1]) res = parse_csv(input_) - assert res == ParseResults(errors=tuple([ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_)) - ])) + assert res == ParseResults( + errors=tuple( + [Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(input_))] + ) + ) -def test_xsv_parse_fail_binary_file(temp_dir: Path): +def test_xsv_parse_fail_binary_file(): test_file = _TEST_DATA_DIR / "testtabs3full2nodata1empty.xls" res = parse_csv(test_file) - assert res == ParseResults(errors=tuple([ - Error( - ErrorType.PARSE_FAIL, - "Not a text file: application/vnd.ms-excel", - source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "Not a text file: application/vnd.ms-excel", + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_xsv_parse_fail_directory(temp_dir: Path): @@ -217,9 +333,17 @@ def test_xsv_parse_fail_directory(temp_dir: Path): res = parse_tsv(test_file) - assert res == ParseResults(errors=tuple([Error( - ErrorType.PARSE_FAIL, "The given path is a directory", SpecificationSource(test_file) - )])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + "The given path is a directory", + SpecificationSource(test_file), + ) + ] + ) + ) def _xsv_parse_fail( @@ -231,11 +355,13 @@ def _xsv_parse_fail( extension: str = "", ): input_ = temp_dir / (str(uuid.uuid4()) + extension) - with open(input_, "w") as test_file: + with open(input_, "w", encoding="utf-8") as test_file: test_file.writelines(lines) res = parser(input_) - expected = ParseResults(errors=tuple([Error(err_type, message, SpecificationSource(input_))])) + expected = ParseResults( + errors=tuple([Error(err_type, message, SpecificationSource(input_))]) + ) assert res == expected @@ -244,19 +370,25 @@ def test_xsv_parse_fail_empty_file(temp_dir: Path): def test_xsv_parse_fail_bad_datatype_header(temp_dir: Path): - err = ('Invalid header; got "This is the wrong header", expected "Data type: ' - + '; Columns: ; Version: "') + err = ( + 'Invalid header; got "This is the wrong header", expected "Data type: ' + + '; Columns: ; Version: "' + ) _xsv_parse_fail(temp_dir, ["This is the wrong header"], parse_csv, err) def test_xsv_parse_fail_bad_version(temp_dir: Path): err = "Schema version 87 is larger than maximum processable version 1" - _xsv_parse_fail(temp_dir, ["Data type: foo; Columns: 22; Version: 87"], parse_csv, err) + _xsv_parse_fail( + temp_dir, ["Data type: foo; Columns: 22; Version: 87"], parse_csv, err + ) def test_xsv_parse_fail_missing_column_headers(temp_dir: Path): err = "Missing 2nd header line" - _xsv_parse_fail(temp_dir, ["Data type: foo; Columns: 3; Version: 1\n"], parse_csv, err) + _xsv_parse_fail( + temp_dir, ["Data type: foo; Columns: 3; Version: 1\n"], parse_csv, err + ) err = "Missing 3rd header line" lines = ["Data type: foo; Columns: 3; Version: 1\n", "head1, head2, head3\n"] @@ -275,7 +407,7 @@ def test_xsv_parse_fail_missing_column_header_entries(temp_dir: Path): def test_xsv_parse_fail_missing_data(temp_dir: Path): err = "No non-header data in file" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head1, head2, head3\n", "Head 1, Head 2, Head 3\n", ] @@ -285,7 +417,7 @@ def test_xsv_parse_fail_missing_data(temp_dir: Path): def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 3, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head1, head2, head3\n", "Head 1, Head 2\n", ] @@ -293,7 +425,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 3, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" + "Data type: foo; Columns: 2; Version: 1\n", "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] @@ -301,7 +433,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 2, expected 2, got 3" lines = [ - "Data type: foo; Columns: 2; Version: 1\n" + "Data type: foo; Columns: 2; Version: 1\n", "head1, head2, head3\n", "Head 1, Head 2\n", ] @@ -309,7 +441,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 2, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head1\thead2\n", "Head 1\tHead 2\tHead 3\n", ] @@ -317,7 +449,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 5, expected 3, got 4" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", @@ -328,7 +460,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): err = "Incorrect number of items in line 6, expected 3, got 2" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head1\thead2\thead 3\n", "Head 1\tHead 2\tHead 3\n", "1\t2\t3\n", @@ -342,7 +474,7 @@ def test_xsv_parse_fail_unequal_rows(temp_dir: Path): def test_xsv_parse_fail_duplicate_headers(temp_dir: Path): err = "Duplicate header name in row 2: head3" lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head3, head2, head3\n", "Head 1, Head 2, Head 3\n", ] @@ -350,7 +482,7 @@ def test_xsv_parse_fail_duplicate_headers(temp_dir: Path): # test with duplicate dual headers lines = [ - "Data type: foo; Columns: 3; Version: 1\n" + "Data type: foo; Columns: 3; Version: 1\n", "head3, head2, head3\n", "Head 3, Head 2, Head 3\n", ] @@ -389,20 +521,37 @@ def test_excel_parse_success(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "type1": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": "foo", "header2": 1, "header3": 6.7}), - frozendict({"header1": "bar", "header2": 2, "header3": 8.9}), - frozendict({"header1": "baz", "header2": None, "header3": 3.4}), - frozendict({"header1": "bat", "header2": 4, "header3": None}), - )), - "type2": ParseResult(SpecificationSource(ex, "tab2"), ( - frozendict({"h1": "golly gee", "2": 42, "h3": "super"}), - )), - "type3": ParseResult(SpecificationSource(ex, "tab3"), ( - frozendict({"head1": "some data", "head2": 1}), - )), - })) + assert res == ParseResults( + frozendict( + { + "type1": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict( + {"header1": "foo", "header2": 1, "header3": 6.7} + ), + frozendict( + {"header1": "bar", "header2": 2, "header3": 8.9} + ), + frozendict( + {"header1": "baz", "header2": None, "header3": 3.4} + ), + frozendict( + {"header1": "bat", "header2": 4, "header3": None} + ), + ), + ), + "type2": ParseResult( + SpecificationSource(ex, "tab2"), + (frozendict({"h1": "golly gee", "2": 42, "h3": "super"}),), + ), + "type3": ParseResult( + SpecificationSource(ex, "tab3"), + (frozendict({"head1": "some data", "head2": 1}),), + ), + } + ) + ) def test_excel_parse_success_nan_inf(): @@ -417,29 +566,36 @@ def test_excel_parse_success_nan_inf(): res = parse_excel(ex) - assert res == ParseResults(frozendict({ - "nan_type": ParseResult(SpecificationSource(ex, "tab1"), ( - frozendict({"header1": 1, "header2": None}), - frozendict({"header1": 2, "header2": None}), - frozendict({"header1": 3, "header2": None}), - frozendict({"header1": 4, "header2": "-1.#IND"}), - frozendict({"header1": 5, "header2": "-1.#QNAN"}), - frozendict({"header1": 6, "header2": "-NaN"}), - frozendict({"header1": 7, "header2": "-nan"}), - frozendict({"header1": 8, "header2": "1.#IND"}), - frozendict({"header1": 9, "header2": "1.#QNAN"}), - frozendict({"header1": 10, "header2": None}), - frozendict({"header1": 11, "header2": None}), - frozendict({"header1": 12, "header2": None}), - frozendict({"header1": 13, "header2": None}), - frozendict({"header1": 14, "header2": "NaN"}), - frozendict({"header1": 15, "header2": None}), - frozendict({"header1": 16, "header2": "nan"}), - frozendict({"header1": 17, "header2": None}), - frozendict({"header1": 18, "header2": None}), - frozendict({"header1": 19, "header2": "some stuff"}), - )), - })) + assert res == ParseResults( + frozendict( + { + "nan_type": ParseResult( + SpecificationSource(ex, "tab1"), + ( + frozendict({"header1": 1, "header2": None}), + frozendict({"header1": 2, "header2": None}), + frozendict({"header1": 3, "header2": None}), + frozendict({"header1": 4, "header2": "-1.#IND"}), + frozendict({"header1": 5, "header2": "-1.#QNAN"}), + frozendict({"header1": 6, "header2": "-NaN"}), + frozendict({"header1": 7, "header2": "-nan"}), + frozendict({"header1": 8, "header2": "1.#IND"}), + frozendict({"header1": 9, "header2": "1.#QNAN"}), + frozendict({"header1": 10, "header2": None}), + frozendict({"header1": 11, "header2": None}), + frozendict({"header1": 12, "header2": None}), + frozendict({"header1": 13, "header2": None}), + frozendict({"header1": 14, "header2": "NaN"}), + frozendict({"header1": 15, "header2": None}), + frozendict({"header1": 16, "header2": "nan"}), + frozendict({"header1": 17, "header2": None}), + frozendict({"header1": 18, "header2": None}), + frozendict({"header1": 19, "header2": "some stuff"}), + ), + ), + } + ) + ) def _excel_parse_fail( @@ -458,16 +614,24 @@ def _excel_parse_fail( if errors: assert res == ParseResults(errors=tuple(errors)) else: - assert res == ParseResults(errors=tuple([ - Error(ErrorType.PARSE_FAIL, message, source_1=SpecificationSource(test_file)) - ])) + assert res == ParseResults( + errors=tuple( + [ + Error( + ErrorType.PARSE_FAIL, + message, + source_1=SpecificationSource(test_file), + ) + ] + ) + ) def test_excel_parse_fail_no_file(): f = _get_test_file("testtabs3full2nodata1empty0.xls") - _excel_parse_fail(f, errors=[ - Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f)) - ]) + _excel_parse_fail( + f, errors=[Error(ErrorType.FILE_NOT_FOUND, source_1=SpecificationSource(f))] + ) def test_excel_parse_fail_directory(temp_dir): @@ -475,37 +639,53 @@ def test_excel_parse_fail_directory(temp_dir): f = temp_dir / d os.makedirs(f, exist_ok=True) err = "The given path is a directory" - _excel_parse_fail(f, errors=[Error(ErrorType.PARSE_FAIL, err, SpecificationSource(f))]) + _excel_parse_fail( + f, errors=[Error(ErrorType.PARSE_FAIL, err, SpecificationSource(f))] + ) def test_excel_parse_fail_empty_file(temp_dir: Path): - _xsv_parse_fail(temp_dir, [], parse_excel, "Not a supported Excel file type", extension=".xls") + _xsv_parse_fail( + temp_dir, [], parse_excel, "Not a supported Excel file type", extension=".xls" + ) def test_excel_parse_fail_non_excel_file(temp_dir: Path): lines = [ - "Data type: foo; Version: 1\n" + "Data type: foo; Version: 1\n", "head1, head2, head 3\n", "Head 1, Head 2, Head 3\n", "1, 2, 3\n", ] _xsv_parse_fail( - temp_dir, lines, parse_excel, "Not a supported Excel file type", extension=".xlsx") + temp_dir, + lines, + parse_excel, + "Not a supported Excel file type", + extension=".xlsx", + ) def test_excel_parse_1emptytab(): - _excel_parse_fail(_get_test_file("testtabs1empty.xls"), "No non-header data in file") + _excel_parse_fail( + _get_test_file("testtabs1empty.xls"), "No non-header data in file" + ) def test_excel_parse_fail_bad_datatype_header(): f = _get_test_file("testbadinitialheader.xls") - err1 = ('Invalid header; got "This header is wack, yo", expected "Data type: ' - + '; Columns: ; Version: "') + err1 = ( + 'Invalid header; got "This header is wack, yo", expected "Data type: ' + + '; Columns: ; Version: "' + ) err2 = "Schema version 2 is larger than maximum processable version 1" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), - ]) + _excel_parse_fail( + f, + errors=[ + Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "badheader1")), + Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "badheader2")), + ], + ) def test_excel_parse_fail_headers_only(): @@ -515,32 +695,79 @@ def test_excel_parse_fail_headers_only(): def test_excel_parse_fail_colliding_datatypes(): f = _get_test_file("testdatatypecollisions.xls") - l = lambda t: f"Found datatype {t} in multiple tabs" + + def error_message(data_type_name): + return f"Found datatype {data_type_name} in multiple tabs" + err = ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE - _excel_parse_fail(f, errors=[ - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_2")), - Error(err, l("type3"), SpecificationSource(f, "dt3"), SpecificationSource(f, "dt3_2")), - Error(err, l("type2"), SpecificationSource(f, "dt2"), SpecificationSource(f, "dt2_3")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + err, + error_message("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_2"), + ), + Error( + err, + error_message("type3"), + SpecificationSource(f, "dt3"), + SpecificationSource(f, "dt3_2"), + ), + Error( + err, + error_message("type2"), + SpecificationSource(f, "dt2"), + SpecificationSource(f, "dt2_3"), + ), + ], + ) def test_excel_parse_fail_duplicate_headers(): f = _get_test_file("testduplicateheaders.xlsx") - l = lambda h: f"Duplicate header name in row 2: {h}" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, l("head1"), SpecificationSource(f, "dt2")), - Error(ErrorType.PARSE_FAIL, l("head2"), SpecificationSource(f, "dt3")), - ]) + + def error_message(header_name): + return f"Duplicate header name in row 2: {header_name}" + + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.PARSE_FAIL, + error_message("head1"), + SpecificationSource(f, "dt2"), + ), + Error( + ErrorType.PARSE_FAIL, + error_message("head2"), + SpecificationSource(f, "dt3"), + ), + ], + ) def test_excel_parse_fail_missing_header_item(): f = _get_test_file("testmissingheaderitem.xlsx") err1 = "Missing header entry in row 2, position 3" err2 = "Missing header entry in row 2, position 2" - _excel_parse_fail(f, errors=[ - Error(ErrorType.PARSE_FAIL, err1, SpecificationSource(f, "missing header item error")), - Error(ErrorType.PARSE_FAIL, err2, SpecificationSource(f, "whitespace header item")), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.PARSE_FAIL, + err1, + SpecificationSource(f, "missing header item error"), + ), + Error( + ErrorType.PARSE_FAIL, + err2, + SpecificationSource(f, "whitespace header item"), + ), + ], + ) + def test_excel_parse_fail_unequal_rows(): """ @@ -550,20 +777,23 @@ def test_excel_parse_fail_unequal_rows(): worry about off by one errors when filling in separator characters. """ f = _get_test_file("testunequalrows.xlsx") - _excel_parse_fail(f, errors=[ - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 3, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 human readable") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 2, expected 2, got 3", - SpecificationSource(f, "2 cols, 3 spec IDs") - ), - Error( - ErrorType.INCORRECT_COLUMN_COUNT, - "Incorrect number of items in line 5, expected 3, got 4", - SpecificationSource(f, "3 cols, 4 data") - ), - ]) + _excel_parse_fail( + f, + errors=[ + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 3, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 human readable"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 2, expected 2, got 3", + SpecificationSource(f, "2 cols, 3 spec IDs"), + ), + Error( + ErrorType.INCORRECT_COLUMN_COUNT, + "Incorrect number of items in line 5, expected 3, got 4", + SpecificationSource(f, "3 cols, 4 data"), + ), + ], + ) diff --git a/tests/pyproject.toml b/tests/pyproject.toml index 8bc18712..a958f2f2 100644 --- a/tests/pyproject.toml +++ b/tests/pyproject.toml @@ -3,4 +3,5 @@ minversion = "6.0" addopts = "-ra -q" testpaths = [ "tests", -] \ No newline at end of file +] +asyncio_mode = "auto" \ No newline at end of file diff --git a/tests/test_app.py b/tests/test_app.py index eabec07e..b3648d97 100644 --- a/tests/test_app.py +++ b/tests/test_app.py @@ -1,20 +1,21 @@ import asyncio -import configparser import hashlib -import openpyxl import os -import pandas +import platform import shutil import string import time +from io import BytesIO from json import JSONDecoder from pathlib import Path -from urllib.parse import urlencode,unquote -from io import BytesIO from typing import Any +from unittest.mock import patch +from urllib.parse import unquote, urlencode - -from aiohttp import test_utils, FormData +import openpyxl +import pandas +import pytest +from aiohttp import FormData, test_utils from hypothesis import given, settings from hypothesis import strategies as st @@ -22,32 +23,29 @@ import staging_service.globus as globus import staging_service.utils as utils from staging_service.AutoDetectUtils import AutoDetectUtils - -from tests.test_utils import check_file_contents, check_excel_contents +from tests.test_helpers import ( + DATA_DIR, + META_DIR, + FileUtil, + assert_file_contents, + bootstrap, + check_excel_contents, +) if os.environ.get("KB_DEPLOYMENT_CONFIG") is None: - from tests.test_utils import bootstrap - bootstrap() decoder = JSONDecoder() -config = configparser.ConfigParser() -config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) -DATA_DIR = config["staging_service"]["DATA_DIR"] -META_DIR = config["staging_service"]["META_DIR"] -AUTH_URL = config["staging_service"]["AUTH_URL"] -if DATA_DIR.startswith("."): - DATA_DIR = os.path.normpath(os.path.join(os.getcwd(), DATA_DIR)) -if META_DIR.startswith("."): - META_DIR = os.path.normpath(os.path.join(os.getcwd(), META_DIR)) -utils.Path._DATA_DIR = DATA_DIR -utils.Path._META_DIR = META_DIR +utils.StagingPath._DATA_DIR = DATA_DIR +utils.StagingPath._META_DIR = META_DIR + +# pylint: disable=C0116 def asyncgiven(**kwargs): - """alterantive to hypothesis.given decorator for async""" + """alternative to hypothesis.given decorator for async""" def real_decorator(fn): @given(**kwargs) @@ -62,13 +60,9 @@ def aio_wrapper(*args, **kwargs): return real_decorator -def mock_auth_app(): - application = app.app_factory(config) - - async def mock_auth(*args, **kwargs): - return "testuser" - - app.auth_client.get_user = mock_auth +# TODO: replace with real unittest mocking +def mock_app(): + application = app.app_factory() async def mock_globus_id(*args, **kwargs): return ["testuser@globusid.org"] @@ -80,8 +74,9 @@ async def mock_globus_id(*args, **kwargs): class AppClient: - def __init__(self, config, mock_username=None): - self.server = test_utils.TestServer(mock_auth_app()) + def __init__(self): + self.server = test_utils.TestServer(mock_app()) + self.client = None async def __aenter__(self): await self.server.start_server(loop=asyncio.get_event_loop()) @@ -90,35 +85,8 @@ async def __aenter__(self): async def __aexit__(self, *args): await self.server.close() - await self.client.close() - - -class FileUtil: - def __init__(self, base_dir=DATA_DIR): - self.base_dir = base_dir - - def __enter__(self): - os.makedirs(self.base_dir, exist_ok=True) - shutil.rmtree(self.base_dir) - os.makedirs(self.base_dir, exist_ok=False) - return self - - def __exit__(self, *args): - shutil.rmtree(self.base_dir) - - def make_file(self, path, contents): - path = os.path.join(self.base_dir, path) - with open(path, encoding="utf-8", mode="w") as f: - f.write(contents) - return path - - def make_dir(self, path): - path = os.path.join(self.base_dir, path) - os.makedirs(path, exist_ok=True) - return path - - def remove_dir(self, path): - shutil.rmtree(path) + if self.client is not None: + await self.client.close() first_letter_alphabet = [c for c in string.ascii_lowercase + string.ascii_uppercase] @@ -138,42 +106,70 @@ def remove_dir(self, path): def test_path_cases(username_first, username_rest): username = username_first + username_rest assert ( - username + "/foo/bar" == utils.Path.validate_path(username, "foo/bar").user_path + username + "/foo/bar" + == utils.StagingPath.validate_path(username, "foo/bar").user_path + ) + assert ( + username + "/baz" + == utils.StagingPath.validate_path(username, "foo/../bar/../baz").user_path + ) + assert ( + username + "/bar" + == utils.StagingPath.validate_path(username, "foo/../../../../bar").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "./foo").user_path + ) + assert ( + username + "/foo/bar" + == utils.StagingPath.validate_path(username, "../foo/bar").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "/../foo").user_path + ) + assert ( + username + "/" == utils.StagingPath.validate_path(username, "/foo/..").user_path + ) + assert ( + username + "/foo" + == utils.StagingPath.validate_path(username, "/foo/.").user_path + ) + assert ( + username + "/foo" == utils.StagingPath.validate_path(username, "foo/").user_path ) assert ( - username + "/baz" - == utils.Path.validate_path(username, "foo/../bar/../baz").user_path + username + "/foo" == utils.StagingPath.validate_path(username, "foo").user_path ) assert ( - username + "/bar" - == utils.Path.validate_path(username, "foo/../../../../bar").user_path + username + "/foo" + == utils.StagingPath.validate_path(username, "/foo/").user_path ) - assert username + "/foo" == utils.Path.validate_path(username, "./foo").user_path assert ( - username + "/foo/bar" - == utils.Path.validate_path(username, "../foo/bar").user_path + username + "/foo" == utils.StagingPath.validate_path(username, "/foo").user_path ) - assert username + "/foo" == utils.Path.validate_path(username, "/../foo").user_path - assert username + "/" == utils.Path.validate_path(username, "/foo/..").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo/.").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo/").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo/").user_path - assert username + "/foo" == utils.Path.validate_path(username, "/foo").user_path - assert username + "/foo" == utils.Path.validate_path(username, "foo/.").user_path - assert username + "/" == utils.Path.validate_path(username, "").user_path - assert username + "/" == utils.Path.validate_path(username, "foo/..").user_path - assert username + "/" == utils.Path.validate_path(username, "/..../").user_path assert ( - username + "/stuff.ext" - == utils.Path.validate_path(username, "/stuff.ext").user_path + username + "/foo" + == utils.StagingPath.validate_path(username, "foo/.").user_path + ) + assert username + "/" == utils.StagingPath.validate_path(username, "").user_path + assert ( + username + "/" == utils.StagingPath.validate_path(username, "foo/..").user_path + ) + assert ( + username + "/" == utils.StagingPath.validate_path(username, "/..../").user_path + ) + assert ( + username + "/stuff.ext" + == utils.StagingPath.validate_path(username, "/stuff.ext").user_path ) @given(username_first_strat, username_strat, st.text()) def test_path_sanitation(username_first, username_rest, path): username = username_first + username_rest - validated = utils.Path.validate_path(username, path) + validated = utils.StagingPath.validate_path(username, path) assert validated.full_path.startswith(DATA_DIR) assert validated.user_path.startswith(username) assert validated.metadata_path.startswith(META_DIR) @@ -192,40 +188,56 @@ async def test_cmd(txt): assert "" == await utils.run_command("ls", d) f = fs.make_file("test/test2", txt) md5 = hashlib.md5(txt.encode("utf8")).hexdigest() - md52 = await utils.run_command("md5sum", f) - assert md5 == md52.split()[0] + os = platform.system() + if os == "Linux": + md52 = await utils.run_command("md5sum", f) + expected_md5 = md52.split()[0] + elif os == "Darwin": + md52 = await utils.run_command("md5", f) + expected_md5 = md52.split()[3] + + assert md5 == expected_md5 # For mac osx, copy md5 to md5sum and check 3rd element # assert md5 == md52.split()[3] -async def test_auth(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_auth(get_user): + get_user.return_value = "testuser" + async with AppClient() as cli: resp = await cli.get("/test-auth") assert resp.status == 200 text = await resp.text() assert "I'm authenticated as" in text -async def test_service(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_service(get_user): + get_user.return_value = "testuser" + async with AppClient() as cli: resp = await cli.get("/test-service") assert resp.status == 200 text = await resp.text() assert "staging service version" in text -async def test_jbi_metadata(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_jgi_metadata(get_user): txt = "testing text\n" username = "testuser" - jbi_metadata = '{"file_owner": "sdm", "added_date": "2013-08-12T00:21:53.844000"}' + get_user.return_value = username + jgi_metadata = '{"file_owner": "sdm", "added_date": "2013-08-12T00:21:53.844000"}' - async with AppClient(config, username) as cli: + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) - f_jgi = fs.make_file( - os.path.join(username, "test", ".test_jgi.fastq.jgi"), jbi_metadata + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_jgi.fastq"), txt) + fs.make_file( + os.path.join(username, "test", ".test_jgi.fastq.jgi"), jgi_metadata ) res1 = await cli.get( os.path.join("jgi-metadata", "test", "test_jgi.fastq"), @@ -247,13 +259,17 @@ async def test_jbi_metadata(): assert res1.status == 404 -async def test_metadata(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_metadata(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), headers={"Authorization": ""}, @@ -276,7 +292,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -306,7 +322,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -339,7 +355,7 @@ async def test_metadata(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -347,13 +363,17 @@ async def test_metadata(): assert not json.get("isFolder") -async def test_define_UPA(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_define_upa(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + get_user.return_value = username + + async with AppClient() as cli: + with FileUtil() as file_util: + file_util.make_dir(os.path.join(username, "test")) + file_util.make_file(os.path.join(username, "test", "test_file_1"), txt) # generating metadata file res1 = await cli.get( os.path.join("metadata", "test", "test_file_1"), @@ -369,7 +389,10 @@ async def test_define_UPA(): ) assert res2.status == 200 json_text = await res2.text() - assert "succesfully updated UPA test_UPA" in json_text + assert ( + f"successfully updated UPA test_UPA for file {username}/test/test_file_1" + in json_text + ) # getting new metadata res3 = await cli.get( @@ -395,7 +418,7 @@ async def test_define_UPA(): assert set(json.keys()) >= set(expected_keys) assert json.get("source") == "Unknown" assert json.get("md5") == "e9018937ab54e6ce88b9e2dfe5053095" - assert json.get("lineCount") == "1" + assert json.get("lineCount") == 1 assert json.get("head") == "testing text\n" assert json.get("tail") == "testing text\n" assert json.get("name") == "test_file_1" @@ -412,7 +435,7 @@ async def test_define_UPA(): ) assert res4.status == 404 - # tesging missing body + # testing missing body res5 = await cli.post( os.path.join("define-upa", "test", "test_file_1"), headers={"Authorization": ""}, @@ -428,13 +451,16 @@ async def test_define_UPA(): assert res6.status == 400 -async def test_mv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_mv(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) # list current test directory res1 = await cli.get( @@ -502,63 +528,71 @@ async def test_mv(): assert res7.status == 404 -async def test_delete(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_delete(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: - with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - - # list current test directory - res1 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) - assert res1.status == 200 - json_text = await res1.text() - json = decoder.decode(json_text) - assert len(json) == 1 - assert json[0]["name"] == "test_file_1" - - res2 = await cli.delete( - os.path.join("delete", "test", "test_file_1"), - headers={"Authorization": ""}, - ) - assert res2.status == 200 - json_text = await res2.text() - assert "successfully deleted" in json_text - - # relist test directory - res3 = await cli.get( - os.path.join("list", "test"), headers={"Authorization": ""} - ) - assert res3.status == 200 - json_text = await res3.text() - json = decoder.decode(json_text) - assert len(json) == 0 + get_user.return_value = username + with patch("staging_service.auth2Client.KBaseAuth2") as mock_auth: + mock_auth.return_value.get_user.return_value = username + async with AppClient() as cli: + with FileUtil() as fs: + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - # testing moving root - # res4 = await cli.delete('/delete/', - # headers={'Authorization': ''}) - # assert res4.status == 403 + # list current test directory + res1 = await cli.get( + os.path.join("list", "test"), headers={"Authorization": ""} + ) + assert res1.status == 200 + json_text = await res1.text() + json = decoder.decode(json_text) + assert len(json) == 1 + assert json[0]["name"] == "test_file_1" + + res2 = await cli.delete( + os.path.join("delete", "test", "test_file_1"), + headers={"Authorization": ""}, + ) + assert res2.status == 200 + json_text = await res2.text() + assert "successfully deleted" in json_text - # testing non-existing file - res5 = await cli.delete( - os.path.join("delete", "test", "non_existing.test_file_2"), - headers={"Authorization": ""}, - ) - assert res5.status == 404 + # relist test directory + res3 = await cli.get( + os.path.join("list", "test"), headers={"Authorization": ""} + ) + assert res3.status == 200 + json_text = await res3.text() + json = decoder.decode(json_text) + assert len(json) == 0 + + # testing moving root + # res4 = await cli.delete('/delete/', + # headers={'Authorization': ''}) + # assert res4.status == 403 + + # testing non-existing file + res5 = await cli.delete( + os.path.join("delete", "test", "non_existing.test_file_2"), + headers={"Authorization": ""}, + ) + assert res5.status == 404 -async def test_list(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_list(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt ) res1 = await cli.get("list/..", headers={"Authorization": ""}) @@ -608,8 +642,8 @@ async def test_list(): assert sum(file_folder_count) == 1 # testing list dot-files - f4 = fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) - # f5 = fs.make_file(os.path.join(username, 'test', '.globus_id'), txt) + fs.make_file(os.path.join(username, "test", ".test_file_1"), txt) + res6 = await cli.get("/list/", headers={"Authorization": ""}) assert res6.status == 200 json_text = await res6.text() @@ -637,13 +671,16 @@ async def test_list(): assert len(file_names) == 4 +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @asyncgiven(txt=st.text()) -async def test_download(txt): +@pytest.mark.asyncio +async def test_download(get_user, txt): username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) res = await cli.get( os.path.join("download", "test", "test_file_1"), @@ -654,11 +691,14 @@ async def test_download(txt): assert result_text == txt.encode() -async def test_download_errors(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_download_errors(get_user): username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) res1 = await cli.get("dwnload", headers={"Authorization": ""}) assert res1.status == 404 @@ -668,22 +708,25 @@ async def test_download_errors(): assert res2.status == 400 -async def test_similar(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_similar(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) - d1 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f1 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1.fq"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2.fq"), txt ) - f2 = fs.make_file( + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_right.fq"), txt, ) - f3 = fs.make_file( + fs.make_file( os.path.join(username, "test", "test_sub_dir", "my_files"), txt ) @@ -709,24 +752,23 @@ async def test_similar(): assert res3.status == 400 -async def test_existence(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_existence(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test_sub_dir")) - f3 = fs.make_file( + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test_file_1"), txt) + fs.make_dir(os.path.join(username, "test", "test_sub_dir")) + fs.make_file( os.path.join(username, "test", "test_sub_dir", "test_file_2"), txt ) - d3 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_file_1") - ) - d4 = fs.make_dir( - os.path.join(username, "test", "test_sub_dir", "test_sub_dir") - ) - f4 = fs.make_file( + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_file_1")) + fs.make_dir(os.path.join(username, "test", "test_sub_dir", "test_sub_dir")) + fs.make_file( os.path.join( username, "test", "test_sub_dir", "test_sub_dir", "test_file_1" ), @@ -775,15 +817,18 @@ async def test_existence(): assert json["isFolder"] is False -async def test_search(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_search(get_user): txt = "testing text" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) - f = fs.make_file(os.path.join(username, "test", "test1"), txt) - d2 = fs.make_dir(os.path.join(username, "test", "test2")) - f3 = fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) + fs.make_dir(os.path.join(username, "test")) + fs.make_file(os.path.join(username, "test", "test1"), txt) + fs.make_dir(os.path.join(username, "test", "test2")) + fs.make_file(os.path.join(username, "test", "test2", "test3"), txt) res1 = await cli.get("search/", headers={"Authorization": ""}) assert res1.status == 200 json_text = await res1.text() @@ -801,16 +846,19 @@ async def test_search(): assert len(json) == 2 -async def test_upload(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload(get_user): txt = "testing text\n" username = "testuser" - async with AppClient(config, username) as cli: + get_user.return_value = username + async with AppClient() as cli: # testing missing body res1 = await cli.post("upload", headers={"Authorization": ""}) assert res1.status == 400 with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, "test")) + fs.make_dir(os.path.join(username, "test")) f = fs.make_file(os.path.join(username, "test", "test_file_1"), txt) files = {"destPath": "/", "uploads": open(f, "rb")} @@ -825,8 +873,7 @@ async def test_upload(): async def _upload_file_fail_filename(filename: str, err: str): # Note two file uploads in a row causes a test error: # https://github.com/aio-libs/aiohttp/issues/3968 - async with AppClient(config, "fake") as cli: # username is ignored by AppClient - + async with AppClient() as cli: formdata = FormData() formdata.add_field("destPath", "/") formdata.add_field("uploads", BytesIO(b"sometext"), filename=filename) @@ -837,29 +884,42 @@ async def _upload_file_fail_filename(filename: str, err: str): assert res.status == 403 -async def test_upload_fail_leading_space(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_leading_space(get_user): + get_user.return_value = "fake" await _upload_file_fail_filename( - " test_file", "cannot upload file with name beginning with space") + " test_file", "cannot upload file with name beginning with space" + ) -async def test_upload_fail_dotfile(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_dotfile(get_user): + get_user.return_value = "whocares" await _upload_file_fail_filename( - ".test_file", "cannot upload file with name beginning with '.'") + ".test_file", "cannot upload file with name beginning with '.'" + ) -async def test_upload_fail_comma_in_file(): - await _upload_file_fail_filename( - "test,file", "cannot upload file with ',' in name") +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_upload_fail_comma_in_file(get_user): + get_user.return_value = "idunno" + await _upload_file_fail_filename("test,file", "cannot upload file with ',' in name") +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @settings(deadline=None) @asyncgiven(contents=st.text()) -async def test_directory_decompression(contents): +@pytest.mark.asyncio +async def test_directory_decompression(get_user, contents): fname = "test" dirname = "dirname" username = "testuser" - path = utils.Path.validate_path(username, os.path.join(dirname, fname)) - path2 = utils.Path.validate_path(username, os.path.join(dirname, fname)) + get_user.return_value = username + path = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) + path2 = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) if path.user_path.endswith("/") or path2.user_path.endswith("/"): # invalid test case # TODO it should be faster if hypothesis could generate all cases except these @@ -873,7 +933,7 @@ async def test_directory_decompression(contents): ("bztar", ".tar.bz"), ("tar", ".tar"), ] - async with AppClient(config, username) as cli: + async with AppClient() as cli: for method, extension in methods: with FileUtil() as fs: d = fs.make_dir(os.path.join(username, dirname)) @@ -913,53 +973,61 @@ async def test_directory_decompression(contents): assert os.path.exists(f3) +@patch("staging_service.auth2Client.KBaseAuth2.get_user") @asyncgiven(contents=st.text()) -async def test_file_decompression(contents): +@pytest.mark.asyncio +async def test_file_decompression(get_user, contents): fname = "test" dirname = "dirname" username = "testuser" - path = utils.Path.validate_path(username, os.path.join(dirname, fname)) + get_user.return_value = username + path = utils.StagingPath.validate_path(username, os.path.join(dirname, fname)) if path.user_path.endswith("/"): # invalid test case # TODO it should be faster if hypothesis could generate all cases except these return methods = [("gzip", ".gz"), ("bzip2", ".bz2")] - async with AppClient(config, username) as cli: - for method, extension in methods: - with FileUtil() as fs: - d = fs.make_dir(os.path.join(username, dirname)) - f1 = fs.make_file(path.user_path, contents) - name = fname + extension - await utils.run_command(method, f1) - # check to see that the original is gone - assert os.path.exists(d) - assert not os.path.exists(f1) - assert os.path.exists(os.path.join(d, name)) - resp = await cli.patch( - "/decompress/" + os.path.join(dirname, name), - headers={"Authorization": ""}, - ) - assert resp.status == 200 - text = await resp.text() - assert "succesfully decompressed" in text - assert name in text - # check to see if we got back what we started with for all files and directories - assert os.path.exists(d) - assert os.path.exists(f1) - - -async def test_importer_mappings(): + with patch("staging_service.auth2Client.KBaseAuth2") as mock_auth: + mock_auth.return_value.get_user.return_value = username + async with AppClient() as cli: + for method, extension in methods: + with FileUtil() as fs: + d = fs.make_dir(os.path.join(username, dirname)) + f1 = fs.make_file(path.user_path, contents) + name = fname + extension + await utils.run_command(method, f1) + # check to see that the original is gone + assert os.path.exists(d) + assert not os.path.exists(f1) + assert os.path.exists(os.path.join(d, name)) + resp = await cli.patch( + "/decompress/" + os.path.join(dirname, name), + headers={"Authorization": ""}, + ) + assert resp.status == 200 + text = await resp.text() + assert "succesfully decompressed" in text + assert name in text + # check to see if we got back what we started with for all files and directories + assert os.path.exists(d) + assert os.path.exists(f1) + + +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_importer_mappings(get_user): """ This tests calling with simple good cases, and some expected bad cases :return: """ username = "testuser" + get_user.return_value = username # Normal case, no match data = {"file_list": ["file1.txt"]} qs1 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs1}") assert resp.status == 200 text = await resp.json() @@ -971,7 +1039,7 @@ async def test_importer_mappings(): data = {"file_list": ["file1.txt", "file.tar.gz"]} qs2 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs2}", data=data) assert resp.status == 200 text = await resp.json() @@ -981,13 +1049,16 @@ async def test_importer_mappings(): # As we update the app mappings this test may need to be changed # Or we need to reload json file itself - # unzip_mapping = AutoDetectUtils._MAPPINGS["apps"]["decompress/unpack"] - assert mappings[1][0] == AutoDetectUtils._MAPPINGS["types"]["gz"]["mappings"][0] + assert ( + mappings[1][0] + == AutoDetectUtils.get_mappings_by_extension("gz")["mappings"][0] + ) # A dict is passed in data = {"file_list": [{}]} qs3 = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qs3}", data=data) assert resp.status == 200 text = await resp.json() @@ -1005,62 +1076,80 @@ async def test_importer_mappings(): for data in bad_data: qsd = urlencode(data, doseq=True) - async with AppClient(config, username) as cli: + + async with AppClient() as cli: resp = await cli.get(f"importer_mappings/?{qsd}") assert resp.status == 400 text = await resp.text() - assert f"must provide file_list field. Your provided qs: {unquote(qsd)}" in text + assert ( + f"must provide file_list field. Your provided qs: {unquote(qsd)}" + in text + ) -async def test_bulk_specification_success(): - # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_success(get_user): + username = "testuser" + get_user.return_value = username + + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/somefolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/somefolder") + base = Path(fu.base_dir) / username tsv = "genomes.tsv" - with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - "val3\tval4\t1\n", - ]) + with open(base / tsv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + "val3\tval4\t1\n", + ] + ) csv = "somefolder/breakfastcereals.csv" - with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - "grape nuts , dietary fiber, also gross\n", - ]) + with open(base / csv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + "grape nuts , dietary fiber, also gross\n", + ] + ) excel = "importspec.xlsx" - with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ["Fred", 1.5] - ]) + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ["Fred", 1.5], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get(f"bulk_specification/?files={tsv} , {csv}, {excel} ") + resp = await cli.get( + f"bulk_specification/?files={tsv} , {csv}, {excel} " + ) jsn = await resp.json() assert jsn == { "types": { "genomes": [ {"spec1": "val1", "spec2": "ꔆ", "spec3": 7}, - {"spec1": "val3", "spec2": "val4", "spec3": 1} + {"spec1": "val3", "spec2": "val4", "spec3": 1}, ], "breakfastcereals": [ {"s1": "froot loops", "s2": "puffin", "s3": "gross"}, @@ -1072,22 +1161,29 @@ async def test_bulk_specification_success(): ], "tree_sloths": [ {"entity_id": "That which ends all", "preferred_food": "ꔆ"} - ] + ], }, "files": { "genomes": {"file": "testuser/genomes.tsv", "tab": None}, "breakfastcereals": { "file": "testuser/somefolder/breakfastcereals.csv", - "tab": None}, + "tab": None, + }, "fruit_bats": {"file": "testuser/importspec.xlsx", "tab": "bats"}, - "tree_sloths": {"file": "testuser/importspec.xlsx", "tab": "sloths"}, - } + "tree_sloths": { + "file": "testuser/importspec.xlsx", + "tab": "sloths", + }, + }, } assert resp.status == 200 -async def test_bulk_specification_fail_no_files(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_no_files(get_user): + get_user.return_value = "dontcare" + async with AppClient() as cli: for f in ["", "?files=", "?files= , ,, , "]: resp = await cli.get(f"bulk_specification/{f}") jsn = await resp.json() @@ -1095,344 +1191,425 @@ async def test_bulk_specification_fail_no_files(): assert resp.status == 400 -async def test_bulk_specification_fail_not_found(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_not_found(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/otherfolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/otherfolder") + base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" - with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + with open(base / tsv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) resp = await cli.get(f"bulk_specification/?files={tsv},somefile.csv") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_find_file", "file": "testuser/somefile.csv"} - ]} + assert jsn == { + "errors": [ + {"type": "cannot_find_file", "file": f"{username}/somefile.csv"} + ] + } assert resp.status == 404 -async def test_bulk_specification_fail_parse_fail(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_parse_fail(get_user): """ Tests a number of different parse fail cases, including incorrect file types. Also tests that all the errors are combined correctly. """ - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser/otherfolder") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(f"{username}/otherfolder") + base = Path(fu.base_dir) / username tsv = "otherfolder/genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + with open(base / tsv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "otherfolder/thing.csv" # this one has a misspelling in the header - with open(base / csv, "w") as f: - f.writelines([ - "Dater type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin , gross\n", - ]) + with open(base / csv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Dater type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin , gross\n", + ] + ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this one is fine - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is missing a parameter ID - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["", "preferred_food"], - ["ID", "Foods I like"], - ["Kevin Garibaldi", "Beeeaaaaans!"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["", "preferred_food"], + ["ID", "Foods I like"], + ["Kevin Garibaldi", "Beeeaaaaans!"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) # this also tests a number of bad file extensions - no need to create files - resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}" - + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile.") + resp = await cli.get( + f"bulk_specification/?files={tsv},{csv},{excel}" + + ",badfile,badfile.fasta.gz,badfile.sra,badfile.sys,badfile." + ) jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "cannot_parse_file", - "message": 'Invalid header; got "Dater type: breakfastcereals; ' - + 'Columns: 3; Version: 1", expected "Data type: ; ' - + 'Columns: ; Version: "', - "file": "testuser/otherfolder/thing.csv", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "Missing header entry in row 2, position 1", - "file": "testuser/stuff.xlsx", - "tab": "sloths", - }, - {"type": "cannot_parse_file", - "message": "badfile is not a supported file type for import specifications", - "file": "testuser/badfile", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "fasta.gz is not a supported file type for import specifications", - "file": "testuser/badfile.fasta.gz", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sra is not a supported file type for import specifications", - "file": "testuser/badfile.sra", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "sys is not a supported file type for import specifications", - "file": "testuser/badfile.sys", - "tab": None, - }, - {"type": "cannot_parse_file", - "message": "badfile. is not a supported file type for import specifications", - "file": "testuser/badfile.", - "tab": None, - }, - ]} + assert jsn == { + "errors": [ + { + "type": "cannot_parse_file", + "message": 'Invalid header; got "Dater type: breakfastcereals; ' + + 'Columns: 3; Version: 1", expected "Data type: ; ' + + 'Columns: ; Version: "', + "file": f"{username}/otherfolder/thing.csv", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "Missing header entry in row 2, position 1", + "file": f"{username}/stuff.xlsx", + "tab": "sloths", + }, + { + "type": "cannot_parse_file", + "message": "badfile is not a supported file type for import specifications", + "file": f"{username}/badfile", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "fasta.gz is not a supported file type for import specifications", + "file": f"{username}/badfile.fasta.gz", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sra is not a supported file type for import specifications", + "file": f"{username}/badfile.sra", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "sys is not a supported file type for import specifications", + "file": f"{username}/badfile.sys", + "tab": None, + }, + { + "type": "cannot_parse_file", + "message": "badfile. is not a supported file type for import specifications", + "file": f"{username}/badfile.", + "tab": None, + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_column_count(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_column_count(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + with open(base / tsv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv = "thing.csv" # this one is missing a column in the last row - with open(base / csv, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin\n", - ]) + with open(base / csv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this one has an extra column in the last row - df = pandas.DataFrame([ - ["Data type: fruit_bats; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42, 56], - ]) + df = pandas.DataFrame( + [ + ["Data type: fruit_bats; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42, 56], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - resp = await cli.get( - f"bulk_specification/?files={tsv},{csv},{excel}") + resp = await cli.get(f"bulk_specification/?files={tsv},{csv},{excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 3, got 2", - "file": "testuser/thing.csv", - "tab": None, - }, - {"type": "incorrect_column_count", - "message": "Incorrect number of items in line 4, expected 2, got 3", - "file": "testuser/stuff.xlsx", - "tab": "bats", - }, - ]} + assert jsn == { + "errors": [ + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 3, got 2", + "file": f"{username}/thing.csv", + "tab": None, + }, + { + "type": "incorrect_column_count", + "message": "Incorrect number of items in line 4, expected 2, got 3", + "file": f"{username}/stuff.xlsx", + "tab": "bats", + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_multiple_specs_per_type(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_multiple_specs_per_type(get_user): + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username tsv = "genomes.tsv" # this one is fine - with open(base / tsv, "w") as f: - f.writelines([ - "Data type: genomes; Columns: 3; Version: 1\n", - "spec1\tspec2\t spec3 \n", - "Spec 1\t Spec 2\t Spec 3\n", - "val1 \t ꔆ \t 7\n", - ]) + with open(base / tsv, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: genomes; Columns: 3; Version: 1\n", + "spec1\tspec2\t spec3 \n", + "Spec 1\t Spec 2\t Spec 3\n", + "val1 \t ꔆ \t 7\n", + ] + ) csv1 = "thing.csv" # this is the first of the breakfastcereals data sources, so fine - with open(base / csv1, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 3; Version: 1\n", - "s1,s2,s3\n", - "S 1,S 2,S 3\n", - "froot loops , puffin, whee\n", - ]) + with open(base / csv1, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: breakfastcereals; Columns: 3; Version: 1\n", + "s1,s2,s3\n", + "S 1,S 2,S 3\n", + "froot loops , puffin, whee\n", + ] + ) csv2 = "thing2.csv" # this data type is also breakfastcereals, so will cause an error - with open(base / csv2, "w") as f: - f.writelines([ - "Data type: breakfastcereals; Columns: 2; Version: 1\n", - "s1,s2\n", - "S 1,S 2\n", - "froot loops , puffin\n", - ]) + with open(base / csv2, "w", encoding="utf-8") as f: + f.writelines( + [ + "Data type: breakfastcereals; Columns: 2; Version: 1\n", + "s1,s2\n", + "S 1,S 2\n", + "froot loops , puffin\n", + ] + ) excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 # this data type is also breakfastcereals, so will cause an error - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) # this one is fine - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) resp = await cli.get( - f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}") + f"bulk_specification/?files={tsv},{csv1},{csv2},{excel}" + ) jsn = await resp.json() err = "Data type breakfastcereals appears in two importer specification sources" - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/thing2.csv", - "tab_2": None - }, - {"type": "multiple_specifications_for_data_type", - "message": err, - "file_1": "testuser/thing.csv", - "tab_1": None, - "file_2": "testuser/stuff.xlsx", - "tab_2": "bats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": f"{username}/thing.csv", + "tab_1": None, + "file_2": f"{username}/thing2.csv", + "tab_2": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": err, + "file_1": f"{username}/thing.csv", + "tab_1": None, + "file_2": f"{username}/stuff.xlsx", + "tab_2": "bats", + }, + ] + } assert resp.status == 400 -async def test_bulk_specification_fail_multiple_specs_per_type_excel(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_bulk_specification_fail_multiple_specs_per_type_excel(get_user): """ Test an excel file with an internal data type collision. This is the only case when all 5 error fields are filled out. """ - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - base = Path(fu.base_dir) / "testuser" + fu.make_dir(username) + base = Path(fu.base_dir) / username excel = "stuff.xlsx" - with pandas.ExcelWriter(base / excel) as exw: - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + with pandas.ExcelWriter(base / excel) as exw: # pylint: disable=E0110 + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="bats", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: tree_sloths; Columns: 2; Version: 1"], - ["entity_id", "preferred_food"], - ["Entity ID", "Preferred Food"], - ["That which ends all", "ꔆ"], - ]) + df = pandas.DataFrame( + [ + ["Data type: tree_sloths; Columns: 2; Version: 1"], + ["entity_id", "preferred_food"], + ["Entity ID", "Preferred Food"], + ["That which ends all", "ꔆ"], + ] + ) df.to_excel(exw, sheet_name="sloths", header=False, index=False) - df = pandas.DataFrame([ - ["Data type: breakfastcereals; Columns: 2; Version: 1"], - ["bat_name", "wing_count"], - ["Name of Bat", "Number of wings"], - ["George", 42], - ]) + df = pandas.DataFrame( + [ + ["Data type: breakfastcereals; Columns: 2; Version: 1"], + ["bat_name", "wing_count"], + ["Name of Bat", "Number of wings"], + ["George", 42], + ] + ) df.to_excel(exw, sheet_name="otherbats", header=False, index=False) resp = await cli.get(f"bulk_specification/?files={excel}") jsn = await resp.json() - assert jsn == {"errors": [ - {"type": "multiple_specifications_for_data_type", - "message": "Found datatype breakfastcereals in multiple tabs", - "file_1": "testuser/stuff.xlsx", - "tab_1": "bats", - "file_2": "testuser/stuff.xlsx", - "tab_2": "otherbats" - }, - ]} + assert jsn == { + "errors": [ + { + "type": "multiple_specifications_for_data_type", + "message": "Found datatype breakfastcereals in multiple tabs", + "file_1": f"{username}/stuff.xlsx", + "tab_1": "bats", + "file_2": f"{username}/stuff.xlsx", + "tab_2": "otherbats", + }, + ] + } assert resp.status == 400 _IMPORT_SPEC_TEST_DATA = { "genome": { - "order_and_display": [ - ["id1", "display1"], - ["id2", "display2"] - ], + "order_and_display": [["id1", "display1"], ["id2", "display2"]], "data": [ {"id1": 54, "id2": "boo"}, {"id1": 32, "id2": "yikes"}, - ] + ], }, "reads": { "order_and_display": [ ["name", "Reads File Name"], - ["inseam", "Reads inseam measurement in km"] + ["inseam", "Reads inseam measurement in km"], ], "data": [ {"name": "myreads.fa", "inseam": 0.1}, - ] - } + ], + }, } -async def test_write_bulk_specification_success_csv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_csv(get_user): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: + username = "testuser" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + fu.make_dir(username) + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "specs", "output_file_type": "CSV", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "CSV", "files_created": { - "genome": "testuser/specs/genome.csv", - "reads": "testuser/specs/reads.csv" - } + "genome": f"{username}/specs/genome.csv", + "reads": f"{username}/specs/reads.csv", + }, } - base = Path(fu.base_dir) / "testuser" - check_file_contents( + base = Path(fu.base_dir) / username + assert_file_contents( base / "specs/genome.csv", [ "Data type: genome; Columns: 2; Version: 1\n", @@ -1440,43 +1617,49 @@ async def test_write_bulk_specification_success_csv(): "display1,display2\n", "54,boo\n", "32,yikes\n", - ] + ], ) - check_file_contents( + assert_file_contents( base / "specs/reads.csv", [ "Data type: reads; Columns: 2; Version: 1\n", "name,inseam\n", "Reads File Name,Reads inseam measurement in km\n", "myreads.fa,0.1\n", - ] + ], ) -async def test_write_bulk_specification_success_tsv(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_tsv(get_user): # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: + username = "foo" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock + fu.make_dir(username) types = dict(_IMPORT_SPEC_TEST_DATA) - types['reads'] = dict(types['reads']) - types['reads']['data'] = [] - resp = await cli.post("write_bulk_specification", json= - { + types["reads"] = dict(types["reads"]) + types["reads"]["data"] = [] + resp = await cli.post( + "write_bulk_specification", + json={ "output_directory": "tsvspecs", "output_file_type": "TSV", "types": types, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "TSV", "files_created": { - "genome": "testuser/tsvspecs/genome.tsv", - "reads": "testuser/tsvspecs/reads.tsv" - } + "genome": f"{username}/tsvspecs/genome.tsv", + "reads": f"{username}/tsvspecs/reads.tsv", + }, } - base = Path(fu.base_dir) / "testuser" - check_file_contents( + base = Path(fu.base_dir) / username + assert_file_contents( base / "tsvspecs/genome.tsv", [ "Data type: genome; Columns: 2; Version: 1\n", @@ -1484,38 +1667,45 @@ async def test_write_bulk_specification_success_tsv(): "display1\tdisplay2\n", "54\tboo\n", "32\tyikes\n", - ] + ], ) - check_file_contents( + assert_file_contents( base / "tsvspecs/reads.tsv", [ "Data type: reads; Columns: 2; Version: 1\n", "name\tinseam\n", "Reads File Name\tReads inseam measurement in km\n", - ] + ], ) -async def test_write_bulk_specification_success_excel(): - # In other tests a username is passed to AppClient but AppClient completely ignores it... - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_success_excel(get_user): + username = "leaf" + get_user.return_value = username + async with AppClient() as cli: with FileUtil() as fu: - fu.make_dir("testuser") # testuser is hardcoded in the auth mock - resp = await cli.post("write_bulk_specification/", json= - { + fu.make_dir(username) + resp = await cli.post( + "write_bulk_specification/", + json={ "output_directory": "", "output_file_type": "EXCEL", "types": _IMPORT_SPEC_TEST_DATA, - }) + }, + ) js = await resp.json() assert js == { "output_file_type": "EXCEL", "files_created": { - "genome": "testuser/import_specification.xlsx", - "reads": "testuser/import_specification.xlsx", - } + "genome": f"{username}/import_specification.xlsx", + "reads": f"{username}/import_specification.xlsx", + }, } - wb = openpyxl.load_workbook(Path(fu.base_dir) / "testuser/import_specification.xlsx") + wb = openpyxl.load_workbook( + Path(fu.base_dir) / f"{username}/import_specification.xlsx" + ) assert wb.sheetnames == ["genome", "reads"] check_excel_contents( wb, @@ -1542,77 +1732,117 @@ async def test_write_bulk_specification_success_excel(): ) -async def test_write_bulk_specification_fail_wrong_data_type(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_data_type(get_user): + get_user.return_value = "do_not_care" + async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", data="foo") js = await resp.json() assert js == {"error": "Required content-type is application/json"} assert resp.status == 415 -async def test_write_bulk_specification_fail_no_content_length(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_content_length(get_user): + get_user.return_value = "whocares" + async with AppClient() as cli: resp = await cli.post( - "write_bulk_specification", headers={"content-type": "application/json"}) + "write_bulk_specification", headers={"content-type": "application/json"} + ) js = await resp.json() assert js == {"error": "The content-length header is required and must be > 0"} assert resp.status == 411 -async def test_write_bulk_specification_fail_large_input(): - async with AppClient(config) as cli: +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_large_input(get_user): + get_user.return_value = "i_do_not" + async with AppClient() as cli: resp = await cli.post("write_bulk_specification/", json="a" * (1024 * 1024 - 2)) txt = await resp.text() # this seems to be a built in (somewhat inaccurate) server feature - assert txt == "Maximum request body size 1048576 exceeded, actual body size 1048576" + assert ( + txt + == "Maximum request body size 1048576 exceeded, actual body size 1048576" + ) assert resp.status == 413 async def _write_bulk_specification_json_fail(json: Any, err: str): - async with AppClient(config) as cli: + async with AppClient() as cli: resp = await cli.post("write_bulk_specification", json=json) js = await resp.json() assert js == {"error": err} assert resp.status == 400 -async def test_write_bulk_specification_fail_not_dict(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_not_dict(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - ["foo"], "The top level JSON element must be a mapping") + ["foo"], "The top level JSON element must be a mapping" + ) -async def test_write_bulk_specification_fail_no_output_dir(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_output_dir(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {}, "output_directory is required and must be a string") + {}, "output_directory is required and must be a string" + ) -async def test_write_bulk_specification_fail_wrong_type_for_output_dir(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_type_for_output_dir(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": 4}, "output_directory is required and must be a string") + {"output_directory": 4}, "output_directory is required and must be a string" + ) -async def test_write_bulk_specification_fail_no_file_type(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_no_file_type(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": "foo"}, "Invalid output_file_type: None") + {"output_directory": "foo"}, "Invalid output_file_type: None" + ) -async def test_write_bulk_specification_fail_wrong_file_type(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_wrong_file_type(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( - {"output_directory": "foo", "output_file_type": "XSV"}, "Invalid output_file_type: XSV") + {"output_directory": "foo", "output_file_type": "XSV"}, + "Invalid output_file_type: XSV", + ) -async def test_write_bulk_specification_fail_invalid_type_value(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_write_bulk_specification_fail_invalid_type_value(get_user): + get_user.return_value = "dontcare" await _write_bulk_specification_json_fail( {"output_directory": "foo", "output_file_type": "CSV", "types": {"a": "fake"}}, - "The value for data type a must be a mapping" + "The value for data type a must be a mapping", ) -async def test_importer_filetypes(): +@pytest.mark.asyncio +@patch("staging_service.auth2Client.KBaseAuth2.get_user") +async def test_importer_filetypes(get_user): """ Only checks a few example entries since the list may expand over time """ - async with AppClient(config) as cli: + get_user.return_value = "dontcare" + async with AppClient() as cli: resp = await cli.get("importer_filetypes") js = await resp.json() assert set(js.keys()) == {"datatype_to_filetype", "filetype_to_extensions"} diff --git a/tests/test_app_error_formatter.py b/tests/test_app_error_formatter.py index a700fd9b..5a1aa3ff 100644 --- a/tests/test_app_error_formatter.py +++ b/tests/test_app_error_formatter.py @@ -1,7 +1,13 @@ from pathlib import Path from staging_service.app_error_formatter import format_import_spec_errors -from staging_service.import_specifications.file_parser import ErrorType, Error, SpecificationSource +from staging_service.import_specifications.file_parser import ( + Error, + ErrorType, + SpecificationSource, +) + +# pylint: disable=C0116 def _ss(file: str, tab: str = None) -> SpecificationSource: @@ -15,10 +21,11 @@ def test_format_import_spec_errors_no_input(): def test_format_import_spec_errors_one_error(): errors = [Error(ErrorType.OTHER, "foobar")] assert format_import_spec_errors(errors, {}) == [ - {"type": "unexpected_error", - "message": "foobar", - "file": None, - } + { + "type": "unexpected_error", + "message": "foobar", + "file": None, + } ] @@ -31,7 +38,7 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar4", _ss("file4"), - _ss("file5") + _ss("file5"), ), Error(ErrorType.NO_FILES_PROVIDED), Error(ErrorType.FILE_NOT_FOUND, source_1=_ss("file6")), @@ -45,31 +52,31 @@ def test_format_import_spec_errors_all_the_errors_no_tabs(): Path("file6"): Path("f6"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "unexpected_error", - "message": "foobar1", - "file": "f1", - }, - {"type": "cannot_parse_file", - "message": "foobar2", - "file": "f2", - "tab": None - }, - {"type": "incorrect_column_count", - "message": "foobar3", - "file": "f3", - "tab": None, - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar4", - "file_1": "f4", - "tab_1": None, - "file_2": "f5", - "tab_2": None, - }, + { + "type": "unexpected_error", + "message": "foobar1", + "file": "f1", + }, + {"type": "cannot_parse_file", "message": "foobar2", "file": "f2", "tab": None}, + { + "type": "incorrect_column_count", + "message": "foobar3", + "file": "f3", + "tab": None, + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar4", + "file_1": "f4", + "tab_1": None, + "file_2": "f5", + "tab_2": None, + }, {"type": "no_files_provided"}, - {"type": "cannot_find_file", - "file": "f6", - }, + { + "type": "cannot_find_file", + "file": "f6", + }, ] @@ -81,7 +88,7 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): ErrorType.MULTIPLE_SPECIFICATIONS_FOR_DATA_TYPE, "foobar3", _ss("file3", "tab3"), - _ss("file4", "tab4") + _ss("file4", "tab4"), ), ] paths = { @@ -91,21 +98,24 @@ def test_format_import_spec_errors_all_the_errors_with_tabs(): Path("file4"): Path("f4"), } assert format_import_spec_errors(errors, paths) == [ - {"type": "cannot_parse_file", - "message": "foobar1", - "file": "f1", - "tab": "tab1" - }, - {"type": "incorrect_column_count", - "message": "foobar2", - "file": "f2", - "tab": "tab2", - }, - {"type": "multiple_specifications_for_data_type", - "message": "foobar3", - "file_1": "f3", - "tab_1": "tab3", - "file_2": "f4", - "tab_2": "tab4", - }, - ] \ No newline at end of file + { + "type": "cannot_parse_file", + "message": "foobar1", + "file": "f1", + "tab": "tab1", + }, + { + "type": "incorrect_column_count", + "message": "foobar2", + "file": "f2", + "tab": "tab2", + }, + { + "type": "multiple_specifications_for_data_type", + "message": "foobar3", + "file_1": "f3", + "tab_1": "tab3", + "file_2": "f4", + "tab_2": "tab4", + }, + ] diff --git a/tests/test_autodetect.py b/tests/test_autodetect.py index 21bbbc82..32b24856 100644 --- a/tests/test_autodetect.py +++ b/tests/test_autodetect.py @@ -1,11 +1,12 @@ import pytest + +from staging_service.app import inject_config_dependencies from staging_service.autodetect.GenerateMappings import ( - file_format_to_extension_mapping, extensions_mapping, + file_format_to_extension_mapping, ) from staging_service.AutoDetectUtils import AutoDetectUtils -from staging_service.app import inject_config_dependencies -from tests.test_utils import bootstrap_config +from tests.test_helpers import bootstrap_config @pytest.fixture(autouse=True, scope="module") @@ -20,7 +21,7 @@ def test_config(): #TODO Can test PATH injections as well :return: """ - mappings = AutoDetectUtils._MAPPINGS + mappings = AutoDetectUtils.has_mappings() assert mappings @@ -36,11 +37,7 @@ def test_bad_filenames(): filename=filename ) assert possible_importers is None - assert fileinfo == { - "prefix": filename, - "suffix": None, - "file_ext_type": [] - } + assert fileinfo == {"prefix": filename, "suffix": None, "file_ext_type": []} def test_reasonable_filenames(): @@ -48,16 +45,17 @@ def test_reasonable_filenames(): Test variants "reasonable" filenames that are in the mappings Note: Some apps and mappings are not yet supported and are commented out """ - good_filenames = [] for heading in file_format_to_extension_mapping.keys(): extensions = file_format_to_extension_mapping[heading] for extension in extensions: - good_filenames.append(( - f"{heading}.{extension}", - heading.count("."), - extensions_mapping[extension]['file_ext_type'] - )) + good_filenames.append( + ( + f"{heading}.{extension}", + heading.count("."), + extensions_mapping[extension]["file_ext_type"], + ) + ) for filename, heading_dotcount, ext in good_filenames: for filename_variant in [ @@ -71,12 +69,16 @@ def test_reasonable_filenames(): ) assert possible_importers is not None expected_suffix = filename_variant.split(".", heading_dotcount + 1)[-1] - assert possible_importers == AutoDetectUtils._MAPPINGS["types"][ - expected_suffix.lower()]["mappings"], filename_variant + assert ( + possible_importers + == AutoDetectUtils.get_mappings_by_extension(expected_suffix.lower())[ + "mappings" + ] + ), filename_variant assert fileinfo == { - "prefix": filename_variant[:-len(expected_suffix) - 1], + "prefix": filename_variant[: -len(expected_suffix) - 1], "suffix": expected_suffix, - "file_ext_type": ext + "file_ext_type": ext, } @@ -85,72 +87,95 @@ def test_specific_filenames(): Test some made up filenames to check that multi-dot extensions are handled correctly """ test_data = [ - ("filename", (None, {"prefix": "filename", "suffix": None, "file_ext_type": []})), - ("file.name", (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []})), - ("fil.en.ame", (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []})), - ("file.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file" , "suffix": "gZ", "file_ext_type": ['CompressedFileFormatArchive']}, - ) - ), - ("file.name.gZ", ( - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], - {"prefix": "file.name", - "suffix": "gZ", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - ) - ), - ("oscar_the_grouch_does_meth.FaStA.gz", ( - [ - {'app_weight': 1, - 'id': 'assembly', - 'title': 'Assembly', + ( + "filename", + (None, {"prefix": "filename", "suffix": None, "file_ext_type": []}), + ), + ( + "file.name", + (None, {"prefix": "file.name", "suffix": None, "file_ext_type": []}), + ), + ( + "fil.en.ame", + (None, {"prefix": "fil.en.ame", "suffix": None, "file_ext_type": []}), + ), + ( + "file.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', + ), + ), + ( + "file.name.gZ", + ( + [ + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", + } + ], + { + "prefix": "file.name", + "suffix": "gZ", + "file_ext_type": ["CompressedFileFormatArchive"], }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "oscar_the_grouch_does_meth", - "suffix": "FaStA.gz", - "file_ext_type": ["FASTA"] - }, - ) - ), - ("look.at.all.these.frigging.dots.gff2.gzip", ( - [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', - } - ], - {"prefix": "look.at.all.these.frigging.dots", - "suffix": "gff2.gzip", - "file_ext_type": ["GFF"] - }, - ) - ) + ), + ), + ( + "oscar_the_grouch_does_meth.FaStA.gz", + ( + [ + {"app_weight": 1, "id": "assembly", "title": "Assembly"}, + {"app_weight": 1, "id": "gff_genome", "title": "GFF/FASTA Genome"}, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "oscar_the_grouch_does_meth", + "suffix": "FaStA.gz", + "file_ext_type": ["FASTA"], + }, + ), + ), + ( + "look.at.all.these.frigging.dots.gff2.gzip", + ( + [ + {"app_weight": 1, "id": "gff_genome", "title": "GFF/FASTA Genome"}, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], + { + "prefix": "look.at.all.these.frigging.dots", + "suffix": "gff2.gzip", + "file_ext_type": ["GFF"], + }, + ), + ), ] for filename, importers in test_data: - assert AutoDetectUtils.determine_possible_importers(filename) == importers, filename + assert ( + AutoDetectUtils.determine_possible_importers(filename) == importers + ), filename def test_sra_mappings(): @@ -160,12 +185,15 @@ def test_sra_mappings(): """ sra_file = "test.sra" possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=sra_file) - assert possible_importers == [{ - 'id': 'sra_reads', - 'app_weight': 1, - 'title': 'SRA Reads', - }] + filename=sra_file + ) + assert possible_importers == [ + { + "id": "sra_reads", + "app_weight": 1, + "title": "SRA Reads", + } + ] assert fileinfo == {"prefix": "test", "suffix": "sra", "file_ext_type": ["SRA"]} @@ -176,49 +204,59 @@ def test_zip_mappings(): """ gz_file = "test.tar.gz" possible_importers, fileinfo = AutoDetectUtils.determine_possible_importers( - filename=gz_file) - assert possible_importers == [{ - 'id': 'decompress', - 'app_weight': 1, - 'title': 'Decompress/Unpack', - }] + filename=gz_file + ) + assert possible_importers == [ + { + "id": "decompress", + "app_weight": 1, + "title": "Decompress/Unpack", + } + ] assert fileinfo == { "prefix": "test", "suffix": "tar.gz", - 'file_ext_type': ['CompressedFileFormatArchive'] + "file_ext_type": ["CompressedFileFormatArchive"], } def test_get_mappings(): """ Basic test of the get mappings logic. Most of the logic is in determine_possible_importers - which is throughly tested above. + which is thoroughly tested above. """ - assert AutoDetectUtils.get_mappings(["filename", "file.name.Gz", "some.dots.gff3.gz"]) == { + assert AutoDetectUtils.get_mappings( + ["filename", "file.name.Gz", "some.dots.gff3.gz"] + ) == { "mappings": [ None, - [{ - 'app_weight': 1, - 'id': 'decompress', - 'title': 'Decompress/Unpack', - }], [ - {'app_weight': 1, - 'id': 'gff_genome', - 'title': 'GFF/FASTA Genome', - }, - {'app_weight': 1, - 'id': 'gff_metagenome', - 'title': 'GFF/FASTA MetaGenome', + { + "app_weight": 1, + "id": "decompress", + "title": "Decompress/Unpack", } ], + [ + { + "app_weight": 1, + "id": "gff_genome", + "title": "GFF/FASTA Genome", + }, + { + "app_weight": 1, + "id": "gff_metagenome", + "title": "GFF/FASTA MetaGenome", + }, + ], ], "fileinfo": [ {"prefix": "filename", "suffix": None, "file_ext_type": []}, - {"prefix": "file.name", - "suffix": "Gz", - "file_ext_type": ['CompressedFileFormatArchive'] - }, - {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ['GFF']}, - ] + { + "prefix": "file.name", + "suffix": "Gz", + "file_ext_type": ["CompressedFileFormatArchive"], + }, + {"prefix": "some.dots", "suffix": "gff3.gz", "file_ext_type": ["GFF"]}, + ], } diff --git a/tests/test_helpers.py b/tests/test_helpers.py new file mode 100644 index 00000000..5b7ed70e --- /dev/null +++ b/tests/test_helpers.py @@ -0,0 +1,102 @@ +import configparser +import os +import shutil +import traceback +from pathlib import Path +from typing import Any + +import openpyxl +from dotenv import load_dotenv + +test_config = configparser.ConfigParser() +test_config.read(os.environ["KB_DEPLOYMENT_CONFIG"]) + +DATA_DIR = test_config["staging_service"]["DATA_DIR"] +META_DIR = test_config["staging_service"]["META_DIR"] +AUTH_URL = test_config["staging_service"]["AUTH_URL"] +if DATA_DIR.startswith("."): + DATA_DIR = os.path.normpath(os.path.join(os.getcwd(), DATA_DIR)) +if META_DIR.startswith("."): + META_DIR = os.path.normpath(os.path.join(os.getcwd(), META_DIR)) + + +def bootstrap(): + """ + Attempts to load environment variables from a number of potentially + present files + + Not sure it actually does anything useful. + """ + test_env_0 = "../test.env" + test_env_1 = "test.env" + test_env_2 = "test/test.env" + + for potential_env_file in [test_env_0, test_env_1, test_env_2]: + try: + load_dotenv(potential_env_file, verbose=True) + except IOError: + pass + + +def bootstrap_config(): + bootstrap() + config_filepath = os.environ["KB_DEPLOYMENT_CONFIG"] + if not os.path.exists(config_filepath): + raise FileNotFoundError(config_filepath) + + config = configparser.ConfigParser() + config.read(config_filepath) + return config + + +def assert_exception_correct(got: Exception, expected: Exception): + err = "".join(traceback.TracebackException.from_exception(got).format()) + assert got.args == expected.args, err + assert isinstance(got, type(expected)) + + +def assert_file_contents(file: Path, lines: list[str]): + with open(file, "r", encoding="utf-8") as f: + assert f.readlines() == lines + + +def check_excel_contents( + wb: openpyxl.Workbook, + sheetname: str, + contents: list[list[Any]], + column_widths: list[int], +): + sheet = wb[sheetname] + for i, row in enumerate(sheet.iter_rows()): + assert [cell.value for cell in row] == contents[i] + # presumably there's an easier way to do this, but it works so f it + dims = [sheet.column_dimensions[dim].width for dim in sheet.column_dimensions] + assert dims == column_widths + + +class FileUtil: + def __init__(self, base_dir=DATA_DIR): + self.base_dir = base_dir + + def __enter__(self): + os.makedirs(self.base_dir, exist_ok=True) + shutil.rmtree(self.base_dir) + os.makedirs(self.base_dir, exist_ok=False) + return self + + def __exit__(self, *args): + shutil.rmtree(self.base_dir) + + def make_file(self, path, contents): + path = os.path.join(self.base_dir, path) + with open(path, encoding="utf-8", mode="w") as f: + f.write(contents) + return path + + def make_dir(self, path): + path = os.path.join(self.base_dir, path) + os.makedirs(path, exist_ok=True) + return path + + def remove_dir(self, path): + shutil.rmtree(path) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 599f0cef..22baf494 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -1,24 +1,39 @@ """ Unit tests for the metadata handling routines. """ import json +import os +import random +import string import uuid - from collections.abc import Generator -from pathlib import Path as PyPath -from pytest import fixture - -from staging_service.utils import Path -from staging_service.metadata import some_metadata +from pathlib import Path -from tests.test_app import FileUtil +from aiohttp import web +from pytest import fixture, raises -# TODO TEST add more unit tests here. +from staging_service.metadata import ( + FILE_SNIPPET_SIZE, + NOT_TEXT_FILE_VALUE, + SOURCE_JGI_IMPORT, + _determine_source, + _file_read_from_head, + _file_read_from_tail, + is_text_string, + some_metadata, +) +from staging_service.utils import StagingPath +from tests.test_helpers import FileUtil -@fixture(scope="module") -def temp_dir() -> Generator[PyPath, None, None]: +@fixture(scope="function", name="temp_dir") +def temp_dir_fixture() -> Generator[Path, None, None]: + """ + A fixture to generate a unique subdirectory within the DATA_DIR + directory set in the test configuration (which defaults to the "data" + directory in the root of the repo) + """ with FileUtil() as fu: - childdir = PyPath(fu.make_dir(str(uuid.uuid4()))).resolve() + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() yield childdir @@ -27,35 +42,32 @@ def temp_dir() -> Generator[PyPath, None, None]: async def test_incomplete_metadata_file_update(temp_dir: Path): """ - Tests the case where a file is listed or checked for existance prior to completing + Tests the case where a file is listed or checked for existence prior to completing upload, and then an UPA is added to the file. This previously caused incomplete metadata to be returned as the logic for whether to run the metadata regeneration code based on the contents of the current metadata file was incorrect. See https://kbase-jira.atlassian.net/browse/PTV-1767 """ await _incomplete_metadata_file_update( - temp_dir, - {"source": "some place", "UPA": "1/2/3"}, - "some place" - ) + temp_dir, {"source": "some place", "UPA": "1/2/3"}, "some place" + ) + + await _incomplete_metadata_file_update(temp_dir, {"UPA": "1/2/3"}, "Unknown") - await _incomplete_metadata_file_update( - temp_dir, - {"UPA": "1/2/3"}, - "Unknown" - ) async def _incomplete_metadata_file_update(temp_dir, metadict, source): - target = Path( - str(temp_dir / "full"), - str(temp_dir / "meta"), + target = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), "user_path", "myfilename", - "super_fake_jgi_path") + "super_fake_jgi_path", + ) - with open(target.full_path, "w") as p: + with open(target.full_path, "w", encoding="utf-8") as p: p.writelines(make_test_lines(1, 6)) - with open(target.metadata_path, "w") as p: + + with open(target.metadata_path, "w", encoding="utf-8") as p: p.write(json.dumps(metadict)) res = await some_metadata(target) @@ -70,11 +82,277 @@ async def _incomplete_metadata_file_update(temp_dir, metadict, source): "tail": "".join(make_test_lines(2, 6)), "UPA": "1/2/3", "isFolder": False, - "lineCount": "5", + "lineCount": 5, "name": "myfilename", "path": "user_path", - "size": 1280 - } + "size": 1280, + } + def make_test_lines(start, stop): return [str(i) + "a" * (256 - len(str(i)) - 1) + "\n" for i in range(start, stop)] + + +async def _generate_binary_file(temp_dir): + target = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + "", + ) + with open(target.full_path, "wb") as data_file: + data_file.write(b"\0\1\2\3") + + res = await some_metadata(target) + + assert "mtime" in res + assert isinstance(res["mtime"], int) + assert "lineCount" in res + assert res["lineCount"] is None + assert res["head"] == NOT_TEXT_FILE_VALUE + assert res["tail"] == NOT_TEXT_FILE_VALUE + + +async def test_binary_data(temp_dir: Path): + """ + Test case in which metadata is generated for a binary file. + """ + await _generate_binary_file(temp_dir) + + +def test_is_text(): + should_be_text = b"hello I'm text" + should_not_be_text = b"\0\1\2\4" + is_almost_text = b"hello \0oops" + text_with_acceptable_control_characters = b"hello\tthis \nis a\rstring" + invalid_unicode = b"\ac2\xa3" + + assert is_text_string(should_be_text) is True + assert is_text_string(text_with_acceptable_control_characters) is True + assert is_text_string(should_not_be_text) is False + assert is_text_string(is_almost_text) is False + assert is_text_string(invalid_unicode) is False + + +async def test_determine_source_jgi_metadata_source_exists(temp_dir: Path): + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + os.path.join(temp_dir, "jgi_metadata"), + ) + + # In this test, the actual data doesn't matter, just the + # fact that a file exists. + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: + data_file.write("foo") + + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT + + +def _create_validated_staging_path(staging_dir: str, file_path: str): + # Model actual usage, in which we have: + # - a user + # - a directory dedicated to user data - each user has a subdirectory + # inside it with the name the same as their username + # - a directory dedicated to metadata, parallel to the data dir + # - some file, which is the target of our path + username = "some_user" + data_dir = os.path.join(staging_dir, "data") + metadata_dir = os.path.join(staging_dir, "metadata") + + # Must create the data directory for the user, as the + # jgi metadata file lives IN the data dir (even though KBase + # metadata files live in the metadata parallel directory) + os.makedirs(os.path.join(data_dir, username, os.path.dirname(file_path))) + + # Here we "configure" the class. Note that this is global for the class. + StagingPath._DATA_DIR = data_dir + StagingPath._META_DIR = metadata_dir + + # Note that "validate_path" does not actually validate the path, + # it just populates the object with the various paths based on + # the config above, the username, and a file, possibly on a path. + return StagingPath.validate_path(username, file_path) + + +def test_determine_source_jgi_metadata_source_exists_canonical(temp_dir: Path): + """ + Tests the case of a simple, top level file. + """ + file_path = "foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT + + +def test_determine_source_jgi_metadata_source_exists_canonical_subdir(temp_dir: Path): + """ + Tests the case of a file in a subdirectory + """ + file_path = "some/foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT + + +def test_determine_source_jgi_metadata_source_exists_canonical_deepsubdir( + temp_dir: Path, +): + """ + Tests the case of a file in a deeply nested directory + """ + file_path = "some/deep/dark/scary/foo.bar" + filename_base = os.path.basename(file_path) + staging_path = _create_validated_staging_path(temp_dir, file_path) + with open(staging_path.jgi_metadata, "w", encoding="utf-8") as data_file: + data_file.write(f".{staging_path.full_path}.{filename_base}jgi") + + assert _determine_source(staging_path) == SOURCE_JGI_IMPORT + + +def test_determine_source_jgi_metadata_source_doesnt_exist(temp_dir: StagingPath): + # + # The _determine_source logic tests if the jgi_metadata exists for this + # StagingPath object. Unfortunately, it currently tests if the file indicated + # by the string exists and is a file. It always checks if the file exists, + # so it must be populated by SOME string. + # + # This may seem strange, but the reason is that a StagingPath is NEVER created + # with the constructor in the code itself, as we do here. It is always + # constructed by it's own methods, the only two methods it has, which + # blindly construct the path the file, whether it exists or not. + # + # I really don't quite understand this design, but that is the way it is, + # for now. + # + # So the conditions under which "no jgi metadata" is determined are: + # - path which is not associated with a file or directory (not found) + # - path associated with a directory + # + values_that_indicate_no_jgi_import_metadata_file = [ + "", # common case of leaving it empty if no metadata file possible + "zzz", # nonsensical file name + os.path.join( + temp_dir, "jgi_metadata" + ), # in the right place, but we still doesn't exist + os.path.join(temp_dir), # a directory + ] + for invalid_path in values_that_indicate_no_jgi_import_metadata_file: + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + invalid_path, + ) + assert _determine_source(staging_path) == "Unknown" + + +@fixture(scope="function") +def temp_dir2() -> Generator[Path, None, None]: + with FileUtil() as fu: + childdir = Path(fu.make_dir(str(uuid.uuid4()))).resolve() + + yield childdir + + +def make_random_string(string_length: str) -> str: + random.seed(42) + possible_letters = string.ascii_letters + return "".join( + random.choice(possible_letters) for _ in range(string_length) # NOSONAR #nosec + ) + + +def test_read_from_head_happy(tmp_path: Path): + """ + In which we read a text snippet successfully from the front of the file. + """ + cases = [1, 10, 100, 1000, 10000, 100000] + # we just happen to know this from metadata.py... + snippet_length = FILE_SNIPPET_SIZE + for case in cases: + file_path = os.path.join(tmp_path, "happy_head.txt") + file_content = make_random_string(case) + with open(file_path, "w", encoding="utf-8") as output_file: + output_file.write(file_content) + snippet = _file_read_from_head(file_path) + assert snippet == file_content[:snippet_length] + + +def test_read_from_tail_happy(tmp_path: Path): + """ + In which we read a text snippet successfully from the tail of the file. + """ + cases = [1, 10, 100, 1000, 10000, 100000] + # we just happen to know this from metadata.py... + snippet_length = FILE_SNIPPET_SIZE + for case in cases: + file_path = os.path.join(tmp_path, "happy_tail.txt") + file_content = make_random_string(case) + with open(file_path, "w", encoding="utf-8") as output_file: + output_file.write(file_content) + snippet = _file_read_from_tail(file_path) + assert snippet == file_content[-snippet_length:] + + +def test_read_from_head_sad(tmp_path: Path): + """ + In which we attempt to read a snippet from a non-text (binary) + file, and get the default "error" string. + """ + cases = [b"\ac2\xa3"] + # we just happen to know this from metadata.py... + # snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "sad_head.txt") + file_content = case + with open(file_path, "wb") as output_file: + output_file.write(file_content) + snippet = _file_read_from_head(file_path) + assert snippet == NOT_TEXT_FILE_VALUE + + +def test_read_from_tail_sad(tmp_path: Path): + """ + In which we attempt to read a snippet from a non-text (binary) + file, and get the default "error" string. + """ + cases = [b"\ac2\xa3"] + # we just happen to know this from metadata.py... + # snippet_length = 1024 + for case in cases: + file_path = os.path.join(tmp_path, "sad_tail.txt") + file_content = case + with open(file_path, "wb") as output_file: + output_file.write(file_content) + snippet = _file_read_from_tail(file_path) + assert snippet == NOT_TEXT_FILE_VALUE + + +async def test_invalid_desired_fields(temp_dir: Path): + staging_path = StagingPath( + os.path.join(temp_dir, "full"), + os.path.join(temp_dir, "meta"), + "user_path", + "myfilename", + "", + ) + + with open(staging_path.full_path, "w", encoding="utf-8") as p: + p.write("foo") + + with raises(web.HTTPBadRequest) as ex: + await some_metadata(staging_path, ["foo"]) + assert str(ex.value) == "Bad Request" + # Okay, not a lovely error message, but it is what it is, and should be improved. + assert ex.value.text == "no data exists for key ('foo',)" diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 94143456..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,54 +0,0 @@ -import configparser -import traceback - -from dotenv import load_dotenv -from pathlib import Path -from typing import Any -import os - -import openpyxl - -def bootstrap(): - test_env_0 = "../test.env" - test_env_1 = "test.env" - test_env_2 = "test/test.env" - - for item in [test_env_0, test_env_1, test_env_2]: - try: - load_dotenv(item, verbose=True) - except Exception: - pass - - -def bootstrap_config(): - bootstrap() - config_filepath = os.environ["KB_DEPLOYMENT_CONFIG"] - if not os.path.exists(config_filepath): - raise FileNotFoundError(config_filepath) - - config = configparser.ConfigParser() - config.read(config_filepath) - return config - - -def assert_exception_correct(got: Exception, expected: Exception): - err = "".join(traceback.TracebackException.from_exception(got).format()) - assert got.args == expected.args, err - assert type(got) == type(expected) - -def check_file_contents(file: Path, lines: list[str]): - with open(file) as f: - assert f.readlines() == lines - -def check_excel_contents( - wb: openpyxl.Workbook, - sheetname: str, - contents: list[list[Any]], - column_widths: list[int] -): - sheet = wb[sheetname] - for i, row in enumerate(sheet.iter_rows()): - assert [cell.value for cell in row] == contents[i] - # presumably there's an easier way to do this, but it works so f it - dims = [sheet.column_dimensions[dim].width for dim in sheet.column_dimensions] - assert dims == column_widths \ No newline at end of file