diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..a5116a3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,32 @@ +--- +name: Bug report +about: Report a bug you've found in this project +title: '' +labels: bug +assignees: '' + +--- + +**Description** +A clear and concise description of what the bug is. + +**Expected Behaviour** +What you expected to happen instead. + +**To Reproduce** +Steps to reproduce the bug. + +**Error Log** +Paste any relevant error logs below: +``` + +``` + +**Screenshots** +Add screenshots to illustrate the bug if you want. + +**Your Setup** + - Commit/version of this repo: + +**Anything Else?** +... diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..bc634e0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,17 @@ +--- +name: Feature request +about: Suggest an feature or improvement for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Overview** +Give a brief description of what the feature or improvement should do and why. + +**Possible Solutions** +Do you have any ideas for how you'd want to implement it? + +**Anything Else?** +Add any other information here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..f936383 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,4 @@ +- [ ] I have read the [section on commits and pull requests](https://github.com/NaturalHistoryMuseum/splitgill/blob/main/CONTRIBUTING.md#commits-and-pull-requests) in `CONTRIBUTING.md` + + +Describe your changes, tagging relevant issues where possible. diff --git a/.github/SUPPORT.md b/.github/SUPPORT.md index 92ea969..00159f0 100644 --- a/.github/SUPPORT.md +++ b/.github/SUPPORT.md @@ -3,7 +3,10 @@ ## Documentation - [splitgill documentation](https://splitgill.readthedocs.io) +## Issues +- [Issues for splitgill](https://github.com/NaturalHistoryMuseum/splitgill/issues) +- [The NHM on Github](https://github.com/NaturalHistoryMuseum) + ## Contact Us - [Gitter](https://gitter.im/nhm-data-portal/lobby) - [Email _data@nhm.ac.uk_](mailto:data@nhm.ac.uk) -- [Twitter](https://twitter.com/nhm_data) diff --git a/.github/workflows/pull-requests.yml b/.github/workflows/pull-requests.yml new file mode 100644 index 0000000..134e108 --- /dev/null +++ b/.github/workflows/pull-requests.yml @@ -0,0 +1,27 @@ +name: Validate pull requests + +on: + pull_request: + types: [opened, edited, reopened, synchronize] + +jobs: + validate-commits: + name: Validate commit messages + runs-on: ubuntu-latest + steps: + - name: Checkout source code + uses: actions/checkout@v4 + - name: Check commit message format + uses: webiny/action-conventional-commits@v1.3.0 + with: + allowed-commit-types: 'feat,fix,refactor,perf,docs,style,test,build,ci,chore,new,patch,revert,ui,merge' + pre-commit: + name: Run pre-commit checks + runs-on: ubuntu-latest + steps: + - name: Checkout source code + uses: actions/checkout@v4 + - name: Setup Python + uses: actions/setup-python@v5 + - name: Run pre-commit + uses: pre-commit/action@v3.0.1 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2d4525a..19b0b7e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ exclude: /(vendor|dist)/ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v5.0.0 + rev: v6.0.0 hooks: - id: check-merge-conflict - id: detect-private-key @@ -11,18 +11,18 @@ repos: exclude: ^tests/helpers/ - id: trailing-whitespace - repo: https://github.com/commitizen-tools/commitizen - rev: v3.30.0 + rev: v4.11.0 hooks: - id: commitizen additional_dependencies: ['cz-nhm'] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.7.1 + rev: v0.14.10 hooks: - id: ruff args: [ '--fix', '--select', 'I', '--select', 'F401', '--fix-only' ] - id: ruff-format - repo: https://github.com/PyCQA/docformatter - rev: eb1df34 + rev: v1.7.7 hooks: - id: docformatter args: [ '-i', '--config', './pyproject.toml' ] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 78f3b26..734d94a 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,7 +7,10 @@ build: python: install: - - requirements: docs/requirements.txt + - method: pip + path: . + extra_requirements: + - docs mkdocs: configuration: mkdocs.yml diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000..e14c03a --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,19 @@ +cff-version: 1.2.0 +title: splitgill +message: >- + If you use this software, please cite it using the + metadata from this file. +type: software +authors: + - name: Natural History Museum + city: London + country: GB + alias: NHM + email: data@nhm.ac.uk +repository-code: 'https://github.com/NaturalHistoryMuseum/splitgill' +abstract: Versioned search library using MongoDB and Elasticsearch. +keywords: + - search + - nhm +license: GPL-3.0-or-later +version: 3.1.1 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..2bdd451 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,132 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[data@nhm.ac.uk](mailto:data@nhm.ac.uk). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..7dcb685 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,182 @@ +# Contributing + +Hi! Thanks for considering contributing to splitgill. :sauropod: + +We welcome suggestions, bug reports, and code or documentation changes. This document will give you a quick overview of how to make one of these contributions. + +If you're completely new to Git, GitHub, or contributing to open source, you might find it helpful to read some other material first, like [W3's Git Tutorial](https://www.w3schools.com/git), [GitHub Quickstart](https://docs.github.com/en/get-started/quickstart), or [How to Contribute to Open Source](https://opensource.guide/how-to-contribute). This document does assume some familiarity with these concepts for brevity, but we're happy to help anyone who's new or needs further clarification. + + +## Quick links + +- [Code of Conduct](./CODE_OF_CONDUCT.md) +- [Support](./.github/SUPPORT.md) +- [Issues](https://github.com/NaturalHistoryMuseum/splitgill/issues) + + +## Table of contents + +1. [Introduction](#introduction) +2. [Questions](#questions) +3. [Suggestions and bug reports](#suggestions-and-bug-reports) +4. [Commits and pull requests](#commits-and-pull-requests) + 1. [Commits](#commits) + 2. [Code changes and style guide](#code-changes-and-style-guide) + 3. [Documentation changes](#documentation-changes) + + +## Introduction + +### Code of Conduct + +We have a [Code of Conduct](./CODE_OF_CONDUCT.md), which all contributors and community participants are expected to adhere to. + +### Official maintainers + +This repository and [several others](https://github.com/search?q=props.maintainers:%22science%20data%20platforms%22+org:NaturalHistoryMuseum&type=repositories) are maintained by the Science Data Platforms team at the Natural History Museum, London. This is a very small team, so contributions are very welcome! + +The current core team consists of: +- Ginger ([@alycejenni](https://github.com/alycejenni)) - Principal Software Engineer + +## Questions + +Before asking your question, have you checked: +- [The README](./README.md) +- [The documentation](https://splitgill.readthedocs.io) +- [The existing issues](https://github.com/NaturalHistoryMuseum/splitgill/issues?q=is:issue) + +If none of those answer your question, try contacting us before raising it as an issue on GitHub. You can find places to contact us in [SUPPORT.md](./.github/SUPPORT.md). + + +## Suggestions and bug reports + +Suggestions, feature requests, and bug reports are all submitted as [GitHub issues](https://docs.github.com/en/issues). + +See GitHub's documentation for the basics on [how to create an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/creating-an-issue). Before you create an issue, please check the [existing issues](https://github.com/NaturalHistoryMuseum/splitgill/issues?q=is:issue) to see if it's already been raised. + +### Good bug reports + +We've provided a template for bug reports. It's not 100% necessary to follow this template exactly, but it does demonstrate the kind of information that will be useful to anyone trying to fix the problem. The most key information is anything that will allow someone to try and recreate your issue on their own system. + +### Good feature suggestions + +We've also provided a template for feature suggestions. This is fairly sparse, and again isn't completely necessary to follow exactly. Please just try to provide as much information about your idea as possible. + + +## Commits and pull requests + +If you want to contribute a change to any of the files in the repository, whether it's code, documentation, or repository meta files like `.gitignore`, you'll need to make commits and pull requests. + +The process is generally as follows: + +1. [Fork the repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo). +2. Create a new branch to work on by branching off `dev`. We name ours in the format `user/feature-name`, e.g. `ginger/deprecate-ckanpackager`. Try to only work on one topic (e.g. implementing one feature, or fixing a couple of related bugs) per branch. +3. Make your changes, and commit often; each commit should only contain one change. See below for specifics on how to word your commits. +4. Push your changes back to your fork. +5. [Open a pull request](https://docs.github.com/en/get-started/quickstart/contributing-to-projects#making-a-pull-request) in this repository, with the base branch set to **dev** and the compare branch set to your new branch. Provide a summary of your changes in the description. +6. There are several automated checks that will run when you open the pull request. Try to make all of them pass. If you do not at least _attempt_ to make them pass, we will not merge your pull request. + 1. Tests. Update them so that they pass, if necessary. New tests are always welcome in any pull request, but if you have added a new feature that has decreased the coverage, new tests are required. + 2. Commit format validation. If you have not followed the conventional commits format for one or more of your commits, this will fail. + 3. Code format validation. If you have not formatted your code correctly (using Ruff, and docformatter), this will fail. +7. Wait for feedback from one of the core maintainers. If it's been a week or so and we haven't responded, we may not have seen it. You can find other places to contact us in [SUPPORT.md](./.github/SUPPORT.md). + +### Commits + +Our commits follow the [Conventional Commits](https://www.conventionalcommits.org) style for consistency and to make it easy to generate changelogs. Please follow this style when you make your commits. We have our own slightly tweaked version of the default style called [`cz-nhm`](https://github.com/NaturalHistoryMuseum/cz-nhm). It's very similar with just a few additions. + +Commits are formatted as follows - not every line is required: +``` +(): + + + +BREAKING CHANGE: + +Closes: +``` + +e.g. +``` +fix(actions): write a short lowercase description of what you did + +Give a bit more context or detail about the changes. + +Closes: #1, #3 +``` + +Or, a very basic but still completely valid commit: +``` +feat: add an exciting new feature +``` + +The tools described below are very useful for generating these messages and sticking to the structure, so we _highly_ recommend using them. + +#### Tools + +We use a few tools to help us with code standardisation and keeping to the conventional commits style. When making a commit, the two key tools to know about are: + +1. [commitizen](https://commitizen-tools.github.io/commitizen) +2. [pre-commit](https://pre-commit.com) + +Both tools are Python-based and can be installed with `pip`. You'll probably want to use something like [pipx](https://pypa.github.io/pipx) or [venv](https://docs.python.org/3/library/venv.html) instead of installing them globally, but for simplicity, the instructions below are just for pip. + +##### commitizen + +commitizen is a CLI tool for creating commits in the conventional commits style. + +To install with `pip`: +```shell +# NB: cz-nhm must be installed in the same environment as commitizen +pip install commitizen cz-nhm +``` + +Then to make a commit: +```shell +cz c +# and follow the prompts +``` + +##### pre-commit + +pre-commit is a tool that runs a variety of checks and modifications before a commit is made. You can check the [.pre-commit-config.yaml](./.pre-commit-config.yaml) file to see exactly what it's currently configured to do for this repository, but of particular note: + +- reformats Python code with [Ruff](https://docs.astral.sh/ruff) +- reformats docstrings with [docformatter](https://github.com/PyCQA/docformatter) +- checks your commit message is correcly formatted + +To install with `pip`: +```shell +pip install pre-commit +``` + +When installed, the checks will be run on all staged files when you try to make a commit. If any of them fail or make any modifications, the commit will be abandoned and you will have to write the message again. Because of this, it's probably best to run the checks on the staged files manually before you even attempt a commit: +```shell +pre-commit run +``` + +Don't forget to stage any modifications that it makes! Once it runs without failing, then you can make your commit. + +Something to remember is that empty docstrings will cause conflicts between Ruff and docformatter and the checks will fail repeatedly - so don't leave your docstrings empty! + +### Code changes and style guide + +We generally use external style guides and tools to help us maintain standardised code. Ruff and docformatter will be run with pre-commit. + +#### Python + +We use [Ruff](https://docs.astral.sh/ruff) to format our code, using defaults for everything except quote style (we use single quotes). + +We also _mostly_ use [CKAN's style](http://docs.ckan.org/en/latest/contributing/python.html), with the following exceptions: +- prefer `f''` strings over `.format()` +- don't use `u''` strings +- use double quotes for docstrings, not single quotes + +### Documentation changes + +Our documentation is generated using [MkDocs](https://www.mkdocs.org), then hosted on [Read the Docs](https://about.readthedocs.com). You can view the current documentation for this repository at [splitgill.readthedocs.io](https://splitgill.readthedocs.io). + +Most of the documentation for this repository is generated automatically by pulling _docstrings_ from the Python code. This documentation is placed in the "API" subfolder in the rendered output. + +There are also a few pages written as standalone `.md` files, e.g. [installation.md](./docs/installation.md), or as a subfolder, e.g. [usage](./docs/usage/index.md). You can also edit these or create new pages. In most of our extensions these are still very sparse or automatically generated from content in the README, but the extension `ckanext-versioned-datastore` has a [good example of more complex documentation](https://github.com/NaturalHistoryMuseum/ckanext-versioned-datastore/tree/main/docs/usage/downloads). + +Once you've made your changes, follow the commit and pull request guidelines as you would for a code change. You will almost certainly be using the `docs:` commit prefix. diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index f9bd145..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt diff --git a/docs/_scripts/gen_api_pages.py b/docs/_scripts/gen_api_pages.py index 4bfaa9c..9ea960a 100644 --- a/docs/_scripts/gen_api_pages.py +++ b/docs/_scripts/gen_api_pages.py @@ -1,6 +1,5 @@ # !/usr/bin/env python # encoding: utf-8 - """ Generate the code reference pages and navigation. @@ -35,7 +34,7 @@ nav[parts] = doc_path.as_posix() - with mkdocs_gen_files.open(full_doc_path, "w") as fd: + with mkdocs_gen_files.open(full_doc_path, 'w') as fd: ident = '.'.join(parts) fd.write(f'::: splitgill.{ident}') diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 310cdc8..0000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -mkdocs -mkdocstrings[python] -mkdocs-material -mkdocs-gen-files -mkdocs-section-index -mkdocs-include-markdown-plugin diff --git a/pyproject.toml b/pyproject.toml index 40f1008..6c2e69a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,12 +75,38 @@ tag_format = "v$version" update_changelog_on_bump = true changelog_incremental = false version_files = [ - "pyproject.toml:version" + "pyproject.toml:version", + "CITATION.cff:^version" ] -[tool.pylint] -max-line-length = 88 -disable = ["C0114", "R0903"] +[tool.ruff] +target-version = "py38" + +[tool.ruff.format] +quote-style = "single" + +[tool.ruff.lint] +select = [ + "E", + "F", + "I", + "D" +] +ignore = [ + "D100", + "D104", + "D200", + "D202", + "D203", + "D205", + "D206", + "D212", + "D300", + "D401", + "E111", + "E114", + "E117" +] [tool.docformatter] wrap-summaries = 88 diff --git a/splitgill/diffing.py b/splitgill/diffing.py index 14d9d58..6913e92 100644 --- a/splitgill/diffing.py +++ b/splitgill/diffing.py @@ -1,21 +1,21 @@ import abc from collections import deque from dataclasses import dataclass -from datetime import datetime, date +from datetime import date, datetime from itertools import zip_longest from typing import ( - Iterable, - Tuple, Any, - Union, - NamedTuple, - Dict, + Collection, Deque, - List, - TypeVar, + Dict, Generic, + Iterable, + List, + NamedTuple, Optional, - Collection, + Tuple, + TypeVar, + Union, ) import regex as rx @@ -24,23 +24,23 @@ # strftime formats used to turn datetime and date objects into strings before data # enters MongoDB (see prepare_data), these are based on ISO 8601 -DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f%z" -DATE_FORMAT = "%Y-%m-%d" +DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%f%z' +DATE_FORMAT = '%Y-%m-%d' # when we turn a naive datetime into a string using the DATETIME_FORMAT above, %z won't # appear meaning we can't strptime with the same format. This is annoying, so here's a # strptime format that can parse this native result -NAIVE_DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S.%f" +NAIVE_DATETIME_FORMAT = '%Y-%m-%dT%H:%M:%S.%f' # a convenient tuple of all the datetime formats Splitgill uses internally SG_DATE_FORMATS = (DATETIME_FORMAT, DATE_FORMAT, NAIVE_DATETIME_FORMAT) # this regex matches invalid characters which we would like to remove from all string # values as they are ingested into the system. It matches unicode control characters # (i.e. category C*) but not \n, \r, or \t). -invalid_value_char_regex = rx.compile(r"[^\P{C}\n\r\t]") +invalid_value_char_regex = rx.compile(r'[^\P{C}\n\r\t]') # this regex matches invalid characters which we would like to remove from all field # names as they are ingested into the system. It matches all unicode control characters, # so it's a bit stricter than the value regex which allows new lines and tabs -invalid_key_char_regex = rx.compile(r"[^\P{C}]") +invalid_key_char_regex = rx.compile(r'[^\P{C}]') def prepare_data( @@ -68,7 +68,7 @@ def prepare_data( if isinstance(value, str): # replace any invalid characters in the string with the empty string - return invalid_value_char_regex.sub("", value) + return invalid_value_char_regex.sub('', value) if isinstance(value, (int, float, bool)): return value @@ -114,16 +114,16 @@ def prepare_field_name(name: Any) -> str: :param name: the field name :return: a clean str field name """ - clean_name = invalid_key_char_regex.sub("", str(name)).replace(".", "_").strip() + clean_name = invalid_key_char_regex.sub('', str(name)).replace('.', '_').strip() # if this results in the empty string, replace with a hyphen if not clean_name: - return "-" + return '-' if name == DATA_ID_FIELD: return DATA_ID_FIELD - if clean_name[0] == "_": - clean_name = f"-{clean_name[1:]}" + if clean_name[0] == '_': + clean_name = f'-{clean_name[1:]}' return clean_name @@ -138,7 +138,7 @@ class DiffOp(NamedTuple): ops: Dict[str, Any] -_T = TypeVar("_T") +_T = TypeVar('_T') @dataclass @@ -152,7 +152,7 @@ class Comparison(abc.ABC, Generic[_T]): right: _T @abc.abstractmethod - def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]: + def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]: """ Compare the two objects and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then @@ -168,7 +168,7 @@ class DictComparison(Comparison[dict]): A comparison between two dicts. """ - def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]: + def compare(self) -> Tuple[Optional[DiffOp], List['Comparison']]: """ Compares the two dicts and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then @@ -184,11 +184,11 @@ def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]: key: value for key, value in self.right.items() if key not in self.left } if new_values: - ops["dn"] = new_values + ops['dn'] = new_values deleted_keys = [key for key in self.left if key not in self.right] if deleted_keys: - ops["dd"] = deleted_keys + ops['dd'] = deleted_keys changes = {} for key, left_value in self.left.items(): @@ -211,7 +211,7 @@ def compare(self) -> Tuple[Optional[DiffOp], List["Comparison"]]: else: changes[key] = right_value if changes: - ops["dc"] = changes + ops['dc'] = changes return DiffOp(self.path, ops) if ops else None, further_comparisons @@ -222,7 +222,7 @@ class ListComparison(Comparison[list]): A comparison between two lists. """ - def compare(self) -> Tuple[DiffOp, List["Comparison"]]: + def compare(self) -> Tuple[DiffOp, List['Comparison']]: """ Compares the two lists and return a 2-tuple containing a DiffOp and a list of further comparisons which need to be handled. If no differences are found, then @@ -244,13 +244,13 @@ def compare(self) -> Tuple[DiffOp, List["Comparison"]]: if left_value is missing: # the right list is longer, so store all the new values so that they can # just be added to the left list to patch it, and stop - ops["ln"] = self.right[index:] + ops['ln'] = self.right[index:] break elif right_value is missing: # the left value is longer, so store the index from which elements in # the left list will be deleted to shorten it to the length of the right # list, and stop - ops["ld"] = index + ops['ld'] = index break else: # a change in the values at this index in each list, check for nested @@ -267,7 +267,7 @@ def compare(self) -> Tuple[DiffOp, List["Comparison"]]: else: changes.append((index, right_value)) if changes: - ops["lc"] = changes + ops['lc'] = changes return DiffOp(self.path, ops) if ops else None, further_comparisons @@ -300,7 +300,7 @@ def diff(base: dict, new: dict) -> Iterable[DiffOp]: return if not isinstance(base, dict) or not isinstance(new, dict): - raise DiffingTypeComparisonException("Both base and new must be dicts") + raise DiffingTypeComparisonException('Both base and new must be dicts') # todo: we could write a shortcut when one of the dicts is empty @@ -334,7 +334,7 @@ def patch(base: dict, ops: Collection[DiffOp]) -> dict: :param base: the starting dict :param ops: the DiffOps to apply to the base dict (can be pure tuples, doesn't have - to be DiffOp namedtuples) + to be DiffOp namedtuples) :return: a new dict with the changes applied """ # nothing to do @@ -355,21 +355,21 @@ def patch(base: dict, ops: Collection[DiffOp]) -> dict: target = target_copy # dict ops - if "dc" in op: - target.update(op["dc"]) - if "dn" in op: - target.update(op["dn"]) - if "dd" in op: - for key in op["dd"]: + if 'dc' in op: + target.update(op['dc']) + if 'dn' in op: + target.update(op['dn']) + if 'dd' in op: + for key in op['dd']: del target[key] # list ops - if "lc" in op: - for index, value in op["lc"]: + if 'lc' in op: + for index, value in op['lc']: target[index] = value - if "ln" in op: - target.extend(op["ln"]) - if "ld" in op: - del target[op["ld"] :] + if 'ln' in op: + target.extend(op['ln']) + if 'ld' in op: + del target[op['ld'] :] return new diff --git a/splitgill/indexing/fields.py b/splitgill/indexing/fields.py index b665812..42532a8 100644 --- a/splitgill/indexing/fields.py +++ b/splitgill/indexing/fields.py @@ -1,14 +1,15 @@ import dataclasses from collections import Counter from enum import auto -from typing import Optional, List, Counter as CounterType, Union +from typing import Counter as CounterType +from typing import List, Optional, Union from strenum import LowercaseStrEnum, StrEnum # special field containing the record's ID so that it can be used for searching etc. # Internally, Splitgill doesn't need this, but it's included for user convenience in # every record as data._id / parsed._id. -DATA_ID_FIELD = "_id" +DATA_ID_FIELD = '_id' class DocumentField(LowercaseStrEnum): @@ -53,21 +54,21 @@ class ParsedType(StrEnum): """ # the unparsed raw field value - UNPARSED = "_u" + UNPARSED = '_u' # the number field - NUMBER = "_n" + NUMBER = '_n' # the date field - DATE = "_d" + DATE = '_d' # the boolean field - BOOLEAN = "_b" + BOOLEAN = '_b' # the text field - TEXT = "_t" + TEXT = '_t' # the keyword case-insensitive field - KEYWORD = "_k" + KEYWORD = '_k' # the geo point field (shape centroid, will always be a point) - GEO_POINT = "_gp" + GEO_POINT = '_gp' # the geo shape field (full shape, could be point, linestring, or polygon) - GEO_SHAPE = "_gs" + GEO_SHAPE = '_gs' def path_to(self, field: str, full: bool = True) -> str: """ @@ -75,7 +76,7 @@ def path_to(self, field: str, full: bool = True) -> str: :param field: the name (including dots if needed) of the field :param full: whether to prepend the parsed field name to the path or not - (default: True) + (default: True) :return: the path """ return parsed_path(field, self, full) @@ -93,16 +94,16 @@ def parsed_path( :param field: the name (including dots if needed) of the field :param parsed_type: the parsed type (default: None) :param full: whether to prepend the parsed field name to the path or not (default: - True) + True) :return: the path """ if parsed_type is not None: - path = f"{field}.{parsed_type}" + path = f'{field}.{parsed_type}' else: path = field if full: - return f"{DocumentField.DATA}.{path}" + return f'{DocumentField.DATA}.{path}' else: return path @@ -117,13 +118,13 @@ class DataType(LowercaseStrEnum): The types represented here should match the output of diffing.prepare_data. """ - NONE = "#n" - STR = "#s" - INT = "#i" - FLOAT = "#f" - BOOL = "#b" - LIST = "#l" - DICT = "#d" + NONE = '#n' + STR = '#s' + INT = '#i' + FLOAT = '#f' + BOOL = '#b' + LIST = '#l' + DICT = '#d' @classmethod def type_for(cls, value: Union[str, int, float, bool, dict, list, None]): @@ -136,9 +137,9 @@ def type_for(cls, value: Union[str, int, float, bool, dict, list, None]): """ if value is not None and not isinstance(value, valid_data_types): raise TypeError( - f"Type ({type(value)}) of value ({value}) not valid DataType" + f'Type ({type(value)}) of value ({value}) not valid DataType' ) - return DataType(f"#{type(value).__name__[0].lower()}") + return DataType(f'#{type(value).__name__[0].lower()}') @dataclasses.dataclass @@ -155,21 +156,21 @@ class DataField: # given types type_counts: CounterType[DataType] = dataclasses.field(default_factory=Counter) # the parent data field (if None, this is a field at the root of the record data) - parent: Optional["DataField"] = None + parent: Optional['DataField'] = None # the immediate descendants of this field (will only have values if this field # appears as a list or dict - children: List["DataField"] = dataclasses.field(default_factory=list) + children: List['DataField'] = dataclasses.field(default_factory=list) def add(self, type_names: str, count: int): """ Add the given type count data to this field. :param type_names: the types this field is seen as a string of their names - separated by commas. + separated by commas. :param count: the number of records with this combination of types """ self.count += count - for name in type_names.split(","): + for name in type_names.split(','): self.type_counts[DataType(name)] += count def is_type(self, *data_types: DataType) -> bool: @@ -178,7 +179,7 @@ def is_type(self, *data_types: DataType) -> bool: :param data_types: the data types to be checked :return: True if the field is an instance of one of the given data types, False - if not + if not """ return any(self.type_counts[data_type] > 0 for data_type in data_types) @@ -188,7 +189,7 @@ def has_children(self) -> bool: @property def name(self) -> str: - return self.path.split(".")[-1] + return self.path.split('.')[-1] @property def is_none(self) -> bool: @@ -267,11 +268,11 @@ def parsed_path(self) -> str: :return: a str path """ - return ".".join(filter(None, self.path.split("."))) + return '.'.join(filter(None, self.path.split('.'))) @property def is_list_element(self) -> bool: - return self.name == "" + return self.name == '' @dataclasses.dataclass @@ -289,11 +290,11 @@ def add(self, type_names: str, count: int): Add the given type count data to this field. :param type_names: the types this field is seen as a string of their names - separated by commas. + separated by commas. :param count: the number of records with this combination of types """ self.count += count - for raw_type in type_names.split(","): + for raw_type in type_names.split(','): self.type_counts[ParsedType(raw_type)] += count def is_type(self, *parsed_types: ParsedType) -> bool: @@ -302,13 +303,13 @@ def is_type(self, *parsed_types: ParsedType) -> bool: :param parsed_types: the parsed types to be checked :return: True if the field is an instance of one of the given parsed types, - False if not + False if not """ return any(self.type_counts[parsed_type] > 0 for parsed_type in parsed_types) @property def name(self) -> str: - return self.path.split(".")[-1] + return self.path.split('.')[-1] @property def is_text(self) -> bool: diff --git a/splitgill/indexing/geo.py b/splitgill/indexing/geo.py index 82a09b6..408cd85 100644 --- a/splitgill/indexing/geo.py +++ b/splitgill/indexing/geo.py @@ -1,13 +1,13 @@ import re from functools import lru_cache from itertools import chain -from typing import Optional, Iterable +from typing import Iterable, Optional import orjson from cytoolz.itertoolz import sliding_window -from fastnumbers import try_float, RAISE +from fastnumbers import RAISE, try_float from pyproj import CRS, Transformer -from shapely import Point, LineString, Polygon, from_wkt, from_geojson, to_wkt +from shapely import LineString, Point, Polygon, from_geojson, from_wkt, to_wkt from shapely.geometry.base import BaseGeometry from shapely.ops import transform @@ -45,8 +45,8 @@ def match_hints(data: dict, hints: Iterable[GeoFieldHint]) -> dict: matches = {} for hint in hints: - longitude = data.get(hint.lon_field, "") - latitude = data.get(hint.lat_field, "") + longitude = data.get(hint.lon_field, '') + latitude = data.get(hint.lat_field, '') try: point = Point(longitude, latitude) except (ValueError, TypeError): @@ -59,7 +59,7 @@ def match_hints(data: dict, hints: Iterable[GeoFieldHint]) -> dict: if hint.radius_field: try: radius = try_float( - data.get(hint.radius_field, ""), on_fail=RAISE, nan=RAISE, inf=RAISE + data.get(hint.radius_field, ''), on_fail=RAISE, nan=RAISE, inf=RAISE ) # only need to make a circle if the radius is greater than 0, this also # means we ignore negative radius values @@ -94,15 +94,15 @@ def create_polygon_circle( :param longitude: a longitude float value :param radius_in_metres: a radius in metres value :param segments: the quad_segs parameter to pass when creating the circle around the - point (the number of segments used will be 4*segments). + point (the number of segments used will be 4*segments). :return: a Polygon """ if radius_in_metres <= 0: - raise ValueError("Uncertainty cannot be <= 0") + raise ValueError('Uncertainty cannot be <= 0') # thanks to https://gis.stackexchange.com/a/289923 for this! aeqd_proj = CRS.from_proj4( - f"+proj=aeqd +lat_0={latitude} +lon_0={longitude} +x_0=0 +y_0=0" + f'+proj=aeqd +lat_0={latitude} +lon_0={longitude} +x_0=0 +y_0=0' ) tfmr = Transformer.from_proj(aeqd_proj, aeqd_proj.geodetic_crs) buf = Point(0, 0).buffer(radius_in_metres, quad_segs=segments) @@ -110,7 +110,7 @@ def create_polygon_circle( polygon = Polygon(transform(tfmr.transform, buf).exterior.coords[::-1]) # confirm that we've created something sensible if not is_shape_valid(polygon) or not is_winding_valid(polygon): - raise ValueError("Invalid circle generated") + raise ValueError('Invalid circle generated') return polygon @@ -135,11 +135,11 @@ def match_geojson(candidate: dict) -> Optional[dict]: :return: returns a dict ready for indexing or None """ # check to make sure trying to get GeoJSON out of this dict is even worth trying - if "type" not in candidate or "coordinates" not in candidate: + if 'type' not in candidate or 'coordinates' not in candidate: return None shape: Optional[BaseGeometry] = from_geojson( - orjson.dumps(candidate), on_invalid="ignore" + orjson.dumps(candidate), on_invalid='ignore' ) if shape is None or not is_shape_valid(shape): return None @@ -151,7 +151,7 @@ def match_geojson(candidate: dict) -> Optional[dict]: return to_parsed_dict(shape.centroid, shape) -wkt_start_check_regex = re.compile(r"(point|linestring|polygon) ", re.IGNORECASE) +wkt_start_check_regex = re.compile(r'(point|linestring|polygon) ', re.IGNORECASE) def match_wkt(candidate: str) -> Optional[dict]: @@ -167,7 +167,7 @@ def match_wkt(candidate: str) -> Optional[dict]: if wkt_start_check_regex.match(candidate) is None: return None - shape: Optional[BaseGeometry] = from_wkt(candidate, on_invalid="ignore") + shape: Optional[BaseGeometry] = from_wkt(candidate, on_invalid='ignore') if shape is None or not is_shape_valid(shape): return None diff --git a/splitgill/indexing/index.py b/splitgill/indexing/index.py index 50cb556..cdabc04 100644 --- a/splitgill/indexing/index.py +++ b/splitgill/indexing/index.py @@ -52,15 +52,15 @@ class IndexNames: def __init__(self, name: str): self.name = name # the base name of all indices for this database - self.base = f"data-{name}" + self.base = f'data-{name}' # the latest index name - self.latest = f"{self.base}-latest" + self.latest = f'{self.base}-latest' # the archive indices base name - self.arc_base = f"{self.base}-arc" + self.arc_base = f'{self.base}-arc' # wildcard name to catch all data indices (so both latest and all arcs) - self.wildcard = f"{self.base}-*" + self.wildcard = f'{self.base}-*' # wildcard name to catch all arc indices - self.arc_wildcard = f"{self.arc_base}-*" + self.arc_wildcard = f'{self.arc_base}-*' def get_arc(self, index: int) -> str: """ @@ -69,7 +69,7 @@ def get_arc(self, index: int) -> str: :param index: the archive index :return: the index name """ - return f"{self.arc_base}-{index}" + return f'{self.arc_base}-{index}' @dataclass @@ -102,12 +102,12 @@ class IndexOp(BulkOp): def serialise(self) -> str: # index ops are 2 lines, first the action metadata and then the document return ( - dumps({"index": {"_index": self.index, "_id": self.doc_id}}) - + b"\n" + dumps({'index': {'_index': self.index, '_id': self.doc_id}}) + + b'\n' # we have to use OPT_NON_STR_KEYS because we're using StrEnums and orjson # doesn't work with them :( + dumps(self.document, option=OPT_NON_STR_KEYS) - ).decode("utf-8") + ).decode('utf-8') @dataclass @@ -134,7 +134,7 @@ class RecordVersion: version: int parsed: ParsedData # pointer to the next RecordVersion - next: Optional["RecordVersion"] = None + next: Optional['RecordVersion'] = None # if this version has been deleted, this is set with the version it was deleted at deleted_at: Optional[int] = None @@ -164,14 +164,14 @@ def create_doc(self) -> dict: doc = { DocumentField.ID: self.record_id, DocumentField.VERSION: self.version, - DocumentField.VERSIONS: {"gte": self.version}, + DocumentField.VERSIONS: {'gte': self.version}, DocumentField.DATA: self.parsed.parsed, DocumentField.DATA_TYPES: self.parsed.data_types, DocumentField.PARSED_TYPES: self.parsed.parsed_types, } if self.version_end is not None: doc[DocumentField.NEXT] = self.version_end - doc[DocumentField.VERSIONS]["lt"] = self.version_end + doc[DocumentField.VERSIONS]['lt'] = self.version_end return doc def __eq__(self, other: Any) -> bool: @@ -205,7 +205,7 @@ class RecordVersions: @classmethod def build( cls, record: MongoRecord, all_options: Dict[int, ParsingOptions] - ) -> "RecordVersions": + ) -> 'RecordVersions': """ Build a new RecordVersion object using the data in the given record and all the available options. diff --git a/splitgill/indexing/options.py b/splitgill/indexing/options.py index 263b0e4..9bf61bd 100644 --- a/splitgill/indexing/options.py +++ b/splitgill/indexing/options.py @@ -1,7 +1,7 @@ from typing import Optional, Set -from splitgill.diffing import DATETIME_FORMAT, DATE_FORMAT, NAIVE_DATETIME_FORMAT -from splitgill.model import ParsingOptions, GeoFieldHint +from splitgill.diffing import DATE_FORMAT, DATETIME_FORMAT, NAIVE_DATETIME_FORMAT +from splitgill.model import GeoFieldHint, ParsingOptions class ParsingOptionsBuilder: @@ -18,7 +18,7 @@ def __init__(self, based_on: Optional[ParsingOptions] = None): # somewhat sensible representative idea to users of what the number actually is # and how it can be searched. This format will produce string representations of # numbers in scientific notation if it decides it needs to - self._float_format: str = "{0:.15g}" + self._float_format: str = '{0:.15g}' self._true_values: Set[str] = set() self._false_values: Set[str] = set() # add the formats we use for datetime and date objects during ingest by default @@ -51,7 +51,7 @@ def build(self) -> ParsingOptions: self._float_format, ) - def with_true_value(self, value: str) -> "ParsingOptionsBuilder": + def with_true_value(self, value: str) -> 'ParsingOptionsBuilder': """ Add the given value to the set of strings that means True and return self (for easy chaining). The value is lowercased before adding it to the set of accepted @@ -67,7 +67,7 @@ def with_true_value(self, value: str) -> "ParsingOptionsBuilder": self._true_values.add(value.lower()) return self - def with_false_value(self, value: str) -> "ParsingOptionsBuilder": + def with_false_value(self, value: str) -> 'ParsingOptionsBuilder': """ Add the given value to the set of strings that means False and return self (for easy chaining). The value is lowercased before adding it to the set of accepted @@ -83,7 +83,7 @@ def with_false_value(self, value: str) -> "ParsingOptionsBuilder": self._false_values.add(value.lower()) return self - def with_date_format(self, date_format: str) -> "ParsingOptionsBuilder": + def with_date_format(self, date_format: str) -> 'ParsingOptionsBuilder': """ Add the given date format to the set of date formats to parse and return self (for easing chaining). The date format should be one that datetime.strptime can @@ -105,7 +105,7 @@ def with_geo_hint( longitude_field: str, radius_field: Optional[str] = None, segments: int = 16, - ) -> "ParsingOptionsBuilder": + ) -> 'ParsingOptionsBuilder': """ Add the given lat/lon/radius field combination as a hint for the existence of a geo parsable field. The radius field name is optional. A segments parameter can @@ -135,7 +135,7 @@ def with_geo_hint( :param longitude_field: the name of the longitude field :param radius_field: the name of the radius field (optional) :param segments: the number of segments to use when creating the circle - (optional, defaults to 16) + (optional, defaults to 16) :return: self """ if latitude_field and longitude_field: @@ -144,7 +144,7 @@ def with_geo_hint( self._geo_hints.add(hint) return self - def with_keyword_length(self, keyword_length: int) -> "ParsingOptionsBuilder": + def with_keyword_length(self, keyword_length: int) -> 'ParsingOptionsBuilder': """ Sets the maximum keyword length which will be used when indexing. Any strings longer than this value will be trimmed down before they are sent to @@ -167,14 +167,14 @@ def with_keyword_length(self, keyword_length: int) -> "ParsingOptionsBuilder": """ if keyword_length < 1: # 0 would mean no keyword values would be indexed, minus numbers are silly - raise ValueError("Keyword length must be greater than 0") + raise ValueError('Keyword length must be greater than 0') if keyword_length > 32766: # lucerne has a term byte-length limit of ~32k so cap at that - raise ValueError("Keyword length must be less than 32766") + raise ValueError('Keyword length must be less than 32766') self._keyword_length = keyword_length return self - def with_float_format(self, float_format: str) -> "ParsingOptionsBuilder": + def with_float_format(self, float_format: str) -> 'ParsingOptionsBuilder': """ Sets the format string to use when converting a float to a string for indexing. The string will have its format() method called during indexing with the float @@ -186,7 +186,7 @@ def with_float_format(self, float_format: str) -> "ParsingOptionsBuilder": self._float_format = float_format return self - def clear_date_formats(self) -> "ParsingOptionsBuilder": + def clear_date_formats(self) -> 'ParsingOptionsBuilder': """ Clears out the date formats in this builder. Note that this will remove the default formats which handle the default way Splitgill handles datetime and date @@ -197,7 +197,7 @@ def clear_date_formats(self) -> "ParsingOptionsBuilder": self._date_formats.clear() return self - def reset_date_formats(self) -> "ParsingOptionsBuilder": + def reset_date_formats(self) -> 'ParsingOptionsBuilder': """ Reset the date formats in this builder back to the default set. @@ -209,7 +209,7 @@ def reset_date_formats(self) -> "ParsingOptionsBuilder": self._date_formats.add(NAIVE_DATETIME_FORMAT) return self - def clear_true_values(self) -> "ParsingOptionsBuilder": + def clear_true_values(self) -> 'ParsingOptionsBuilder': """ Clear out all true values in this builder. @@ -218,7 +218,7 @@ def clear_true_values(self) -> "ParsingOptionsBuilder": self._true_values.clear() return self - def clear_false_values(self) -> "ParsingOptionsBuilder": + def clear_false_values(self) -> 'ParsingOptionsBuilder': """ Clear out all false values in this builder. @@ -227,7 +227,7 @@ def clear_false_values(self) -> "ParsingOptionsBuilder": self._false_values.clear() return self - def clear_geo_hints(self) -> "ParsingOptionsBuilder": + def clear_geo_hints(self) -> 'ParsingOptionsBuilder': """ Clear out all geo hints in this builder. diff --git a/splitgill/indexing/parser.py b/splitgill/indexing/parser.py index 55dedff..439414f 100644 --- a/splitgill/indexing/parser.py +++ b/splitgill/indexing/parser.py @@ -1,11 +1,11 @@ from functools import lru_cache from itertools import groupby -from typing import Union, NamedTuple, Tuple +from typing import NamedTuple, Tuple, Union from fastnumbers import try_float -from splitgill.indexing.fields import ParsedType, DataType -from splitgill.indexing.geo import match_geojson, match_wkt, match_hints +from splitgill.indexing.fields import DataType, ParsedType +from splitgill.indexing.geo import match_geojson, match_hints, match_wkt from splitgill.model import ParsingOptions from splitgill.utils import parse_to_timestamp @@ -40,14 +40,14 @@ def parse(data: dict, options: ParsingOptions) -> ParsedData: # aggregations faster as there are fewer unique values) parsed_types.sort() parsed_types = [ - f"{path}.{','.join(pt.rsplit('.', 1)[1] for pt in group)}" - for path, group in groupby(parsed_types, lambda pt: pt.rsplit(".", 1)[0]) + f'{path}.{",".join(pt.rsplit(".", 1)[1] for pt in group)}' + for path, group in groupby(parsed_types, lambda pt: pt.rsplit('.', 1)[0]) ] data_types.sort() data_types = [ - f"{path}.{','.join(dt.rsplit('.', 1)[1] for dt in group)}" - for path, group in groupby(data_types, lambda dt: dt.rsplit(".", 1)[0]) + f'{path}.{",".join(dt.rsplit(".", 1)[1] for dt in group)}' + for path, group in groupby(data_types, lambda dt: dt.rsplit('.', 1)[0]) ] return ParsedData(parsed, data_types, parsed_types) @@ -63,7 +63,7 @@ def parse_dict(data: dict, options: ParsingOptions, check_geojson: bool) -> Pars :return: a ParsedData named tuple """ parsed = {} - data_types = [f"{key}.{DataType.type_for(value)}" for key, value in data.items()] + data_types = [f'{key}.{DataType.type_for(value)}' for key, value in data.items()] parsed_types = [] if check_geojson: @@ -80,25 +80,25 @@ def parse_dict(data: dict, options: ParsingOptions, check_geojson: bool) -> Pars parsed[key], dts, pts = parse_dict(value, options, True) else: parsed[key], dts, pts = parse_list(value, options) - data_types.extend(f"{key}.{dt}" for dt in dts) - parsed_types.extend(f"{key}.{pt}" for pt in pts) + data_types.extend(f'{key}.{dt}' for dt in dts) + parsed_types.extend(f'{key}.{pt}' for pt in pts) else: if value is None: parsed[key] = {ParsedType.UNPARSED: None} continue if not str(value): - parsed[key] = {ParsedType.UNPARSED: ""} + parsed[key] = {ParsedType.UNPARSED: ''} continue parsed_value = parse_value(value, options) parsed[key] = parsed_value - parsed_types.extend(f"{key}.{k}" for k in parsed_value.keys()) + parsed_types.extend(f'{key}.{k}' for k in parsed_value.keys()) hint_matches = match_hints(data, options.geo_hints) for key, geo_data in hint_matches.items(): # we want to add the geo data to the key's parsed data but the parsed dict is # a cached response from parse_value, so we have to make a copy parsed[key] = {**parsed[key], **geo_data} - parsed_types.extend(f"{key}.{k}" for k in geo_data.keys()) + parsed_types.extend(f'{key}.{k}' for k in geo_data.keys()) return ParsedData(parsed, data_types, parsed_types) @@ -113,7 +113,7 @@ def parse_list(data: list, options: ParsingOptions) -> Tuple[list, set, set]: :return: a list of parsed values, a set of parsed types, and a set of data types """ parsed: list = [None] * len(data) - data_types = {f".{DataType.type_for(value)}" for value in data} + data_types = {f'.{DataType.type_for(value)}' for value in data} parsed_types = set() for index, value in enumerate(data): @@ -124,7 +124,7 @@ def parse_list(data: list, options: ParsingOptions) -> Tuple[list, set, set]: parsed[index], dts, pts = parse_dict(value, options, True) else: parsed[index], dts, pts = parse_list(value, options) - data_types.update(f".{dt}" for dt in dts) + data_types.update(f'.{dt}' for dt in dts) # elasticsearch completely flattens lists so when adding the parsed types we # just ignore the hierarchy and store the types directly in our set parsed_types.update(pts) diff --git a/splitgill/indexing/syncing.py b/splitgill/indexing/syncing.py index f525334..e70114a 100644 --- a/splitgill/indexing/syncing.py +++ b/splitgill/indexing/syncing.py @@ -37,7 +37,7 @@ class BulkOpException(Exception): """ def __init__(self, errors: List[dict]): - super().__init__(f"{len(errors)} errors during bulk index. Sample: {errors[0]}") + super().__init__(f'{len(errors)} errors during bulk index. Sample: {errors[0]}') self.errors = errors @@ -117,7 +117,7 @@ def write_ops( # return the indices back to their original settings as defined in the template client.indices.put_settings( - body={"index": {"refresh_interval": None, "number_of_replicas": None}}, + body={'index': {'refresh_interval': None, 'number_of_replicas': None}}, index=result.indices, ) @@ -136,7 +136,7 @@ async def setup_indices(client: AsyncElasticsearch, indices: Iterable[str]): if not await client.indices.exists(index=index): await client.indices.create(index=index) await client.indices.put_settings( - body={"index": {"refresh_interval": -1, "number_of_replicas": 0}}, + body={'index': {'refresh_interval': -1, 'number_of_replicas': 0}}, index=index, ) @@ -240,15 +240,15 @@ async def worker(client: AsyncElasticsearch, task_queue: Queue) -> Tuple[int, in raise errors = [] - for item in response["items"]: + for item in response['items']: action = next(iter(item.keys())) - error = item[action].get("error", None) + error = item[action].get('error', None) if error: errors.append(item) else: - if action == "index": + if action == 'index': indexed += 1 - elif action == "delete": + elif action == 'delete': deleted += 1 if errors: diff --git a/splitgill/indexing/templates.py b/splitgill/indexing/templates.py index a70ae11..0b87cfc 100644 --- a/splitgill/indexing/templates.py +++ b/splitgill/indexing/templates.py @@ -10,11 +10,11 @@ def create_templates(client: Elasticsearch): :param client: an Elasticsearch client object """ client.indices.put_index_template( - name="arc-data-template", + name='arc-data-template', body=get_arc_template(), ) client.indices.put_index_template( - name="latest-data-template", + name='latest-data-template', body=get_latest_template(), ) @@ -29,7 +29,7 @@ def get_latest_template() -> dict: :return: the template as a dict """ return _get_template( - pattern="data-*-latest", + pattern='data-*-latest', shards=5, # must be higher priority than the arc template otherwise the patterns can't be # resolved when they both match an index name @@ -46,7 +46,7 @@ def get_arc_template() -> dict: :return: the template as a dict """ return _get_template( - pattern="data-*-arc-*", + pattern='data-*-arc-*', shards=1, priority=700, ) @@ -54,74 +54,74 @@ def get_arc_template() -> dict: def _get_template(pattern: str, shards: int, priority: int) -> dict: return { - "index_patterns": [pattern], - "priority": priority, - "template": { - "settings": { - "analysis": { - "normalizer": { - "lowercase_normalizer": { - "type": "custom", - "char_filter": [], - "filter": ["lowercase"], + 'index_patterns': [pattern], + 'priority': priority, + 'template': { + 'settings': { + 'analysis': { + 'normalizer': { + 'lowercase_normalizer': { + 'type': 'custom', + 'char_filter': [], + 'filter': ['lowercase'], } } }, - "index": { - "codec": "best_compression", - "number_of_shards": shards, - "number_of_replicas": 1, - "mapping": { - "total_fields": { + 'index': { + 'codec': 'best_compression', + 'number_of_shards': shards, + 'number_of_replicas': 1, + 'mapping': { + 'total_fields': { # this essentially means a maximum of around 500-600 fields, # but in reality the number of fields a record is indexed # into depends on how many values are recognised as geo or # list values and how many data types the values are parsed # into - "limit": 4000, + 'limit': 4000, }, }, }, }, - "mappings": { + 'mappings': { # we're handling dates ourselves so none of this please - "date_detection": False, + 'date_detection': False, # this is off by default anyway but just to make sure - "numeric_detection": False, - "properties": { - DocumentField.ID: {"type": "keyword"}, - DocumentField.VERSION: {"type": "date", "format": "epoch_millis"}, - DocumentField.NEXT: {"type": "date", "format": "epoch_millis"}, + 'numeric_detection': False, + 'properties': { + DocumentField.ID: {'type': 'keyword'}, + DocumentField.VERSION: {'type': 'date', 'format': 'epoch_millis'}, + DocumentField.NEXT: {'type': 'date', 'format': 'epoch_millis'}, DocumentField.VERSIONS: { - "type": "date_range", - "format": "epoch_millis", + 'type': 'date_range', + 'format': 'epoch_millis', }, - DocumentField.DATA_TYPES: {"type": "keyword"}, - DocumentField.PARSED_TYPES: {"type": "keyword"}, + DocumentField.DATA_TYPES: {'type': 'keyword'}, + DocumentField.PARSED_TYPES: {'type': 'keyword'}, # the text value of each field will be copied into this field for # easy querying (see the dynamic keyword_field below) - DocumentField.ALL_TEXT: {"type": "text"}, + DocumentField.ALL_TEXT: {'type': 'text'}, # the geo point value of each geo field will be copied into this # field for easy querying and map making (see the dynamic # keyword_field below) DocumentField.ALL_POINTS: { - "type": "geo_point", - "ignore_z_value": True, + 'type': 'geo_point', + 'ignore_z_value': True, }, # the geo shape value of each geo field will be copied into this # field for easy querying (see the dynamic keyword_field below) DocumentField.ALL_SHAPES: { - "type": "geo_shape", - "ignore_z_value": True, + 'type': 'geo_shape', + 'ignore_z_value': True, }, }, - "dynamic_templates": [ + 'dynamic_templates': [ # define all the parsed data types { - "data_unparsed": { - "match_pattern": "simple", - "path_match": ParsedType.UNPARSED.path_to("*", full=True), - "mapping": { + 'data_unparsed': { + 'match_pattern': 'simple', + 'path_match': ParsedType.UNPARSED.path_to('*', full=True), + 'mapping': { # setting enabled to false stops elasticsearch indexing # this field which means we can pass any value into it # (defining the type as an object is meaningless, but @@ -131,84 +131,84 @@ def _get_template(pattern: str, shards: int, priority: int) -> dict: # record to record and version to version so we can't # ensure it'll be the same type all the time and cannot # index it. - "type": "object", - "enabled": False, + 'type': 'object', + 'enabled': False, }, }, }, { - "data_geo_point": { - "match_pattern": "simple", - "path_match": ParsedType.GEO_POINT.path_to("*", full=True), - "mapping": { - "type": "geo_point", - "ignore_z_value": True, + 'data_geo_point': { + 'match_pattern': 'simple', + 'path_match': ParsedType.GEO_POINT.path_to('*', full=True), + 'mapping': { + 'type': 'geo_point', + 'ignore_z_value': True, # copy the value of this field into the all_points field # (note that this forces us to use WKT to define the # points in this field because elasticsearch can't do a # copy_to on objects, only values) - "copy_to": DocumentField.ALL_POINTS, + 'copy_to': DocumentField.ALL_POINTS, }, }, }, { - "data_geo_shape": { - "match_pattern": "simple", - "path_match": ParsedType.GEO_SHAPE.path_to("*", full=True), - "mapping": { - "type": "geo_shape", - "ignore_z_value": True, + 'data_geo_shape': { + 'match_pattern': 'simple', + 'path_match': ParsedType.GEO_SHAPE.path_to('*', full=True), + 'mapping': { + 'type': 'geo_shape', + 'ignore_z_value': True, # copy the value of this field into the all_shapes field # (note that this forces us to use WKT to define the # points in this field because elasticsearch can't do a # copy_to on objects, only values) - "copy_to": DocumentField.ALL_SHAPES, + 'copy_to': DocumentField.ALL_SHAPES, }, }, }, { - "data_text": { - "match_pattern": "simple", - "path_match": ParsedType.TEXT.path_to("*", full=True), - "mapping": { - "type": "text", + 'data_text': { + 'match_pattern': 'simple', + 'path_match': ParsedType.TEXT.path_to('*', full=True), + 'mapping': { + 'type': 'text', # copy the text value of this field into the all text # field - "copy_to": DocumentField.ALL_TEXT, + 'copy_to': DocumentField.ALL_TEXT, }, }, }, { - "data_keyword": { - "match_pattern": "simple", - "path_match": ParsedType.KEYWORD.path_to("*", full=True), - "mapping": { - "type": "keyword", + 'data_keyword': { + 'match_pattern': 'simple', + 'path_match': ParsedType.KEYWORD.path_to('*', full=True), + 'mapping': { + 'type': 'keyword', # lowercase the text when storing it, this allows # case-insensitive usage - "normalizer": "lowercase_normalizer", + 'normalizer': 'lowercase_normalizer', }, }, }, { - "data_number": { - "match_pattern": "simple", - "path_match": ParsedType.NUMBER.path_to("*", full=True), - "mapping": {"type": "double"}, + 'data_number': { + 'match_pattern': 'simple', + 'path_match': ParsedType.NUMBER.path_to('*', full=True), + 'mapping': {'type': 'double'}, }, }, { - "data_date": { - "match_pattern": "simple", - "path_match": ParsedType.DATE.path_to("*", full=True), - "mapping": {"type": "date", "format": "epoch_millis"}, + 'data_date': { + 'match_pattern': 'simple', + 'path_match': ParsedType.DATE.path_to('*', full=True), + 'mapping': {'type': 'date', 'format': 'epoch_millis'}, }, }, { - "data_boolean": { - "match_pattern": "simple", - "path_match": ParsedType.BOOLEAN.path_to("*", full=True), - "mapping": {"type": "boolean"}, + 'data_boolean': { + 'match_pattern': 'simple', + 'path_match': ParsedType.BOOLEAN.path_to('*', full=True), + 'mapping': {'type': 'boolean'}, }, }, ], diff --git a/splitgill/ingest.py b/splitgill/ingest.py index 39b5fc6..964eee1 100644 --- a/splitgill/ingest.py +++ b/splitgill/ingest.py @@ -1,12 +1,12 @@ from itertools import islice -from typing import Iterable, Union, Optional +from typing import Iterable, Optional, Union -from pymongo import InsertOne, UpdateOne, DeleteOne +from pymongo import DeleteOne, InsertOne, UpdateOne from pymongo.collection import Collection -from splitgill.diffing import prepare_data, diff, prepare_field_name +from splitgill.diffing import diff, prepare_data, prepare_field_name from splitgill.indexing.fields import DATA_ID_FIELD -from splitgill.model import Record, MongoRecord +from splitgill.model import MongoRecord, Record from splitgill.utils import partition MongoBulkOp = Union[InsertOne, UpdateOne, DeleteOne] @@ -67,13 +67,12 @@ def generate_ops( :param data_collection: the data collection containing any existing records :param records: the records to generate insert/update ops for :param modified_field: optional field containing a modified date. If this parameter - is specified, the check to see if there are any changes - between the old and new versions of the data will ignore this - field (if there are other fields that have changed, then a - full diff is generated with these fields included). Defaults - to None, indicating no modified field should be used. + is specified, the check to see if there are any changes between the old and new + versions of the data will ignore this field (if there are other fields that have + changed, then a full diff is generated with these fields included). Defaults to + None, indicating no modified field should be used. :param find_size: the number of records look up at a time. This corresponds directly - to the size of the $in query ID list. Defaults to 100. + to the size of the $in query ID list. Defaults to 100. :return: yields bulk Mongo ops """ # todo: refactor this, it's a bit messy @@ -81,8 +80,8 @@ def generate_ops( records_by_id = {record.id: record for record in chunk} # find if any of the records to be added/updated already exist in the collection existing = { - doc["id"]: MongoRecord(**doc) - for doc in data_collection.find({"id": {"$in": list(records_by_id)}}) + doc['id']: MongoRecord(**doc) + for doc in data_collection.find({'id': {'$in': list(records_by_id)}}) } # shortcut if no records exist @@ -90,9 +89,9 @@ def generate_ops( yield from ( InsertOne( { - "id": record.id, - "data": prepare_record_data(record), - "version": None, + 'id': record.id, + 'data': prepare_record_data(record), + 'version': None, } ) for record in records_by_id.values() @@ -111,7 +110,7 @@ def generate_ops( if record_id not in existing: # the record is new, insert and carry on to the next - yield InsertOne({"id": record_id, "data": new_data, "version": None}) + yield InsertOne({'id': record_id, 'data': new_data, 'version': None}) continue existing_record = existing[record_id] @@ -122,11 +121,11 @@ def generate_ops( if not existing_record.diffs: if not record.data: # the uncommitted record is being deleted, so delete it! - yield DeleteOne({"id": record.id}) + yield DeleteOne({'id': record.id}) elif any(diff(new_data, existing_record.data)): # the current record has one uncommitted version of the data and # no previous versions, just replace its data with the new data - yield UpdateOne({"id": record.id}, {"$set": {"data": new_data}}) + yield UpdateOne({'id': record.id}, {'$set': {'data': new_data}}) # the existing and new data are the same, nothing to do continue else: @@ -160,15 +159,15 @@ def generate_ops( # the existing record has been updated, yield the op necessary to # update it in mongo yield UpdateOne( - {"id": record.id}, + {'id': record.id}, { - "$set": { + '$set': { # set new latest data - "data": new_data, + 'data': new_data, # set version to None to indicate the change is uncommitted - "version": None, + 'version': None, # add diff at previous version - f"diffs.{existing_record.version}": changes, + f'diffs.{existing_record.version}': changes, }, }, ) @@ -188,11 +187,11 @@ def generate_rollback_ops(data_collection: Collection) -> Iterable[MongoBulkOp]: :param data_collection: the data collection to operate on :return: yields bulk Mongo ops """ - for doc in data_collection.find({"version": None}): + for doc in data_collection.find({'version': None}): record = MongoRecord(**doc) if not record.diffs: # the record is just uncommitted data and nothing else, just delete it - yield DeleteOne({"id": record.id}) + yield DeleteOne({'id': record.id}) else: # there is uncommitted data on this record, roll it back and then update op = revert_record(record) @@ -214,7 +213,7 @@ def revert_record(record: MongoRecord) -> Optional[UpdateOne]: returned as you shouldn't be reverting committed data, that breaks Splitgill! :return: an UpdateOne object if there was a previous version to revert to and - therefore the revert was completed, None if not + therefore the revert was completed, None if not """ if not record.diffs or record.version is not None: return None @@ -222,15 +221,15 @@ def revert_record(record: MongoRecord) -> Optional[UpdateOne]: record.version, record.data = next(islice(record.iter(), 1, None), None) del record.diffs[str(record.version)] return UpdateOne( - {"id": record.id}, + {'id': record.id}, { - "$set": { + '$set': { # update the data and the version - "data": record.data, - "version": record.version, + 'data': record.data, + 'version': record.version, }, # delete the entry from the diffs, or delete the diffs completely if the # version we just reverted back to was the only previous version - "$unset": {"diffs" if not record.diffs else f"diffs.{record.version}": ""}, + '$unset': {'diffs' if not record.diffs else f'diffs.{record.version}': ''}, }, ) diff --git a/splitgill/locking.py b/splitgill/locking.py index b32a2e8..57a4c1b 100644 --- a/splitgill/locking.py +++ b/splitgill/locking.py @@ -28,7 +28,7 @@ def __init__(self, lock_collection: Collection): """ self.lock_collection = lock_collection # does nothing if this already exists - self.lock_collection.create_index("lock_id", unique=True) + self.lock_collection.create_index('lock_id', unique=True) def acquire(self, lock_id: str, raise_on_fail: bool = False, **kwargs) -> bool: """ @@ -41,17 +41,17 @@ def acquire(self, lock_id: str, raise_on_fail: bool = False, **kwargs) -> bool: :param lock_id: the ID of the lock to acquire :param raise_on_fail: if True, raises an AlreadyLocked exception if the lock - can't be acquired. Default: False. + can't be acquired. Default: False. :return: True if the lock was acquired, False if not """ try: doc = { - "lock_id": lock_id, - "locked_at": datetime.now(timezone.utc), - "locked_by": platform.node(), + 'lock_id': lock_id, + 'locked_at': datetime.now(timezone.utc), + 'locked_by': platform.node(), } if kwargs: - doc["data"] = kwargs + doc['data'] = kwargs self.lock_collection.insert_one(doc) except DuplicateKeyError: if raise_on_fail: @@ -66,7 +66,7 @@ def release(self, lock_id: str): :param lock_id: the ID of the lock to release """ - self.lock_collection.delete_one({"lock_id": lock_id}) + self.lock_collection.delete_one({'lock_id': lock_id}) def is_locked(self, lock_id: str) -> bool: """ @@ -85,7 +85,7 @@ def get_metadata(self, lock_id: str) -> Optional[dict]: :param lock_id: :return: """ - return self.lock_collection.find_one({"lock_id": lock_id}) + return self.lock_collection.find_one({'lock_id': lock_id}) @contextmanager def lock(self, lock_id: str, **kwargs): diff --git a/splitgill/manager.py b/splitgill/manager.py index 633607b..7e0a418 100644 --- a/splitgill/manager.py +++ b/splitgill/manager.py @@ -25,8 +25,8 @@ from splitgill.search import version_query from splitgill.utils import iter_terms, now, partition -OPTIONS_COLLECTION_NAME = "options" -LOCKS_COLLECTION_NAME = "locks" +OPTIONS_COLLECTION_NAME = 'options' +LOCKS_COLLECTION_NAME = 'locks' class SplitgillClient: @@ -39,14 +39,14 @@ def __init__( self, mongo: MongoClient, elasticsearch: Elasticsearch, - mongo_database_name: str = "sg", + mongo_database_name: str = 'sg', ): self.mongo = mongo self.elasticsearch = elasticsearch self.mongo_database_name = mongo_database_name self.lock_manager = LockManager(self.get_lock_collection()) - def get_database(self, name: str) -> "SplitgillDatabase": + def get_database(self, name: str) -> 'SplitgillDatabase': """ Returns a SplitgillDatabase object. @@ -78,7 +78,7 @@ def get_data_collection(self, name: str) -> Collection: :param name: the name of the Splitgill database :return: a pymongo Collection object """ - return self.get_mongo_database().get_collection(f"data-{name}") + return self.get_mongo_database().get_collection(f'data-{name}') def get_lock_collection(self) -> Collection: """ @@ -96,9 +96,9 @@ class SearchVersion(Enum): """ # searches the latest data - latest = "latest" + latest = 'latest' # searches all data - all = "all" + all = 'all' class SplitgillDatabase: @@ -150,12 +150,12 @@ def get_arc_status(self) -> ArcStatus: ) if result: latest_arc_index = max( - int(arc_index_name.split("-")[-1]) for arc_index_name in result.keys() + int(arc_index_name.split('-')[-1]) for arc_index_name in result.keys() ) count_result = self._client.elasticsearch.count( index=self.indices.get_arc(latest_arc_index) ) - return ArcStatus(latest_arc_index, count_result["count"]) + return ArcStatus(latest_arc_index, count_result['count']) else: return ArcStatus(0, 0) @@ -167,13 +167,13 @@ def get_committed_version(self) -> Optional[int]: :return: the max version or None """ - sort = [("version", DESCENDING)] + sort = [('version', DESCENDING)] last_data = self.data_collection.find_one(sort=sort) - last_options = self.options_collection.find_one({"name": self.name}, sort=sort) + last_options = self.options_collection.find_one({'name': self.name}, sort=sort) - last_data_version = last_data.get("version") if last_data is not None else None + last_data_version = last_data.get('version') if last_data is not None else None last_options_version = ( - last_options.get("version") if last_options is not None else None + last_options.get('version') if last_options is not None else None ) # there's no committed data or options if last_data_version is None and last_options_version is None: @@ -197,13 +197,13 @@ def get_elasticsearch_version(self) -> Optional[int]: version = None for field in (DocumentField.VERSION, DocumentField.NEXT): result = self._client.elasticsearch.search( - aggs={"max_version": {"max": {"field": field}}}, + aggs={'max_version': {'max': {'field': field}}}, size=0, # search all indices so that we catch deletes which won't have a # document in latest index=self.indices.wildcard, ) - value = get_in(("aggregations", "max_version", "value"), result, None) + value = get_in(('aggregations', 'max_version', 'value'), result, None) if value is not None and (version is None or value > version): version = value @@ -236,7 +236,7 @@ def has_data(self) -> bool: :return: True if there is data, False if not """ - return self.data_collection.find_one({"version": {"$ne": None}}) is not None + return self.data_collection.find_one({'version': {'$ne': None}}) is not None def has_options(self) -> bool: """ @@ -245,7 +245,7 @@ def has_options(self) -> bool: :return: True if there is options, False if not """ - return self.options_collection.find_one({"version": {"$ne": None}}) is not None + return self.options_collection.find_one({'version': {'$ne': None}}) is not None def commit(self) -> Optional[int]: """ @@ -260,7 +260,7 @@ def commit(self) -> Optional[int]: """ # todo: global now? # todo: transaction/rollback? Can't do this without replicasets so who knows? - with self.locker.lock(self.name, stage="commit"): + with self.locker.lock(self.name, stage='commit'): if not self.has_uncommitted_data() and not self.has_uncommitted_options(): # nothing to commit, so nothing to do return None @@ -276,7 +276,7 @@ def commit(self) -> Optional[int]: # update the uncommitted data and options in a transaction for collection in [self.data_collection, self.options_collection]: collection.update_many( - filter={"version": None}, update={"$set": {"version": version}} + filter={'version': None}, update={'$set': {'version': version}} ) return version @@ -314,7 +314,7 @@ def ingest( """ # this does nothing if the indexes already exist self.data_collection.create_indexes( - [IndexModel([("id", ASCENDING)]), IndexModel([("version", DESCENDING)])] + [IndexModel([('id', ASCENDING)]), IndexModel([('version', DESCENDING)])] ) result = IngestResult() @@ -361,10 +361,10 @@ def update_options(self, options: ParsingOptions, commit=True) -> Optional[int]: # either the options are completely new or they differ from the existing # options, write a fresh entry new_doc = { - "name": self.name, + 'name': self.name, # version = None to indicate this is an uncommitted change - "version": None, - "options": options.to_doc(), + 'version': None, + 'options': options.to_doc(), } self.options_collection.insert_one(new_doc) @@ -381,7 +381,7 @@ def rollback_options(self) -> int: :return: the number of documents deleted """ - return self.options_collection.delete_many({"version": None}).deleted_count + return self.options_collection.delete_many({'version': None}).deleted_count def rollback_records(self): """ @@ -401,7 +401,7 @@ def has_uncommitted_data(self) -> bool: :return: returns True if there are any uncommitted records, False if not """ - return self.data_collection.find_one({"version": None}) is not None + return self.data_collection.find_one({'version': None}) is not None def has_uncommitted_options(self) -> bool: """ @@ -409,7 +409,7 @@ def has_uncommitted_options(self) -> bool: :return: returns True if there are any uncommitted options, False if not """ - return self.options_collection.find_one({"version": None}) is not None + return self.options_collection.find_one({'version': None}) is not None def get_options(self, include_uncommitted=False) -> Dict[int, ParsingOptions]: """ @@ -420,9 +420,9 @@ def get_options(self, include_uncommitted=False) -> Dict[int, ParsingOptions]: :return: a dict of versions and options """ return { - doc["version"]: ParsingOptions.from_doc(doc["options"]) - for doc in self.options_collection.find({"name": self.name}) - if include_uncommitted or doc["version"] is not None + doc['version']: ParsingOptions.from_doc(doc['options']) + for doc in self.options_collection.find({'name': self.name}) + if include_uncommitted or doc['version'] is not None } def iter_records(self, **find_kwargs) -> Iterable[MongoRecord]: @@ -477,7 +477,7 @@ def sync( last_sync = self.get_elasticsearch_version() if not resync else None if last_sync is None: # elasticsearch has nothing so find all committed records - find_filter = {"version": {"$ne": None}} + find_filter = {'version': {'$ne': None}} else: committed_version = self.get_committed_version() if last_sync >= committed_version: @@ -487,10 +487,10 @@ def sync( if any(version > last_sync for version in all_options): # there's an options change ahead, this means we need to check all # records again, so filter out committed records only - find_filter = {"version": {"$ne": None}} + find_filter = {'version': {'$ne': None}} else: # find all the updated records that haven't had their updates synced yet - find_filter = {"version": {"$gt": last_sync}} + find_filter = {'version': {'$gt': last_sync}} if resync: # delete all arcs @@ -547,7 +547,7 @@ def resync_arcs(self, bulk_options: Optional[BulkOptions] = None) -> WriteResult # deleted them all self.get_arc_status(), # get all committed records - self.iter_records(filter={"version": {"$ne": None}}), + self.iter_records(filter={'version': {'$ne': None}}), # get all committed options self.get_options(include_uncommitted=False), None, @@ -657,7 +657,7 @@ def get_data_fields( # create the basic field objects and add type counts for term in iter_terms(search, DocumentField.DATA_TYPES): - path, raw_types = term.value.rsplit(".", 1) + path, raw_types = term.value.rsplit('.', 1) if path not in fields: fields[path] = DataField(path) fields[path].add(raw_types, term.count) @@ -666,10 +666,10 @@ def get_data_fields( for field in fields.values(): if not field.is_container: continue - target_dot_count = field.path.count(".") + 1 + target_dot_count = field.path.count('.') + 1 for child in fields.values(): - if child.path.count(".") == target_dot_count and child.path.startswith( - f"{field.path}." + if child.path.count('.') == target_dot_count and child.path.startswith( + f'{field.path}.' ): field.children.append(child) child.parent = field @@ -679,7 +679,7 @@ def get_data_fields( # reverse order of the order we want them applied. # descending depth (so fields closest to the root first) data_fields = sorted( - fields.values(), key=lambda f: f.path.count("."), reverse=True + fields.values(), key=lambda f: f.path.count('.'), reverse=True ) # ascending alphabetical order data_fields.sort(key=lambda f: f.path) @@ -708,7 +708,7 @@ def get_parsed_fields( # create the basic field objects and add type counts for term in iter_terms(search, DocumentField.PARSED_TYPES): - path, raw_types = term.value.rsplit(".", 1) + path, raw_types = term.value.rsplit('.', 1) if path not in fields: fields[path] = ParsedField(path) fields[path].add(raw_types, term.count) @@ -718,7 +718,7 @@ def get_parsed_fields( # in the reverse order of the order we want them applied. # descending depth (so fields closest to the root first) parsed_fields = sorted( - fields.values(), key=lambda f: f.path.count("."), reverse=True + fields.values(), key=lambda f: f.path.count('.'), reverse=True ) # ascending alphabetical order parsed_fields.sort(key=lambda f: f.path) diff --git a/splitgill/model.py b/splitgill/model.py index 4bbf623..4193cc7 100644 --- a/splitgill/model.py +++ b/splitgill/model.py @@ -1,12 +1,12 @@ -from dataclasses import dataclass, field, astuple +from dataclasses import astuple, dataclass, field from itertools import chain -from typing import Dict, Iterable, NamedTuple, List, Optional, FrozenSet, Any +from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Optional from uuid import uuid4 from bson import ObjectId from pymongo.results import BulkWriteResult -from splitgill.diffing import patch, DiffOp +from splitgill.diffing import DiffOp, patch @dataclass @@ -29,15 +29,15 @@ def is_delete(self) -> bool: return not self.data @staticmethod - def new(data: dict) -> "Record": + def new(data: dict) -> 'Record': return Record(str(uuid4()), data) @staticmethod - def delete(record_id: str) -> "Record": + def delete(record_id: str) -> 'Record': return Record(record_id, {}) -VersionedData = NamedTuple("VersionedData", version=Optional[int], data=dict) +VersionedData = NamedTuple('VersionedData', version=Optional[int], data=dict) @dataclass @@ -100,7 +100,7 @@ def iter(self) -> Iterable[VersionedData]: with the latest data and working back to the first version. :return: VersionedData (version: int, data: dict) named tuples in descending - version order + version order """ yield VersionedData(self.version, self.data) base = self.data @@ -171,23 +171,23 @@ class ParsingOptions: def to_doc(self) -> dict: return { - "true_values": list(self.true_values), - "false_values": list(self.false_values), - "date_formats": list(self.date_formats), - "geo_hints": [astuple(hint) for hint in self.geo_hints], - "keyword_length": self.keyword_length, - "float_format": self.float_format, + 'true_values': list(self.true_values), + 'false_values': list(self.false_values), + 'date_formats': list(self.date_formats), + 'geo_hints': [astuple(hint) for hint in self.geo_hints], + 'keyword_length': self.keyword_length, + 'float_format': self.float_format, } @classmethod - def from_doc(cls, doc: dict) -> "ParsingOptions": + def from_doc(cls, doc: dict) -> 'ParsingOptions': return ParsingOptions( - frozenset(doc["true_values"]), - frozenset(doc["false_values"]), - frozenset(doc["date_formats"]), - frozenset(GeoFieldHint(*params) for params in doc["geo_hints"]), - doc["keyword_length"], - doc["float_format"], + frozenset(doc['true_values']), + frozenset(doc['false_values']), + frozenset(doc['date_formats']), + frozenset(GeoFieldHint(*params) for params in doc['geo_hints']), + doc['keyword_length'], + doc['float_format'], ) diff --git a/splitgill/search.py b/splitgill/search.py index 60ad026..8c14d4a 100644 --- a/splitgill/search.py +++ b/splitgill/search.py @@ -1,15 +1,15 @@ import datetime from collections import defaultdict -from typing import Dict, Union, Optional +from typing import Dict, Optional, Union from elasticsearch_dsl import Q from elasticsearch_dsl.query import Bool, Query from splitgill.indexing.fields import ( + DATA_ID_FIELD, DocumentField, ParsedType, parsed_path, - DATA_ID_FIELD, ) from splitgill.utils import to_timestamp @@ -52,7 +52,7 @@ def version_query(version: int) -> Query: :param version: the requested version :return: an elasticsearch-dsl Query object """ - return Q("term", **{DocumentField.VERSIONS: version}) + return Q('term', **{DocumentField.VERSIONS: version}) def index_specific_version_filter(indexes_and_versions: Dict[str, int]) -> Query: @@ -86,13 +86,13 @@ def index_specific_version_filter(indexes_and_versions: Dict[str, int]) -> Query if len(indexes) == 1: # there's only one index requiring this version so use a term query filters.append( - Bool(filter=[Q("term", _index=indexes[0]), version_filter]) + Bool(filter=[Q('term', _index=indexes[0]), version_filter]) ) else: # there are a few indexes using this version, query them using terms as # a group filters.append( - Bool(filter=[Q("terms", _index=indexes), version_filter]) + Bool(filter=[Q('terms', _index=indexes), version_filter]) ) return Bool(should=filters, minimum_should_match=1) @@ -104,7 +104,7 @@ def has_geo() -> Query: :return: an exists Query object """ - return Q("exists", field=DocumentField.ALL_POINTS) + return Q('exists', field=DocumentField.ALL_POINTS) def exists_query(field: str) -> Query: @@ -114,7 +114,7 @@ def exists_query(field: str) -> Query: :param field: the field path :return: an exists query on the field using the full parsed path """ - return Q("exists", field=parsed_path(field, parsed_type=None, full=True)) + return Q('exists', field=parsed_path(field, parsed_type=None, full=True)) def infer_parsed_type( @@ -137,7 +137,7 @@ def infer_parsed_type( elif isinstance(value, (datetime.date, datetime.datetime)): return ParsedType.DATE else: - raise ValueError(f"Unexpected type {type(value)}") + raise ValueError(f'Unexpected type {type(value)}') def term_query( @@ -162,7 +162,7 @@ def term_query( if parsed_type == ParsedType.DATE and isinstance(value, datetime.date): value = to_timestamp(value) - return Q("term", **{parsed_path(field, parsed_type=parsed_type, full=True): value}) + return Q('term', **{parsed_path(field, parsed_type=parsed_type, full=True): value}) def match_query(query: str, field: Optional[str] = None, **match_kwargs) -> Query: @@ -180,7 +180,7 @@ def match_query(query: str, field: Optional[str] = None, **match_kwargs) -> Quer path = ALL_TEXT else: path = text(field) - return Q("match", **{path: {"query": query, **match_kwargs}}) + return Q('match', **{path: {'query': query, **match_kwargs}}) def range_query( @@ -209,7 +209,7 @@ def range_query( """ range_inner = {} for_inference = None - for key, value in zip(["gte", "lt", "gt", "lte"], [gte, lt, gt, lte]): + for key, value in zip(['gte', 'lt', 'gt', 'lte'], [gte, lt, gt, lte]): if value is None: continue if for_inference is None: @@ -221,7 +221,7 @@ def range_query( range_inner[key] = value if not range_inner: - raise ValueError("You must provide at least one of the lt/lte/gt/gte values") + raise ValueError('You must provide at least one of the lt/lte/gt/gte values') if parsed_type is None: parsed_type = infer_parsed_type(for_inference) @@ -229,7 +229,7 @@ def range_query( range_inner.update(range_kwargs) return Q( - "range", **{parsed_path(field, parsed_type=parsed_type, full=True): range_inner} + 'range', **{parsed_path(field, parsed_type=parsed_type, full=True): range_inner} ) @@ -247,13 +247,13 @@ def rebuild_data(parsed_data: dict) -> dict: def rebuild_dict_or_list( - value: Union[dict, list] + value: Union[dict, list], ) -> Union[int, str, bool, float, dict, list, None]: """ Rebuild a dict or a list inside the parsed dict. :param value: a dict which can either be for structure or a value, or a list of - either value or structure dicts + either value or structure dicts :return: a dict, list, or value """ if isinstance(value, dict): @@ -267,7 +267,7 @@ def rebuild_dict_or_list( return { key: rebuild_dict_or_list(value) for key, value in value.items() - if not key.startswith("_") or key == DATA_ID_FIELD + if not key.startswith('_') or key == DATA_ID_FIELD } elif isinstance(value, list): # pass each element of the list through this function diff --git a/splitgill/utils.py b/splitgill/utils.py index 86c562b..ccfde05 100644 --- a/splitgill/utils.py +++ b/splitgill/utils.py @@ -1,12 +1,11 @@ from dataclasses import dataclass -from datetime import datetime, timezone, date +from datetime import date, datetime, timezone from itertools import islice from time import time -from typing import Iterable, Union, List, Any +from typing import Iterable, Union from cytoolz import get_in -from elasticsearch_dsl import Search, A -from elasticsearch_dsl.aggs import Agg +from elasticsearch_dsl import A, Search def to_timestamp(moment: Union[datetime, date]) -> int: @@ -21,7 +20,7 @@ def to_timestamp(moment: Union[datetime, date]) -> int: :param moment: a datetime or date object :return: the timestamp (number of milliseconds between the UNIX epoch and the - moment) as an int + moment) as an int """ if isinstance(moment, datetime): return int(moment.timestamp() * 1000) @@ -43,7 +42,7 @@ def parse_to_timestamp( :param datetime_format: the format as a string :param tz: the timezone to use (default: UTC) :return: the parsed datetime as the number of milliseconds since the UNIX epoch as - an int + an int """ dt = datetime.strptime(datetime_string, datetime_format) if dt.tzinfo is None: @@ -103,21 +102,21 @@ def iter_terms(search: Search, field: str, chunk_size: int = 50) -> Iterable[Ter # search to work with agg_search = search[:0] agg_search.aggs.bucket( - "values", - "composite", + 'values', + 'composite', size=chunk_size, - sources={"value": A("terms", field=field)}, + sources={'value': A('terms', field=field)}, ) if after is not None: - agg_search.aggs["values"].after = after + agg_search.aggs['values'].after = after result = agg_search.execute().aggs.to_dict() - buckets = get_in(("values", "buckets"), result, []) - after = get_in(("values", "after_key"), result, None) + buckets = get_in(('values', 'buckets'), result, []) + after = get_in(('values', 'after_key'), result, None) if not buckets: break else: yield from ( - Term(bucket["key"]["value"], bucket["doc_count"]) for bucket in buckets + Term(bucket['key']['value'], bucket['doc_count']) for bucket in buckets ) diff --git a/tests/conftest.py b/tests/conftest.py index b7e417c..fbe980d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,11 @@ import json -from os import getenv from contextlib import suppress +from os import getenv from typing import List import pytest from elastic_transport import NodeConfig -from elasticsearch import Elasticsearch, AsyncElasticsearch +from elasticsearch import AsyncElasticsearch, Elasticsearch from pymongo import MongoClient from pymongo.collection import Collection from pymongo.database import Database @@ -18,7 +18,7 @@ @pytest.fixture def mongo_client() -> MongoClient: - with MongoClient(getenv("SPLITGILL_MONGO_HOST", "mongo"), 27017) as client: + with MongoClient(getenv('SPLITGILL_MONGO_HOST', 'mongo'), 27017) as client: yield client database_names = client.list_database_names() for name in database_names: @@ -31,26 +31,26 @@ def mongo_client() -> MongoClient: @pytest.fixture def mongo_database(mongo_client: MongoClient) -> Database: - yield mongo_client["test"] + yield mongo_client['test'] @pytest.fixture def mongo_collection(mongo_database: Database) -> Collection: - yield mongo_database["test"] + yield mongo_database['test'] @pytest.fixture def es_node_configs() -> List[NodeConfig]: - node_configs = [NodeConfig("http", getenv("SPLITGILL_ES_HOST", "es"), 9200)] + node_configs = [NodeConfig('http', getenv('SPLITGILL_ES_HOST', 'es'), 9200)] yield node_configs with Elasticsearch(node_configs) as es: - es.indices.delete(index="*") - index_templates = es.indices.get_index_template(name="*") - for index_template in index_templates["index_templates"]: + es.indices.delete(index='*') + index_templates = es.indices.get_index_template(name='*') + for index_template in index_templates['index_templates']: with suppress(Exception): - es.indices.delete_index_template(name=index_template["name"]) + es.indices.delete_index_template(name=index_template['name']) @pytest.fixture @@ -76,27 +76,27 @@ def splitgill( @pytest.fixture def database(splitgill: SplitgillClient) -> SplitgillDatabase: - return splitgill.get_database("test-db") + return splitgill.get_database('test-db') @pytest.fixture def geojson_point() -> dict: - return {"type": "Point", "coordinates": [30, 10]} + return {'type': 'Point', 'coordinates': [30, 10]} @pytest.fixture def geojson_linestring() -> dict: return { - "type": "LineString", - "coordinates": [[10, 10], [20, 10], [20, 20]], + 'type': 'LineString', + 'coordinates': [[10, 10], [20, 10], [20, 20]], } @pytest.fixture def geojson_polygon() -> dict: return { - "type": "Polygon", - "coordinates": [[[10, 10], [20, 10], [20, 20], [10, 20], [10, 10]]], + 'type': 'Polygon', + 'coordinates': [[[10, 10], [20, 10], [20, 20], [10, 20], [10, 10]]], } @@ -104,8 +104,8 @@ def geojson_polygon() -> dict: def geojson_holed_polygon() -> dict: # this is a lovely square with an hourglass like shape hole return { - "type": "Polygon", - "coordinates": [ + 'type': 'Polygon', + 'coordinates': [ [[10, 10], [20, 10], [20, 20], [10, 20], [10, 10]], [[12, 12], [14, 15], [12, 18], [18, 18], [16, 15], [18, 12], [12, 12]], ], @@ -137,26 +137,26 @@ def basic_options() -> ParsingOptions: return ( ParsingOptionsBuilder() .with_keyword_length(8191) - .with_float_format("{0:.15g}") - .with_true_value("true") - .with_true_value("yes") - .with_true_value("y") - .with_false_value("false") - .with_false_value("no") - .with_false_value("n") - .with_date_format("%Y-%m-%d") - .with_date_format("%Y-%m-%dT%H:%M:%S") - .with_date_format("%Y-%m-%dT%H:%M:%S.%f") - .with_date_format("%Y-%m-%d %H:%M:%S") - .with_date_format("%Y-%m-%d %H:%M:%S.%f") - .with_date_format("%Y-%m-%dT%H:%M:%S%z") - .with_date_format("%Y-%m-%dT%H:%M:%S.%f%z") - .with_date_format("%Y-%m-%d %H:%M:%S%z") - .with_date_format("%Y-%m-%d %H:%M:%S.%f%z") - .with_geo_hint("lat", "lon") - .with_geo_hint("latitude", "longitude", "radius") + .with_float_format('{0:.15g}') + .with_true_value('true') + .with_true_value('yes') + .with_true_value('y') + .with_false_value('false') + .with_false_value('no') + .with_false_value('n') + .with_date_format('%Y-%m-%d') + .with_date_format('%Y-%m-%dT%H:%M:%S') + .with_date_format('%Y-%m-%dT%H:%M:%S.%f') + .with_date_format('%Y-%m-%d %H:%M:%S') + .with_date_format('%Y-%m-%d %H:%M:%S.%f') + .with_date_format('%Y-%m-%dT%H:%M:%S%z') + .with_date_format('%Y-%m-%dT%H:%M:%S.%f%z') + .with_date_format('%Y-%m-%d %H:%M:%S%z') + .with_date_format('%Y-%m-%d %H:%M:%S.%f%z') + .with_geo_hint('lat', 'lon') + .with_geo_hint('latitude', 'longitude', 'radius') .with_geo_hint( - "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters" + 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters' ) .build() ) diff --git a/tests/indexing/test_fields.py b/tests/indexing/test_fields.py index fe6e783..a4af17b 100644 --- a/tests/indexing/test_fields.py +++ b/tests/indexing/test_fields.py @@ -1,24 +1,24 @@ from datetime import datetime -from typing import List, Any +from typing import Any, List import pytest from splitgill.indexing.fields import ( - ParsedType, - parsed_path, - DocumentField, - DataType, DataField, + DataType, + DocumentField, ParsedField, + ParsedType, + parsed_path, ) -@pytest.mark.parametrize("parsed_type", ParsedType) +@pytest.mark.parametrize('parsed_type', ParsedType) def test_parsed_path(parsed_type: ParsedType): - field = "a.field.in.the.record" + field = 'a.field.in.the.record' - full = f"{DocumentField.DATA}.{field}.{parsed_type}" - relative = f"{field}.{parsed_type}" + full = f'{DocumentField.DATA}.{field}.{parsed_type}' + relative = f'{field}.{parsed_type}' assert parsed_path(field, parsed_type=parsed_type, full=True) == full assert parsed_type.path_to(field, full=True) == full @@ -28,13 +28,13 @@ def test_parsed_path(parsed_type: ParsedType): def test_parse_path_no_parsed_type(): - assert parsed_path("a.field.in.the.record", None, False) == "a.field.in.the.record" + assert parsed_path('a.field.in.the.record', None, False) == 'a.field.in.the.record' class TestDataTypeTypeFor: def test_str(self): - assert DataType.type_for("beans") == DataType.STR - assert DataType.type_for("") == DataType.STR + assert DataType.type_for('beans') == DataType.STR + assert DataType.type_for('') == DataType.STR def test_int(self): assert DataType.type_for(4) == DataType.INT @@ -49,7 +49,7 @@ def test_bool(self): def test_dict(self): assert DataType.type_for({}) == DataType.DICT - assert DataType.type_for({"a": 5}) == DataType.DICT + assert DataType.type_for({'a': 5}) == DataType.DICT def test_list(self): assert DataType.type_for([]) == DataType.LIST @@ -65,7 +65,7 @@ def test_invalid(self): tuple(), # the not sensible tests object(), - type("TestClass", (), {}), + type('TestClass', (), {}), ..., ] for value in invalid: @@ -75,27 +75,27 @@ def test_invalid(self): class TestDataField: def test_name(self): - assert DataField("field").name == "field" - assert DataField("a.field").name == "field" - assert DataField("b.c.e.d.t.h..a.field").name == "field" + assert DataField('field').name == 'field' + assert DataField('a.field').name == 'field' + assert DataField('b.c.e.d.t.h..a.field').name == 'field' def test_parsed_path(self): - assert DataField("field").parsed_path == "field" - assert DataField("a.field").parsed_path == "a.field" - assert DataField("b.c.e.d.t.h..a.field").parsed_path == "b.c.e.d.t.h.a.field" - assert DataField("a.....field").parsed_path == "a.field" + assert DataField('field').parsed_path == 'field' + assert DataField('a.field').parsed_path == 'a.field' + assert DataField('b.c.e.d.t.h..a.field').parsed_path == 'b.c.e.d.t.h.a.field' + assert DataField('a.....field').parsed_path == 'a.field' def test_is_list_element(self): - assert not DataField("field").is_list_element - assert not DataField("a.field").is_list_element - assert not DataField("a.b").is_list_element - assert DataField("a.").is_list_element - assert DataField("a..").is_list_element - assert DataField("a..b.").is_list_element + assert not DataField('field').is_list_element + assert not DataField('a.field').is_list_element + assert not DataField('a.b').is_list_element + assert DataField('a.').is_list_element + assert DataField('a..').is_list_element + assert DataField('a..b.').is_list_element def test_add(self): - df = DataField("field") - df.add(",".join([DataType.STR, DataType.INT]), 3) + df = DataField('field') + df.add(','.join([DataType.STR, DataType.INT]), 3) df.add(DataType.FLOAT, 4) assert df.is_type(DataType.STR, DataType.INT, DataType.FLOAT) assert df.is_float @@ -106,13 +106,13 @@ def test_add(self): class TestParsedField: def test_name(self): - assert ParsedField("field").name == "field" - assert ParsedField("a.field").name == "field" - assert ParsedField("b.c.e.d.t.h.a.field").name == "field" + assert ParsedField('field').name == 'field' + assert ParsedField('a.field').name == 'field' + assert ParsedField('b.c.e.d.t.h.a.field').name == 'field' def test_add(self): - pf = ParsedField("field") - pf.add(",".join([ParsedType.NUMBER, ParsedType.DATE]), 3) + pf = ParsedField('field') + pf.add(','.join([ParsedType.NUMBER, ParsedType.DATE]), 3) pf.add(ParsedType.TEXT, 4) pf.add(ParsedType.KEYWORD, 24) assert pf.is_type( diff --git a/tests/indexing/test_geo.py b/tests/indexing/test_geo.py index 47f2894..39f4744 100644 --- a/tests/indexing/test_geo.py +++ b/tests/indexing/test_geo.py @@ -2,38 +2,38 @@ import math import pytest -from shapely import Point, MultiPoint, LineString, Polygon, from_geojson +from shapely import LineString, MultiPoint, Point, Polygon, from_geojson from shapely.geometry.base import BaseGeometry from splitgill.indexing.fields import ParsedType from splitgill.indexing.geo import ( create_polygon_circle, - is_winding_valid, - match_hints, is_shape_valid, + is_winding_valid, match_geojson, + match_hints, match_wkt, ) from splitgill.model import GeoFieldHint -hint = GeoFieldHint("lat", "lon", "rad") +hint = GeoFieldHint('lat', 'lon', 'rad') class TestMatchHints: def test_invalid_latitude(self): - assert not match_hints({"lat": "1000", "lon": "23"}, [hint]) + assert not match_hints({'lat': '1000', 'lon': '23'}, [hint]) def test_missing_latitude(self): - assert not match_hints({"lat": None, "lon": "23"}, [hint]) + assert not match_hints({'lat': None, 'lon': '23'}, [hint]) def test_invalid_longitude(self): - assert not match_hints({"lat": "23", "lon": "1000"}, [hint]) + assert not match_hints({'lat': '23', 'lon': '1000'}, [hint]) def test_missing_longitude(self): - assert not match_hints({"lat": "23", "lon": None}, [hint]) + assert not match_hints({'lat': '23', 'lon': None}, [hint]) def test_invalid_radius(self): - matched = match_hints({"lat": "23", "lon": "24", "rad": "-1"}, [hint]) + matched = match_hints({'lat': '23', 'lon': '24', 'rad': '-1'}, [hint]) geo_data = matched[hint.lat_field] # check something was returned assert geo_data @@ -41,7 +41,7 @@ def test_invalid_radius(self): assert geo_data[ParsedType.GEO_POINT] == geo_data[ParsedType.GEO_SHAPE] def test_0_radius(self): - matched = match_hints({"lat": "23", "lon": "24", "rad": "0"}, [hint]) + matched = match_hints({'lat': '23', 'lon': '24', 'rad': '0'}, [hint]) geo_data = matched[hint.lat_field] # check something was returned assert geo_data @@ -49,12 +49,12 @@ def test_0_radius(self): assert geo_data[ParsedType.GEO_POINT] == geo_data[ParsedType.GEO_SHAPE] def test_valid_without_radius(self): - matches = match_hints({"lat": "51.496111", "lon": "-0.176111"}, [hint]) + matches = match_hints({'lat': '51.496111', 'lon': '-0.176111'}, [hint]) assert len(matches) == 1 geo_data = matches[hint.lat_field] - assert geo_data[ParsedType.GEO_POINT] == "POINT (-0.176111 51.496111)" - assert geo_data[ParsedType.GEO_SHAPE] == "POINT (-0.176111 51.496111)" + assert geo_data[ParsedType.GEO_POINT] == 'POINT (-0.176111 51.496111)' + assert geo_data[ParsedType.GEO_SHAPE] == 'POINT (-0.176111 51.496111)' def test_valid_with_radius(self): lat = 51.496111 @@ -62,13 +62,13 @@ def test_valid_with_radius(self): rad = 10.5 segments = 16 circle = create_polygon_circle(lat, lon, rad, segments) - h = GeoFieldHint("lat", "lon", "rad", segments) + h = GeoFieldHint('lat', 'lon', 'rad', segments) - matches = match_hints({"lat": lat, "lon": lon, "rad": rad}, [h]) + matches = match_hints({'lat': lat, 'lon': lon, 'rad': rad}, [h]) assert len(matches) == 1 geo_data = matches[h.lat_field] - assert geo_data[ParsedType.GEO_POINT] == f"POINT ({lon} {lat})" + assert geo_data[ParsedType.GEO_POINT] == f'POINT ({lon} {lat})' assert geo_data[ParsedType.GEO_SHAPE] == circle.wkt def test_segments_is_passed_correctly(self): @@ -79,17 +79,17 @@ def test_segments_is_passed_correctly(self): segments_2 = 128 circle_1 = create_polygon_circle(lat, lon, rad, segments_1) circle_2 = create_polygon_circle(lat, lon, rad, segments_2) - h_1 = GeoFieldHint("lat", "lon", "rad", segments_1) - h_2 = GeoFieldHint("lat", "lon", "rad", segments_2) + h_1 = GeoFieldHint('lat', 'lon', 'rad', segments_1) + h_2 = GeoFieldHint('lat', 'lon', 'rad', segments_2) - matches_1 = match_hints({"lat": lat, "lon": lon, "rad": rad}, [h_1]) + matches_1 = match_hints({'lat': lat, 'lon': lon, 'rad': rad}, [h_1]) geo_data_1 = matches_1[h_1.lat_field] - assert geo_data_1[ParsedType.GEO_POINT] == f"POINT ({lon} {lat})" + assert geo_data_1[ParsedType.GEO_POINT] == f'POINT ({lon} {lat})' assert geo_data_1[ParsedType.GEO_SHAPE] == circle_1.wkt - matches_2 = match_hints({"lat": lat, "lon": lon, "rad": rad}, [h_2]) + matches_2 = match_hints({'lat': lat, 'lon': lon, 'rad': rad}, [h_2]) geo_data_2 = matches_2[h_2.lat_field] - assert geo_data_2[ParsedType.GEO_POINT] == f"POINT ({lon} {lat})" + assert geo_data_2[ParsedType.GEO_POINT] == f'POINT ({lon} {lat})' assert geo_data_2[ParsedType.GEO_SHAPE] == circle_2.wkt assert geo_data_1[ParsedType.GEO_POINT] == geo_data_2[ParsedType.GEO_POINT] @@ -99,16 +99,16 @@ def test_segments_is_passed_correctly(self): is_shape_valid_scenarios = [ (Point(0, 0), True), # invalid longitudes - (Point("NaN", 0), False), + (Point('NaN', 0), False), (Point(math.nan, 0), False), - (Point("inf", 0), False), + (Point('inf', 0), False), (Point(math.inf, 0), False), (Point(190, 0), False), (Point(-190, 0), False), # invalid latitudes - (Point(0, "NaN"), False), + (Point(0, 'NaN'), False), (Point(0, math.nan), False), - (Point(0, "inf"), False), + (Point(0, 'inf'), False), (Point(0, math.inf), False), (Point(0, 100), False), (Point(0, -100), False), @@ -141,7 +141,7 @@ def test_segments_is_passed_correctly(self): ] -@pytest.mark.parametrize(("shape", "is_valid"), is_shape_valid_scenarios) +@pytest.mark.parametrize(('shape', 'is_valid'), is_shape_valid_scenarios) def test_is_shape_valid(shape: BaseGeometry, is_valid: bool): assert is_shape_valid(shape) == is_valid @@ -184,86 +184,86 @@ def test_valid_point(self, geojson_point: dict, wkt_point: str): def test_invalid_point_with_elevation(self, geojson_point: dict, wkt_point: str): data = geojson_point.copy() - data["coordinates"] = (*data["coordinates"], 2000.6) + data['coordinates'] = (*data['coordinates'], 2000.6) assert match_geojson(data) is None def test_invalid_with_too_many_points(self, geojson_point: dict): data = geojson_point.copy() - data["coordinates"] = (*data["coordinates"], "2000.6", "2004.2") + data['coordinates'] = (*data['coordinates'], '2000.6', '2004.2') assert match_geojson(data) is None def test_invalid_point_too_few_points(self, geojson_point: dict): data = geojson_point.copy() - data["coordinates"] = (data["coordinates"][0],) + data['coordinates'] = (data['coordinates'][0],) assert match_geojson(data) is None def test_invalid_point_bad_lat(self): - data = {"type": "Point", "coordinates": ("30.0", "100.0")} + data = {'type': 'Point', 'coordinates': ('30.0', '100.0')} assert match_geojson(data) is None def test_invalid_point_bad_lon(self): - data = {"type": "Point", "coordinates": ("-190.0", "100.0")} + data = {'type': 'Point', 'coordinates': ('-190.0', '100.0')} assert match_geojson(data) is None def test_invalid_point_bad_lat_cause_its_a_random_string(self): - data = {"type": "Point", "coordinates": ("80", "garbage!")} + data = {'type': 'Point', 'coordinates': ('80', 'garbage!')} assert match_geojson(data) is None def test_invalid_point_bad_lon_cause_its_none(self): - data = {"type": "Point", "coordinates": (None, "100.0")} + data = {'type': 'Point', 'coordinates': (None, '100.0')} assert match_geojson(data) is None def test_invalid_point_bad_lon_cause_its_empty_string(self): - data = {"type": "Point", "coordinates": ("", "100.0")} + data = {'type': 'Point', 'coordinates': ('', '100.0')} assert match_geojson(data) is None def test_valid_linestring(self, geojson_linestring: dict, wkt_linestring: str): parsed = match_geojson(geojson_linestring) - assert parsed[ParsedType.GEO_POINT] == "POINT (17.5 12.5)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (17.5 12.5)' assert parsed[ParsedType.GEO_SHAPE] == wkt_linestring def test_invalid_linestring_too_few_points(self, geojson_linestring: dict): data = geojson_linestring.copy() - data["coordinates"] = [data["coordinates"][0]] + data['coordinates'] = [data['coordinates'][0]] assert match_geojson(data) is None def test_invalid_linestring_bad_lat(self): data = { - "type": "LineString", - "coordinates": (("30.0", "100.0"), ("30.0", "10.0")), + 'type': 'LineString', + 'coordinates': (('30.0', '100.0'), ('30.0', '10.0')), } assert match_geojson(data) is None def test_invalid_linestring_bad_lon(self): data = { - "type": "LineString", - "coordinates": (("-190.0", "100.0"), ("30.0", "10.0")), + 'type': 'LineString', + 'coordinates': (('-190.0', '100.0'), ('30.0', '10.0')), } assert match_geojson(data) is None def test_valid_polygon(self, geojson_polygon: dict, wkt_polygon: str): parsed = match_geojson(geojson_polygon) - assert parsed[ParsedType.GEO_POINT] == "POINT (15 15)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (15 15)' assert parsed[ParsedType.GEO_SHAPE] == wkt_polygon def test_valid_linear_and_hole_winding_polygon( self, geojson_holed_polygon: dict, wkt_holed_polygon: str ): parsed = match_geojson(geojson_holed_polygon) - assert parsed[ParsedType.GEO_POINT] == "POINT (15 15)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (15 15)' assert parsed[ParsedType.GEO_SHAPE] == wkt_holed_polygon def test_invalid_not_closed_polygon_in_linear_ring(self, geojson_polygon: dict): polygon = geojson_polygon.copy() # remove the last coordinate - del polygon["coordinates"][0][-1] + del polygon['coordinates'][0][-1] assert match_geojson(polygon) is None def test_invalid_not_closed_polygon_in_hole(self, geojson_holed_polygon: dict): # in the linear ring polygon = geojson_holed_polygon.copy() # remove the last coordinate - del polygon["coordinates"][1][-1] + del polygon['coordinates'][1][-1] assert match_geojson(polygon) is None def test_invalid_linear_but_valid_hole_winding_polygon( @@ -271,9 +271,9 @@ def test_invalid_linear_but_valid_hole_winding_polygon( ): polygon = geojson_holed_polygon.copy() # reverse the linear ring winding direction - polygon["coordinates"] = ( - polygon["coordinates"][0][::-1], - polygon["coordinates"][1], + polygon['coordinates'] = ( + polygon['coordinates'][0][::-1], + polygon['coordinates'][1], ) assert match_geojson(polygon) is None @@ -282,21 +282,21 @@ def test_valid_linear_but_invalid_hole_winding_polygon( ): polygon = geojson_holed_polygon.copy() # reverse the hole winding direction - polygon["coordinates"] = ( - polygon["coordinates"][0], - polygon["coordinates"][1][::-1], + polygon['coordinates'] = ( + polygon['coordinates'][0], + polygon['coordinates'][1][::-1], ) assert match_geojson(polygon) is None class TestMatchWKT: def test_ignore_silly(self): - assert match_wkt("beans on toast") is None + assert match_wkt('beans on toast') is None def test_empty(self): - assert match_wkt("point empty") is None - assert match_wkt("linestring empty") is None - assert match_wkt("polygon empty") is None + assert match_wkt('point empty') is None + assert match_wkt('linestring empty') is None + assert match_wkt('polygon empty') is None def test_valid_point(self, wkt_point: str): parsed = match_wkt(wkt_point) @@ -304,62 +304,62 @@ def test_valid_point(self, wkt_point: str): assert parsed[ParsedType.GEO_SHAPE] == wkt_point def test_valid_point_with_elevation(self, wkt_point: str): - test_point = "point (30 10 50)" + test_point = 'point (30 10 50)' parsed = match_wkt(test_point) assert parsed[ParsedType.GEO_POINT] == wkt_point assert parsed[ParsedType.GEO_SHAPE] == wkt_point def test_invalid_with_too_many_points(self, geojson_point: dict): - assert match_wkt("point (20 30 40 50 60 70)") is None + assert match_wkt('point (20 30 40 50 60 70)') is None def test_invalid_point_too_few_points(self, geojson_point: dict): - assert match_wkt("point (20)") is None + assert match_wkt('point (20)') is None def test_invalid_point_bad_lat(self): - data = "point (30.0 100.0)" + data = 'point (30.0 100.0)' assert match_wkt(data) is None def test_invalid_point_bad_lon(self): - data = "point (-190.0 100.0)" + data = 'point (-190.0 100.0)' assert match_wkt(data) is None def test_invalid_point_bad_lat_cause_its_a_random_string(self): - data = "point (30.0 garbage)" + data = 'point (30.0 garbage)' assert match_wkt(data) is None def test_valid_linestring(self, wkt_linestring: str): parsed = match_wkt(wkt_linestring) - assert parsed[ParsedType.GEO_POINT] == "POINT (17.5 12.5)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (17.5 12.5)' assert parsed[ParsedType.GEO_SHAPE] == wkt_linestring def test_invalid_linestring_too_few_points(self): - data = "linestring (10 10)" + data = 'linestring (10 10)' assert match_wkt(data) is None def test_invalid_linestring_bad_lat(self): - data = "linestring (30 100, 30 10)" + data = 'linestring (30 100, 30 10)' assert match_wkt(data) is None def test_invalid_linestring_bad_lon(self): - data = "linestring (-190 100, 30 10)" + data = 'linestring (-190 100, 30 10)' assert match_wkt(data) is None def test_valid_polygon(self, wkt_polygon: str): parsed = match_wkt(wkt_polygon) - assert parsed[ParsedType.GEO_POINT] == "POINT (15 15)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (15 15)' assert parsed[ParsedType.GEO_SHAPE] == wkt_polygon def test_valid_linear_and_hole_winding_polygon(self, wkt_holed_polygon: str): parsed = match_wkt(wkt_holed_polygon) - assert parsed[ParsedType.GEO_POINT] == "POINT (15 15)" + assert parsed[ParsedType.GEO_POINT] == 'POINT (15 15)' assert parsed[ParsedType.GEO_SHAPE] == wkt_holed_polygon def test_invalid_not_closed_polygon_in_linear_ring(self): - polygon = "POLYGON ((30 10, 40 40, 20 40, 10 20))" + polygon = 'POLYGON ((30 10, 40 40, 20 40, 10 20))' assert match_wkt(polygon) is None def test_invalid_not_closed_polygon_in_hole(self): - polygon = "POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20))" + polygon = 'POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20))' assert match_wkt(polygon) is None @@ -368,6 +368,6 @@ def test_is_winding_valid(geojson_holed_polygon: dict): assert is_winding_valid(shape) bad = geojson_holed_polygon.copy() - bad["coordinates"][1] = list(reversed(bad["coordinates"][1])) + bad['coordinates'][1] = list(reversed(bad['coordinates'][1])) bad_shape = from_geojson(json.dumps(bad)) assert not is_winding_valid(bad_shape) diff --git a/tests/indexing/test_index.py b/tests/indexing/test_index.py index 04d0835..229ad52 100644 --- a/tests/indexing/test_index.py +++ b/tests/indexing/test_index.py @@ -25,22 +25,22 @@ def test_index_names(): - indices = IndexNames("test") - assert indices.name == "test" - assert indices.base == "data-test" - assert indices.latest == "data-test-latest" - assert indices.arc_base == "data-test-arc" - assert indices.wildcard == "data-test-*" - assert indices.arc_wildcard == "data-test-arc-*" - assert indices.get_arc(0) == "data-test-arc-0" - assert indices.get_arc(4289) == "data-test-arc-4289" + indices = IndexNames('test') + assert indices.name == 'test' + assert indices.base == 'data-test' + assert indices.latest == 'data-test-latest' + assert indices.arc_base == 'data-test-arc' + assert indices.wildcard == 'data-test-*' + assert indices.arc_wildcard == 'data-test-arc-*' + assert indices.get_arc(0) == 'data-test-arc-0' + assert indices.get_arc(4289) == 'data-test-arc-4289' def setup_scenario( splitgill: SplitgillClient, records: Dict[str, Dict[int, dict]], options: Dict[int, ParsingOptions], - database_name="test", + database_name='test', ) -> SplitgillDatabase: database = SplitgillDatabase(database_name, splitgill) @@ -83,7 +83,7 @@ def check_op( assert isinstance(op, IndexOp) if next_version is not None: assert op.document[DocumentField.NEXT] == next_version - assert op.document[DocumentField.VERSIONS]["lt"] == next_version + assert op.document[DocumentField.VERSIONS]['lt'] == next_version assert op.index == index_names.get_arc(0) else: assert DocumentField.NEXT not in op.document @@ -92,7 +92,7 @@ def check_op( assert op.document[DocumentField.ID] == record_id assert op.document[DocumentField.VERSION] == version - assert op.document[DocumentField.VERSIONS]["gte"] == version + assert op.document[DocumentField.VERSIONS]['gte'] == version # copy the data and add the record ID for checks data = data.copy() @@ -110,7 +110,7 @@ def test_after_beyond_data_version( # this shouldn't happen, but might as well check it! database = setup_scenario( splitgill, - records={"r1": {10: {"x": 5}}}, + records={'r1': {10: {'x': 5}}}, options={8: basic_options}, ) @@ -131,7 +131,7 @@ def test_after_beyond_options_version(self, splitgill: SplitgillClient): # this shouldn't happen, but might as well check it! database = setup_scenario( splitgill, - records={"r1": {10: {"x": 5}}}, + records={'r1': {10: {'x': 5}}}, options={ 8: builder.with_keyword_length(256).build(), 12: builder.with_keyword_length(4).build(), @@ -152,18 +152,18 @@ def test_after_beyond_options_version(self, splitgill: SplitgillClient): def test_mix(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 4: {"x": 3.8}, - 8: {"x": 1.4}, - 9: {"x": 9.6}, + 2: {'x': 5.4}, + 4: {'x': 3.8}, + 8: {'x': 1.4}, + 9: {'x': 9.6}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 10: builder.with_float_format("{0:.10f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 10: builder.with_float_format('{0:.10f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) ops = list( generate_index_ops( @@ -175,31 +175,31 @@ def test_mix(self, splitgill: SplitgillClient): ) ) assert len(ops) == 7 - check_op(database.indices, ops[6], "r1", 10, data[9], options[10]) + check_op(database.indices, ops[6], 'r1', 10, data[9], options[10]) check_op( - database.indices, ops[5], "r1", 9, data[9], options[7], next_version=10 + database.indices, ops[5], 'r1', 9, data[9], options[7], next_version=10 ) - check_op(database.indices, ops[4], "r1", 8, data[8], options[7], next_version=9) - check_op(database.indices, ops[3], "r1", 7, data[4], options[7], next_version=8) - check_op(database.indices, ops[2], "r1", 5, data[4], options[5], next_version=7) - check_op(database.indices, ops[1], "r1", 4, data[4], options[1], next_version=5) - check_op(database.indices, ops[0], "r1", 2, data[2], options[1], next_version=4) + check_op(database.indices, ops[4], 'r1', 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[3], 'r1', 7, data[4], options[7], next_version=8) + check_op(database.indices, ops[2], 'r1', 5, data[4], options[5], next_version=7) + check_op(database.indices, ops[1], 'r1', 4, data[4], options[1], next_version=5) + check_op(database.indices, ops[0], 'r1', 2, data[2], options[1], next_version=4) def test_delete(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, + 2: {'x': 5.4}, 4: {}, - 8: {"x": 1.4}, + 8: {'x': 1.4}, 9: {}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 10: builder.with_float_format("{0:.10f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 10: builder.with_float_format('{0:.10f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) ops = list( generate_index_ops( @@ -216,16 +216,16 @@ def test_delete(self, splitgill: SplitgillClient): # we're syncing from the start, no delete op should be created and therefore we # should only get 2 ops assert len(ops) == 2 - check_op(database.indices, ops[0], "r1", 2, data[2], options[1], next_version=4) - check_op(database.indices, ops[1], "r1", 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[0], 'r1', 2, data[2], options[1], next_version=4) + check_op(database.indices, ops[1], 'r1', 8, data[8], options[7], next_version=9) def test_delete_with_after(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() options = { - 2: builder.with_float_format("{0:.4f}").build(), + 2: builder.with_float_format('{0:.4f}').build(), } - database = setup_scenario(splitgill, {"r1": {2: {"x": 5.4}}}, options) + database = setup_scenario(splitgill, {'r1': {2: {'x': 5.4}}}, options) first_ops = list( generate_index_ops( database.indices, @@ -236,7 +236,7 @@ def test_delete_with_after(self, splitgill: SplitgillClient): ) ) - database = setup_scenario(splitgill, {"r1": {4: {}}}, options) + database = setup_scenario(splitgill, {'r1': {4: {}}}, options) second_ops = list( generate_index_ops( database.indices, @@ -248,16 +248,16 @@ def test_delete_with_after(self, splitgill: SplitgillClient): ) assert len(first_ops) == 1 - check_op(database.indices, first_ops[0], "r1", 2, {"x": 5.4}, options[2]) + check_op(database.indices, first_ops[0], 'r1', 2, {'x': 5.4}, options[2]) assert len(second_ops) == 2 - check_delete_op(database.indices, second_ops[0], "r1") + check_delete_op(database.indices, second_ops[0], 'r1') check_op( database.indices, second_ops[1], - "r1", + 'r1', 2, - {"x": 5.4}, + {'x': 5.4}, options[2], next_version=4, ) @@ -265,18 +265,18 @@ def test_delete_with_after(self, splitgill: SplitgillClient): def test_after_between_versions(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 4: {"x": 2.7}, - 8: {"x": 1.4}, - 9: {"x": 0.1}, + 2: {'x': 5.4}, + 4: {'x': 2.7}, + 8: {'x': 1.4}, + 9: {'x': 0.1}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 10: builder.with_float_format("{0:.10f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 10: builder.with_float_format('{0:.10f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) # set after to 6 as we have no data nor options versions at 6 ops = list( @@ -289,29 +289,29 @@ def test_after_between_versions(self, splitgill: SplitgillClient): ) ) assert len(ops) == 5 - check_op(database.indices, ops[4], "r1", 10, data[9], options[10]) + check_op(database.indices, ops[4], 'r1', 10, data[9], options[10]) check_op( - database.indices, ops[3], "r1", 9, data[9], options[7], next_version=10 + database.indices, ops[3], 'r1', 9, data[9], options[7], next_version=10 ) - check_op(database.indices, ops[2], "r1", 8, data[8], options[7], next_version=9) - check_op(database.indices, ops[1], "r1", 7, data[4], options[7], next_version=8) - check_op(database.indices, ops[0], "r1", 5, data[4], options[5], next_version=7) + check_op(database.indices, ops[2], 'r1', 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[1], 'r1', 7, data[4], options[7], next_version=8) + check_op(database.indices, ops[0], 'r1', 5, data[4], options[5], next_version=7) def test_after_at_both_versions(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 5: {"x": 2.7}, - 8: {"x": 1.4}, - 9: {"x": 0.1}, + 2: {'x': 5.4}, + 5: {'x': 2.7}, + 8: {'x': 1.4}, + 9: {'x': 0.1}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 10: builder.with_float_format("{0:.10f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 10: builder.with_float_format('{0:.10f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) # set after to 5 which we have a version of data and options at ops = list( @@ -324,28 +324,28 @@ def test_after_at_both_versions(self, splitgill: SplitgillClient): ) ) assert len(ops) == 5 - check_op(database.indices, ops[4], "r1", 10, data[9], options[10]) + check_op(database.indices, ops[4], 'r1', 10, data[9], options[10]) check_op( - database.indices, ops[3], "r1", 9, data[9], options[7], next_version=10 + database.indices, ops[3], 'r1', 9, data[9], options[7], next_version=10 ) - check_op(database.indices, ops[2], "r1", 8, data[8], options[7], next_version=9) - check_op(database.indices, ops[1], "r1", 7, data[5], options[7], next_version=8) - check_op(database.indices, ops[0], "r1", 5, data[5], options[5], next_version=7) + check_op(database.indices, ops[2], 'r1', 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[1], 'r1', 7, data[5], options[7], next_version=8) + check_op(database.indices, ops[0], 'r1', 5, data[5], options[5], next_version=7) def test_after_new_data(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 4: {"x": 2.7}, - 8: {"x": 1.4}, - 9: {"x": 0.1}, + 2: {'x': 5.4}, + 4: {'x': 2.7}, + 8: {'x': 1.4}, + 9: {'x': 0.1}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) # set after to 8, this should just mean version 9 of the data is found as new ops = list( @@ -360,24 +360,24 @@ def test_after_new_data(self, splitgill: SplitgillClient): # should get 2 ops, one to update the latest index and one pushing the old # latest down to the non-latest data indices assert len(ops) == 2 - check_op(database.indices, ops[1], "r1", 9, data[9], options[7]) - check_op(database.indices, ops[0], "r1", 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[1], 'r1', 9, data[9], options[7]) + check_op(database.indices, ops[0], 'r1', 8, data[8], options[7], next_version=9) def test_after_new_options(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 4: {"x": 2.7}, - 8: {"x": 1.4}, - 9: {"x": 0.1}, + 2: {'x': 5.4}, + 4: {'x': 2.7}, + 8: {'x': 1.4}, + 9: {'x': 0.1}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 10: builder.with_float_format("{0:.3f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 10: builder.with_float_format('{0:.3f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) # set after to 9, this should just mean version 10 of the options is found as # new @@ -393,26 +393,26 @@ def test_after_new_options(self, splitgill: SplitgillClient): # should get 2 ops, one to update the latest index and one pushing the old # latest down to the non-latest data indices assert len(ops) == 2 - check_op(database.indices, ops[1], "r1", 10, data[9], options[10]) + check_op(database.indices, ops[1], 'r1', 10, data[9], options[10]) check_op( - database.indices, ops[0], "r1", 9, data[9], options[7], next_version=10 + database.indices, ops[0], 'r1', 9, data[9], options[7], next_version=10 ) def test_after_new_both(self, splitgill: SplitgillClient): builder = ParsingOptionsBuilder() data = { - 2: {"x": 5.4}, - 4: {"x": 2.7}, - 8: {"x": 1.4}, - 9: {"x": 0.1}, + 2: {'x': 5.4}, + 4: {'x': 2.7}, + 8: {'x': 1.4}, + 9: {'x': 0.1}, } options = { - 1: builder.with_float_format("{0:.4f}").build(), - 5: builder.with_float_format("{0:.2f}").build(), - 7: builder.with_float_format("{0:.6f}").build(), - 9: builder.with_float_format("{0:.3f}").build(), + 1: builder.with_float_format('{0:.4f}').build(), + 5: builder.with_float_format('{0:.2f}').build(), + 7: builder.with_float_format('{0:.6f}').build(), + 9: builder.with_float_format('{0:.3f}').build(), } - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) # set after to 8, this should just mean version 10 of the options and data is # found as new @@ -428,15 +428,15 @@ def test_after_new_both(self, splitgill: SplitgillClient): # should get 2 ops, one to update the latest index and one pushing the old # latest down to the non-latest data indices assert len(ops) == 2 - check_op(database.indices, ops[1], "r1", 9, data[9], options[9]) - check_op(database.indices, ops[0], "r1", 8, data[8], options[7], next_version=9) + check_op(database.indices, ops[1], 'r1', 9, data[9], options[9]) + check_op(database.indices, ops[0], 'r1', 8, data[8], options[7], next_version=9) def test_just_latest( self, splitgill: SplitgillClient, basic_options: ParsingOptions ): - data = {1: {"x": 5.4}} + data = {1: {'x': 5.4}} options = {1: basic_options} - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) ops = list( generate_index_ops( @@ -448,25 +448,25 @@ def test_just_latest( ) ) assert len(ops) == 1 - check_op(database.indices, ops[0], "r1", 1, data[1], options[1]) + check_op(database.indices, ops[0], 'r1', 1, data[1], options[1]) def test_meta_geo(self, splitgill: SplitgillClient): builder = ( ParsingOptionsBuilder() .with_keyword_length(256) - .with_float_format("{0:.15g}") + .with_float_format('{0:.15g}') ) data = { 1: { - "x": "beans", - "lat": 4, - "lon": 10, - "location": {"type": "Point", "coordinates": [100.4, 0.1]}, + 'x': 'beans', + 'lat': 4, + 'lon': 10, + 'location': {'type': 'Point', 'coordinates': [100.4, 0.1]}, } } - options = {1: builder.with_geo_hint("lat", "lon").build()} + options = {1: builder.with_geo_hint('lat', 'lon').build()} - database = setup_scenario(splitgill, {"r1": data}, options) + database = setup_scenario(splitgill, {'r1': data}, options) ops = list( generate_index_ops( @@ -479,13 +479,13 @@ def test_meta_geo(self, splitgill: SplitgillClient): ) assert len(ops) == 1 - check_op(database.indices, ops[0], "r1", 1, data[1], options[1]) + check_op(database.indices, ops[0], 'r1', 1, data[1], options[1]) def test_arcs_from_nothing(self, splitgill: SplitgillClient): options = ( ParsingOptionsBuilder() .with_keyword_length(256) - .with_float_format("{0:.15g}") + .with_float_format('{0:.15g}') ).build() records = 1000 @@ -493,14 +493,14 @@ def test_arcs_from_nothing(self, splitgill: SplitgillClient): arc_max_size = 50 data = { - f"r-{i}": {v: {"x": f"value at {v}"} for v in range(1, versions + 1)} + f'r-{i}': {v: {'x': f'value at {v}'} for v in range(1, versions + 1)} for i in range(records) } database = setup_scenario(splitgill, data, {1: options}) # sync with the arc max size we want - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", arc_max_size): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', arc_max_size): database.sync() # get the indices used @@ -519,45 +519,45 @@ def test_arcs_from_nothing(self, splitgill: SplitgillClient): assert indices == sorted(expected) assert database.search().count() == records # the latest arc might not be full - assert splitgill.elasticsearch.count(index=indices[0])["count"] <= arc_max_size + assert splitgill.elasticsearch.count(index=indices[0])['count'] <= arc_max_size # all the other arcs will be though for index in indices[1:-1]: - assert splitgill.elasticsearch.count(index=index)["count"] == arc_max_size + assert splitgill.elasticsearch.count(index=index)['count'] == arc_max_size assert ( - database.search(version=SearchVersion.all).filter(id_query("r-123")).count() + database.search(version=SearchVersion.all).filter(id_query('r-123')).count() == versions ) def test_arcs_size_change_bigger(self, splitgill: SplitgillClient): - database = splitgill.get_database("test") - record_id = "r-1" + database = splitgill.get_database('test') + record_id = 'r-1' # goes to arc-0 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # goes to arc-0 - database.ingest([Record(record_id, {"a": 7})], commit=True) + database.ingest([Record(record_id, {'a': 7})], commit=True) # goes to arc-1 - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) # goes to arc-1 - database.ingest([Record(record_id, {"a": 3})], commit=True) + database.ingest([Record(record_id, {'a': 3})], commit=True) # goes to latest, then after second sync goes to arc-1 - database.ingest([Record(record_id, {"a": 8})], commit=True) + database.ingest([Record(record_id, {'a': 8})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 2): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 2): database.sync() # goes to arc-1 - database.ingest([Record(record_id, {"a": 2})], commit=True) + database.ingest([Record(record_id, {'a': 2})], commit=True) # goes to arc-2 - database.ingest([Record(record_id, {"a": 1})], commit=True) + database.ingest([Record(record_id, {'a': 1})], commit=True) # goes to arc-2 - database.ingest([Record(record_id, {"a": 0})], commit=True) + database.ingest([Record(record_id, {'a': 0})], commit=True) # goes to arc-2 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # goes to latest - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 4): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 4): database.sync() search = Search(using=splitgill.elasticsearch) @@ -567,34 +567,34 @@ def test_arcs_size_change_bigger(self, splitgill: SplitgillClient): assert search.index(database.indices.get_arc(2)).count() == 3 def test_arcs_size_change_smaller(self, splitgill: SplitgillClient): - database = splitgill.get_database("test") - record_id = "r-1" + database = splitgill.get_database('test') + record_id = 'r-1' # goes to arc-0 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # goes to arc-0 - database.ingest([Record(record_id, {"a": 7})], commit=True) + database.ingest([Record(record_id, {'a': 7})], commit=True) # goes to arc-0 - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) # goes to arc-0 - database.ingest([Record(record_id, {"a": 3})], commit=True) + database.ingest([Record(record_id, {'a': 3})], commit=True) # goes to latest, then after second sync goes to arc-1 - database.ingest([Record(record_id, {"a": 8})], commit=True) + database.ingest([Record(record_id, {'a': 8})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 5): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 5): database.sync() # goes to arc-1 - database.ingest([Record(record_id, {"a": 2})], commit=True) + database.ingest([Record(record_id, {'a': 2})], commit=True) # goes to arc-2 - database.ingest([Record(record_id, {"a": 1})], commit=True) + database.ingest([Record(record_id, {'a': 1})], commit=True) # goes to arc-2 - database.ingest([Record(record_id, {"a": 0})], commit=True) + database.ingest([Record(record_id, {'a': 0})], commit=True) # goes to arc-3 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # goes to latest - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 2): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 2): database.sync() search = Search(using=splitgill.elasticsearch) @@ -607,21 +607,21 @@ def test_arcs_size_change_smaller(self, splitgill: SplitgillClient): def test_arcs_size_1(self, splitgill: SplitgillClient): # an arc size of 1 would never actually be used, but it is a helpful edge case # to test to make sure the logic is sound - database = splitgill.get_database("test") - record_id = "r-1" - database.ingest([Record(record_id, {"a": 4})], commit=True) - database.ingest([Record(record_id, {"a": 7})], commit=True) - database.ingest([Record(record_id, {"a": 9})], commit=True) - database.ingest([Record(record_id, {"a": 3})], commit=True) - database.ingest([Record(record_id, {"a": 8})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 1): + database = splitgill.get_database('test') + record_id = 'r-1' + database.ingest([Record(record_id, {'a': 4})], commit=True) + database.ingest([Record(record_id, {'a': 7})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) + database.ingest([Record(record_id, {'a': 3})], commit=True) + database.ingest([Record(record_id, {'a': 8})], commit=True) + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 1): database.sync() - database.ingest([Record(record_id, {"a": 2})], commit=True) - database.ingest([Record(record_id, {"a": 1})], commit=True) - database.ingest([Record(record_id, {"a": 0})], commit=True) - database.ingest([Record(record_id, {"a": 4})], commit=True) - database.ingest([Record(record_id, {"a": 9})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 1): + database.ingest([Record(record_id, {'a': 2})], commit=True) + database.ingest([Record(record_id, {'a': 1})], commit=True) + database.ingest([Record(record_id, {'a': 0})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 1): database.sync() search = Search(using=splitgill.elasticsearch) @@ -638,22 +638,22 @@ def test_handling_of_non_impactful_options(self, splitgill: SplitgillClient): case where it's tricky to figure out the right versions of the data to make ops for. """ - database = splitgill.get_database("test") - doc_id = "r-1" + database = splitgill.get_database('test') + doc_id = 'r-1' # create an initial version with some parsing options - with freeze_time("2025-05-01 09:00:00"): + with freeze_time('2025-05-01 09:00:00'): database.update_options(ParsingOptionsBuilder().build(), commit=False) - database.ingest([Record(doc_id, {"x": 5})], commit=True) + database.ingest([Record(doc_id, {'x': 5})], commit=True) # update the parsing options in a way which doesn't impact the resulting parsed data - with freeze_time("2025-05-01 11:00:00"): + with freeze_time('2025-05-01 11:00:00'): database.update_options( ParsingOptionsBuilder().with_keyword_length(10).build(), commit=True ) # update the data - with freeze_time("2025-05-01 12:00:00"): - database.ingest([Record(doc_id, {"x": 7})], commit=True) + with freeze_time('2025-05-01 12:00:00'): + database.ingest([Record(doc_id, {'x': 7})], commit=True) ops = list( generate_index_ops( @@ -672,32 +672,32 @@ def test_handling_of_non_impactful_options(self, splitgill: SplitgillClient): # the first op has the first data/option version assert ops[0].document[DocumentField.VERSION] == 1746090000000 assert ops[0].document[DocumentField.VERSIONS] == { - "gte": 1746090000000, + 'gte': 1746090000000, # but the op doesn't have next version as the middle options change at 11am, # it's the next data change at 12 noon because the options change doesn't change # the resulting document - "lt": 1746100800000, + 'lt': 1746100800000, } assert ops[1].index == database.indices.latest assert ops[1].doc_id == doc_id assert ops[1].document[DocumentField.VERSION] == 1746100800000 - assert ops[1].document[DocumentField.VERSIONS] == {"gte": 1746100800000} + assert ops[1].document[DocumentField.VERSIONS] == {'gte': 1746100800000} def test_delete_op(): - op = DeleteOp("test-index", "record-1") + op = DeleteOp('test-index', 'record-1') assert op.serialise() == json.dumps( - {"delete": {"_index": "test-index", "_id": "record-1"}}, separators=(",", ":") + {'delete': {'_index': 'test-index', '_id': 'record-1'}}, separators=(',', ':') ) def test_index_op(): - op = IndexOp("test-index", "record-1", {"x": "beans", "y": "beans", "z": 4.689221}) + op = IndexOp('test-index', 'record-1', {'x': 'beans', 'y': 'beans', 'z': 4.689221}) metadata = json.dumps( - {"index": {"_index": "test-index", "_id": "record-1"}}, separators=(",", ":") + {'index': {'_index': 'test-index', '_id': 'record-1'}}, separators=(',', ':') ) data = json.dumps( - {"x": "beans", "y": "beans", "z": 4.689221}, separators=(",", ":") + {'x': 'beans', 'y': 'beans', 'z': 4.689221}, separators=(',', ':') ) - assert op.serialise() == f"{metadata}\n{data}" + assert op.serialise() == f'{metadata}\n{data}' diff --git a/tests/indexing/test_options.py b/tests/indexing/test_options.py index b3591b3..eec60f2 100644 --- a/tests/indexing/test_options.py +++ b/tests/indexing/test_options.py @@ -7,12 +7,12 @@ class TestParsingOptionsBuilder: def test_with_geo_hint(self): builder = ParsingOptionsBuilder() - builder.with_geo_hint("lat", "lon").with_geo_hint("x", "y", "rad", 12) + builder.with_geo_hint('lat', 'lon').with_geo_hint('x', 'y', 'rad', 12) - assert GeoFieldHint("lat", "lon") in builder._geo_hints - assert GeoFieldHint("x", "y", "rad", 12) in builder._geo_hints + assert GeoFieldHint('lat', 'lon') in builder._geo_hints + assert GeoFieldHint('x', 'y', 'rad', 12) in builder._geo_hints - another_ref = builder.with_geo_hint("lat", "lon") + another_ref = builder.with_geo_hint('lat', 'lon') assert len(builder._geo_hints) == 2 # check the chaining works properly @@ -20,24 +20,24 @@ def test_with_geo_hint(self): def test_with_true_value(self): builder = ParsingOptionsBuilder() - builder.with_true_value("aye") + builder.with_true_value('aye') builder.with_true_value(None) - assert "aye" in builder._true_values + assert 'aye' in builder._true_values assert len(builder._true_values) == 1 def test_with_false_value(self): builder = ParsingOptionsBuilder() - builder.with_false_value("narp") + builder.with_false_value('narp') builder.with_false_value(None) - assert "narp" in builder._false_values + assert 'narp' in builder._false_values assert len(builder._false_values) == 1 def test_with_date_format(self): builder = ParsingOptionsBuilder() base_count = len(builder._date_formats) - builder.with_date_format("%Y") + builder.with_date_format('%Y') builder.with_date_format(None) - assert "%Y" in builder._date_formats + assert '%Y' in builder._date_formats assert len(builder._date_formats) == 1 + base_count def test_keyword_length(self): diff --git a/tests/indexing/test_parser.py b/tests/indexing/test_parser.py index e8d33b5..b136a25 100644 --- a/tests/indexing/test_parser.py +++ b/tests/indexing/test_parser.py @@ -1,12 +1,12 @@ -from datetime import datetime, date, timezone, timedelta +from datetime import date, datetime, timedelta, timezone from itertools import chain import pytest from shapely import from_wkt from splitgill.diffing import prepare_data -from splitgill.indexing.fields import ParsedType, DataType -from splitgill.indexing.geo import match_hints, match_geojson +from splitgill.indexing.fields import DataType, ParsedType +from splitgill.indexing.geo import match_geojson, match_hints from splitgill.indexing.options import ParsingOptionsBuilder from splitgill.indexing.parser import parse, parse_value from splitgill.model import ParsingOptions @@ -42,181 +42,181 @@ def pt(path: str, *types: ParsedType, include_text: bool = True) -> str: if include_text: types.append(ParsedType.KEYWORD) types.append(ParsedType.TEXT) - return f"{path}.{','.join(sorted(types))}" + return f'{path}.{",".join(sorted(types))}' def dt(path: str, *types: DataType) -> str: - return f"{path}.{','.join(sorted(types))}" + return f'{path}.{",".join(sorted(types))}' class TestParse: def test_no_nesting(self, basic_options: ParsingOptions): - data = {"x": "beans"} + data = {'x': 'beans'} parsed_data = parse(data, basic_options) - assert parsed_data.parsed == {"x": parse_value("beans", basic_options)} - assert parsed_data.data_types == [dt("x", DataType.STR)] - assert parsed_data.parsed_types == [pt("x")] + assert parsed_data.parsed == {'x': parse_value('beans', basic_options)} + assert parsed_data.data_types == [dt('x', DataType.STR)] + assert parsed_data.parsed_types == [pt('x')] def test_list_of_strings(self, basic_options: ParsingOptions): - data = {"x": ["beans", "lemons", "goats"]} + data = {'x': ['beans', 'lemons', 'goats']} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": [parse_value(value, basic_options) for value in data["x"]] + 'x': [parse_value(value, basic_options) for value in data['x']] } assert parsed_data.data_types == [ - dt("x", DataType.LIST), - dt("x.", DataType.STR), + dt('x', DataType.LIST), + dt('x.', DataType.STR), ] - assert parsed_data.parsed_types == [pt("x")] + assert parsed_data.parsed_types == [pt('x')] def test_list_of_dicts(self, basic_options: ParsingOptions): - data = {"x": [{"a": 4}, {"a": 5}, {"a": 6}]} + data = {'x': [{'a': 4}, {'a': 5}, {'a': 6}]} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": [ - {"a": parse_value(4, basic_options)}, - {"a": parse_value(5, basic_options)}, - {"a": parse_value(6, basic_options)}, + 'x': [ + {'a': parse_value(4, basic_options)}, + {'a': parse_value(5, basic_options)}, + {'a': parse_value(6, basic_options)}, ] } assert parsed_data.data_types == [ - dt("x", DataType.LIST), - dt("x.", DataType.DICT), - dt("x..a", DataType.INT), + dt('x', DataType.LIST), + dt('x.', DataType.DICT), + dt('x..a', DataType.INT), ] - assert parsed_data.parsed_types == [pt("x.a", ParsedType.NUMBER)] + assert parsed_data.parsed_types == [pt('x.a', ParsedType.NUMBER)] def test_list_of_lists(self, basic_options: ParsingOptions): - data = {"x": [[1, 2, 3], [4, 5, 6]]} + data = {'x': [[1, 2, 3], [4, 5, 6]]} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": [ + 'x': [ [parse_value(value, basic_options) for value in [1, 2, 3]], [parse_value(value, basic_options) for value in [4, 5, 6]], ] } assert parsed_data.data_types == [ - dt("x", DataType.LIST), - dt("x.", DataType.LIST), - dt("x..", DataType.INT), + dt('x', DataType.LIST), + dt('x.', DataType.LIST), + dt('x..', DataType.INT), ] - assert parsed_data.parsed_types == [pt("x", ParsedType.NUMBER)] + assert parsed_data.parsed_types == [pt('x', ParsedType.NUMBER)] def test_nested_dict(self, basic_options: ParsingOptions): - data = {"x": "beans", "y": {"a": "5", "b": "buckets!"}} + data = {'x': 'beans', 'y': {'a': '5', 'b': 'buckets!'}} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": parse_value("beans", basic_options), - "y": { - "a": parse_value("5", basic_options), - "b": parse_value("buckets!", basic_options), + 'x': parse_value('beans', basic_options), + 'y': { + 'a': parse_value('5', basic_options), + 'b': parse_value('buckets!', basic_options), }, } assert parsed_data.data_types == [ - dt("x", DataType.STR), - dt("y", DataType.DICT), - dt("y.a", DataType.STR), - dt("y.b", DataType.STR), + dt('x', DataType.STR), + dt('y', DataType.DICT), + dt('y.a', DataType.STR), + dt('y.b', DataType.STR), ] assert parsed_data.parsed_types == [ - pt("x"), - pt("y.a", ParsedType.NUMBER), - pt("y.b"), + pt('x'), + pt('y.a', ParsedType.NUMBER), + pt('y.b'), ] def test_nested_mix(self, basic_options: ParsingOptions): data = { - "x": ["4", "6", [{"a": ["1", "2"]}, {"a": ["6", "1"]}]], - "y": {"t": [{"x": 4}, {"x": 1}, {"x": 5.6}]}, + 'x': ['4', '6', [{'a': ['1', '2']}, {'a': ['6', '1']}]], + 'y': {'t': [{'x': 4}, {'x': 1}, {'x': 5.6}]}, } parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": [ - parse_value("4", basic_options), - parse_value("6", basic_options), + 'x': [ + parse_value('4', basic_options), + parse_value('6', basic_options), [ { - "a": [ - parse_value("1", basic_options), - parse_value("2", basic_options), + 'a': [ + parse_value('1', basic_options), + parse_value('2', basic_options), ] }, { - "a": [ - parse_value("6", basic_options), - parse_value("1", basic_options), + 'a': [ + parse_value('6', basic_options), + parse_value('1', basic_options), ] }, ], ], - "y": { - "t": [ - {"x": parse_value(4, basic_options)}, - {"x": parse_value(1, basic_options)}, - {"x": parse_value(5.6, basic_options)}, + 'y': { + 't': [ + {'x': parse_value(4, basic_options)}, + {'x': parse_value(1, basic_options)}, + {'x': parse_value(5.6, basic_options)}, ] }, } assert parsed_data.data_types == [ - dt("x", DataType.LIST), - dt("x.", DataType.STR, DataType.LIST), - dt("x..", DataType.DICT), - dt("x...a", DataType.LIST), - dt("x...a.", DataType.STR), - dt("y", DataType.DICT), - dt("y.t", DataType.LIST), - dt("y.t.", DataType.DICT), - dt("y.t..x", DataType.INT, DataType.FLOAT), + dt('x', DataType.LIST), + dt('x.', DataType.STR, DataType.LIST), + dt('x..', DataType.DICT), + dt('x...a', DataType.LIST), + dt('x...a.', DataType.STR), + dt('y', DataType.DICT), + dt('y.t', DataType.LIST), + dt('y.t.', DataType.DICT), + dt('y.t..x', DataType.INT, DataType.FLOAT), ] assert parsed_data.parsed_types == [ - pt("x", ParsedType.NUMBER), - pt("x.a", ParsedType.NUMBER), - pt("y.t.x", ParsedType.NUMBER), + pt('x', ParsedType.NUMBER), + pt('x.a', ParsedType.NUMBER), + pt('y.t.x', ParsedType.NUMBER), ] def test_geo_hinted_fields(self, basic_options: ParsingOptions): data = { - "x": "something", - "y": "somewhere", - "decimalLatitude": 14.897, - "decimalLongitude": -87.956, + 'x': 'something', + 'y': 'somewhere', + 'decimalLatitude': 14.897, + 'decimalLongitude': -87.956, } parsed_data = parse(data, basic_options) geo_data = next(iter(match_hints(data, basic_options.geo_hints).values())) assert parsed_data.parsed == { - "x": parse_value("something", basic_options), - "y": parse_value("somewhere", basic_options), - "decimalLatitude": {**parse_value(14.897, basic_options), **geo_data}, - "decimalLongitude": parse_value(-87.956, basic_options), + 'x': parse_value('something', basic_options), + 'y': parse_value('somewhere', basic_options), + 'decimalLatitude': {**parse_value(14.897, basic_options), **geo_data}, + 'decimalLongitude': parse_value(-87.956, basic_options), } def test_geojson_field( self, geojson_point: dict, wkt_point: str, basic_options: ParsingOptions ): data = { - "x": geojson_point, - "y": "somewhere", + 'x': geojson_point, + 'y': 'somewhere', } parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": { + 'x': { ParsedType.GEO_POINT: wkt_point, ParsedType.GEO_SHAPE: wkt_point, - "type": parse_value("Point", basic_options), - "coordinates": [ + 'type': parse_value('Point', basic_options), + 'coordinates': [ parse_value(30, basic_options), parse_value(10, basic_options), ], }, - "y": parse_value("somewhere", basic_options), + 'y': parse_value('somewhere', basic_options), } def test_geojson_at_root_not_recognised( @@ -225,8 +225,8 @@ def test_geojson_at_root_not_recognised( parsed_data = parse(geojson_point, basic_options) assert parsed_data.parsed == { - "type": parse_value("Point", basic_options), - "coordinates": [ + 'type': parse_value('Point', basic_options), + 'coordinates': [ parse_value(30.0, basic_options), parse_value(10.0, basic_options), ], @@ -241,7 +241,7 @@ def test_geojson_field_list( basic_options: ParsingOptions, ): data = { - "x": [ + 'x': [ geojson_point, geojson_linestring, geojson_polygon, @@ -251,7 +251,7 @@ def test_geojson_field_list( parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "x": [ + 'x': [ { **parse(value, basic_options).parsed, **match_geojson(value), @@ -266,41 +266,41 @@ def test_geojson_field_list( } def test_dict_with_nulls(self, basic_options: ParsingOptions): - data = {"a": "hello", "b": None, "c": ""} + data = {'a': 'hello', 'b': None, 'c': ''} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "a": parse_value("hello", basic_options), - "b": {ParsedType.UNPARSED: None}, - "c": {ParsedType.UNPARSED: ""}, + 'a': parse_value('hello', basic_options), + 'b': {ParsedType.UNPARSED: None}, + 'c': {ParsedType.UNPARSED: ''}, } assert parsed_data.data_types == [ - f"a.{DataType.STR}", - f"b.{DataType.NONE}", - f"c.{DataType.STR}", + f'a.{DataType.STR}', + f'b.{DataType.NONE}', + f'c.{DataType.STR}', ] - assert parsed_data.parsed_types == [pt("a")] + assert parsed_data.parsed_types == [pt('a')] def test_list_with_nulls(self, basic_options: ParsingOptions): - data = {"a": ["hello", None, ""]} + data = {'a': ['hello', None, '']} parsed_data = parse(data, basic_options) assert parsed_data.parsed == { - "a": [parse_value("hello", basic_options), None, None] + 'a': [parse_value('hello', basic_options), None, None] } assert parsed_data.data_types == [ - dt("a", DataType.LIST), - dt("a.", DataType.NONE, DataType.STR), + dt('a', DataType.LIST), + dt('a.', DataType.NONE, DataType.STR), ] - assert parsed_data.parsed_types == [pt("a")] + assert parsed_data.parsed_types == [pt('a')] class TestParseValue: def test_normal_text(self, basic_options: ParsingOptions): - assert parse_value("banana", basic_options) == { - ParsedType.UNPARSED: "banana", - ParsedType.TEXT: "banana", - ParsedType.KEYWORD: "banana", + assert parse_value('banana', basic_options) == { + ParsedType.UNPARSED: 'banana', + ParsedType.TEXT: 'banana', + ParsedType.KEYWORD: 'banana', } def test_bools(self, basic_options: ParsingOptions): @@ -324,56 +324,56 @@ def test_bools(self, basic_options: ParsingOptions): } def test_number(self, basic_options: ParsingOptions): - assert parse_value("5.3", basic_options) == { - ParsedType.UNPARSED: "5.3", - ParsedType.TEXT: "5.3", - ParsedType.KEYWORD: "5.3", + assert parse_value('5.3', basic_options) == { + ParsedType.UNPARSED: '5.3', + ParsedType.TEXT: '5.3', + ParsedType.KEYWORD: '5.3', ParsedType.NUMBER: 5.3, } - assert parse_value("70", basic_options) == { - ParsedType.UNPARSED: "70", - ParsedType.TEXT: "70", - ParsedType.KEYWORD: "70", + assert parse_value('70', basic_options) == { + ParsedType.UNPARSED: '70', + ParsedType.TEXT: '70', + ParsedType.KEYWORD: '70', ParsedType.NUMBER: 70.0, } - assert parse_value("70.0", basic_options) == { - ParsedType.UNPARSED: "70.0", - ParsedType.TEXT: "70.0", - ParsedType.KEYWORD: "70.0", + assert parse_value('70.0', basic_options) == { + ParsedType.UNPARSED: '70.0', + ParsedType.TEXT: '70.0', + ParsedType.KEYWORD: '70.0', ParsedType.NUMBER: 70.0, } assert parse_value(4, basic_options) == { ParsedType.UNPARSED: 4, - ParsedType.TEXT: "4", - ParsedType.KEYWORD: "4", + ParsedType.TEXT: '4', + ParsedType.KEYWORD: '4', ParsedType.NUMBER: 4, } assert parse_value(16.04, basic_options) == { ParsedType.UNPARSED: 16.04, - ParsedType.TEXT: "16.04", - ParsedType.KEYWORD: "16.04", + ParsedType.TEXT: '16.04', + ParsedType.KEYWORD: '16.04', ParsedType.NUMBER: 16.04, } assert parse_value(16.042245342119813456, basic_options) == { ParsedType.UNPARSED: 16.042245342119813456, - ParsedType.TEXT: "16.0422453421198", - ParsedType.KEYWORD: "16.0422453421198", + ParsedType.TEXT: '16.0422453421198', + ParsedType.KEYWORD: '16.0422453421198', ParsedType.NUMBER: 16.042245342119813456, } - assert parse_value("1.2312e-20", basic_options) == { - ParsedType.UNPARSED: "1.2312e-20", - ParsedType.TEXT: "1.2312e-20", - ParsedType.KEYWORD: "1.2312e-20", + assert parse_value('1.2312e-20', basic_options) == { + ParsedType.UNPARSED: '1.2312e-20', + ParsedType.TEXT: '1.2312e-20', + ParsedType.KEYWORD: '1.2312e-20', ParsedType.NUMBER: 1.2312e-20, } def test_invalid_numbers(self, basic_options: ParsingOptions): - assert ParsedType.NUMBER.value not in parse_value("5.3.4", basic_options) - assert ParsedType.NUMBER.value not in parse_value("NaN", basic_options) - assert ParsedType.NUMBER.value not in parse_value("inf", basic_options) + assert ParsedType.NUMBER.value not in parse_value('5.3.4', basic_options) + assert ParsedType.NUMBER.value not in parse_value('NaN', basic_options) + assert ParsedType.NUMBER.value not in parse_value('inf', basic_options) def test_date_date_and_time(self, basic_options: ParsingOptions): - value = "2005-07-02 20:16:47.458301" + value = '2005-07-02 20:16:47.458301' assert parse_value(value, basic_options) == { ParsedType.UNPARSED: value, @@ -381,22 +381,22 @@ def test_date_date_and_time(self, basic_options: ParsingOptions): ParsedType.KEYWORD: value, ParsedType.DATE: to_timestamp( # check the timestamp is converted correctly, it'll be UTC so add +00:00 - datetime.fromisoformat(f"{value}+00:00") + datetime.fromisoformat(f'{value}+00:00') ), } def test_date_date_and_time_and_tz(self, basic_options: ParsingOptions): - assert parse_value("2005-07-02 20:16:47.103+05:00", basic_options) == { - ParsedType.UNPARSED: "2005-07-02 20:16:47.103+05:00", - ParsedType.TEXT: "2005-07-02 20:16:47.103+05:00", - ParsedType.KEYWORD: "2005-07-02 20:16:47.103+05:00", + assert parse_value('2005-07-02 20:16:47.103+05:00', basic_options) == { + ParsedType.UNPARSED: '2005-07-02 20:16:47.103+05:00', + ParsedType.TEXT: '2005-07-02 20:16:47.103+05:00', + ParsedType.KEYWORD: '2005-07-02 20:16:47.103+05:00', ParsedType.DATE: to_timestamp( - datetime.fromisoformat("2005-07-02T20:16:47.103000+05:00") + datetime.fromisoformat('2005-07-02T20:16:47.103000+05:00') ), } def test_date_just_a_date(self, basic_options: ParsingOptions): - value = "2005-07-02" + value = '2005-07-02' assert parse_value(value, basic_options) == { ParsedType.UNPARSED: value, @@ -404,18 +404,18 @@ def test_date_just_a_date(self, basic_options: ParsingOptions): ParsedType.KEYWORD: value, ParsedType.DATE: to_timestamp( # use midnight UTC - datetime.fromisoformat(f"{value}T00:00:00+00:00") + datetime.fromisoformat(f'{value}T00:00:00+00:00') ), } @pytest.mark.parametrize( - "value,epoch", + 'value,epoch', [ # RFC 3339 - ("1996-12-19T16:39:57-08:00", 851042397000), - ("1990-12-31T23:59:59+00:00", 662687999000), + ('1996-12-19T16:39:57-08:00', 851042397000), + ('1990-12-31T23:59:59+00:00', 662687999000), # dates - ("2012-05-03", 1336003200000), + ('2012-05-03', 1336003200000), ], ) def test_date_formats(self, value: str, epoch: int, basic_options: ParsingOptions): @@ -423,9 +423,9 @@ def test_date_formats(self, value: str, epoch: int, basic_options: ParsingOption assert parsed[ParsedType.DATE.value] == epoch def test_date_formats_that_we_want_ignore(self, basic_options: ParsingOptions): - assert ParsedType.DATE.value not in parse_value("12:04:23", basic_options) + assert ParsedType.DATE.value not in parse_value('12:04:23', basic_options) assert ParsedType.DATE.value not in parse_value( - "2007-03-01T13:00:00Z.2008-05-11T15:30:00Z", basic_options + '2007-03-01T13:00:00Z.2008-05-11T15:30:00Z', basic_options ) def test_caching_of_bools_and_ints(self, basic_options: ParsingOptions): diff --git a/tests/indexing/test_templates.py b/tests/indexing/test_templates.py index 855d498..ca7fc41 100644 --- a/tests/indexing/test_templates.py +++ b/tests/indexing/test_templates.py @@ -1,5 +1,3 @@ -from uuid import uuid4 - import pytest from elasticsearch import Elasticsearch @@ -22,28 +20,28 @@ def test_index_template_usage(elasticsearch_client: Elasticsearch): # picked. This index name hits this clash and therefore checks that the priorities # have been set up correctly and the correct template is chosen. resp1 = elasticsearch_client.indices.simulate_index_template( - name="data-beans-arc-latest" + name='data-beans-arc-latest' ) - assert resp1.body["template"]["settings"]["index"]["number_of_shards"] == "5" + assert resp1.body['template']['settings']['index']['number_of_shards'] == '5' # normal index names resp2 = elasticsearch_client.indices.simulate_index_template( - name="data-5788f3e2-6e71-4ecb-aa04-cfba6da1a691-latest" + name='data-5788f3e2-6e71-4ecb-aa04-cfba6da1a691-latest' ) - assert resp2.body["template"]["settings"]["index"]["number_of_shards"] == "5" + assert resp2.body['template']['settings']['index']['number_of_shards'] == '5' resp3 = elasticsearch_client.indices.simulate_index_template( - name="data-5788f3e2-6e71-4ecb-aa04-cfba6da1a691-arc-0" + name='data-5788f3e2-6e71-4ecb-aa04-cfba6da1a691-arc-0' ) - assert resp3.body["template"]["settings"]["index"]["number_of_shards"] == "1" + assert resp3.body['template']['settings']['index']['number_of_shards'] == '1' def test_all_text(splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) - records = [Record.new({"a": "banana", "b": "apple", "c": 5.8, "d": True})] + database = SplitgillDatabase('test', splitgill) + records = [Record.new({'a': 'banana', 'b': 'apple', 'c': 5.8, 'd': True})] database.ingest(records, commit=True) database.sync() - queries = ["banana", "apple", "5.8", "true"] + queries = ['banana', 'apple', '5.8', 'true'] for query in queries: count = database.search().filter(match_query(query)).count() assert count == 1 @@ -53,16 +51,16 @@ def test_all_text(splitgill: SplitgillClient): def database_with_geo(splitgill: SplitgillClient): records = [ # a couple of wkt points - Record("r1", {"a": "POINT (22.8 3.3)", "b": "POINT (11.3 19.5)"}), + Record('r1', {'a': 'POINT (22.8 3.3)', 'b': 'POINT (11.3 19.5)'}), # one geojson point - Record("r2", {"a": {"coordinates": [14.2, -15.6], "type": "Point"}}), + Record('r2', {'a': {'coordinates': [14.2, -15.6], 'type': 'Point'}}), # a wkt polygon (centre is 7.15 6.1) Record( - "r3", - {"a": "POLYGON ((-2.6 14.3, -2.6 -2.1, 16.9 -2.1, 16.9 14.3, -2.6 14.3))"}, + 'r3', + {'a': 'POLYGON ((-2.6 14.3, -2.6 -2.1, 16.9 -2.1, 16.9 14.3, -2.6 14.3))'}, ), ] - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.ingest(records, commit=True) database.sync() return database @@ -75,18 +73,18 @@ def test_simple(self, database_with_geo: SplitgillDatabase): database_with_geo.search() .source(DocumentField.ID) .filter( - "geo_bounding_box", + 'geo_bounding_box', **{ DocumentField.ALL_POINTS: { - "top_left": [9, -11], - "bottom_right": [18, -18], + 'top_left': [9, -11], + 'bottom_right': [18, -18], } }, ) .execute() ) assert len(resp.hits.hits) == 1 - assert resp.hits.hits[0]._source[DocumentField.ID] == "r2" + assert resp.hits.hits[0]._source[DocumentField.ID] == 'r2' def test_shape_miss(self, database_with_geo: SplitgillDatabase): # this will find r1 but not r3 because although the search area contains r3's @@ -95,17 +93,17 @@ def test_shape_miss(self, database_with_geo: SplitgillDatabase): database_with_geo.search() .source(DocumentField.ID) .filter( - "geo_bounding_box", + 'geo_bounding_box', **{ DocumentField.ALL_POINTS: { - "top_left": [7, 23], - "bottom_right": [15, 11], + 'top_left': [7, 23], + 'bottom_right': [15, 11], } }, ) .execute() ) - assert resp.hits.hits[0]._source[DocumentField.ID] == "r1" + assert resp.hits.hits[0]._source[DocumentField.ID] == 'r1' def test_shape_hit(self, database_with_geo: SplitgillDatabase): # this will find r1 and r3 because the search area contains r3's point (i.e. @@ -114,19 +112,19 @@ def test_shape_hit(self, database_with_geo: SplitgillDatabase): database_with_geo.search() .source(DocumentField.ID) .filter( - "geo_bounding_box", + 'geo_bounding_box', **{ DocumentField.ALL_POINTS: { - "top_left": [1.7, 24.1], - "bottom_right": [13.8, 2.6], + 'top_left': [1.7, 24.1], + 'bottom_right': [13.8, 2.6], } }, ) .execute() ) assert sorted(h._source[DocumentField.ID] for h in resp.hits.hits) == [ - "r1", - "r3", + 'r1', + 'r3', ] @@ -137,35 +135,35 @@ def test_simple(self, database_with_geo: SplitgillDatabase): database_with_geo.search() .source(DocumentField.ID) .filter( - "geo_bounding_box", + 'geo_bounding_box', **{ DocumentField.ALL_SHAPES: { - "top_left": [9, -11], - "bottom_right": [18, -18], + 'top_left': [9, -11], + 'bottom_right': [18, -18], } }, ) .execute() ) assert len(resp.hits.hits) == 1 - assert resp.hits.hits[0]._source[DocumentField.ID] == "r2" + assert resp.hits.hits[0]._source[DocumentField.ID] == 'r2' def test_shape(self, database_with_geo: SplitgillDatabase): resp = ( database_with_geo.search() .source(DocumentField.ID) .filter( - "geo_bounding_box", + 'geo_bounding_box', **{ DocumentField.ALL_SHAPES: { - "top_left": [7, 23], - "bottom_right": [15, 11], + 'top_left': [7, 23], + 'bottom_right': [15, 11], } }, ) .execute() ) assert sorted(h._source[DocumentField.ID] for h in resp.hits.hits) == [ - "r1", - "r3", + 'r1', + 'r3', ] diff --git a/tests/test_diffing.py b/tests/test_diffing.py index 4c333e1..acd8600 100644 --- a/tests/test_diffing.py +++ b/tests/test_diffing.py @@ -1,16 +1,16 @@ -from datetime import datetime, timezone, timedelta, date +from datetime import date, datetime, timedelta, timezone from decimal import Decimal import pytest from splitgill.diffing import ( - prepare_data, - diff, - DiffOp, - DiffingTypeComparisonException, - patch, DictComparison, + DiffingTypeComparisonException, + DiffOp, ListComparison, + diff, + patch, + prepare_data, prepare_field_name, ) @@ -20,11 +20,11 @@ def test_none(self): assert prepare_data(None) is None def test_str(self): - assert prepare_data("beans") == "beans" - assert prepare_data("beans\tand\rlemons\neh?") == "beans\tand\rlemons\neh?" - assert prepare_data("beans\x07andabell") == "beansandabell" + assert prepare_data('beans') == 'beans' + assert prepare_data('beans\tand\rlemons\neh?') == 'beans\tand\rlemons\neh?' + assert prepare_data('beans\x07andabell') == 'beansandabell' assert ( - prepare_data("bea\x07ns\tand\rlem\x00ons\neh?") == "beans\tand\rlemons\neh?" + prepare_data('bea\x07ns\tand\rlem\x00ons\neh?') == 'beans\tand\rlemons\neh?' ) def test_numbers(self): @@ -33,8 +33,8 @@ def test_numbers(self): assert prepare_data(-20.5012) == -20.5012 assert prepare_data(0) == 0 # only ints and float please! - assert prepare_data(complex(3, 4)) == "(3+4j)" - assert prepare_data(Decimal("3.4")) == "3.4" + assert prepare_data(complex(3, 4)) == '(3+4j)' + assert prepare_data(Decimal('3.4')) == '3.4' def test_bool(self): assert prepare_data(True) is True @@ -42,172 +42,172 @@ def test_bool(self): def test_datetime(self): naive_no_ms = datetime(2020, 5, 18, 15, 16, 56) - assert prepare_data(naive_no_ms) == "2020-05-18T15:16:56.000000" + assert prepare_data(naive_no_ms) == '2020-05-18T15:16:56.000000' naive_with_ms = datetime(2020, 5, 18, 15, 16, 56, 2908) - assert prepare_data(naive_with_ms) == "2020-05-18T15:16:56.002908" + assert prepare_data(naive_with_ms) == '2020-05-18T15:16:56.002908' minus_3_hours = timezone(timedelta(hours=-3)) with_tz_no_ms = datetime(2020, 5, 18, 15, 16, 56, tzinfo=minus_3_hours) - assert prepare_data(with_tz_no_ms) == "2020-05-18T15:16:56.000000-0300" + assert prepare_data(with_tz_no_ms) == '2020-05-18T15:16:56.000000-0300' with_tz_with_ms = datetime(2020, 5, 18, 15, 16, 56, 2908, tzinfo=minus_3_hours) - assert prepare_data(with_tz_with_ms) == "2020-05-18T15:16:56.002908-0300" + assert prepare_data(with_tz_with_ms) == '2020-05-18T15:16:56.002908-0300' def test_date(self): - assert prepare_data(date(2024, 3, 10)) == "2024-03-10" + assert prepare_data(date(2024, 3, 10)) == '2024-03-10' def test_dict(self): assert prepare_data({}) == {} - assert prepare_data({"x": None}) == {"x": None} - assert prepare_data({"x": "beans"}) == {"x": "beans"} - assert prepare_data({"x": "4"}) == {"x": prepare_data("4")} - assert prepare_data({3: True}) == {"3": prepare_data(True)} - assert prepare_data({4: {6: 1}}) == {"4": {"6": prepare_data(1)}} - assert prepare_data({"x.y": "4"}) == {"x_y": prepare_data("4")} - assert prepare_data({"x\ny": "4"}) == {"xy": prepare_data("4")} - assert prepare_data({"x\ny.n ": "4"}) == {"xy_n": prepare_data("4")} + assert prepare_data({'x': None}) == {'x': None} + assert prepare_data({'x': 'beans'}) == {'x': 'beans'} + assert prepare_data({'x': '4'}) == {'x': prepare_data('4')} + assert prepare_data({3: True}) == {'3': prepare_data(True)} + assert prepare_data({4: {6: 1}}) == {'4': {'6': prepare_data(1)}} + assert prepare_data({'x.y': '4'}) == {'x_y': prepare_data('4')} + assert prepare_data({'x\ny': '4'}) == {'xy': prepare_data('4')} + assert prepare_data({'x\ny.n ': '4'}) == {'xy_n': prepare_data('4')} def test_list(self): assert prepare_data([]) == [] assert prepare_data([3, None, 5]) == [3, None, 5] assert prepare_data([1, 2, 3]) == [1, 2, 3] - assert prepare_data([1, True, "3"]) == [1, True, "3"] + assert prepare_data([1, True, '3']) == [1, True, '3'] def test_set(self): assert prepare_data(set()) == [] - prepared = prepare_data({1, 2, 3, "beans", None}) + prepared = prepare_data({1, 2, 3, 'beans', None}) assert isinstance(prepared, list) assert 1 in prepared assert 2 in prepared assert 3 in prepared - assert "beans" in prepared + assert 'beans' in prepared assert None in prepared def test_tuple(self): assert prepare_data(tuple()) == [] assert prepare_data((3, None, 5)) == [3, None, 5] assert prepare_data((1, 2, 3)) == [1, 2, 3] - assert prepare_data((1, True, "3")) == [1, True, "3"] + assert prepare_data((1, True, '3')) == [1, True, '3'] def test_fallback(self): class A: def __str__(self): - return "beans" + return 'beans' - assert prepare_data(A()) == "beans" + assert prepare_data(A()) == 'beans' def test_mix(self): prepared = prepare_data( { - "x": "4", - "y": True, - "z": [1, 2, 3], - "a": { - "x": ["4", 20.7], - "y": datetime(2020, 5, 18, 15, 16, 56), + 'x': '4', + 'y': True, + 'z': [1, 2, 3], + 'a': { + 'x': ['4', 20.7], + 'y': datetime(2020, 5, 18, 15, 16, 56), }, - "b": [{"x": 1}, {"x": "4.2"}], + 'b': [{'x': 1}, {'x': '4.2'}], } ) assert prepared == { - "x": "4", - "y": True, - "z": [1, 2, 3], - "a": { - "x": ["4", 20.7], - "y": "2020-05-18T15:16:56.000000", + 'x': '4', + 'y': True, + 'z': [1, 2, 3], + 'a': { + 'x': ['4', 20.7], + 'y': '2020-05-18T15:16:56.000000', }, - "b": [{"x": 1}, {"x": "4.2"}], + 'b': [{'x': 1}, {'x': '4.2'}], } def test_prepare_field_name(): # not a str - assert prepare_field_name(5) == "5" + assert prepare_field_name(5) == '5' # a dot! - assert prepare_field_name("x.y") == "x_y" + assert prepare_field_name('x.y') == 'x_y' # padded with whitespace - assert prepare_field_name(" x ") == "x" + assert prepare_field_name(' x ') == 'x' # lots of dots - assert prepare_field_name(".x.y.z.1.2.") == "-x_y_z_1_2_" + assert prepare_field_name('.x.y.z.1.2.') == '-x_y_z_1_2_' # a mix of horrors - assert prepare_field_name("\nx.\ty\r \x07fowien") == "x_y fowien" + assert prepare_field_name('\nx.\ty\r \x07fowien') == 'x_y fowien' # an empty name - assert prepare_field_name("") == "-" + assert prepare_field_name('') == '-' # a name which becomes empty after removing all the junk - assert prepare_field_name(" \t \x07 ") == "-" + assert prepare_field_name(' \t \x07 ') == '-' # a starting _ is converted to a - - assert prepare_field_name("_beans") == "-beans" + assert prepare_field_name('_beans') == '-beans' class TestDiff: def test_equal(self): - base = {"x": "4"} - new = {"x": "4"} + base = {'x': '4'} + new = {'x': '4'} assert list(diff(base, new)) == [] def test_equal_is(self): - base = new = {"x": "4"} + base = new = {'x': '4'} assert list(diff(base, new)) == [] def test_not_dicts(self): with pytest.raises(DiffingTypeComparisonException): - list(diff(("1", "2", "3"), {"a": "4"})) + list(diff(('1', '2', '3'), {'a': '4'})) with pytest.raises(DiffingTypeComparisonException): - list(diff({"a": "4"}, ("1", "2", "3"))) + list(diff({'a': '4'}, ('1', '2', '3'))) with pytest.raises(DiffingTypeComparisonException): - list(diff("4", "beans")) + list(diff('4', 'beans')) def test_dict_new(self): - base = {"a": "4"} - new = {"a": "4", "b": "3"} - assert list(diff(base, new)) == [DiffOp(tuple(), {"dn": {"b": "3"}})] + base = {'a': '4'} + new = {'a': '4', 'b': '3'} + assert list(diff(base, new)) == [DiffOp(tuple(), {'dn': {'b': '3'}})] def test_dict_delete(self): - base = {"a": "4", "b": "3"} - new = {"a": "4"} - assert list(diff(base, new)) == [DiffOp(tuple(), {"dd": ["b"]})] + base = {'a': '4', 'b': '3'} + new = {'a': '4'} + assert list(diff(base, new)) == [DiffOp(tuple(), {'dd': ['b']})] def test_dict_change(self): - base = {"a": "4", "b": "3"} - new = {"a": "4", "b": "6"} - assert list(diff(base, new)) == [DiffOp(tuple(), {"dc": {"b": "6"}})] + base = {'a': '4', 'b': '3'} + new = {'a': '4', 'b': '6'} + assert list(diff(base, new)) == [DiffOp(tuple(), {'dc': {'b': '6'}})] def test_list_new(self): - base = {"a": ["1", "2", "3"]} - new = {"a": ["1", "2", "3", "4", "5"]} - assert list(diff(base, new)) == [DiffOp(("a",), {"ln": ["4", "5"]})] + base = {'a': ['1', '2', '3']} + new = {'a': ['1', '2', '3', '4', '5']} + assert list(diff(base, new)) == [DiffOp(('a',), {'ln': ['4', '5']})] def test_list_delete(self): - base = {"a": ["1", "2", "3", "4", "5"]} - new = {"a": ["1", "2", "3"]} - assert list(diff(base, new)) == [DiffOp(("a",), {"ld": 3})] + base = {'a': ['1', '2', '3', '4', '5']} + new = {'a': ['1', '2', '3']} + assert list(diff(base, new)) == [DiffOp(('a',), {'ld': 3})] def test_list_change(self): - base = {"a": ["1", "2", "3", "4", "5"]} - new = {"a": ["1", "2", "3", "10", "5"]} - assert list(diff(base, new)) == [DiffOp(("a",), {"lc": [(3, "10")]})] + base = {'a': ['1', '2', '3', '4', '5']} + new = {'a': ['1', '2', '3', '10', '5']} + assert list(diff(base, new)) == [DiffOp(('a',), {'lc': [(3, '10')]})] def test_list_with_embeds(self): base = { - "a": [{"y": "4"}, {"z": "5"}], - "b": [["1", "2", "3"], ["4", "5", "6"], ["7", "8", "9"]], + 'a': [{'y': '4'}, {'z': '5'}], + 'b': [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']], } new = { - "a": [{"y": "4"}, {"z": "3"}], - "b": [["1", "10", "3"], ["4", "5", "6"], ["4", "8", "9"]], + 'a': [{'y': '4'}, {'z': '3'}], + 'b': [['1', '10', '3'], ['4', '5', '6'], ['4', '8', '9']], } assert list(diff(base, new)) == [ - DiffOp(path=("a", 1), ops={"dc": {"z": "3"}}), - DiffOp(path=("b", 0), ops={"lc": [(1, "10")]}), - DiffOp(path=("b", 2), ops={"lc": [(0, "4")]}), + DiffOp(path=('a', 1), ops={'dc': {'z': '3'}}), + DiffOp(path=('b', 0), ops={'lc': [(1, '10')]}), + DiffOp(path=('b', 2), ops={'lc': [(0, '4')]}), ] def test_dict_embeds(self): - base = {"a": {"b": "5", "c": "6"}} - new = {"a": {"a": "2", "c": "4"}} + base = {'a': {'b': '5', 'c': '6'}} + new = {'a': {'a': '2', 'c': '4'}} assert list(diff(base, new)) == [ - DiffOp(path=("a",), ops={"dn": {"a": "2"}, "dd": ["b"], "dc": {"c": "4"}}) + DiffOp(path=('a',), ops={'dn': {'a': '2'}, 'dd': ['b'], 'dc': {'c': '4'}}) ] @@ -217,71 +217,71 @@ def test_same(self): assert op is None assert len(more) == 0 - op, more = DictComparison(tuple(), {"a": 4}, {"a": 4}).compare() + op, more = DictComparison(tuple(), {'a': 4}, {'a': 4}).compare() assert op is None assert len(more) == 0 - base = {"a": 4} + base = {'a': 4} op, more = DictComparison(tuple(), base, base).compare() assert op is None assert len(more) == 0 def test_dn(self): - comp = DictComparison(tuple(), {"a": 4}, {"a": 4, "b": 3, "c": 8}) + comp = DictComparison(tuple(), {'a': 4}, {'a': 4, 'b': 3, 'c': 8}) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"dn": {"b": 3, "c": 8}} + assert op.ops == {'dn': {'b': 3, 'c': 8}} assert len(more) == 0 def test_dd(self): - comp = DictComparison(tuple(), {"a": 4, "b": 3, "c": 8}, {"a": 4}) + comp = DictComparison(tuple(), {'a': 4, 'b': 3, 'c': 8}, {'a': 4}) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"dd": ["b", "c"]} + assert op.ops == {'dd': ['b', 'c']} assert len(more) == 0 def test_dc(self): - comp = DictComparison(tuple(), {"a": 4, "b": 3}, {"a": 1, "b": 9}) + comp = DictComparison(tuple(), {'a': 4, 'b': 3}, {'a': 1, 'b': 9}) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"dc": {"a": 1, "b": 9}} + assert op.ops == {'dc': {'a': 1, 'b': 9}} assert len(more) == 0 def test_nested_dicts(self): # both dicts - op, more = DictComparison(("x",), {"a": {"x": 3}}, {"a": {"x": 4}}).compare() + op, more = DictComparison(('x',), {'a': {'x': 3}}, {'a': {'x': 4}}).compare() assert op is None - assert more == [DictComparison(("x", "a"), {"x": 3}, {"x": 4})] + assert more == [DictComparison(('x', 'a'), {'x': 3}, {'x': 4})] def test_nested_dict_and_not(self): # one a dict, one not - op, more = DictComparison(tuple(), {"a": {"x": 3}}, {"a": "x"}).compare() + op, more = DictComparison(tuple(), {'a': {'x': 3}}, {'a': 'x'}).compare() assert op.path == tuple() - assert op.ops == {"dc": {"a": "x"}} + assert op.ops == {'dc': {'a': 'x'}} assert len(more) == 0 # one not a dict, one a dict - op, more = DictComparison(tuple(), {"a": "x"}, {"a": {"x": 3}}).compare() + op, more = DictComparison(tuple(), {'a': 'x'}, {'a': {'x': 3}}).compare() assert op.path == tuple() - assert op.ops == {"dc": {"a": {"x": 3}}} + assert op.ops == {'dc': {'a': {'x': 3}}} assert len(more) == 0 def test_nested_lists(self): # both lists - op, more = DictComparison(("x",), {"a": [1, 2, 3]}, {"a": [1, 2, 4]}).compare() + op, more = DictComparison(('x',), {'a': [1, 2, 3]}, {'a': [1, 2, 4]}).compare() assert op is None - assert more == [ListComparison(("x", "a"), [1, 2, 3], [1, 2, 4])] + assert more == [ListComparison(('x', 'a'), [1, 2, 3], [1, 2, 4])] def test_nested_list_and_not(self): # both dicts - op, more = DictComparison(tuple(), {"a": [1, 2, 3]}, {"a": "x"}).compare() + op, more = DictComparison(tuple(), {'a': [1, 2, 3]}, {'a': 'x'}).compare() assert op.path == tuple() - assert op.ops == {"dc": {"a": "x"}} + assert op.ops == {'dc': {'a': 'x'}} assert len(more) == 0 - op, more = DictComparison(tuple(), {"a": "x"}, {"a": [1, 2, 3]}).compare() + op, more = DictComparison(tuple(), {'a': 'x'}, {'a': [1, 2, 3]}).compare() assert op.path == tuple() - assert op.ops == {"dc": {"a": [1, 2, 3]}} + assert op.ops == {'dc': {'a': [1, 2, 3]}} assert len(more) == 0 @@ -304,135 +304,135 @@ def test_ln(self): comp = ListComparison(tuple(), [1, 2, 3], [1, 2, 3, 4, 5]) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"ln": [4, 5]} + assert op.ops == {'ln': [4, 5]} assert len(more) == 0 def test_ld(self): comp = ListComparison(tuple(), [1, 2, 3, 4, 5], [1, 2, 3]) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"ld": 3} + assert op.ops == {'ld': 3} assert len(more) == 0 def test_lc(self): - comp = ListComparison(tuple(), [1, 2, 3, 4, 5], [1, 9, 3, "b", 5]) + comp = ListComparison(tuple(), [1, 2, 3, 4, 5], [1, 9, 3, 'b', 5]) op, more = comp.compare() assert op.path == tuple() - assert op.ops == {"lc": [(1, 9), (3, "b")]} + assert op.ops == {'lc': [(1, 9), (3, 'b')]} assert len(more) == 0 def test_nested_dicts(self): # both dicts - op, more = ListComparison(("x",), ["b", {"x": 1}], ["b", {"x": 2}]).compare() + op, more = ListComparison(('x',), ['b', {'x': 1}], ['b', {'x': 2}]).compare() assert op is None - assert more == [DictComparison(("x", 1), {"x": 1}, {"x": 2})] + assert more == [DictComparison(('x', 1), {'x': 1}, {'x': 2})] def test_nested_dict_and_not(self): # one a dict, one not - op, more = ListComparison(tuple(), ["b", {"x": 1}], ["b", "x"]).compare() + op, more = ListComparison(tuple(), ['b', {'x': 1}], ['b', 'x']).compare() assert op.path == tuple() - assert op.ops == {"lc": [(1, "x")]} + assert op.ops == {'lc': [(1, 'x')]} assert len(more) == 0 # one not a dict, one a dict - op, more = ListComparison(tuple(), ["b", "x"], ["b", {"x": 1}]).compare() + op, more = ListComparison(tuple(), ['b', 'x'], ['b', {'x': 1}]).compare() assert op.path == tuple() - assert op.ops == {"lc": [(1, {"x": 1})]} + assert op.ops == {'lc': [(1, {'x': 1})]} assert len(more) == 0 def test_nested_lists(self): # both lists op, more = ListComparison( - ("x",), [1, [9, 8, 7], 2], [1, [9, 8, 6], 2] + ('x',), [1, [9, 8, 7], 2], [1, [9, 8, 6], 2] ).compare() assert op is None - assert more == [ListComparison(("x", 1), [9, 8, 7], [9, 8, 6])] + assert more == [ListComparison(('x', 1), [9, 8, 7], [9, 8, 6])] def test_nested_list_and_not(self): # one a list, one not - op, more = ListComparison(tuple(), ["a", [1, 2, 3]], ["a", "x"]).compare() + op, more = ListComparison(tuple(), ['a', [1, 2, 3]], ['a', 'x']).compare() assert op.path == tuple() - assert op.ops == {"lc": [(1, "x")]} + assert op.ops == {'lc': [(1, 'x')]} assert len(more) == 0 # one not a list, one a dict - op, more = ListComparison(tuple(), ["a", "x"], ["a", [1, 2, 3]]).compare() + op, more = ListComparison(tuple(), ['a', 'x'], ['a', [1, 2, 3]]).compare() assert op.path == tuple() - assert op.ops == {"lc": [(1, [1, 2, 3])]} + assert op.ops == {'lc': [(1, [1, 2, 3])]} assert len(more) == 0 # todo: make this a more complete, systematic set of scenarios patching_scenarios = [ # a basic example - ({"x": "4"}, {"x": "5"}), - ({"x": True}, {"x": 5}), + ({'x': '4'}, {'x': '5'}), + ({'x': True}, {'x': 5}), # basic lists # ld - ({"x": ["1", "2", "3"]}, {"x": ["1", "5"]}), - ({"x": [False, 5, "hello"]}, {"x": [True, 5]}), + ({'x': ['1', '2', '3']}, {'x': ['1', '5']}), + ({'x': [False, 5, 'hello']}, {'x': [True, 5]}), # ln - ({"x": ["1", "2", "3"]}, {"x": ["1", "2", "3", "4"]}), - ({"x": [1, 2, 3.4]}, {"x": [1, 2, 3.4, 4]}), + ({'x': ['1', '2', '3']}, {'x': ['1', '2', '3', '4']}), + ({'x': [1, 2, 3.4]}, {'x': [1, 2, 3.4, 4]}), # lc - ({"x": ["1", "2", "3"]}, {"x": ["1", "5", "3"]}), - ({"x": ["1", 2, "3"]}, {"x": ["1", 5, "3"]}), - ({"x": ["1", None, "3"]}, {"x": ["1", False, "3"]}), + ({'x': ['1', '2', '3']}, {'x': ['1', '5', '3']}), + ({'x': ['1', 2, '3']}, {'x': ['1', 5, '3']}), + ({'x': ['1', None, '3']}, {'x': ['1', False, '3']}), # basic dicts - ({"x": {"y": "5"}}, {"x": {"y": "6"}}), - ({"x": {"y": 5}}, {"x": {"y": 6}}), + ({'x': {'y': '5'}}, {'x': {'y': '6'}}), + ({'x': {'y': 5}}, {'x': {'y': 6}}), # dc - ({"x": "4"}, {"x": "6"}), - ({"x": "4"}, {"x": 6}), + ({'x': '4'}, {'x': '6'}), + ({'x': '4'}, {'x': 6}), # dn - ({"x": "4"}, {"x": "6", "y": "10"}), - ({"x": "4"}, {"x": 6, "y": False}), + ({'x': '4'}, {'x': '6', 'y': '10'}), + ({'x': '4'}, {'x': 6, 'y': False}), # dd - ({"x": "4", "y": "10"}, {"x": "4"}), - ({"x": 4.523, "y": "10"}, {"x": 4.523}), + ({'x': '4', 'y': '10'}, {'x': '4'}), + ({'x': 4.523, 'y': '10'}, {'x': 4.523}), # list becomes str - ({"x": ["1", "2", "3"]}, {"x": "543"}), + ({'x': ['1', '2', '3']}, {'x': '543'}), # dict becomes str - ({"x": {"y": "4"}}, {"x": "543"}), + ({'x': {'y': '4'}}, {'x': '543'}), # str becomes list - ({"x": "543"}, {"x": ["1", "2", "3"]}), + ({'x': '543'}, {'x': ['1', '2', '3']}), # str becomes dict - ({"x": "543"}, {"x": {"y": "4"}}), + ({'x': '543'}, {'x': {'y': '4'}}), # list becomes dict - ({"x": ["1", "2", "3"]}, {"x": {"y": "1"}}), + ({'x': ['1', '2', '3']}, {'x': {'y': '1'}}), # dict becomes list - ({"x": {"y": "1"}}, {"x": ["1", "2", "3"]}), + ({'x': {'y': '1'}}, {'x': ['1', '2', '3']}), # dict becomes list in dict - ({"x": {"y": {"z": "43"}}}, {"x": {"y": ["1", "2", "3"]}}), + ({'x': {'y': {'z': '43'}}}, {'x': {'y': ['1', '2', '3']}}), # list of lists ( - {"x": [["1", "2", "3"], ["4", "5", "6"], ["7", "8", "9"]]}, - {"x": [["1", "2", "4"], ["4", "10", "6"], ["0", "8", "9"]]}, + {'x': [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']]}, + {'x': [['1', '2', '4'], ['4', '10', '6'], ['0', '8', '9']]}, ), # list of dicts - ({"x": [{"y": "5"}, {"y": "7"}]}, {"x": [{"y": "3"}, {"y": "7"}]}), + ({'x': [{'y': '5'}, {'y': '7'}]}, {'x': [{'y': '3'}, {'y': '7'}]}), # list of dicts with lists and changing types ( - {"x": [{"y": ["1", "2", "3"]}, {"y": ["7", "8"]}]}, - {"x": [{"y": "nope"}, {"y": ["3", "8"]}]}, + {'x': [{'y': ['1', '2', '3']}, {'y': ['7', '8']}]}, + {'x': [{'y': 'nope'}, {'y': ['3', '8']}]}, ), # list of lists becomes list of not-lists (and vice versa) ( - {"x": [["1", "2", "3"], ["4", "5", "6"], ["7", "8", "9"]]}, - {"x": ["not", "a", "tuple"]}, + {'x': [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']]}, + {'x': ['not', 'a', 'tuple']}, ), ( - {"x": ["not", "a", "tuple"]}, - {"x": [["1", "2", "3"], ["4", "5", "6"], ["7", "8", "9"]]}, + {'x': ['not', 'a', 'tuple']}, + {'x': [['1', '2', '3'], ['4', '5', '6'], ['7', '8', '9']]}, ), ] class TestPatch: def test_empty(self): - assert patch({"c": "4"}, []) == {"c": "4"} + assert patch({'c': '4'}, []) == {'c': '4'} - @pytest.mark.parametrize(("base", "new"), patching_scenarios) + @pytest.mark.parametrize(('base', 'new'), patching_scenarios) def test_patching(self, base: dict, new: dict): diff_ops = list(diff(base, new)) patched_base = patch(base, diff_ops) diff --git a/tests/test_ingest.py b/tests/test_ingest.py index 8ace857..8e1d77b 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -11,11 +11,11 @@ def create_random_record() -> Record: - return Record(str(uuid4()), {"x": randint(0, 1000), "y": str(uuid4())}) + return Record(str(uuid4()), {'x': randint(0, 1000), 'y': str(uuid4())}) def commit_helper(data_collection: Collection, version: int): - data_collection.update_many({"version": None}, {"$set": {"version": version}}) + data_collection.update_many({'version': None}, {'$set': {'version': version}}) class TestGenerateOps: @@ -33,36 +33,36 @@ def test_with_all_new_unique_records(self, mongo_collection: Collection): assert all(isinstance(op, InsertOne) for op in ops) for record, op in zip(records, ops): - assert op._doc["id"] == record.id - assert op._doc["version"] is None - assert op._doc["data"] == prepare_record_data(record) + assert op._doc['id'] == record.id + assert op._doc['version'] is None + assert op._doc['data'] == prepare_record_data(record) def test_with_all_new_but_some_repeating_records( self, mongo_collection: Collection ): records = [ - Record("6", {"x": 81}), - Record("1", {"x": 5}), - Record("1", {"x": 6}), - Record("2", {"x": 5}), - Record("1", {"x": 7}), + Record('6', {'x': 81}), + Record('1', {'x': 5}), + Record('1', {'x': 6}), + Record('2', {'x': 5}), + Record('1', {'x': 7}), ] ops = list(generate_ops(mongo_collection, records)) assert len(ops) == 3 for record, op in zip((records[0], records[4], records[3]), ops): - assert op._doc["id"] == record.id - assert op._doc["version"] is None - assert op._doc["data"] == prepare_record_data(record) + assert op._doc['id'] == record.id + assert op._doc['version'] is None + assert op._doc['data'] == prepare_record_data(record) def test_update_existing_records(self, mongo_collection: Collection): # add some records old_version = 4 old_records = [ - Record("1", {"x": 1}), - Record("2", {"x": 2}), - Record("3", {"x": 3}), + Record('1', {'x': 1}), + Record('2', {'x': 2}), + Record('3', {'x': 3}), ] mongo_collection.bulk_write(list(generate_ops(mongo_collection, old_records))) commit_helper(mongo_collection, old_version) @@ -71,10 +71,10 @@ def test_update_existing_records(self, mongo_collection: Collection): # update some of them new_records = [ # no update to record 1 - Record("1", {"x": 1}), + Record('1', {'x': 1}), # updates to records 2 and 3 though - Record("2", {"x": 10}), - Record("3", {"x": 185}), + Record('2', {'x': 10}), + Record('3', {'x': 185}), ] ops = list(generate_ops(mongo_collection, new_records)) mongo_collection.bulk_write(ops) @@ -84,16 +84,16 @@ def test_update_existing_records(self, mongo_collection: Collection): # all the ops should be UpdateOnes assert all(isinstance(op, UpdateOne) for op in ops) # there should be 2 changed records - assert mongo_collection.count_documents({"version": None}) == 2 + assert mongo_collection.count_documents({'version': None}) == 2 for new_record, old_record in zip(new_records[1:], old_records[1:]): # sanity check assert new_record.id == old_record.id - doc = mongo_collection.find_one({"id": new_record.id}) - assert doc["version"] is None - assert doc["data"] == prepare_record_data(new_record) + doc = mongo_collection.find_one({'id': new_record.id}) + assert doc['version'] is None + assert doc['data'] == prepare_record_data(new_record) # to compare the diff we have to convert the tuples into lists - assert doc["diffs"][str(old_version)] == [ + assert doc['diffs'][str(old_version)] == [ [list(diff_op.path), diff_op.ops] for diff_op in diff( prepare_record_data(new_record), prepare_record_data(old_record) @@ -109,9 +109,9 @@ def test_lots_of_records(self, mongo_collection: Collection): assert all(isinstance(op, InsertOne) for op in ops) for record, op in zip(records, ops): - assert op._doc["id"] == record.id - assert op._doc["version"] is None - assert op._doc["data"] == prepare_record_data(record) + assert op._doc['id'] == record.id + assert op._doc['version'] is None + assert op._doc['data'] == prepare_record_data(record) def test_delete_of_non_existent_record(self, mongo_collection: Collection): records = [Record.new({})] @@ -120,7 +120,7 @@ def test_delete_of_non_existent_record(self, mongo_collection: Collection): def test_handling_uncommitted(self, mongo_collection: Collection): # add some data without committing it - record = Record("1", {"x": 4}) + record = Record('1', {'x': 4}) mongo_collection.bulk_write(list(generate_ops(mongo_collection, [record]))) assert mongo_collection.count_documents({}) == 1 @@ -128,15 +128,15 @@ def test_handling_uncommitted(self, mongo_collection: Collection): assert not list(generate_ops(mongo_collection, [record])) # change the data without committing it - record.data["x"] = 5 + record.data['x'] = 5 mongo_collection.bulk_write(list(generate_ops(mongo_collection, [record]))) assert mongo_collection.count_documents({}) == 1 - assert mongo_collection.find_one({"id": "1"})["data"] == { - DATA_ID_FIELD: "1", - "x": 5, + assert mongo_collection.find_one({'id': '1'})['data'] == { + DATA_ID_FIELD: '1', + 'x': 5, } - assert mongo_collection.find_one({"id": "1"})["version"] is None - assert "diffs" not in mongo_collection.find_one({"id": "1"}) + assert mongo_collection.find_one({'id': '1'})['version'] is None + assert 'diffs' not in mongo_collection.find_one({'id': '1'}) # delete the uncommitted data record.data = {} @@ -145,32 +145,32 @@ def test_handling_uncommitted(self, mongo_collection: Collection): def test_handling_uncommitted_with_diffs(self, mongo_collection: Collection): # add some data and commit it - record = Record("1", {"x": 4}) + record = Record('1', {'x': 4}) mongo_collection.bulk_write(list(generate_ops(mongo_collection, [record]))) commit_helper(mongo_collection, 6) # change the data without committing it - record.data["x"] = 5 + record.data['x'] = 5 mongo_collection.bulk_write(list(generate_ops(mongo_collection, [record]))) # change the data back to the previous version's data - record.data["x"] = 4 + record.data['x'] = 4 mongo_collection.bulk_write(list(generate_ops(mongo_collection, [record]))) assert mongo_collection.count_documents({}) == 1 - assert mongo_collection.find_one({"id": "1"})["data"] == { - DATA_ID_FIELD: "1", - "x": 4, + assert mongo_collection.find_one({'id': '1'})['data'] == { + DATA_ID_FIELD: '1', + 'x': 4, } - assert mongo_collection.find_one({"id": "1"})["version"] == 6 - assert "diffs" not in mongo_collection.find_one({"id": "1"}) + assert mongo_collection.find_one({'id': '1'})['version'] == 6 + assert 'diffs' not in mongo_collection.find_one({'id': '1'}) def test_modified_is_ignored_when_provided(self, mongo_collection: Collection): # add some records old_version = 4 old_records = [ - Record("1", {"x": 1, "modified": "2024-02-22T15:11:03+00:00"}), - Record("2", {"x": 2, "modified": "2021-02-22T15:12:07+00:00"}), - Record("3", {"x": 3, "modified": "2021-02-22T15:27:32+00:00"}), + Record('1', {'x': 1, 'modified': '2024-02-22T15:11:03+00:00'}), + Record('2', {'x': 2, 'modified': '2021-02-22T15:12:07+00:00'}), + Record('3', {'x': 3, 'modified': '2021-02-22T15:27:32+00:00'}), ] mongo_collection.bulk_write(list(generate_ops(mongo_collection, old_records))) commit_helper(mongo_collection, old_version) @@ -179,12 +179,12 @@ def test_modified_is_ignored_when_provided(self, mongo_collection: Collection): # update some of them new_records = [ # no update to record 1 (modified is different, but other data is the same) - Record("1", {"x": 1, "modified": "2024-02-22T16:11:03+00:00"}), + Record('1', {'x': 1, 'modified': '2024-02-22T16:11:03+00:00'}), # updates to records 2 and 3 though (both modified and x have changed) - Record("2", {"x": 10, "modified": "2024-02-22T16:11:03+00:00"}), - Record("3", {"x": 185, "modified": "2024-02-22T16:11:03+00:00"}), + Record('2', {'x': 10, 'modified': '2024-02-22T16:11:03+00:00'}), + Record('3', {'x': 185, 'modified': '2024-02-22T16:11:03+00:00'}), ] - ops = list(generate_ops(mongo_collection, new_records, "modified")) + ops = list(generate_ops(mongo_collection, new_records, 'modified')) mongo_collection.bulk_write(ops) # number of records shouldn't have changed @@ -192,16 +192,16 @@ def test_modified_is_ignored_when_provided(self, mongo_collection: Collection): # all the ops should be UpdateOnes assert all(isinstance(op, UpdateOne) for op in ops) # there should be 2 changed records - assert mongo_collection.count_documents({"version": None}) == 2 + assert mongo_collection.count_documents({'version': None}) == 2 for new_record, old_record in zip(new_records[1:], old_records[1:]): # sanity check assert new_record.id == old_record.id - doc = mongo_collection.find_one({"id": new_record.id}) - assert doc["version"] is None - assert doc["data"] == prepare_record_data(new_record) + doc = mongo_collection.find_one({'id': new_record.id}) + assert doc['version'] is None + assert doc['data'] == prepare_record_data(new_record) # to compare the diff we have to convert the tuples into lists - assert doc["diffs"][str(old_version)] == [ + assert doc['diffs'][str(old_version)] == [ [list(diff_op.path), diff_op.ops] for diff_op in diff( prepare_record_data(new_record), prepare_record_data(old_record) diff --git a/tests/test_locking.py b/tests/test_locking.py index 34a07f3..513f25c 100644 --- a/tests/test_locking.py +++ b/tests/test_locking.py @@ -5,76 +5,76 @@ from freezegun import freeze_time from pymongo.collection import Collection -from splitgill.locking import LockManager, AlreadyLocked +from splitgill.locking import AlreadyLocked, LockManager class TestLockManager: def test_is_locked(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - assert lock_manager.acquire("test") - assert lock_manager.is_locked("test") - assert not lock_manager.is_locked("a different lock") - lock_manager.release("test") - assert not lock_manager.is_locked("test") + assert lock_manager.acquire('test') + assert lock_manager.is_locked('test') + assert not lock_manager.is_locked('a different lock') + lock_manager.release('test') + assert not lock_manager.is_locked('test') def test_acquired_ok(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - assert lock_manager.acquire("test") - lock_manager.release("test") - assert not lock_manager.is_locked("test") + assert lock_manager.acquire('test') + lock_manager.release('test') + assert not lock_manager.is_locked('test') - @freeze_time("2024-03-20 21:27:23") + @freeze_time('2024-03-20 21:27:23') def test_get_metadata(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - data = {"under_test": True, "number": 4, "banana": [1, 2, 3]} - lock_manager.acquire("test", **data) - metadata = lock_manager.get_metadata("test") + data = {'under_test': True, 'number': 4, 'banana': [1, 2, 3]} + lock_manager.acquire('test', **data) + metadata = lock_manager.get_metadata('test') # check there is an _id and also remove it so that we can check the metadata # without it - assert metadata.pop("_id", False) + assert metadata.pop('_id', False) assert metadata == { - "lock_id": "test", - "locked_by": platform.node(), - "locked_at": datetime(2024, 3, 20, 21, 27, 23), - "data": data, + 'lock_id': 'test', + 'locked_by': platform.node(), + 'locked_at': datetime(2024, 3, 20, 21, 27, 23), + 'data': data, } - lock_manager.release("test") - assert not lock_manager.is_locked("test") + lock_manager.release('test') + assert not lock_manager.is_locked('test') def test_acquired_double_lock(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - assert lock_manager.acquire("test") - assert not lock_manager.acquire("test") - lock_manager.release("test") - assert not lock_manager.is_locked("test") + assert lock_manager.acquire('test') + assert not lock_manager.acquire('test') + lock_manager.release('test') + assert not lock_manager.is_locked('test') def test_acquired_double_lock_raise(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - assert lock_manager.acquire("test") + assert lock_manager.acquire('test') with pytest.raises(AlreadyLocked): - assert lock_manager.acquire("test", raise_on_fail=True) - lock_manager.release("test") - assert not lock_manager.is_locked("test") + assert lock_manager.acquire('test', raise_on_fail=True) + lock_manager.release('test') + assert not lock_manager.is_locked('test') def test_lock(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - with lock_manager.lock("test", wow="pop!"): - assert lock_manager.is_locked("test") - assert lock_manager.get_metadata("test")["data"]["wow"] == "pop!" - assert not lock_manager.is_locked("test") + with lock_manager.lock('test', wow='pop!'): + assert lock_manager.is_locked('test') + assert lock_manager.get_metadata('test')['data']['wow'] == 'pop!' + assert not lock_manager.is_locked('test') def test_lock_already_locked(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - assert lock_manager.acquire("test") + assert lock_manager.acquire('test') with pytest.raises(AlreadyLocked): - with lock_manager.lock("test"): + with lock_manager.lock('test'): pass - lock_manager.release("test") - assert not lock_manager.is_locked("test") + lock_manager.release('test') + assert not lock_manager.is_locked('test') def test_lock_interrupted(self, mongo_collection: Collection): lock_manager = LockManager(mongo_collection) - with pytest.raises(Exception, match="oh no!"): - with lock_manager.lock("test"): - raise Exception("oh no!") - assert not lock_manager.is_locked("test") + with pytest.raises(Exception, match='oh no!'): + with lock_manager.lock('test'): + raise Exception('oh no!') + assert not lock_manager.is_locked('test') diff --git a/tests/test_manager.py b/tests/test_manager.py index ac72d34..5289c01 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -35,25 +35,25 @@ class TestSplitgillClient: def test_database(self, splitgill: SplitgillClient): - assert splitgill.get_mongo_database().name == "sg" + assert splitgill.get_mongo_database().name == 'sg' def test_custom_database( self, mongo_client: MongoClient, elasticsearch_client: Elasticsearch ): client = SplitgillClient( - mongo_client, elasticsearch_client, mongo_database_name="test" + mongo_client, elasticsearch_client, mongo_database_name='test' ) - assert client.get_mongo_database().name == "test" + assert client.get_mongo_database().name == 'test' def test_get_data_collection(self, splitgill: SplitgillClient): - name = "test" - assert splitgill.get_data_collection(name).name == f"data-{name}" + name = 'test' + assert splitgill.get_data_collection(name).name == f'data-{name}' def test_get_options_collection(self, splitgill: SplitgillClient): assert splitgill.get_options_collection().name == OPTIONS_COLLECTION_NAME def test_get_database(self, splitgill: SplitgillClient): - name = "test" + name = 'test' assert ( splitgill.get_database(name).name == SplitgillDatabase(name, splitgill).name ) @@ -61,46 +61,46 @@ def test_get_database(self, splitgill: SplitgillClient): class TestSplitgillDatabaseGetCommittedVersion: def test_no_data_no_options(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) assert database.get_committed_version() is None def test_uncommitted_data(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 4}), - Record.new({"x": 89}), - Record.new({"x": 5}), + Record.new({'x': 4}), + Record.new({'x': 89}), + Record.new({'x': 5}), ] database.ingest(records, commit=False) assert database.get_committed_version() is None - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_committed_data(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 4}), - Record.new({"x": 89}), - Record.new({"x": 5}), + Record.new({'x': 4}), + Record.new({'x': 89}), + Record.new({'x': 5}), ] database.ingest(records, commit=True) assert database.get_committed_version() == 1326542401000 - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_mixed_data(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) version = 1326542401000 records = [ - Record.new({"x": 4}), - Record.new({"x": 89}), - Record.new({"x": 5}), + Record.new({'x': 4}), + Record.new({'x': 89}), + Record.new({'x': 5}), ] database.ingest(records, commit=True) assert database.get_committed_version() == version more_records = [ # this one is new - Record.new({"x": 1}), + Record.new({'x': 1}), # this one is an update to one of the ones above - Record(records[0].id, {"x": 100}), + Record(records[0].id, {'x': 100}), ] database.ingest(more_records, commit=False) assert database.get_committed_version() == version @@ -108,74 +108,74 @@ def test_mixed_data(self, splitgill: SplitgillClient): def test_uncommitted_options( self, splitgill: SplitgillClient, basic_options: ParsingOptions ): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.update_options(basic_options, commit=False) assert database.get_committed_version() is None - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_committed_options( self, splitgill: SplitgillClient, basic_options: ParsingOptions ): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.update_options(basic_options, commit=True) assert database.get_committed_version() == 1326542401000 - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_mixed_options( self, splitgill: SplitgillClient, basic_options_builder: ParsingOptionsBuilder ): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) version = 1326542401000 options = basic_options_builder.build() database.update_options(options, commit=True) assert database.get_committed_version() == version - new_options = basic_options_builder.with_true_value("aye").build() + new_options = basic_options_builder.with_true_value('aye').build() database.update_options(new_options, commit=False) assert database.get_committed_version() == version def test_mixed_both( self, splitgill: SplitgillClient, basic_options_builder: ParsingOptionsBuilder ): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 4}), - Record.new({"x": 89}), - Record.new({"x": 5}), + Record.new({'x': 4}), + Record.new({'x': 89}), + Record.new({'x': 5}), ] database.ingest(records, commit=False) database.update_options(basic_options_builder.build(), commit=False) # add the new stuff - with freeze_time("2012-01-14 12:00:01"): + with freeze_time('2012-01-14 12:00:01'): version = database.commit() assert database.get_committed_version() == version # update the records - with freeze_time("2012-01-14 12:00:05"): - new_records = [Record.new({"x": 4})] + with freeze_time('2012-01-14 12:00:05'): + new_records = [Record.new({'x': 4})] database.ingest(new_records, commit=True) assert database.get_committed_version() == 1326542405000 # update the options - with freeze_time("2012-01-14 12:00:09"): - new_options = basic_options_builder.with_true_value("aye").build() + with freeze_time('2012-01-14 12:00:09'): + new_options = basic_options_builder.with_true_value('aye').build() database.update_options(new_options, commit=True) assert database.get_committed_version() == 1326542409000 class TestGetElasticsearchVersion: def test_no_docs(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) assert database.get_elasticsearch_version() is None def test_with_docs(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) versions = [] for _ in range(5): versions.append( - database.ingest([Record.new({"x": 4})], commit=True).version + database.ingest([Record.new({'x': 4})], commit=True).version ) # just to ensure the versions are different have a nap. They will be, cause # Python slow, but this guarantees it @@ -192,12 +192,12 @@ def test_with_deletes(self, splitgill: SplitgillClient): # of data or a delete (ensuring that we are getting the latest version from both # the version and the next fields) - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) versions = [] for _ in range(5): versions.append( - database.ingest([Record.new({"x": 4})], commit=True).version + database.ingest([Record.new({'x': 4})], commit=True).version ) # just to ensure the versions are different have a nap. They will be, cause # Python slow, but this guarantees it @@ -214,42 +214,42 @@ def test_with_deletes(self, splitgill: SplitgillClient): class TestCommit: def test_nothing_to_commit(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) assert database.commit() is None - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_new_records(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) - database.ingest([Record.new({"x": 5})], commit=False) + database = SplitgillDatabase('test', splitgill) + database.ingest([Record.new({'x': 5})], commit=False) assert database.commit() == 1326542401000 - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_new_options( self, splitgill: SplitgillClient, basic_options: ParsingOptions ): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.update_options(basic_options, commit=False) assert database.commit() == 1326542401000 - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_both(self, splitgill: SplitgillClient, basic_options: ParsingOptions): - database = SplitgillDatabase("test", splitgill) - database.ingest([Record.new({"x": 5})], commit=False) + database = SplitgillDatabase('test', splitgill) + database.ingest([Record.new({'x': 5})], commit=False) database.update_options(basic_options, commit=False) assert database.commit() == 1326542401000 class TestIngest: def test_no_records(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.ingest([]) assert database.get_committed_version() is None - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_with_records(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) count = 103 - record_iter = (Record.new({"x": i}) for i in range(count)) + record_iter = (Record.new({'x': i}) for i in range(count)) database.ingest(record_iter, commit=True) @@ -257,69 +257,69 @@ def test_with_records(self, splitgill: SplitgillClient): assert database.get_committed_version() == 1326542401000 def test_same_record(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) - record = Record("r1", {"x": 5, "y": False, "z": [1, 2, 3]}) + database = SplitgillDatabase('test', splitgill) + record = Record('r1', {'x': 5, 'y': False, 'z': [1, 2, 3]}) database.ingest([record], commit=True) - added_record = database.data_collection.find_one({"id": "r1"}) - assert added_record["data"] == {**record.data, DATA_ID_FIELD: "r1"} - assert "diffs" not in added_record + added_record = database.data_collection.find_one({'id': 'r1'}) + assert added_record['data'] == {**record.data, DATA_ID_FIELD: 'r1'} + assert 'diffs' not in added_record # add the same record again database.ingest([record], commit=True) - added_record_again = database.data_collection.find_one({"id": "r1"}) + added_record_again = database.data_collection.find_one({'id': 'r1'}) assert added_record == added_record_again def test_same_record_tuples_and_lists(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) - record = Record("r1", {"x": (1, 2, 3)}) - clean_data = {"x": [1, 2, 3], DATA_ID_FIELD: "r1"} + database = SplitgillDatabase('test', splitgill) + record = Record('r1', {'x': (1, 2, 3)}) + clean_data = {'x': [1, 2, 3], DATA_ID_FIELD: 'r1'} database.ingest([record], commit=True) - added_record = database.data_collection.find_one({"id": "r1"}) - assert added_record["data"] == clean_data - assert "diffs" not in added_record + added_record = database.data_collection.find_one({'id': 'r1'}) + assert added_record['data'] == clean_data + assert 'diffs' not in added_record # add the same record again database.ingest([record], commit=True) - added_record_again = database.data_collection.find_one({"id": "r1"}) + added_record_again = database.data_collection.find_one({'id': 'r1'}) assert added_record == added_record_again # add the same record again with a list instead of a tuple this time record.data = clean_data database.ingest([record], commit=True) - added_record_again = database.data_collection.find_one({"id": "r1"}) + added_record_again = database.data_collection.find_one({'id': 'r1'}) assert added_record == added_record_again - @freeze_time("2012-01-14 12:00:01") + @freeze_time('2012-01-14 12:00:01') def test_commit_and_is_default(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) - database.ingest([Record.new({"x": 10})]) + database.ingest([Record.new({'x': 10})]) assert database.get_committed_version() == 1326542401000 def test_no_commit(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) - database.ingest([Record.new({"x": 10})], commit=False) + database.ingest([Record.new({'x': 10})], commit=False) assert database.get_committed_version() is None def test_no_commit_when_error(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) # force bulk write to error when called - database.data_collection.bulk_write = MagicMock(side_effect=Exception("oh no!")) + database.data_collection.bulk_write = MagicMock(side_effect=Exception('oh no!')) - with pytest.raises(Exception, match="oh no!"): - database.ingest([Record.new({"x": 10})], commit=True) + with pytest.raises(Exception, match='oh no!'): + database.ingest([Record.new({'x': 10})], commit=True) assert database.get_committed_version() is None class TestSync: def test_nothing_to_sync(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) result = database.sync() assert not splitgill.elasticsearch.indices.exists(index=database.indices.latest) @@ -327,9 +327,9 @@ def test_nothing_to_sync(self, splitgill: SplitgillClient): assert not result.indices def test_everything_to_sync_many_workers(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) - records = [Record.new({"x": i}) for i in range(1000)] + records = [Record.new({'x': i}) for i in range(1000)] database.ingest(records, commit=True) # these are silly numbers, but it'll make sure it works at least! @@ -342,22 +342,22 @@ def test_everything_to_sync_many_workers(self, splitgill: SplitgillClient): assert result.indices == [database.indices.latest] def test_one_sync_then_another(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) version_1_time = datetime(2020, 7, 2, tzinfo=timezone.utc) version_1_records = [ - Record("r1", {"x": 5}), - Record("r2", {"x": 10}), - Record("r3", {"x": 15}), - Record("r4", {"x": -1}), - Record("r5", {"x": 1098}), + Record('r1', {'x': 5}), + Record('r2', {'x': 10}), + Record('r3', {'x': 15}), + Record('r4', {'x': -1}), + Record('r5', {'x': 1098}), ] # add some records at a specific version with freeze_time(version_1_time): database.ingest(version_1_records, commit=True) database.sync() assert ( - splitgill.elasticsearch.count(index=database.indices.latest)["count"] == 5 + splitgill.elasticsearch.count(index=database.indices.latest)['count'] == 5 ) assert database.get_current_indices() == [database.indices.latest] @@ -365,9 +365,9 @@ def test_one_sync_then_another(self, splitgill: SplitgillClient): version_2_time = datetime(2020, 7, 3, tzinfo=timezone.utc) version_2_records = [ # a new record - Record("another", {"x": 7}), + Record('another', {'x': 7}), # an update to the first record in the version 1 set of records - Record("r1", {"x": 6}), + Record('r1', {'x': 6}), ] # update the records with freeze_time(version_2_time): @@ -390,15 +390,15 @@ def test_one_sync_then_another(self, splitgill: SplitgillClient): ] def test_sync_with_delete(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) version_1_time = datetime(2020, 7, 2, tzinfo=timezone.utc) version_1_records = [ - Record("r1", {"x": 5}), - Record("r2", {"x": 10}), - Record("r3", {"x": 15}), - Record("r4", {"x": -1}), - Record("r5", {"x": 1098}), + Record('r1', {'x': 5}), + Record('r2', {'x': 10}), + Record('r3', {'x': 15}), + Record('r4', {'x': -1}), + Record('r5', {'x': 1098}), ] # add some records at a specific version with freeze_time(version_1_time): @@ -413,9 +413,9 @@ def test_sync_with_delete(self, splitgill: SplitgillClient): version_2_time = datetime(2020, 7, 3, tzinfo=timezone.utc) version_2_records = [ # a new record - Record("another", {"x": 7}), + Record('another', {'x': 7}), # a delete to the second record in the version 1 set of records - Record("r2", {}), + Record('r2', {}), ] # update the records with freeze_time(version_2_time): @@ -436,25 +436,25 @@ def test_sync_with_delete(self, splitgill: SplitgillClient): # the second record shouldn't be in the latest index assert not splitgill.elasticsearch.exists( - id="r2", index=database.indices.latest + id='r2', index=database.indices.latest ) # but it should be in the old index assert ( database.search(SearchVersion.all) - .filter(id_query("r2")) + .filter(id_query('r2')) .filter(version_query(to_timestamp(version_1_time))) .count() == 1 ) def test_sync_delete_non_existent(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) database.ingest( [ - Record.new({"x": 5}), - Record.new({"x": 10}), - Record.new({"x": 15}), + Record.new({'x': 5}), + Record.new({'x': 10}), + Record.new({'x': 15}), # a delete Record.new({}), ], @@ -476,19 +476,19 @@ def mock_parse(*args, **kwargs): if called < 4: return parse(*args, **kwargs) else: - raise Exception("Something went wrong... on purpose!") + raise Exception('Something went wrong... on purpose!') - with patch("splitgill.indexing.index.parse", side_effect=mock_parse): - database = SplitgillDatabase("test", splitgill) + with patch('splitgill.indexing.index.parse', side_effect=mock_parse): + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 5}), - Record.new({"x": 10}), - Record.new({"x": 15}), - Record.new({"x": 8}), + Record.new({'x': 5}), + Record.new({'x': 10}), + Record.new({'x': 15}), + Record.new({'x': 8}), ] database.ingest(records) - with pytest.raises(Exception, match="Something went wrong... on purpose!"): + with pytest.raises(Exception, match='Something went wrong... on purpose!'): # add them one at a time so that some docs actually get to elasticsearch database.sync(BulkOptions(chunk_size=1)) @@ -500,8 +500,8 @@ def mock_parse(*args, **kwargs): assert ( splitgill.elasticsearch.indices.get_settings(index=database.indices.latest)[ database.indices.latest - ]["settings"]["index"]["refresh_interval"] - == "-1" + ]['settings']['index']['refresh_interval'] + == '-1' ) # run another sync which doesn't error (we're outside of the patch context) @@ -515,8 +515,8 @@ def mock_parse(*args, **kwargs): assert ( splitgill.elasticsearch.indices.get_settings(index=database.indices.latest)[ database.indices.latest - ]["settings"]["index"].get("refresh_interval") - != "-1" + ]['settings']['index'].get('refresh_interval') + != '-1' ) def test_incomplete_is_not_searchable_until_refresh( @@ -532,19 +532,19 @@ def mock_parse(*args, **kwargs): if called < 4: return parse(*args, **kwargs) else: - raise Exception("Something went wrong... on purpose!") + raise Exception('Something went wrong... on purpose!') - with patch("splitgill.indexing.index.parse", side_effect=mock_parse): - database = SplitgillDatabase("test", splitgill) + with patch('splitgill.indexing.index.parse', side_effect=mock_parse): + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 5}), - Record.new({"x": 10}), - Record.new({"x": 15}), - Record.new({"x": 8}), + Record.new({'x': 5}), + Record.new({'x': 10}), + Record.new({'x': 15}), + Record.new({'x': 8}), ] database.ingest(records) - with pytest.raises(Exception, match="Something went wrong... on purpose!"): + with pytest.raises(Exception, match='Something went wrong... on purpose!'): # add them one at a time so that some docs actually get to elasticsearch database.sync( BulkOptions(worker_count=1, chunk_size=1, buffer_multiplier=1) @@ -557,13 +557,13 @@ def mock_parse(*args, **kwargs): assert database.search().count() < 4 def test_resync_rogue_deletions(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) records = [ - Record.new({"x": 5}), - Record.new({"x": 10}), - Record.new({"x": 15}), - Record.new({"x": -1}), - Record.new({"x": 1098}), + Record.new({'x': 5}), + Record.new({'x': 10}), + Record.new({'x': 15}), + Record.new({'x': -1}), + Record.new({'x': 1098}), ] database.ingest(records, commit=True) @@ -572,7 +572,7 @@ def test_resync_rogue_deletions(self, splitgill: SplitgillClient): # delete a couple of documents to cause mayhem database.search().params(refresh=True).filter( - "terms", **{DocumentField.ID: [records[1].id, records[4].id]} + 'terms', **{DocumentField.ID: [records[1].id, records[4].id]} ).delete() # check we deleted them assert database.search().count() == len(records) - 2 @@ -582,25 +582,25 @@ def test_resync_rogue_deletions(self, splitgill: SplitgillClient): assert database.search().count() == len(records) def test_resync_over_existing(self, splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) records = [ - Record("r1", {"x": 5}), - Record("r2", {"x": 10}), - Record("r3", {"x": 15}), - Record("r4", {"x": -1}), - Record("r5", {"x": 1098}), + Record('r1', {'x': 5}), + Record('r2', {'x': 10}), + Record('r3', {'x': 15}), + Record('r4', {'x': -1}), + Record('r5', {'x': 1098}), ] database.ingest(records, commit=True) new_records = [ - Record("r2", {"x": "arms!"}), - Record("r5", {"x": "egg!"}), + Record('r2', {'x': 'arms!'}), + Record('r5', {'x': 'egg!'}), ] database.ingest(new_records, commit=True) # force the sync to create arc-0 and arc-1 so that we can test all the arcs get # deleted when we resync - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 1): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 1): database.sync(resync=False) assert database.search().count() == 5 @@ -613,7 +613,7 @@ def test_resync_over_existing(self, splitgill: SplitgillClient): def test_search(splitgill: SplitgillClient): - database = SplitgillDatabase("test", splitgill) + database = SplitgillDatabase('test', splitgill) client = splitgill.elasticsearch latest = [database.indices.latest] @@ -632,18 +632,18 @@ def test_search(splitgill: SplitgillClient): assert database.search(version=5)._index == wildcard assert database.search(version=5)._using == client assert database.search(version=5).to_dict() == { - "query": {"bool": {"filter": [version_query(5).to_dict()]}} + 'query': {'bool': {'filter': [version_query(5).to_dict()]}} } # data in index and 5 is less than latest, so should create a search over everything # at version 5 - database.ingest([Record.new({"x": 5})]) + database.ingest([Record.new({'x': 5})]) database.sync() assert 5 < database.get_elasticsearch_version() assert database.search(version=5)._index == wildcard assert database.search(version=5)._using == client assert database.search(version=5).to_dict() == { - "query": {"bool": {"filter": [version_query(5).to_dict()]}} + 'query': {'bool': {'filter': [version_query(5).to_dict()]}} } # data in index and version requested is above latest so should just use latest @@ -727,12 +727,12 @@ def check_data_fields(actual: List[DataField], expected: List[DataField]): class TestGetFieldsMethods: def test_int(self, database: SplitgillDatabase): records = [ - Record.new({"a": 5}), - Record.new({"a": 10}), - Record.new({"b": 15}), - Record.new({"b": -1}), - Record.new({"b": 1098}), - Record.new({"c": 33}), + Record.new({'a': 5}), + Record.new({'a': 10}), + Record.new({'b': 15}), + Record.new({'b': -1}), + Record.new({'b': 1098}), + Record.new({'c': 33}), ] database.ingest(records, commit=True) database.sync() @@ -741,28 +741,28 @@ def test_int(self, database: SplitgillDatabase): assert len(data_fields) == 4 assert data_fields == [ id_df(6), - df("b", 3, i=3), - df("a", 2, i=2), - df("c", 1, i=1), + df('b', 3, i=3), + df('a', 2, i=2), + df('c', 1, i=1), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 4 assert parsed_fields == [ id_pf(6), - pf("b", 3, t=3, n=3), - pf("a", 2, t=2, n=2), - pf("c", 1, t=1, n=1), + pf('b', 3, t=3, n=3), + pf('a', 2, t=2, n=2), + pf('c', 1, t=1, n=1), ] def test_float(self, database: SplitgillDatabase): records = [ - Record.new({"a": 5.4}), - Record.new({"a": 10.1}), - Record.new({"b": 15.0}), - Record.new({"b": -1.8}), - Record.new({"b": 1098.124235}), - Record.new({"c": 33.6}), + Record.new({'a': 5.4}), + Record.new({'a': 10.1}), + Record.new({'b': 15.0}), + Record.new({'b': -1.8}), + Record.new({'b': 1098.124235}), + Record.new({'c': 33.6}), ] database.ingest(records, commit=True) database.sync() @@ -771,32 +771,32 @@ def test_float(self, database: SplitgillDatabase): assert len(data_fields) == 4 assert data_fields == [ id_df(6), - df("b", 3, f=3), - df("a", 2, f=2), - df("c", 1, f=1), + df('b', 3, f=3), + df('a', 2, f=2), + df('c', 1, f=1), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 4 assert parsed_fields == [ id_pf(6), - pf("b", 3, t=3, n=3), - pf("a", 2, t=2, n=2), - pf("c", 1, t=1, n=1), + pf('b', 3, t=3, n=3), + pf('a', 2, t=2, n=2), + pf('c', 1, t=1, n=1), ] def test_date(self, database: SplitgillDatabase): records = [ - Record.new({"a": "2010-01-06"}), - Record.new({"a": "2010-01-06T13:11:47+05:00"}), - Record.new({"b": "2010-01-06 13:11:47"}), + Record.new({'a': '2010-01-06'}), + Record.new({'a': '2010-01-06T13:11:47+05:00'}), + Record.new({'b': '2010-01-06 13:11:47'}), ] database.ingest(records, commit=True) database.update_options( ParsingOptionsBuilder() - .with_date_format("%Y-%m-%d") - .with_date_format("%Y-%m-%dT%H:%M:%S%z") - .with_date_format("%Y-%m-%d %H:%M:%S") + .with_date_format('%Y-%m-%d') + .with_date_format('%Y-%m-%dT%H:%M:%S%z') + .with_date_format('%Y-%m-%d %H:%M:%S') .build() ) database.sync() @@ -805,26 +805,26 @@ def test_date(self, database: SplitgillDatabase): assert len(data_fields) == 3 assert data_fields == [ id_df(3), - df("a", 2, s=2), - df("b", 1, s=1), + df('a', 2, s=2), + df('b', 1, s=1), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 3 assert parsed_fields == [ id_pf(3), - pf("a", 2, d=2, t=2), - pf("b", 1, d=1, t=1), + pf('a', 2, d=2, t=2), + pf('b', 1, d=1, t=1), ] def test_bool(self, database: SplitgillDatabase): records = [ - Record.new({"a": True}), - Record.new({"a": False}), - Record.new({"b": True}), - Record.new({"b": True}), - Record.new({"b": False}), - Record.new({"c": False}), + Record.new({'a': True}), + Record.new({'a': False}), + Record.new({'b': True}), + Record.new({'b': True}), + Record.new({'b': False}), + Record.new({'c': False}), ] database.ingest(records, commit=True) database.sync() @@ -833,28 +833,28 @@ def test_bool(self, database: SplitgillDatabase): assert len(data_fields) == 4 assert data_fields == [ id_df(6), - df("b", 3, b=3), - df("a", 2, b=2), - df("c", 1, b=1), + df('b', 3, b=3), + df('a', 2, b=2), + df('c', 1, b=1), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 4 assert parsed_fields == [ id_pf(6), - pf("b", 3, t=3, b=3), - pf("a", 2, t=2, b=2), - pf("c", 1, t=1, b=1), + pf('b', 3, t=3, b=3), + pf('a', 2, t=2, b=2), + pf('c', 1, t=1, b=1), ] def test_str(self, database: SplitgillDatabase): records = [ - Record.new({"a": "beans"}), - Record.new({"a": "hammers"}), - Record.new({"b": "eggs"}), - Record.new({"b": "llamas"}), - Record.new({"b": "goats"}), - Record.new({"c": "books"}), + Record.new({'a': 'beans'}), + Record.new({'a': 'hammers'}), + Record.new({'b': 'eggs'}), + Record.new({'b': 'llamas'}), + Record.new({'b': 'goats'}), + Record.new({'c': 'books'}), ] database.ingest(records, commit=True) database.sync() @@ -863,28 +863,28 @@ def test_str(self, database: SplitgillDatabase): assert len(data_fields) == 4 assert data_fields == [ id_df(6), - df("b", 3, s=3), - df("a", 2, s=2), - df("c", 1, s=1), + df('b', 3, s=3), + df('a', 2, s=2), + df('c', 1, s=1), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 4 assert parsed_fields == [ id_pf(6), - pf("b", 3, t=3), - pf("a", 2, t=2), - pf("c", 1, t=1), + pf('b', 3, t=3), + pf('a', 2, t=2), + pf('c', 1, t=1), ] def test_dict(self, database: SplitgillDatabase): records = [ - Record.new({"topA": {"a": 4}}), - Record.new({"topA": {"a": 5}}), - Record.new({"topB": {"a": 6}}), - Record.new({"topB": {"a": 7}}), - Record.new({"topB": {"a": 8}}), - Record.new({"topC": {"a": 9}}), + Record.new({'topA': {'a': 4}}), + Record.new({'topA': {'a': 5}}), + Record.new({'topB': {'a': 6}}), + Record.new({'topB': {'a': 7}}), + Record.new({'topB': {'a': 8}}), + Record.new({'topC': {'a': 9}}), ] database.ingest(records, commit=True) database.sync() @@ -895,12 +895,12 @@ def test_dict(self, database: SplitgillDatabase): data_fields, [ id_df(6), - df("topB", 3, d=3), - df("topB.a", 3, i=3), - df("topA", 2, d=2), - df("topA.a", 2, i=2), - df("topC", 1, d=1), - df("topC.a", 1, i=1), + df('topB', 3, d=3), + df('topB.a', 3, i=3), + df('topA', 2, d=2), + df('topA.a', 2, i=2), + df('topC', 1, d=1), + df('topC.a', 1, i=1), ], ) @@ -908,17 +908,17 @@ def test_dict(self, database: SplitgillDatabase): assert len(parsed_fields) == 4 assert parsed_fields == [ id_pf(6), - pf("topB.a", 3, t=3, n=3), - pf("topA.a", 2, t=2, n=2), - pf("topC.a", 1, t=1, n=1), + pf('topB.a', 3, t=3, n=3), + pf('topA.a', 2, t=2, n=2), + pf('topC.a', 1, t=1, n=1), ] def test_list(self, database: SplitgillDatabase): records = [ - Record.new({"a": [1, 2, 3]}), - Record.new({"a": [1, "beans", 3]}), - Record.new({"a": [1, False, True]}), - Record.new({"a": [5.4]}), + Record.new({'a': [1, 2, 3]}), + Record.new({'a': [1, 'beans', 3]}), + Record.new({'a': [1, False, True]}), + Record.new({'a': [5.4]}), ] database.ingest(records, commit=True) database.sync() @@ -929,8 +929,8 @@ def test_list(self, database: SplitgillDatabase): data_fields, [ id_df(4), - df("a", 4, l=4), - df("a.", 4, i=3, f=1, s=1, b=1), + df('a', 4, l=4), + df('a.', 4, i=3, f=1, s=1, b=1), ], ) @@ -938,17 +938,17 @@ def test_list(self, database: SplitgillDatabase): assert len(parsed_fields) == 2 assert parsed_fields == [ id_pf(4), - pf("a", 4, t=4, n=4, b=1), + pf('a', 4, t=4, n=4, b=1), ] def test_mix(self, database: SplitgillDatabase): records = [ - Record.new({"a": 5}), - Record.new({"a": 50.1}), - Record.new({"b": "beans!"}), - Record.new({"b": [1, 2, 3]}), - Record.new({"b": {"x": 5.4, "y": True}}), - Record.new({"b": {"x": "lemonade!", "y": False}}), + Record.new({'a': 5}), + Record.new({'a': 50.1}), + Record.new({'b': 'beans!'}), + Record.new({'b': [1, 2, 3]}), + Record.new({'b': {'x': 5.4, 'y': True}}), + Record.new({'b': {'x': 'lemonade!', 'y': False}}), ] database.ingest(records, commit=True) database.sync() @@ -959,11 +959,11 @@ def test_mix(self, database: SplitgillDatabase): data_fields, [ id_df(6), - df("b", 4, s=1, l=1, d=2), - df("a", 2, i=1, f=1), - df("b.x", 2, f=1, s=1), - df("b.y", 2, b=2), - df("b.", 1, i=1), + df('b', 4, s=1, l=1, d=2), + df('a', 2, i=1, f=1), + df('b.x', 2, f=1, s=1), + df('b.y', 2, b=2), + df('b.', 1, i=1), ], ) @@ -971,16 +971,16 @@ def test_mix(self, database: SplitgillDatabase): assert len(parsed_fields) == 5 assert parsed_fields == [ id_pf(6), - pf("a", 2, t=2, n=2), - pf("b", 2, t=2, n=1), - pf("b.x", 2, t=2, n=1), - pf("b.y", 2, t=2, b=2), + pf('a', 2, t=2, n=2), + pf('b', 2, t=2, n=1), + pf('b.x', 2, t=2, n=1), + pf('b.y', 2, t=2, b=2), ] def test_list_of_dicts(self, database: SplitgillDatabase): records = [ - Record.new({"a": [{"a": 5}, {"a": 5.4}, {"b": True}]}), - Record.new({"a": [{"a": "beans"}, {"a": 5.4}, {"b": 3.9}]}), + Record.new({'a': [{'a': 5}, {'a': 5.4}, {'b': True}]}), + Record.new({'a': [{'a': 'beans'}, {'a': 5.4}, {'b': 3.9}]}), ] database.ingest(records, commit=True) database.sync() @@ -991,10 +991,10 @@ def test_list_of_dicts(self, database: SplitgillDatabase): data_fields, [ id_df(2), - df("a", 2, l=2), - df("a.", 2, d=2), - df("a..a", 2, i=1, f=2, s=1), - df("a..b", 2, b=1, f=1), + df('a', 2, l=2), + df('a.', 2, d=2), + df('a..a', 2, i=1, f=2, s=1), + df('a..b', 2, b=1, f=1), ], ) @@ -1002,15 +1002,15 @@ def test_list_of_dicts(self, database: SplitgillDatabase): assert len(parsed_fields) == 3 assert parsed_fields == [ id_pf(2), - pf("a.a", 2, t=2, n=2), - pf("a.b", 2, t=2, b=1, n=1), + pf('a.a', 2, t=2, n=2), + pf('a.b', 2, t=2, b=1, n=1), ] def test_list_of_lists(self, database: SplitgillDatabase): records = [ - Record.new({"a": [[1, 2, 3], [4, 5, 6], 9]}), - Record.new({"a": [[9, 8, 7], [6, 5, 4], "organs"]}), - Record.new({"a": [[1, 2, 3], [4, 5, 6], [True, False]]}), + Record.new({'a': [[1, 2, 3], [4, 5, 6], 9]}), + Record.new({'a': [[9, 8, 7], [6, 5, 4], 'organs']}), + Record.new({'a': [[1, 2, 3], [4, 5, 6], [True, False]]}), ] database.ingest(records, commit=True) database.sync() @@ -1021,9 +1021,9 @@ def test_list_of_lists(self, database: SplitgillDatabase): data_fields, [ id_df(3), - df("a", 3, l=3), - df("a.", 3, l=3, i=1, s=1), - df("a..", 3, b=1, i=3), + df('a', 3, l=3), + df('a.', 3, l=3, i=1, s=1), + df('a..', 3, b=1, i=3), ], ) @@ -1031,12 +1031,12 @@ def test_list_of_lists(self, database: SplitgillDatabase): assert len(parsed_fields) == 2 assert parsed_fields == [ id_pf(3), - pf("a", 3, t=3, n=3, b=1), + pf('a', 3, t=3, n=3, b=1), ] def test_deep_nesting(self, database: SplitgillDatabase): # ew - records = [Record.new({"a": {"b": [[{"c": [{"d": 5}]}]]}})] + records = [Record.new({'a': {'b': [[{'c': [{'d': 5}]}]]}})] database.ingest(records, commit=True) database.sync() @@ -1046,30 +1046,30 @@ def test_deep_nesting(self, database: SplitgillDatabase): data_fields, [ id_df(1), - df("a", 1, d=1), - df("a.b", 1, l=1), - df("a.b.", 1, l=1), - df("a.b..", 1, d=1), - df("a.b...c", 1, l=1), - df("a.b...c.", 1, d=1), - df("a.b...c..d", 1, i=1), + df('a', 1, d=1), + df('a.b', 1, l=1), + df('a.b.', 1, l=1), + df('a.b..', 1, d=1), + df('a.b...c', 1, l=1), + df('a.b...c.', 1, d=1), + df('a.b...c..d', 1, i=1), ], ) parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 2 - assert parsed_fields == [id_pf(1), pf("a.b.c.d", 1, t=1, n=1)] + assert parsed_fields == [id_pf(1), pf('a.b.c.d', 1, t=1, n=1)] def test_version(self, database: SplitgillDatabase): # add some records with integer values version_1_time = datetime(2020, 7, 2, tzinfo=timezone.utc) - version_1_records = [Record("r1", {"x": 5}), Record("r2", {"x": 10})] + version_1_records = [Record('r1', {'x': 5}), Record('r2', {'x': 10})] with freeze_time(version_1_time): database.ingest(version_1_records, commit=True) database.sync() # the next day all the record values become bools, wild stuff version_2_time = datetime(2020, 7, 3, tzinfo=timezone.utc) - version_2_records = [Record("r1", {"x": True}), Record("r2", {"x": False})] + version_2_records = [Record('r1', {'x': True}), Record('r2', {'x': False})] with freeze_time(version_2_time): database.ingest(version_2_records, commit=True) database.sync() @@ -1077,25 +1077,25 @@ def test_version(self, database: SplitgillDatabase): # check the latest version where the values are all bools data_fields = database.get_data_fields() assert len(data_fields) == 2 - assert data_fields == [id_df(2), df("x", 2, b=2)] + assert data_fields == [id_df(2), df('x', 2, b=2)] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 2 - assert parsed_fields == [id_pf(2), pf("x", 2, t=2, b=2)] + assert parsed_fields == [id_pf(2), pf('x', 2, t=2, b=2)] # then check the old version where the values are ints data_fields = database.get_data_fields(version=to_timestamp(version_1_time)) assert len(data_fields) == 2 - assert data_fields == [id_df(2), df("x", 2, i=2)] + assert data_fields == [id_df(2), df('x', 2, i=2)] parsed_fields = database.get_parsed_fields(version=to_timestamp(version_1_time)) assert len(parsed_fields) == 2 - assert parsed_fields == [id_pf(2), pf("x", 2, t=2, n=2)] + assert parsed_fields == [id_pf(2), pf('x', 2, t=2, n=2)] def test_with_filter(self, database: SplitgillDatabase): records = [ - Record.new({"a": 1, "b": True}), - Record.new({"a": 2, "b": 5.3}), - Record.new({"a": 3, "b": "beans!"}), - Record.new({"a": 4, "b": "armpit"}), + Record.new({'a': 1, 'b': True}), + Record.new({'a': 2, 'b': 5.3}), + Record.new({'a': 3, 'b': 'beans!'}), + Record.new({'a': 4, 'b': 'armpit'}), ] database.ingest(records, commit=True) database.sync() @@ -1105,41 +1105,41 @@ def test_with_filter(self, database: SplitgillDatabase): assert len(data_fields) == 3 assert data_fields == [ id_df(4), - df("a", 4, i=4), - df("b", 4, f=1, b=1, s=2), + df('a', 4, i=4), + df('b', 4, f=1, b=1, s=2), ] parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 3 assert parsed_fields == [ id_pf(4), - pf("a", 4, t=4, n=4), - pf("b", 4, t=4, b=1, n=1), + pf('a', 4, t=4, n=4), + pf('b', 4, t=4, b=1, n=1), ] # now check with some filters - query = term_query("a", 1) + query = term_query('a', 1) data_fields = database.get_data_fields(query=query) parsed_fields = database.get_parsed_fields(query=query) - assert data_fields == [id_df(1), df("a", 1, i=1), df("b", 1, b=1)] - assert parsed_fields == [id_pf(1), pf("a", 1, t=1, n=1), pf("b", 1, t=1, b=1)] + assert data_fields == [id_df(1), df('a', 1, i=1), df('b', 1, b=1)] + assert parsed_fields == [id_pf(1), pf('a', 1, t=1, n=1), pf('b', 1, t=1, b=1)] - query = term_query("a", 2) + query = term_query('a', 2) data_fields = database.get_data_fields(query=query) parsed_fields = database.get_parsed_fields(query=query) - assert data_fields == [id_df(1), df("a", 1, i=1), df("b", 1, f=1)] - assert parsed_fields == [id_pf(1), pf("a", 1, t=1, n=1), pf("b", 1, t=1, n=1)] + assert data_fields == [id_df(1), df('a', 1, i=1), df('b', 1, f=1)] + assert parsed_fields == [id_pf(1), pf('a', 1, t=1, n=1), pf('b', 1, t=1, n=1)] - query = term_query("a", 3) + query = term_query('a', 3) data_fields = database.get_data_fields(query=query) parsed_fields = database.get_parsed_fields(query=query) - assert data_fields == [id_df(1), df("a", 1, i=1), df("b", 1, s=1)] - assert parsed_fields == [id_pf(1), pf("a", 1, t=1, n=1), pf("b", 1, t=1)] + assert data_fields == [id_df(1), df('a', 1, i=1), df('b', 1, s=1)] + assert parsed_fields == [id_pf(1), pf('a', 1, t=1, n=1), pf('b', 1, t=1)] - query = term_query("a", 4) + query = term_query('a', 4) data_fields = database.get_data_fields(query=query) parsed_fields = database.get_parsed_fields(query=query) - assert data_fields == [id_df(1), df("a", 1, i=1), df("b", 1, s=1)] - assert parsed_fields == [id_pf(1), pf("a", 1, t=1, n=1), pf("b", 1, t=1)] + assert data_fields == [id_df(1), df('a', 1, i=1), df('b', 1, s=1)] + assert parsed_fields == [id_pf(1), pf('a', 1, t=1, n=1), pf('b', 1, t=1)] def test_geo_in_parsed_fields( self, database: SplitgillDatabase, geojson_point: dict, wkt_point: str @@ -1147,17 +1147,17 @@ def test_geo_in_parsed_fields( records = [ Record.new( { - "geojson": geojson_point, - "wkt": wkt_point, - "lat": 30, - "lon": 60, - "rad": 100, + 'geojson': geojson_point, + 'wkt': wkt_point, + 'lat': 30, + 'lon': 60, + 'rad': 100, } ), ] database.ingest(records, commit=False) database.update_options( - ParsingOptionsBuilder().with_geo_hint("lat", "lon", "rad").build(), + ParsingOptionsBuilder().with_geo_hint('lat', 'lon', 'rad').build(), commit=False, ) database.commit() @@ -1167,20 +1167,20 @@ def test_geo_in_parsed_fields( assert len(parsed_fields) == 8 assert parsed_fields == [ id_pf(1), - pf("geojson", 1, g=1, has_original=False), - pf("geojson.coordinates", 1, n=1, t=1), - pf("geojson.type", 1, t=1), - pf("lat", 1, g=1, t=1, n=1), - pf("lon", 1, t=1, n=1), - pf("rad", 1, t=1, n=1), - pf("wkt", 1, g=1, t=1), + pf('geojson', 1, g=1, has_original=False), + pf('geojson.coordinates', 1, n=1, t=1), + pf('geojson.type', 1, t=1), + pf('lat', 1, g=1, t=1, n=1), + pf('lon', 1, t=1, n=1), + pf('rad', 1, t=1, n=1), + pf('wkt', 1, g=1, t=1), ] def test_counts(self, database: SplitgillDatabase): records = [ - Record.new({"a": [1, True]}), - Record.new({"a": [1, 4.5]}), - Record.new({"a": "beans"}), + Record.new({'a': [1, True]}), + Record.new({'a': [1, 4.5]}), + Record.new({'a': 'beans'}), ] database.ingest(records, commit=True) database.sync() @@ -1192,39 +1192,39 @@ def test_counts(self, database: SplitgillDatabase): [ id_df(3), # 3 fields have an "a" field - df("a", 3, l=2, s=1), + df('a', 3, l=2, s=1), # 2 fields have lists under "a" - df("a.", 2, i=2, b=1, f=1), + df('a.', 2, i=2, b=1, f=1), ], ) parsed_fields = database.get_parsed_fields() assert len(parsed_fields) == 2 - assert parsed_fields == [id_pf(3), pf("a", 3, n=2, b=1, t=3)] + assert parsed_fields == [id_pf(3), pf('a', 3, n=2, b=1, t=3)] def test_hierarchy(self, database: SplitgillDatabase): records = [ - Record.new({"a": "beans"}), - Record.new({"b": {"c": 4, "d": True}}), - Record.new({"e": ["beans"]}), - Record.new({"f": [{"g": 3, "h": {"i": "beans"}}]}), + Record.new({'a': 'beans'}), + Record.new({'b': {'c': 4, 'd': True}}), + Record.new({'e': ['beans']}), + Record.new({'f': [{'g': 3, 'h': {'i': 'beans'}}]}), ] database.ingest(records, commit=True) database.sync() # these are the data fields we expect record_id = id_df(4) - a = df("a", 1, s=1) - b = df("b", 1, d=1) - b_c = df("b.c", 1, i=1) - b_d = df("b.d", 1, b=1) - e = df("e", 1, l=1) - e_ = df("e.", 1, s=1) - f = df("f", 1, l=1) - f_ = df("f.", 1, d=1) - f__g = df("f..g", 1, i=1) - f__h = df("f..h", 1, d=1) - f__h_i = df("f..h.i", 1, s=1) + a = df('a', 1, s=1) + b = df('b', 1, d=1) + b_c = df('b.c', 1, i=1) + b_d = df('b.d', 1, b=1) + e = df('e', 1, l=1) + e_ = df('e.', 1, s=1) + f = df('f', 1, l=1) + f_ = df('f.', 1, d=1) + f__g = df('f..g', 1, i=1) + f__h = df('f..h', 1, d=1) + f__h_i = df('f..h.i', 1, s=1) # these are the relationships we expect b.children.append(b_c) @@ -1272,7 +1272,7 @@ def test_hierarchy(self, database: SplitgillDatabase): def test_get_rounded_version(splitgill: SplitgillClient): - database = splitgill.get_database("test") + database = splitgill.get_database('test') # test with no versions assert database.get_rounded_version(8) is None @@ -1280,7 +1280,7 @@ def test_get_rounded_version(splitgill: SplitgillClient): # create some versions for version in [4, 5, 9]: with freeze_time(datetime.fromtimestamp(version / 1000, timezone.utc)): - database.ingest([Record.new({"a": 4})]) + database.ingest([Record.new({'a': 4})]) database.sync() # check before the first version @@ -1308,18 +1308,18 @@ def test_get_rounded_version(splitgill: SplitgillClient): def test_get_versions(splitgill: SplitgillClient): - database = splitgill.get_database("test") + database = splitgill.get_database('test') assert database.get_versions() == [] - record_id = "test-1" + record_id = 'test-1' versions = [ - (4, {"a": 1}), - (5, {"a": 7}), + (4, {'a': 1}), + (5, {'a': 7}), # delete (7, {}), # it's back! :O - (9, {"a": 4}), + (9, {'a': 4}), # lastly, delete the record to check next is also being considered (15, {}), ] @@ -1338,27 +1338,27 @@ def test_get_versions(splitgill: SplitgillClient): class TestArcStatus: def test_empty(self, splitgill: SplitgillClient): - database = splitgill.get_database("test") + database = splitgill.get_database('test') status = database.get_arc_status() assert status.index == 0 assert status.count == 0 def test_basic(self, splitgill: SplitgillClient): - database = splitgill.get_database("test") + database = splitgill.get_database('test') - record_id = "r-1" + record_id = 'r-1' # this will go into arc-0 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # this will go into arc-0 - database.ingest([Record(record_id, {"a": 7})], commit=True) + database.ingest([Record(record_id, {'a': 7})], commit=True) # this will go into arc-0 - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) # this will go into arc-1 - database.ingest([Record(record_id, {"a": 3})], commit=True) + database.ingest([Record(record_id, {'a': 3})], commit=True) # this'll be the latest record - database.ingest([Record(record_id, {"a": 8})], commit=True) + database.ingest([Record(record_id, {'a': 8})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 3): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 3): database.sync() status = database.get_arc_status() @@ -1374,54 +1374,54 @@ def test_basic(self, splitgill: SplitgillClient): ) def test_repeat_ingest(self, splitgill: SplitgillClient): - database = splitgill.get_database("test") + database = splitgill.get_database('test') - record_id = "r-1" + record_id = 'r-1' # this will go into arc-0 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # this will go into arc-0 - database.ingest([Record(record_id, {"a": 7})], commit=True) + database.ingest([Record(record_id, {'a': 7})], commit=True) # this will go into arc-0 - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) # this will go into arc-1 - database.ingest([Record(record_id, {"a": 3})], commit=True) + database.ingest([Record(record_id, {'a': 3})], commit=True) # this will go into arc-1 - database.ingest([Record(record_id, {"a": 8})], commit=True) + database.ingest([Record(record_id, {'a': 8})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 3): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 3): database.sync() # this will go into arc-1 - database.ingest([Record(record_id, {"a": 1})], commit=True) + database.ingest([Record(record_id, {'a': 1})], commit=True) # this will go into arc-2 - database.ingest([Record(record_id, {"a": 4})], commit=True) + database.ingest([Record(record_id, {'a': 4})], commit=True) # this will go into arc-2 - database.ingest([Record(record_id, {"a": 9})], commit=True) + database.ingest([Record(record_id, {'a': 9})], commit=True) # this will go into latest - database.ingest([Record(record_id, {"a": 2})], commit=True) + database.ingest([Record(record_id, {'a': 2})], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 3): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 3): database.sync() assert database.get_arc_status() == ArcStatus(2, 2) def test_resync_arcs(splitgill: SplitgillClient): - database = splitgill.get_database("test") - database.ingest([Record(f"r-{i}", {"a": 4}) for i in range(1000)], commit=True) - database.ingest([Record(f"r-{i}", {"a": 7}) for i in range(1000)], commit=True) - database.ingest([Record(f"r-{i}", {"a": 5}) for i in range(400)], commit=True) + database = splitgill.get_database('test') + database.ingest([Record(f'r-{i}', {'a': 4}) for i in range(1000)], commit=True) + database.ingest([Record(f'r-{i}', {'a': 7}) for i in range(1000)], commit=True) + database.ingest([Record(f'r-{i}', {'a': 5}) for i in range(400)], commit=True) - with patch("splitgill.indexing.index.MAX_DOCS_PER_ARC", 349): + with patch('splitgill.indexing.index.MAX_DOCS_PER_ARC', 349): database.sync() database.resync_arcs() count = database.search(version=SearchVersion.all).count() r_5_count = ( - database.search(version=SearchVersion.all).filter(id_query("r-5")).count() + database.search(version=SearchVersion.all).filter(id_query('r-5')).count() ) r_780_count = ( - database.search(version=SearchVersion.all).filter(id_query("r-780")).count() + database.search(version=SearchVersion.all).filter(id_query('r-780')).count() ) assert count == 2400 assert r_5_count == 3 diff --git a/tests/test_model.py b/tests/test_model.py index 29d491c..e1d6849 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -4,10 +4,10 @@ from splitgill.diffing import diff from splitgill.model import ( - MongoRecord, - VersionedData, GeoFieldHint, + MongoRecord, ParsingOptions, + VersionedData, ) @@ -27,14 +27,14 @@ def test_is_deleted_true(self): assert record.is_deleted def test_is_deleted_false(self): - record = create_mongo_record(1, {"x": "beans"}) + record = create_mongo_record(1, {'x': 'beans'}) assert not record.is_deleted def test_iter(self): data = [ - VersionedData(10, {"a": "1", "b": "2"}), - VersionedData(7, {"a": "4", "b": "1"}), - VersionedData(2, {"c": "1"}), + VersionedData(10, {'a': '1', 'b': '2'}), + VersionedData(7, {'a': '4', 'b': '1'}), + VersionedData(2, {'c': '1'}), ] record = create_mongo_record(data[0].version, data[0].data, *data[1:]) @@ -44,9 +44,9 @@ def test_iter(self): def test_get_versions(self): data = [ - VersionedData(10, {"a": "1", "b": "2"}), - VersionedData(7, {"a": "4", "b": "1"}), - VersionedData(2, {"c": "1"}), + VersionedData(10, {'a': '1', 'b': '2'}), + VersionedData(7, {'a': '4', 'b': '1'}), + VersionedData(2, {'c': '1'}), ] record = create_mongo_record(data[0].version, data[0].data, *data[1:]) @@ -59,23 +59,23 @@ def test_get_versions(self): class TestGeoFieldHint: def test_hash(self): hints = set() - hints.add(GeoFieldHint("latitude", "longitude")) - hints.add(GeoFieldHint("latitude", "longitude")) - hints.add(GeoFieldHint("latitude", "longitude", "radius")) - hints.add(GeoFieldHint("latitude", "longitude", None)) - hints.add(GeoFieldHint("lat", "lon")) + hints.add(GeoFieldHint('latitude', 'longitude')) + hints.add(GeoFieldHint('latitude', 'longitude')) + hints.add(GeoFieldHint('latitude', 'longitude', 'radius')) + hints.add(GeoFieldHint('latitude', 'longitude', None)) + hints.add(GeoFieldHint('lat', 'lon')) assert len(hints) == 2 def test_eq(self): # should be equal based on the lat, nothing else - assert GeoFieldHint("lat", "lon") == GeoFieldHint("lat", "lon") - assert GeoFieldHint("lat", "lon") == GeoFieldHint("lat", "difflon") - assert GeoFieldHint("lat", "lon", None) == GeoFieldHint("lat", "lon", "rad") + assert GeoFieldHint('lat', 'lon') == GeoFieldHint('lat', 'lon') + assert GeoFieldHint('lat', 'lon') == GeoFieldHint('lat', 'difflon') + assert GeoFieldHint('lat', 'lon', None) == GeoFieldHint('lat', 'lon', 'rad') class TestParsingOptions: def test_from_to_doc_empty(self): options = ParsingOptions( - frozenset(), frozenset(), frozenset(), frozenset(), 256, "{0:.15g}" + frozenset(), frozenset(), frozenset(), frozenset(), 256, '{0:.15g}' ) assert options == ParsingOptions.from_doc(options.to_doc()) diff --git a/tests/test_search.py b/tests/test_search.py index 6d573c3..fa41ecc 100644 --- a/tests/test_search.py +++ b/tests/test_search.py @@ -6,163 +6,163 @@ from splitgill.indexing.options import ParsingOptionsBuilder from splitgill.indexing.parser import parse from splitgill.search import ( - term_query, - number, - date, - boolean, - match_query, ALL_TEXT, - text, + boolean, + date, keyword, + match_query, + number, range_query, rebuild_data, + term_query, + text, ) from splitgill.utils import to_timestamp class TestTermQuery: def test_no_infer(self): - value = "banana" - q = term_query("beans.toast", value, ParsedType.NUMBER) - assert q.to_dict() == {"term": {number("beans.toast"): value}} + value = 'banana' + q = term_query('beans.toast', value, ParsedType.NUMBER) + assert q.to_dict() == {'term': {number('beans.toast'): value}} def test_datetimes_are_converted(self): dt = datetime(2019, 6, 4, 14, 9, 45, tzinfo=timezone.utc) ms = to_timestamp(dt) # this should convert the datetime to an epoch - q = term_query("beans.toast", dt, ParsedType.DATE) - assert q.to_dict() == {"term": {date("beans.toast"): ms}} + q = term_query('beans.toast', dt, ParsedType.DATE) + assert q.to_dict() == {'term': {date('beans.toast'): ms}} # this should not touch the value as it's not a datetime - q = term_query("beans.toast", ms, ParsedType.DATE) - assert q.to_dict() == {"term": {date("beans.toast"): ms}} + q = term_query('beans.toast', ms, ParsedType.DATE) + assert q.to_dict() == {'term': {date('beans.toast'): ms}} # if the parsed type is inferred, it should still convert the datetime - q = term_query("beans.toast", dt) - assert q.to_dict() == {"term": {date("beans.toast"): ms}} + q = term_query('beans.toast', dt) + assert q.to_dict() == {'term': {date('beans.toast'): ms}} def test_infer_number(self): - assert term_query("beans.toast", 4).to_dict() == { - "term": {number("beans.toast"): 4} + assert term_query('beans.toast', 4).to_dict() == { + 'term': {number('beans.toast'): 4} } - assert term_query("beans.toast", 9.2).to_dict() == { - "term": {number("beans.toast"): 9.2} + assert term_query('beans.toast', 9.2).to_dict() == { + 'term': {number('beans.toast'): 9.2} } def test_infer_boolean(self): - assert term_query("beans.toast", True).to_dict() == { - "term": {boolean("beans.toast"): True} + assert term_query('beans.toast', True).to_dict() == { + 'term': {boolean('beans.toast'): True} } - assert term_query("beans.toast", False).to_dict() == { - "term": {boolean("beans.toast"): False} + assert term_query('beans.toast', False).to_dict() == { + 'term': {boolean('beans.toast'): False} } def test_infer_date(self): dt = datetime(2019, 6, 4, 14, 9, 45, tzinfo=timezone.utc) ms = to_timestamp(dt) - assert term_query("beans.toast", dt).to_dict() == { - "term": {date("beans.toast"): ms} + assert term_query('beans.toast', dt).to_dict() == { + 'term': {date('beans.toast'): ms} } def test_infer_str(self): - assert term_query("beans.toast", "hello!").to_dict() == { - "term": {keyword("beans.toast"): "hello!"} + assert term_query('beans.toast', 'hello!').to_dict() == { + 'term': {keyword('beans.toast'): 'hello!'} } def test_bad_type(self): with pytest.raises(ValueError): - term_query("beans.toast", object()) + term_query('beans.toast', object()) class TestMatchQuery: def test_all_text(self): - assert match_query("banana").to_dict() == { - "match": {ALL_TEXT: {"query": "banana"}} + assert match_query('banana').to_dict() == { + 'match': {ALL_TEXT: {'query': 'banana'}} } - assert match_query("banana", fuzziness="AUTO").to_dict() == { - "match": {ALL_TEXT: {"query": "banana", "fuzziness": "AUTO"}} + assert match_query('banana', fuzziness='AUTO').to_dict() == { + 'match': {ALL_TEXT: {'query': 'banana', 'fuzziness': 'AUTO'}} } def test_a_field(self): - assert match_query("banana", "beans.toast").to_dict() == { - "match": {text("beans.toast"): {"query": "banana"}} + assert match_query('banana', 'beans.toast').to_dict() == { + 'match': {text('beans.toast'): {'query': 'banana'}} } - assert match_query("banana", "beans.toast", fuzziness="AUTO").to_dict() == { - "match": {text("beans.toast"): {"query": "banana", "fuzziness": "AUTO"}} + assert match_query('banana', 'beans.toast', fuzziness='AUTO').to_dict() == { + 'match': {text('beans.toast'): {'query': 'banana', 'fuzziness': 'AUTO'}} } class TestRangeQuery: def test_int(self): - assert range_query("beans.toast", 4, 10).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4, - "lt": 10, + assert range_query('beans.toast', 4, 10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4, + 'lt': 10, } } } def test_float(self): - assert range_query("beans.toast", 4.5, 10.2).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4.5, - "lt": 10.2, + assert range_query('beans.toast', 4.5, 10.2).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4.5, + 'lt': 10.2, } } } def test_number_mix(self): - assert range_query("beans.toast", 4.5, 10).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4.5, - "lt": 10, + assert range_query('beans.toast', 4.5, 10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4.5, + 'lt': 10, } } } - assert range_query("beans.toast", 4, 10.6).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4, - "lt": 10.6, + assert range_query('beans.toast', 4, 10.6).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4, + 'lt': 10.6, } } } def test_lte_gte(self): - assert range_query("beans.toast", gte=4, lte=10).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4, - "lte": 10, + assert range_query('beans.toast', gte=4, lte=10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4, + 'lte': 10, } } } - assert range_query("beans.toast", gt=4, lt=10).to_dict() == { - "range": { - number("beans.toast"): { - "gt": 4, - "lt": 10, + assert range_query('beans.toast', gt=4, lt=10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gt': 4, + 'lt': 10, } } } - assert range_query("beans.toast", gte=4, lt=10).to_dict() == { - "range": { - number("beans.toast"): { - "gte": 4, - "lt": 10, + assert range_query('beans.toast', gte=4, lt=10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gte': 4, + 'lt': 10, } } } - assert range_query("beans.toast", gt=4, lte=10).to_dict() == { - "range": { - number("beans.toast"): { - "gt": 4, - "lte": 10, + assert range_query('beans.toast', gt=4, lte=10).to_dict() == { + 'range': { + number('beans.toast'): { + 'gt': 4, + 'lte': 10, } } } @@ -171,11 +171,11 @@ def test_datetime(self): gte = datetime(2020, 1, 2, 3, 4, 5, tzinfo=timezone.utc) lt = datetime(2020, 1, 2, 3, 4, 5, tzinfo=timezone.utc) - assert range_query("beans.toast", gte, lt).to_dict() == { - "range": { - date("beans.toast"): { - "gte": to_timestamp(gte), - "lt": to_timestamp(lt), + assert range_query('beans.toast', gte, lt).to_dict() == { + 'range': { + date('beans.toast'): { + 'gte': to_timestamp(gte), + 'lt': to_timestamp(lt), } } } @@ -184,38 +184,38 @@ def test_date(self): gte = datetime(2020, 1, 2) lt = datetime(2020, 1, 2) - assert range_query("beans.toast", gte, lt).to_dict() == { - "range": { - date("beans.toast"): { - "gte": to_timestamp(gte), - "lt": to_timestamp(lt), + assert range_query('beans.toast', gte, lt).to_dict() == { + 'range': { + date('beans.toast'): { + 'gte': to_timestamp(gte), + 'lt': to_timestamp(lt), } } } rebuild_data_scenarios = [ - {"_id": "1", "x": 4, "y": None, "z": ""}, - {"_id": "1", "x": 4}, - {"_id": "2", "x": 4.2394823749823798423}, - {"_id": "3", "x": [1, 2, 3]}, - {"_id": "4", "x": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, - {"_id": "5", "x": {"y": 5, "z": 4.6}}, - {"_id": "6", "x": {"y": [1, 2, 3], "z": [4, 5, 6, [7, 8, 9, {"y": "beans"}]]}}, - {"_id": "7", "a geojson point": {"type": "Point", "coordinates": [30, 10]}}, + {'_id': '1', 'x': 4, 'y': None, 'z': ''}, + {'_id': '1', 'x': 4}, + {'_id': '2', 'x': 4.2394823749823798423}, + {'_id': '3', 'x': [1, 2, 3]}, + {'_id': '4', 'x': [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, + {'_id': '5', 'x': {'y': 5, 'z': 4.6}}, + {'_id': '6', 'x': {'y': [1, 2, 3], 'z': [4, 5, 6, [7, 8, 9, {'y': 'beans'}]]}}, + {'_id': '7', 'a geojson point': {'type': 'Point', 'coordinates': [30, 10]}}, { - "_id": "8", - "anicelistofgeojson": [ - {"type": "Point", "coordinates": [30, 10]}, - {"type": "Point", "coordinates": [20, 20]}, - {"type": "Point", "coordinates": [10, 30]}, + '_id': '8', + 'anicelistofgeojson': [ + {'type': 'Point', 'coordinates': [30, 10]}, + {'type': 'Point', 'coordinates': [20, 20]}, + {'type': 'Point', 'coordinates': [10, 30]}, ], }, - {"_id": "9", "egg": [1, None, 3]}, + {'_id': '9', 'egg': [1, None, 3]}, ] -@pytest.mark.parametrize("data", rebuild_data_scenarios) +@pytest.mark.parametrize('data', rebuild_data_scenarios) def test_rebuild(data: dict): options = ParsingOptionsBuilder().build() parsed = parse(data, options) diff --git a/tests/test_syncing.py b/tests/test_syncing.py index bd91562..89e4ef0 100644 --- a/tests/test_syncing.py +++ b/tests/test_syncing.py @@ -33,7 +33,7 @@ def test_write_result(): class TestWorker: async def test_timeout(self): queue = Queue() - queue.put_nowait([IndexOp("test", "doc1", {"x": 4})]) + queue.put_nowait([IndexOp('test', 'doc1', {'x': 4})]) mock_client = AsyncMock( bulk=AsyncMock(side_effect=ConnectionTimeout("doesn't matter")) @@ -43,7 +43,7 @@ async def test_timeout(self): async def test_timeout_backoff_and_then_succeed(self): queue = Queue() - queue.put_nowait([IndexOp("test", "doc1", {"x": 4})]) + queue.put_nowait([IndexOp('test', 'doc1', {'x': 4})]) queue.put_nowait(None) mock_client = AsyncMock( @@ -51,7 +51,7 @@ async def test_timeout_backoff_and_then_succeed(self): side_effect=[ ConnectionTimeout("doesn't matter"), ConnectionTimeout("doesn't matter"), - {"items": [{"index": {}}, {"index": {}}, {"delete": {}}]}, + {'items': [{'index': {}}, {'index': {}}, {'delete': {}}]}, ] ) ) @@ -61,13 +61,13 @@ async def test_timeout_backoff_and_then_succeed(self): async def test_counting(self): queue = Queue() - queue.put_nowait([IndexOp("test", "doc1", {"x": 4})]) - queue.put_nowait([IndexOp("test", "doc1", {"x": 2})]) + queue.put_nowait([IndexOp('test', 'doc1', {'x': 4})]) + queue.put_nowait([IndexOp('test', 'doc1', {'x': 2})]) queue.put_nowait(None) mock_client = AsyncMock( bulk=AsyncMock( - return_value={"items": [{"index": {}}, {"index": {}}, {"delete": {}}]} + return_value={'items': [{'index': {}}, {'index': {}}, {'delete': {}}]} ) ) indexed, deleted = await worker(mock_client, queue) @@ -76,12 +76,12 @@ async def test_counting(self): async def test_errors(self): queue = Queue() - queue.put_nowait([IndexOp("test", "doc1", {"x": 4})]) + queue.put_nowait([IndexOp('test', 'doc1', {'x': 4})]) - error_item = {"index": {"error": ["oh no!"]}} + error_item = {'index': {'error': ['oh no!']}} mock_client = AsyncMock( bulk=AsyncMock( - return_value={"items": [{"index": {}}, error_item, {"delete": {}}]} + return_value={'items': [{'index': {}}, error_item, {'delete': {}}]} ) ) with pytest.raises(BulkOpException) as e: @@ -104,7 +104,7 @@ async def test_errors(self): tasks = set() async def error(): - raise Exception("oh no!") + raise Exception('oh no!') # this task won't be complete by the time we check task_1 = create_task(sleep(2)) @@ -115,7 +115,7 @@ async def error(): # this task will raise an error! tasks.add(create_task(error())) await sleep(0) - with pytest.raises(Exception, match="oh no!"): + with pytest.raises(Exception, match='oh no!'): check_for_errors(tasks) # make sure we don't leave any tasks hanging around await gather(task_1, task_2) @@ -123,7 +123,7 @@ async def error(): def test_refresh_attempts(): mock_client = MagicMock( - indices=MagicMock(refresh=MagicMock(side_effect=ConnectionTimeout("nope"))) + indices=MagicMock(refresh=MagicMock(side_effect=ConnectionTimeout('nope'))) ) attempts = 9 with pytest.raises(ConnectionTimeout): @@ -139,14 +139,14 @@ def test_write_errors(elasticsearch_client: Elasticsearch): options = BulkOptions(chunk_size=4, worker_count=8) # create 7000 ops - ops = [IndexOp("test", f"doc-{i}", {"x": i}) for i in range(7000)] + ops = [IndexOp('test', f'doc-{i}', {'x': i}) for i in range(7000)] # replace one of the ops with an op that causes an error, in this case we pass a # list instead of a dict as the record. This is a nice test because it will # serialise, but to something that Elasticsearch won't accept as a document so it # will raise an error - ops[5419] = IndexOp("test", "doc-error", []) + ops[5419] = IndexOp('test', 'doc-error', []) - with pytest.raises(BulkOpException, match="1 errors during bulk index"): + with pytest.raises(BulkOpException, match='1 errors during bulk index'): write_ops(elasticsearch_client, ops, options) diff --git a/tests/test_utils.py b/tests/test_utils.py index e75e1bb..9f70510 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,8 @@ -from datetime import datetime, timedelta, timezone, date +from datetime import date, datetime, timedelta, timezone from freezegun import freeze_time -from splitgill.utils import to_timestamp, parse_to_timestamp, now, partition +from splitgill.utils import now, parse_to_timestamp, partition, to_timestamp class TestToTimestamp: @@ -34,18 +34,18 @@ def test_date(self): class TestParseTimestamp: def test_default_no_tz(self): - assert parse_to_timestamp("2012-01-14", "%Y-%m-%d") == 1326499200000 + assert parse_to_timestamp('2012-01-14', '%Y-%m-%d') == 1326499200000 def test_default_no_tz_is_utc(self): - no_tz = parse_to_timestamp("2012-01-14", "%Y-%m-%d") - with_utc_tz = parse_to_timestamp("2012-01-14", "%Y-%m-%d", timezone.utc) + no_tz = parse_to_timestamp('2012-01-14', '%Y-%m-%d') + with_utc_tz = parse_to_timestamp('2012-01-14', '%Y-%m-%d', timezone.utc) assert no_tz == with_utc_tz def test_different_tz(self): five_hours_behind = timezone(timedelta(hours=-5)) assert ( parse_to_timestamp( - "2012-01-14 15:30:54", "%Y-%m-%d %H:%M:%S", five_hours_behind + '2012-01-14 15:30:54', '%Y-%m-%d %H:%M:%S', five_hours_behind ) == 1326573054000 ) @@ -54,7 +54,7 @@ def test_when_format_has_tz(self): # if UTC was used instead of the tz in the formatted string, we'd expect to get # 1326555054000 as the result assert ( - parse_to_timestamp("2012-01-14 15:30:54 +0300", "%Y-%m-%d %H:%M:%S %z") + parse_to_timestamp('2012-01-14 15:30:54 +0300', '%Y-%m-%d %H:%M:%S %z') == 1326544254000 ) @@ -64,13 +64,13 @@ def test_when_format_has_tz_and_we_give_tz(self): # it is ignored because the timezone is specified in the formatted string assert ( parse_to_timestamp( - "2012-01-14 15:30:54 +0300", "%Y-%m-%d %H:%M:%S %z", ten_hours_ahead + '2012-01-14 15:30:54 +0300', '%Y-%m-%d %H:%M:%S %z', ten_hours_ahead ) == 1326544254000 ) -@freeze_time("2012-01-14 12:00:01") +@freeze_time('2012-01-14 12:00:01') def test_now(): assert now() == 1326542401000