Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.0
current_version = 1.0.0
commit = True
tag = True
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>(rc|dev))(?P<build>\d+))?
Expand All @@ -18,3 +18,5 @@ values =
first_value = 1

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"
8 changes: 5 additions & 3 deletions .github/workflows/coverage.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name: Coverage

on:
push:
branches:
Expand All @@ -9,20 +8,23 @@ on:
pull_request:
branches:
- main

- dev
jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Setup a local virtual environment
run: |
poetry config virtualenvs.create true --local
poetry config virtualenvs.in-project true --local
poetry lock
- uses: actions/cache@v3
name: Define a cache for the virtual environment based on the dependencies lock file
with:
Expand Down
11 changes: 8 additions & 3 deletions .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
name: Lint

on:
push:

pull_request:
branches:
- main
- dev
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Setup a local virtual environment
run: |
poetry config virtualenvs.create true --local
poetry config virtualenvs.in-project true --local
poetry lock
- uses: actions/cache@v3
name: Define a cache for the virtual environment based on the dependencies lock file
with:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ on:
- main
- dev
- issue/**

jobs:
validate_focus:
runs-on: ubuntu-latest
Expand All @@ -14,8 +13,10 @@ jobs:
steps:
- name: Check out repository code
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Setup a local virtual environment
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@ on:
tags:
- 'v\d\.\d\.\d'
- 'v\d\.\d\.\d-(dev|rc)\d'

jobs:
publish:
permissions:
id-token: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.8
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: 3.11
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Install dependencies
run: |
poetry build
find -type l -exec bash -c 'ln -f "$(readlink -m "$0")" "$0"' {} \;
poetry build --format=sdist
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
12 changes: 5 additions & 7 deletions .github/workflows/unittest.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
name: Unittest

on:
push:
branches:
Expand All @@ -9,15 +8,14 @@ on:
pull_request:
branches:
- main

- dev
jobs:
test:
runs-on: ubuntu-latest

runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [ "3.8", "3.9", "3.10", "3.11" ]

python-version: [ "3.9", "3.10", "3.11", "3.12" ]
os: [ windows-latest, ubuntu-latest, macos-latest ]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -35,7 +33,7 @@ jobs:
name: Define a cache for the virtual environment based on the dependencies lock file
with:
path: ./.venv
key: venv-${{ hashFiles('poetry.lock') }}
key: venv-${{ hashFiles('poetry.lock') }}-${{ matrix.os }}-${{ matrix.python-version }}
- name: Install dependencies
run: |
poetry install
Expand Down
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ tbd

### Prerequisites

- Python 3.8+
- Python 3.9+
- Poetry (Package & Dependency Manager)

### Installation
Expand Down Expand Up @@ -40,6 +40,22 @@ Using Poetry, you can install the project's dependencies with:
```bash
poetry install
```
#### 4. Downgrade Multimethod

By default Multimethod 2.0 is installed, focus_validator works with 1.9.1

```bash
poetry add "multimethod@1.9.1"
```

#### 5. Install Virtual Environment Shell

Shell is not installed by default, you need to install it.

```bash
poetry self add poetry-plugin-shell
```


## Usage

Expand Down Expand Up @@ -67,6 +83,14 @@ poetry run pytest

Ensure you have `pytest` defined as a development dependency in your `pyproject.toml`.

If running on legacy CPUs and the tests crash on the polars library, run the following locally only:

```bash
poetry add polars-lts-cpu
```

This will align the polars execution with your system hardware. It should NOT be committed back into the repository.

## License

This project is licensed under the MIT License - see the `LICENSE` file for details.
21 changes: 21 additions & 0 deletions build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import pathlib
import shutil
import yaml

def copy_rules(basedir):
with open(os.path.join(basedir, 'version_sets.yaml'), 'r') as file:
version_sets = yaml.safe_load(file)

for version, base_files in version_sets.items():
dest = os.path.join(basedir, 'version_sets', version)
if os.path.exists(dest):
shutil.rmtree(dest)
pathlib.Path(dest).mkdir(parents=True)
for f in base_files:
src_file = os.path.join(basedir, 'base_rule_definitions', f)
dest_file = os.path.join(dest, f)
shutil.copyfile(src_file, dest_file)

if __name__ == "__main__":
copy_rules(basedir='focus_validator/rules')
25 changes: 24 additions & 1 deletion focus_validator/config_objects/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from enum import Enum
from typing import List, Literal

from pydantic import BaseModel
import sqlglot
from pydantic import BaseModel, field_validator


class AllowNullsCheck(BaseModel):
Expand All @@ -12,6 +13,22 @@ class ValueInCheck(BaseModel):
value_in: List[str]


class SQLQueryCheck(BaseModel):
sql_query: str

@field_validator("sql_query")
def check_sql_query(cls, sql_query):
returned_columns = [
column.alias
for column in sqlglot.parse_one(sql_query).find_all(sqlglot.exp.Alias)
]

assert returned_columns == [
"check_output"
], "SQL query must only return a column called 'check_output'"
return sql_query


SIMPLE_CHECKS = Literal["check_unique", "column_required"]


Expand All @@ -20,6 +37,7 @@ class DataTypes(Enum):
DECIMAL = "decimal"
DATETIME = "datetime"
CURRENCY_CODE = "currency-code"
STRINGIFIED_JSON_OBJECT = "stringified-json-object"


class DataTypeCheck(BaseModel):
Expand Down Expand Up @@ -50,3 +68,8 @@ def generate_check_friendly_name(check, column_id):
return f"{column_id} does not allow null values."
elif isinstance(check, DataTypeCheck):
return f"{column_id} requires values of type {check.data_type.value}."
elif isinstance(check, SQLQueryCheck):
sql_query = " ".join([word.strip() for word in check.sql_query.split()])
return f"{column_id} requires values that return true when evaluated by the following SQL query: {sql_query}"
else:
raise NotImplementedError(f"Check {check} not implemented.")
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
from itertools import groupby
from typing import Dict, List, Optional, Set, Union

import pandas as pd
import pandera as pa
import sqlglot
from pandera.api.pandas.types import PandasDtypeInputTypes

from focus_validator.config_objects import ChecklistObject, InvalidRule, Rule
Expand All @@ -10,11 +13,25 @@
ChecklistObjectStatus,
DataTypeCheck,
DataTypes,
SQLQueryCheck,
ValueInCheck,
)
from focus_validator.config_objects.override import Override
from focus_validator.exceptions import FocusNotImplementedError

# group index column adds a column to the dataframe which is used to group the dataframe, otherwise the default
# groupby function does not carry forward all rows in the dataframe causing it to not have row numbers
GROUP_INDEX_COLUMN = "group_index_column"


def __groupby_fnc__(df: pd.DataFrame, column_alias: List[str]):
"""
Custom groupby function to be used with pandera check_sql_query, allowing null values
Default groupby function does not allow null values
"""
df[GROUP_INDEX_COLUMN] = range(0, len(df))
return df.groupby(column_alias + [GROUP_INDEX_COLUMN], dropna=False)


class FocusToPanderaSchemaConverter:
@staticmethod
Expand All @@ -40,9 +57,22 @@ def __generate_pandera_check__(rule: Rule, check_id):
return pa.Check.check_value_in(
allowed_values=check.value_in, error=error_string
)
elif isinstance(check, SQLQueryCheck):
column_alias = [
column.alias_or_name
for column in sqlglot.parse_one(check.sql_query).find_all(
sqlglot.exp.Column
)
]
return pa.Check.check_sql_query(
sql_query=check.sql_query,
error=error_string,
column_alias=column_alias,
groupby=lambda df: __groupby_fnc__(df=df, column_alias=column_alias),
)
elif isinstance(check, AllowNullsCheck):
return pa.Check.check_not_null(
error=error_string, ignore_na=False, allow_nulls=check.allow_nulls
error=error_string, ignore_na=check.allow_nulls
)
else:
raise FocusNotImplementedError(
Expand Down Expand Up @@ -77,6 +107,14 @@ def __generate_column_definition__(
error=f"{rule.check_id}:::Ensures that column is of {data_type.value} type.",
)
)
elif data_type == DataTypes.STRINGIFIED_JSON_OBJECT:
pandera_type = None
column_checks.append(
pa.Check.check_stringified_json_object_dtype(
ignore_na=True,
error=f"{rule.check_id}:::Ensures that column is of {data_type.value} type.",
)
)
else:
pandera_type = pa.String

Expand Down Expand Up @@ -151,7 +189,7 @@ def generate_pandera_schema(
for rule in rules:
if isinstance(rule, InvalidRule):
checklist[rule.rule_path] = ChecklistObject(
check_name=rule.rule_path,
check_name=os.path.splitext(os.path.basename(rule.rule_path))[0],
column_id="Unknown",
error=f"{rule.error_type}: {rule.error}",
status=ChecklistObjectStatus.ERRORED,
Expand Down Expand Up @@ -180,4 +218,7 @@ def generate_pandera_schema(
overrides=overrides,
schema_dict=schema_dict,
)
return pa.DataFrameSchema(schema_dict, strict=False), checklist
return (
pa.DataFrameSchema(schema_dict, strict=False),
checklist,
)
2 changes: 1 addition & 1 deletion focus_validator/config_objects/override.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import List

import yaml
from pydantic import BaseModel
from pydantic.v1 import BaseModel


class Override(BaseModel):
Expand Down
Loading