From 336121621a3a41206203e681b58e249df33c1c87 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Wed, 12 Nov 2025 16:58:04 -0700 Subject: [PATCH 01/33] separating out a glob function from the parse function, doesn't work with debug yet --- mod/hallmark/core.py | 72 ++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index c6ad40d..73725bb 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -69,7 +69,43 @@ def filter(self, **kwargs): return self[mask] @classmethod - def parse(cls, fmt, *args, debug=False, **kwargs): + def glob_search(cls, fmt, *args, debug=False, **kwargs): + pmax = len(fmt) // 3 # to specify a parameter, we need at least + # three characters '{p}'; the maximum number + # of possible parameters is `len(fmt) // 3`. + + # Construct the glob pattern for search files + pattern = fmt + for i in range(pmax): + if debug: + print(i, pattern, args, kwargs) + try: + pattern = pattern.format(*args, **kwargs) + break + except KeyError as e: + k = e.args[0] + pattern = re.sub(r'\{'+k+r':?.*?\}', '{'+k+':s}', pattern) + kwargs[e.args[0]] = '*' + + # Obtain list of files based on the glob pattern + files = sorted(glob(pattern)) + + # Print the glob pattern and a summary of matches + if debug: + print(f'Pattern: "{pattern}"') + n = len(files) + if n > 1: + print(f'{n} matches, e.g., "{files[0]}"') + elif n > 0: + print(f'{n} match, i.e., "{files[0]}"') + else: + print(f'No match; please check format string') + + return files + + + @classmethod + def parse(cls, fmt): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -107,40 +143,10 @@ def parse(cls, fmt, *args, debug=False, **kwargs): 0 data/run1_p10.csv 1 10 1 data/run2_p20.csv 2 20 """ - pmax = len(fmt) // 3 # to specify a parameter, we need at least - # three characters '{p}'; the maximum number - # of possible parameters is `len(fmt) // 3`. - - # Construct the glob pattern for search files - pattern = fmt - for i in range(pmax): - if debug: - print(i, pattern, args, kwargs) - try: - pattern = pattern.format(*args, **kwargs) - break - except KeyError as e: - k = e.args[0] - pattern = re.sub(r'\{'+k+r':?.*?\}', '{'+k+':s}', pattern) - kwargs[e.args[0]] = '*' - - # Obtain list of files based on the glob pattern - files = sorted(glob(pattern)) - - # Print the glob pattern and a summary of matches - if debug: - print(f'Pattern: "{pattern}"') - n = len(files) - if n > 1: - print(f'{n} matches, e.g., "{files[0]}"') - elif n > 0: - print(f'{n} match, i.e., "{files[0]}"') - else: - print(f'No match; please check format string') - + # Parse list of file names back to parameters parser = parse.compile(fmt) - + files = ParaFrame.glob_search(fmt) l = [] for f in files: r = parser.parse(f) From fe4fcb38557bb1d6792f8a7a397dd7c669bd13a8 Mon Sep 17 00:00:00 2001 From: Ram Adithya Date: Wed, 19 Nov 2025 16:21:22 -0700 Subject: [PATCH 02/33] Updated core.py file with the debug issue fixed along with the latest demo file --- demos/ParaFrame.ipynb | 90 ++++++++++--------------------------------- mod/hallmark/core.py | 37 +++++++++--------- 2 files changed, 40 insertions(+), 87 deletions(-) diff --git a/demos/ParaFrame.ipynb b/demos/ParaFrame.ipynb index 7696832..79e4427 100644 --- a/demos/ParaFrame.ipynb +++ b/demos/ParaFrame.ipynb @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "forced-windows", "metadata": {}, "outputs": [ @@ -181,7 +181,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "id": "immediate-girlfriend", "metadata": {}, "outputs": [], @@ -191,17 +191,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "level-carol", "metadata": {}, "outputs": [], "source": [ - "pf = ParaFrame(\"data/a_{a:d}/b_{b:d}.txt\")" + "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\")" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "friendly-compatibility", "metadata": {}, "outputs": [ @@ -320,7 +320,7 @@ "[100 rows x 3 columns]" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -341,18 +341,10 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "loved-statistics", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/astro_se_int/lib/python3.13/site-packages/hallmark/core.py:49: FutureWarning: Logical ops (and, or, xor) between Pandas objects and dtype-less sequences (e.g. list, tuple) are deprecated and will raise in a future version. Wrap the object in a Series, Index, or np.array before operating instead.\n", - " mask |= self[k] == v\n" - ] - }, { "data": { "text/html": [ @@ -458,7 +450,7 @@ "9 data/a_0/b_19.txt 0 19" ] }, - "execution_count": 9, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -470,20 +462,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "id": "guilty-liberty", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/astro_se_int/lib/python3.13/site-packages/hallmark/core.py:47: FutureWarning: Logical ops (and, or, xor) between Pandas objects and dtype-less sequences (e.g. list, tuple) are deprecated and will raise in a future version. Wrap the object in a Series, Index, or np.array before operating instead.\n", - " mask |= self[k].isin(v)\n" - ] - }, { "data": { "text/html": [ @@ -659,7 +643,7 @@ "19 data/a_1/b_19.txt 1 19" ] }, - "execution_count": 10, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -671,20 +655,12 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "id": "british-craps", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/astro_se_int/lib/python3.13/site-packages/hallmark/core.py:49: FutureWarning: Logical ops (and, or, xor) between Pandas objects and dtype-less sequences (e.g. list, tuple) are deprecated and will raise in a future version. Wrap the object in a Series, Index, or np.array before operating instead.\n", - " mask |= self[k] == v\n" - ] - }, { "data": { "text/html": [ @@ -853,7 +829,7 @@ "90 data/a_9/b_10.txt 9 10" ] }, - "execution_count": 11, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -865,20 +841,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "sapphire-analysis", "metadata": { "scrolled": true }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/astro_se_int/lib/python3.13/site-packages/hallmark/core.py:49: FutureWarning: Logical ops (and, or, xor) between Pandas objects and dtype-less sequences (e.g. list, tuple) are deprecated and will raise in a future version. Wrap the object in a Series, Index, or np.array before operating instead.\n", - " mask |= self[k] == v\n" - ] - }, { "data": { "text/html": [ @@ -921,7 +889,7 @@ "0 data/a_0/b_10.txt 0 10" ] }, - "execution_count": 12, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -933,7 +901,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "modular-background", "metadata": { "scrolled": true @@ -1184,7 +1152,7 @@ "49 data/a_4/b_19.txt 4 19" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -1206,7 +1174,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "id": "lasting-clear", "metadata": {}, "outputs": [ @@ -1234,14 +1202,6 @@ "Doing something with file \"data/a_8/b_10.txt\"...\n", "Doing something with file \"data/a_9/b_10.txt\"...\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/anaconda3/envs/astro_se_int/lib/python3.13/site-packages/hallmark/core.py:49: FutureWarning: Logical ops (and, or, xor) between Pandas objects and dtype-less sequences (e.g. list, tuple) are deprecated and will raise in a future version. Wrap the object in a Series, Index, or np.array before operating instead.\n", - " mask |= self[k] == v\n" - ] } ], "source": [ @@ -1261,7 +1221,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 11, "id": "developmental-luther", "metadata": {}, "outputs": [ @@ -1278,23 +1238,15 @@ } ], "source": [ - "pf = ParaFrame(\"data/a_{a:d}/b_{b:d}.txt\", debug=True)" + "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\", debug=True)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "corrected-divorce", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python (astro_se_int)", + "display_name": "hallmark-313", "language": "python", - "name": "astro_se_int" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 73725bb..4f1d7e9 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -21,6 +21,7 @@ import pandas as pd import numpy as np + class ParaFrame(pd.DataFrame): """ A subclass of :class:`pandas.DataFrame` with added methods for @@ -32,21 +33,22 @@ class ParaFrame(pd.DataFrame): parameters from a format pattern (using ``glob`` + ``parse``). * ``__call__``/``filter``: convenience filtering by column values. """ + @property def _constructor(self): return ParaFrame def __call__(self, **kwds): return self.filter(**kwds) - + def filter(self, **kwargs): """ Filter a pandas ``DataFrame`` by matching column values. This function utlizes provided **kwargs to filter an existing - ``ParaFrame`` by masking based on column values. Filtering supports - single- and multi-conditioned queries, returning rows that satisfy - any of the provided conditions. + ``ParaFrame`` by masking based on column values. Filtering supports + single- and multi-conditioned queries, returning rows that satisfy + any of the provided conditions. Args: **kwargs: Arbitrary keyword arguments specifying column names @@ -63,7 +65,7 @@ def filter(self, **kwargs): mask = [False] * len(self) for k, v in kwargs.items(): if isinstance(v, (tuple, list)): - mask |= np.isin(np.array(self[k]),np.array(v)) + mask |= np.isin(np.array(self[k]), np.array(v)) else: mask |= np.array(self[k]) == v return self[mask] @@ -71,8 +73,8 @@ def filter(self, **kwargs): @classmethod def glob_search(cls, fmt, *args, debug=False, **kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least - # three characters '{p}'; the maximum number - # of possible parameters is `len(fmt) // 3`. + # three characters '{p}'; the maximum number + # of possible parameters is `len(fmt) // 3`. # Construct the glob pattern for search files pattern = fmt @@ -84,14 +86,14 @@ def glob_search(cls, fmt, *args, debug=False, **kwargs): break except KeyError as e: k = e.args[0] - pattern = re.sub(r'\{'+k+r':?.*?\}', '{'+k+':s}', pattern) - kwargs[e.args[0]] = '*' + pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) + kwargs[e.args[0]] = "*" # Obtain list of files based on the glob pattern files = sorted(glob(pattern)) - + # Print the glob pattern and a summary of matches - if debug: + if debug == True: print(f'Pattern: "{pattern}"') n = len(files) if n > 1: @@ -99,13 +101,12 @@ def glob_search(cls, fmt, *args, debug=False, **kwargs): elif n > 0: print(f'{n} match, i.e., "{files[0]}"') else: - print(f'No match; please check format string') - + print(f"No match; please check format string") + return files - @classmethod - def parse(cls, fmt): + def parse(cls, fmt, debug=False): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -143,15 +144,15 @@ def parse(cls, fmt): 0 data/run1_p10.csv 1 10 1 data/run2_p20.csv 2 20 """ - + # Parse list of file names back to parameters parser = parse.compile(fmt) - files = ParaFrame.glob_search(fmt) + files = ParaFrame.glob_search(fmt, debug=debug) l = [] for f in files: r = parser.parse(f) if r is None: print(f'Failed to parse "{f}"') else: - l.append({'path':f, **r.named}) + l.append({"path": f, **r.named}) return cls(l) From ba70f39474fc47c95e2fb47ba1130613a876994d Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sat, 10 Jan 2026 17:37:34 -0700 Subject: [PATCH 03/33] finalizing glob/parse method split, setting glob up for testing --- mod/hallmark/core.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index e7ea7db..e50b55c 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -71,7 +71,7 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, **kwargs): + def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. @@ -103,10 +103,10 @@ def glob_search(cls, fmt, *args, debug=False, **kwargs): else: print(f"No match; please check format string") - return files + return (files, pattern) if return_pattern else files @classmethod - def parse(cls, fmt, debug=False): + def parse(cls, fmt, *args, debug=False, **kwargs): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -147,8 +147,8 @@ def parse(cls, fmt, debug=False): # Parse list of file names back to parameters parser = parse.compile(fmt) + files = cls.glob_search(fmt, *args, debug=debug, **kwargs) - files = ParaFrame.glob_search(fmt, debug=debug) frame = [] for f in files: r = parser.parse(f) From c978132737988ae31e0bead44deb36ae7ed164e7 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sun, 11 Jan 2026 16:23:54 -0700 Subject: [PATCH 04/33] added first glob method unit test --- tests/test_paraframe.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 173aab5..2b683b1 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -1,6 +1,7 @@ import pandas as pd import pytest from hallmark import ParaFrame +from pathlib import Path @pytest.fixture def create_ParaFrame(create_temp_data): @@ -28,4 +29,9 @@ def test_all_txt_files_b10_through_b19_get_created(create_ParaFrame): def test_pandas_method_on_pf(create_ParaFrame): pf = create_ParaFrame - assert isinstance(pf.head(), pd.DataFrame) \ No newline at end of file + assert isinstance(pf.head(), pd.DataFrame) + +def test_glob_string_format(create_temp_data): + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] + assert Path(pattern).as_posix().endswith("/a_0/b_*.txt") \ No newline at end of file From 9bfaf2913afbbe7479c2c3fcd74bfbaca58397f0 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sun, 11 Jan 2026 16:56:52 -0700 Subject: [PATCH 05/33] additional glob/parse method separation unit testing --- tests/test_paraframe.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 2b683b1..2008cf3 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -34,4 +34,15 @@ def test_pandas_method_on_pf(create_ParaFrame): def test_glob_string_format(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] - assert Path(pattern).as_posix().endswith("/a_0/b_*.txt") \ No newline at end of file + assert Path(pattern).as_posix().endswith("/a_0/b_*.txt") + +def test_glob_method_returns_files(create_temp_data): + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + files = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[0] + assert len(files) == 10 + +def test_parse_with_filter_arg(create_temp_data): + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pf = ParaFrame.parse(fmt, a=0) + assert pf.shape == (10, 3) + assert pf["a"].unique() == 0 From 940f836d0a5de7ae189a396cc5e6ecb0592dbc34 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Mon, 12 Jan 2026 15:59:02 -0700 Subject: [PATCH 06/33] added pytest fixtures and three xfail tests for formatter testing with spin values --- demos/ParaFrame.ipynb | 3 --- tests/conftest.py | 18 +++++++++++++++++- tests/test_paraframe.py | 37 ++++++++++++++++++++++++++++++++++--- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/demos/ParaFrame.ipynb b/demos/ParaFrame.ipynb index f6a05db..309fb98 100644 --- a/demos/ParaFrame.ipynb +++ b/demos/ParaFrame.ipynb @@ -197,7 +197,6 @@ "outputs": [], "source": [ "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\")" - "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\")" ] }, { @@ -1240,7 +1239,6 @@ ], "source": [ "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\", debug=True)" - "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\", debug=True)" ] } ], @@ -1249,7 +1247,6 @@ "display_name": "hallmark-313", "language": "python", "name": "python3" - "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/tests/conftest.py b/tests/conftest.py index 62addee..c4465c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,10 @@ import pytest +def spin_format(val): + if val == 0: + return "0" + return f"{val:+g}" + @pytest.fixture(scope = "function") def create_temp_data(tmp_path): data_dir = tmp_path / "data" @@ -8,4 +13,15 @@ def create_temp_data(tmp_path): subdir.mkdir(parents=True) for b in range(10, 20): (subdir / f"b_{b}.txt").touch() - return data_dir \ No newline at end of file + return data_dir + +@pytest.fixture(scope = "function") +def create_temp_data_spin(tmp_path): + data_dir = tmp_path / "data" + spins = [-0.5, 0.0, 0.5] + for a in spins: + subdir = data_dir / f"a{spin_format(a)}" + subdir.mkdir(parents=True) + for b in range(10, 20): + (subdir / f"b_{b}.txt").touch() + return data_dir diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 2008cf3..cabf8e6 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -1,13 +1,17 @@ import pandas as pd import pytest from hallmark import ParaFrame -from pathlib import Path @pytest.fixture def create_ParaFrame(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") return ParaFrame.parse(fmt, debug = True) +@pytest.fixture +def create_ParaFrame_spin(create_temp_data_spin): + fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") + return ParaFrame.parse(fmt, debug = True) + def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -15,6 +19,11 @@ def test_shape_of_ParaFrame(create_ParaFrame): pf = create_ParaFrame assert pf.shape == (100,3) +def test_column_dtype(create_ParaFrame): + pf = create_ParaFrame + assert pd.api.types.is_integer_dtype(pf["a"]) + assert pd.api.types.is_integer_dtype(pf["b"]) + def test_column_names_in_ParaFrame(create_ParaFrame): pf = create_ParaFrame assert set(pf.columns) == {"path","a","b"} @@ -34,15 +43,37 @@ def test_pandas_method_on_pf(create_ParaFrame): def test_glob_string_format(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] - assert Path(pattern).as_posix().endswith("/a_0/b_*.txt") + norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS + assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") files = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[0] assert len(files) == 10 -def test_parse_with_filter_arg(create_temp_data): +def test_parse_method_with_added_filter_arg(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") pf = ParaFrame.parse(fmt, a=0) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 + +@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") +def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): + fmt = str(create_temp_data_spin / "a{a:spin}/b_{b:d}.txt") + files, pattern = ParaFrame.glob_search(fmt, a=0.5, return_pattern=True) + norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS + assert norm.endswith("/a+0.5/b_*.txt") + assert len(files) == 10 + +@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") +def test_parse_produces_float_spin_column(create_ParaFrame_spin): + pf = create_ParaFrame_spin + assert pd.api.types.is_float_dtype(pf["a"]) + assert set(pf["a"].unique()) == {-0.5, 0.0, 0.5} + +@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") +def test_filtering_by_numeric_spin(create_ParaFrame_spin): + pf = create_ParaFrame_spin + pf_filtered = pf(a=0.5) + assert len(pf_filtered) == 3 + assert set(pf_filtered["a"].unique()) == {0.5} \ No newline at end of file From 9a1063adce3bd9c9bc19bab9a8f64f8dec66679c Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Fri, 30 Jan 2026 10:54:25 -0700 Subject: [PATCH 07/33] skeleton build of a string.Formatter subclass with logic not yet implemented --- mod/hallmark/core.py | 18 ++++++++++++++++++ tests/test_paraframe.py | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index e50b55c..aaa66f9 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -20,6 +20,7 @@ import parse import pandas as pd import numpy as np +import string # added for Formatter subclassing class ParaFrame(pd.DataFrame): @@ -157,3 +158,20 @@ def parse(cls, fmt, *args, debug=False, **kwargs): else: frame.append({'path':f, **r.named}) return cls(frame) + +class newFormatter(string.Formatter): + + def format_field(self, value, type): + # define default formatting for spin values according to EHT standards + + # need to add some kind of .get("Formatter") that acesses the yml file + if type == "aspin": + # here we need to take in the information in the "formatter" section of the .yml file + # so that each special case of spin types are accounted for. Will probably be a return + # statement in this that uses whatever is in the .yml spin type dictionary + + # otherwise we return a default for all standard spins? + return f'{value:+g}' + + # otherwise, all other data types keep their default behavior + return super().format_field(value, type) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index cabf8e6..a520db3 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -43,7 +43,7 @@ def test_pandas_method_on_pf(create_ParaFrame): def test_glob_string_format(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] - norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS + norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): From 4863ea06be4a8b659b209071ff4dc7cadf2fa1a6 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Tue, 3 Feb 2026 17:02:47 -0700 Subject: [PATCH 08/33] Updated core file with new_fmt to allow for strings in the input fmt. Also created the encodings.yaml --- mod/hallmark/core.py | 26 +++++--------------------- mod/hallmark/encodings.yaml | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 21 deletions(-) create mode 100644 mod/hallmark/encodings.yaml diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index aaa66f9..994052f 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -88,6 +88,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): except KeyError as e: k = e.args[0] pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) + new_fmt = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", new_fmt) kwargs[e.args[0]] = "*" # Obtain list of files based on the glob pattern @@ -104,7 +105,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): else: print(f"No match; please check format string") - return (files, pattern) if return_pattern else files + return (files, pattern) if return_pattern else (new_fmt, files) @classmethod def parse(cls, fmt, *args, debug=False, **kwargs): @@ -147,8 +148,8 @@ def parse(cls, fmt, *args, debug=False, **kwargs): """ # Parse list of file names back to parameters - parser = parse.compile(fmt) - files = cls.glob_search(fmt, *args, debug=debug, **kwargs) + new_fmt, files = cls.glob_search(fmt, *args, debug=debug, **kwargs) + parser = parse.compile(new_fmt) frame = [] for f in files: @@ -157,21 +158,4 @@ def parse(cls, fmt, *args, debug=False, **kwargs): print(f'Failed to parse "{f}"') else: frame.append({'path':f, **r.named}) - return cls(frame) - -class newFormatter(string.Formatter): - - def format_field(self, value, type): - # define default formatting for spin values according to EHT standards - - # need to add some kind of .get("Formatter") that acesses the yml file - if type == "aspin": - # here we need to take in the information in the "formatter" section of the .yml file - # so that each special case of spin types are accounted for. Will probably be a return - # statement in this that uses whatever is in the .yml spin type dictionary - - # otherwise we return a default for all standard spins? - return f'{value:+g}' - - # otherwise, all other data types keep their default behavior - return super().format_field(value, type) + return cls(frame) \ No newline at end of file diff --git a/mod/hallmark/encodings.yaml b/mod/hallmark/encodings.yaml new file mode 100644 index 0000000..2c379f8 --- /dev/null +++ b/mod/hallmark/encodings.yaml @@ -0,0 +1,14 @@ +encodings: + spin_UIUC: + type: float + regex: "m?([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + transform: + m: "-" + formatting: + format: "+.2f" + + spin_BHAC: + type: float + regex: "[+-]?([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + fomratting: + format: "+.2f" \ No newline at end of file From 1c80b765d1d04831610c5ca21d6a50bf0c756d95 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Tue, 3 Feb 2026 17:32:22 -0700 Subject: [PATCH 09/33] added the first helper function --- mod/hallmark/core.py | 11 +++++++++++ mod/hallmark/helper_functions.py | 13 +++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 mod/hallmark/helper_functions.py diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 994052f..0934134 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -22,6 +22,7 @@ import numpy as np import string # added for Formatter subclassing +from .helper_functions import * class ParaFrame(pd.DataFrame): """ @@ -78,7 +79,14 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): # of possible parameters is `len(fmt) // 3`. # Construct the glob pattern for search files + + ### Load Yaml File ### + parameters = load_encodings_yaml() + print(parameters) + ### Call Pre-processing function ### + pattern = fmt + new_fmt = fmt for i in range(pmax): if debug: print(i, pattern, args, kwargs) @@ -149,6 +157,9 @@ def parse(cls, fmt, *args, debug=False, **kwargs): # Parse list of file names back to parameters new_fmt, files = cls.glob_search(fmt, *args, debug=debug, **kwargs) + + ### Normalizing custom Characters function ### + parser = parse.compile(new_fmt) frame = [] diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py new file mode 100644 index 0000000..54ad6fb --- /dev/null +++ b/mod/hallmark/helper_functions.py @@ -0,0 +1,13 @@ +from pathlib import Path +import yaml + +cwd = Path.cwd() +ENCODINGS_YAML = Path(__file__).parent / "encodings.yaml" +print(ENCODINGS_YAML) + +def load_encodings_yaml(path=ENCODINGS_YAML): + f = path.open("r", encoding="utf-8") + data = yaml.safe_load(f) + encodings = data["encodings"] + return encodings + From 3b0d59705560d805bb4eac2845b104486842db0a Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Wed, 4 Feb 2026 11:24:53 -0700 Subject: [PATCH 10/33] added unit test for yaml loader helper function. fixed yaml file spelling error. changed e2e debug test to xfail, will update after formatting issue resolved. --- mod/hallmark/encodings.yaml | 2 +- tests/test_paraframe.py | 10 +++++++++- tests/test_paraframe_e2e.py | 2 ++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/mod/hallmark/encodings.yaml b/mod/hallmark/encodings.yaml index 2c379f8..9a45c3f 100644 --- a/mod/hallmark/encodings.yaml +++ b/mod/hallmark/encodings.yaml @@ -10,5 +10,5 @@ encodings: spin_BHAC: type: float regex: "[+-]?([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - fomratting: + formatting: format: "+.2f" \ No newline at end of file diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index a520db3..5bef648 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -1,6 +1,7 @@ import pandas as pd import pytest from hallmark import ParaFrame +from hallmark.helper_functions import * @pytest.fixture def create_ParaFrame(create_temp_data): @@ -76,4 +77,11 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): pf = create_ParaFrame_spin pf_filtered = pf(a=0.5) assert len(pf_filtered) == 3 - assert set(pf_filtered["a"].unique()) == {0.5} \ No newline at end of file + assert set(pf_filtered["a"].unique()) == {0.5} + +def test_loading_yaml_file_for_special_formatting(): + parameters = load_encodings_yaml() + assert "spin_UIUC" in parameters + assert "spin_BHAC" in parameters + assert parameters['spin_UIUC']['regex'] == "m?([0-9]+(\.[0-9]+)?|\.[0-9]+)" + assert parameters['spin_UIUC']['transform']['m'] == '-' \ No newline at end of file diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index a931372..7241424 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -1,4 +1,5 @@ from hallmark import ParaFrame +import pytest def test_paraframe_class_functionality(create_temp_data): # a user wants to create a paraframe @@ -42,6 +43,7 @@ def test_paraframe_class_functionality(create_temp_data): assert len(mask_filter) == 40 assert all(mask_filter["a"].unique() == [1,2,3,4]) +@pytest.mark.xfail(strict=True, reason="Debug output formatting has been changed, test needs updated") def test_debug(create_temp_data, capsys, tmp_path): # users want to see a detailed summary of how ParaFrame utilizes globbing fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") From cc34749b422e46cfe8036eb96c5d60eecb6e7193 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Wed, 4 Feb 2026 17:13:29 -0700 Subject: [PATCH 11/33] Updated core file with encoding map and removed pre-processing idea --- mod/hallmark/core.py | 86 +++++++++++++++++++------------- mod/hallmark/helper_functions.py | 20 +++++++- 2 files changed, 69 insertions(+), 37 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 0934134..0a2a15d 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -78,42 +78,55 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. - # Construct the glob pattern for search files - - ### Load Yaml File ### + # Load and read Yaml file parameters = load_encodings_yaml() print(parameters) - ### Call Pre-processing function ### + # Construct the glob pattern for search files pattern = fmt - new_fmt = fmt - for i in range(pmax): - if debug: - print(i, pattern, args, kwargs) - try: - pattern = pattern.format(*args, **kwargs) - break - except KeyError as e: - k = e.args[0] - pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) - new_fmt = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", new_fmt) - kwargs[e.args[0]] = "*" - - # Obtain list of files based on the glob pattern - files = sorted(glob(pattern)) - - # Print the glob pattern and a summary of matches - if debug == True: - print(f'Pattern: "{pattern}"') - n = len(files) - if n > 1: - print(f'{n} matches, e.g., "{files[0]}"') - elif n > 0: - print(f'{n} match, i.e., "{files[0]}"') - else: - print(f"No match; please check format string") - - return (files, pattern) if return_pattern else (new_fmt, files) + fmt_d = fmt + + ### Call Preprocessing Function ### + custom_key_encoding_map = encoding_map(fmt) + encoding = custom_key_encoding_map['aspin'] + + if encoding in parameters: + # if "__HM_PREPROCESSED__" not in fmt: + # raise ValueError( + # "Format string was not preprocessed. " + # "Call preprocess_fmt(fmt) before glob_search()." + # ) + + for i in range(pmax): + if debug: + print(i, pattern, args, kwargs) + try: + pattern = pattern.format(*args, **kwargs) + break + except KeyError as e: + k = e.args[0] + pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) + fmt_s = pattern + fmt_d = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", fmt_d) + kwargs[e.args[0]] = "*" + + # Obtain list of files based on the glob pattern + globbed_files = sorted(glob(pattern)) + + # Print the glob pattern and a summary of matches + if debug == True: + print(f'Pattern: "{pattern}"') + n = len(globbed_files) + if n > 1: + print(f'{n} matches, e.g., "{globbed_files[0]}"') + elif n > 0: + print(f'{n} match, i.e., "{globbed_files[0]}"') + else: + print(f"No match; please check format string") + + return (globbed_files, pattern) if return_pattern else (fmt_d, fmt_s, custom_key_encoding_map, globbed_files) + else: + raise KeyError("Custom Key not defined in config file!") @classmethod def parse(cls, fmt, *args, debug=False, **kwargs): @@ -155,15 +168,18 @@ def parse(cls, fmt, *args, debug=False, **kwargs): 1 data/run2_p20.csv 2 20 """ + ### Call Preprocessing Function ### + + # Parse list of file names back to parameters - new_fmt, files = cls.glob_search(fmt, *args, debug=debug, **kwargs) + fmt_d, fmt_s, custom_key_encoding_map, globbed_files = cls.glob_search(fmt, *args, debug=debug, **kwargs) ### Normalizing custom Characters function ### - parser = parse.compile(new_fmt) + parser = parse.compile(fmt_s) frame = [] - for f in files: + for f in globbed_files: r = parser.parse(f) if r is None: print(f'Failed to parse "{f}"') diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 54ad6fb..5e4f204 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -1,9 +1,12 @@ from pathlib import Path import yaml +import re + +# tmp_fmt = "/{mag:d}a{aspin@spin_UIUC}_w{win:d}/img_s{snapshot:d}_Rh{Rhigh:d}_i{inc:d}.h5" -cwd = Path.cwd() ENCODINGS_YAML = Path(__file__).parent / "encodings.yaml" -print(ENCODINGS_YAML) + +_ENCODING_RE = re.compile(r"\{(\w+):@(\w+)\}") def load_encodings_yaml(path=ENCODINGS_YAML): f = path.open("r", encoding="utf-8") @@ -11,3 +14,16 @@ def load_encodings_yaml(path=ENCODINGS_YAML): encodings = data["encodings"] return encodings +def pre_process_fmt(fmt): + return None + +def encoding_map(fmt): + encoding_map = {} + matches = list(_ENCODING_RE.finditer(fmt)) + print(matches) + for m in matches: + # print(m.group(0)) + name = m.group(1) + encoding = m.group(2) + encoding_map[name] = encoding + return encoding_map \ No newline at end of file From 552a21c4434b7443e8920b165877fc75c42feb30 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Mon, 9 Feb 2026 18:54:56 -0700 Subject: [PATCH 12/33] added new version of yaml file that allows multiple fmt's --- mod/hallmark/encodings.yaml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/mod/hallmark/encodings.yaml b/mod/hallmark/encodings.yaml index 9a45c3f..05239db 100644 --- a/mod/hallmark/encodings.yaml +++ b/mod/hallmark/encodings.yaml @@ -1,14 +1,16 @@ -encodings: - spin_UIUC: - type: float - regex: "m?([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - transform: - m: "-" - formatting: - format: "+.2f" +data: + - fmt : "" + encoding: + regex: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + transform: + m: "-" + formatting: + format: "+.2f" - spin_BHAC: - type: float - regex: "[+-]?([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - formatting: - format: "+.2f" \ No newline at end of file + # - fmt2 : "" + # encoding2: + # regex2: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + # transform: + # m: "-" + # formatting: + # format: "+.2f" From 1810fd4c2b5fcc602960c1e4e8c82bd80389603b Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Tue, 10 Feb 2026 17:09:22 -0700 Subject: [PATCH 13/33] updated the core file, encodings yaml file, and helper functions to allow custom aspin types --- mod/hallmark/core.py | 98 ++++++++++++++------------------ mod/hallmark/encodings.yaml | 21 +++---- mod/hallmark/helper_functions.py | 36 ++++++------ 3 files changed, 66 insertions(+), 89 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 0a2a15d..5652a49 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -73,63 +73,53 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): + def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwargs): + + # Load and read Yaml file + yaml_encodings = load_encodings_yaml(index) + fmt = yaml_encodings["fmt"] + print(fmt) + pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. - # Load and read Yaml file - parameters = load_encodings_yaml() - print(parameters) - # Construct the glob pattern for search files pattern = fmt fmt_d = fmt - ### Call Preprocessing Function ### - custom_key_encoding_map = encoding_map(fmt) - encoding = custom_key_encoding_map['aspin'] - - if encoding in parameters: - # if "__HM_PREPROCESSED__" not in fmt: - # raise ValueError( - # "Format string was not preprocessed. " - # "Call preprocess_fmt(fmt) before glob_search()." - # ) - - for i in range(pmax): - if debug: - print(i, pattern, args, kwargs) - try: - pattern = pattern.format(*args, **kwargs) - break - except KeyError as e: - k = e.args[0] - pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) - fmt_s = pattern - fmt_d = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", fmt_d) - kwargs[e.args[0]] = "*" - - # Obtain list of files based on the glob pattern - globbed_files = sorted(glob(pattern)) - - # Print the glob pattern and a summary of matches - if debug == True: - print(f'Pattern: "{pattern}"') - n = len(globbed_files) - if n > 1: - print(f'{n} matches, e.g., "{globbed_files[0]}"') - elif n > 0: - print(f'{n} match, i.e., "{globbed_files[0]}"') - else: - print(f"No match; please check format string") - - return (globbed_files, pattern) if return_pattern else (fmt_d, fmt_s, custom_key_encoding_map, globbed_files) - else: - raise KeyError("Custom Key not defined in config file!") + + for i in range(pmax): + if debug: + print(i, pattern, args, kwargs) + try: + pattern = pattern.format(*args, **kwargs) + break + except KeyError as e: + k = e.args[0] + pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) + fmt_s = pattern + fmt_d = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", fmt_d) + kwargs[e.args[0]] = "*" + + # Obtain list of files based on the glob pattern + globbed_files = sorted(glob(pattern)) + + # Print the glob pattern and a summary of matches + if debug == True: + print(f'Pattern: "{pattern}"') + n = len(globbed_files) + if n > 1: + print(f'{n} matches, e.g., "{globbed_files[0]}"') + elif n > 0: + print(f'{n} match, i.e., "{globbed_files[0]}"') + else: + print(f"No match; please check format string") + + return (globbed_files, pattern) if return_pattern else (yaml_encodings, fmt_d, globbed_files) @classmethod - def parse(cls, fmt, *args, debug=False, **kwargs): + def parse(cls, index = 0, *args, debug=False, **kwargs,): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -167,20 +157,16 @@ def parse(cls, fmt, *args, debug=False, **kwargs): 0 data/run1_p10.csv 1 10 1 data/run2_p20.csv 2 20 """ - - ### Call Preprocessing Function ### - - + # Parse list of file names back to parameters - fmt_d, fmt_s, custom_key_encoding_map, globbed_files = cls.glob_search(fmt, *args, debug=debug, **kwargs) - - ### Normalizing custom Characters function ### - parser = parse.compile(fmt_s) + yaml_encodings, fmt_d, globbed_files = cls.glob_search(index, *args, debug=debug, **kwargs) + parser = parse.compile(fmt_d) frame = [] for f in globbed_files: - r = parser.parse(f) + f_new = regex_sub(f, yaml_encodings) + r = parser.parse(f_new) if r is None: print(f'Failed to parse "{f}"') else: diff --git a/mod/hallmark/encodings.yaml b/mod/hallmark/encodings.yaml index 05239db..f5d0c2e 100644 --- a/mod/hallmark/encodings.yaml +++ b/mod/hallmark/encodings.yaml @@ -1,16 +1,11 @@ data: - - fmt : "" + - fmt : "data/{mag:d}_mag{aspin}_w{win:d}" encoding: - regex: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - transform: - m: "-" - formatting: - format: "+.2f" + aspin: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + - # - fmt2 : "" - # encoding2: - # regex2: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - # transform: - # m: "-" - # formatting: - # format: "+.2f" + - fmt : "data/a_{a:d}/b_{b:d}.txt" + encoding: + aspin: "" + + diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 5e4f204..5e6ae1f 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -2,28 +2,24 @@ import yaml import re -# tmp_fmt = "/{mag:d}a{aspin@spin_UIUC}_w{win:d}/img_s{snapshot:d}_Rh{Rhigh:d}_i{inc:d}.h5" - ENCODINGS_YAML = Path(__file__).parent / "encodings.yaml" -_ENCODING_RE = re.compile(r"\{(\w+):@(\w+)\}") - -def load_encodings_yaml(path=ENCODINGS_YAML): +def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): f = path.open("r", encoding="utf-8") - data = yaml.safe_load(f) - encodings = data["encodings"] - return encodings + yaml_file = yaml.safe_load(f) + encodings = yaml_file["data"] + return encodings[index] + +def regex_sub(f, yaml_encodings): + fmt = f + regex = yaml_encodings["encoding"]["aspin"] + + if re.search(regex, fmt) and len(regex)>0: + matches = re.finditer(regex, fmt) + for match in matches: + k = match.group(0) + k_num = "-" + str(match.group(1)) + fmt = re.sub(k,k_num , fmt) -def pre_process_fmt(fmt): - return None + return fmt -def encoding_map(fmt): - encoding_map = {} - matches = list(_ENCODING_RE.finditer(fmt)) - print(matches) - for m in matches: - # print(m.group(0)) - name = m.group(1) - encoding = m.group(2) - encoding_map[name] = encoding - return encoding_map \ No newline at end of file From 7b36db0c25abb5c0b5568d7588309bbc326cbdf5 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Tue, 10 Feb 2026 18:06:13 -0700 Subject: [PATCH 14/33] Updated core file to allow integer and floats in parse. Added PyTOML to dependencies. --- mod/hallmark/core.py | 12 +++++------- pyproject.toml | 1 + 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 5652a49..636c14d 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -78,7 +78,6 @@ def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwarg # Load and read Yaml file yaml_encodings = load_encodings_yaml(index) fmt = yaml_encodings["fmt"] - print(fmt) pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number @@ -86,7 +85,7 @@ def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwarg # Construct the glob pattern for search files pattern = fmt - fmt_d = fmt + fmt_g = fmt for i in range(pmax): @@ -98,8 +97,7 @@ def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwarg except KeyError as e: k = e.args[0] pattern = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":s}", pattern) - fmt_s = pattern - fmt_d = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":d}", fmt_d) + fmt_g = re.sub(r"\{" + k + r":?.*?\}", "{" + k + ":g}", fmt_g) kwargs[e.args[0]] = "*" # Obtain list of files based on the glob pattern @@ -116,7 +114,7 @@ def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwarg else: print(f"No match; please check format string") - return (globbed_files, pattern) if return_pattern else (yaml_encodings, fmt_d, globbed_files) + return (globbed_files, pattern) if return_pattern else (yaml_encodings, fmt_g, globbed_files) @classmethod def parse(cls, index = 0, *args, debug=False, **kwargs,): @@ -160,8 +158,8 @@ def parse(cls, index = 0, *args, debug=False, **kwargs,): # Parse list of file names back to parameters - yaml_encodings, fmt_d, globbed_files = cls.glob_search(index, *args, debug=debug, **kwargs) - parser = parse.compile(fmt_d) + yaml_encodings, fmt_g, globbed_files = cls.glob_search(index, *args, debug=debug, **kwargs) + parser = parse.compile(fmt_g) frame = [] for f in globbed_files: diff --git a/pyproject.toml b/pyproject.toml index 2c38eb2..4decfea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies =[ "numpy", "pandas", "parse", + "PyYAML", ] [tool.setuptools.packages.find] From 1d0a0f24bcfc6ee0accd6141b24099b2b921a06c Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Wed, 11 Feb 2026 11:07:54 -0700 Subject: [PATCH 15/33] adapting tests and yaml file for updates to core --- mod/hallmark/encodings.yaml | 3 +++ tests/test_paraframe.py | 47 +++++++++++++++++++------------------ 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/mod/hallmark/encodings.yaml b/mod/hallmark/encodings.yaml index f5d0c2e..f87c1fb 100644 --- a/mod/hallmark/encodings.yaml +++ b/mod/hallmark/encodings.yaml @@ -8,4 +8,7 @@ data: encoding: aspin: "" + - fmt : "data/a{aspin}/b_{b:d}.txt" + encoding: + aspin: "" diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 5bef648..c226a03 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -5,13 +5,13 @@ @pytest.fixture def create_ParaFrame(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - return ParaFrame.parse(fmt, debug = True) + #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + return ParaFrame.parse(1, debug = True) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): - fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") - return ParaFrame.parse(fmt, debug = True) + #fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") + return ParaFrame.parse(2, debug = True) def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -20,10 +20,10 @@ def test_shape_of_ParaFrame(create_ParaFrame): pf = create_ParaFrame assert pf.shape == (100,3) -def test_column_dtype(create_ParaFrame): - pf = create_ParaFrame - assert pd.api.types.is_integer_dtype(pf["a"]) - assert pd.api.types.is_integer_dtype(pf["b"]) +# def test_column_dtype(create_ParaFrame): +# pf = create_ParaFrame +# assert pd.api.types.is_integer_gtype(pf["a"]) +# assert pd.api.types.is_integer_gtype(pf["b"]) def test_column_names_in_ParaFrame(create_ParaFrame): pf = create_ParaFrame @@ -42,26 +42,27 @@ def test_pandas_method_on_pf(create_ParaFrame): assert isinstance(pf.head(), pd.DataFrame) def test_glob_string_format(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] + #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pattern = ParaFrame.glob_search(1, a=0, return_pattern=True)[1] norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - files = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[0] + #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + files = ParaFrame.glob_search(1, a=0, return_pattern=True)[0] assert len(files) == 10 def test_parse_method_with_added_filter_arg(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(fmt, a=0) + #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pf = ParaFrame.parse(1, a=0) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 - + +# update these to take parameters from the example fmt in helper_functions @pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): - fmt = str(create_temp_data_spin / "a{a:spin}/b_{b:d}.txt") - files, pattern = ParaFrame.glob_search(fmt, a=0.5, return_pattern=True) + #fmt = str(create_temp_data_spin / "a{a:spin}/b_{b:d}.txt") + files, pattern = ParaFrame.glob_search(2, a=0.5, return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 @@ -79,9 +80,9 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): assert len(pf_filtered) == 3 assert set(pf_filtered["a"].unique()) == {0.5} -def test_loading_yaml_file_for_special_formatting(): - parameters = load_encodings_yaml() - assert "spin_UIUC" in parameters - assert "spin_BHAC" in parameters - assert parameters['spin_UIUC']['regex'] == "m?([0-9]+(\.[0-9]+)?|\.[0-9]+)" - assert parameters['spin_UIUC']['transform']['m'] == '-' \ No newline at end of file +# def test_loading_yaml_file_for_special_formatting(): +# parameters = load_encodings_yaml() +# assert "spin_UIUC" in parameters +# assert "spin_BHAC" in parameters +# assert parameters['spin_UIUC']['regex'] == "m?([0-9]+(\.[0-9]+)?|\.[0-9]+)" +# assert parameters['spin_UIUC']['transform']['m'] == '-' \ No newline at end of file From 8d8df5c730e75219c96e5a33ea4ee66b66e26014 Mon Sep 17 00:00:00 2001 From: Ram Adithya Date: Wed, 11 Feb 2026 12:02:54 -0700 Subject: [PATCH 16/33] Started updating tests to match with the current Paraframe indexing method --- mod/hallmark/encodings.yaml => encodings.yaml | 0 mod/hallmark/core.py | 15 +++++++---- mod/hallmark/helper_functions.py | 3 +-- tests/conftest.py | 4 +++ tests/test_paraframe.py | 26 ++++++++----------- 5 files changed, 26 insertions(+), 22 deletions(-) rename mod/hallmark/encodings.yaml => encodings.yaml (100%) diff --git a/mod/hallmark/encodings.yaml b/encodings.yaml similarity index 100% rename from mod/hallmark/encodings.yaml rename to encodings.yaml diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 636c14d..2d5f457 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -73,11 +73,16 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwargs): + def glob_search(cls, index = 0, _test_fmt = None, *args, debug=False, return_pattern=False,**kwargs): # Load and read Yaml file - yaml_encodings = load_encodings_yaml(index) - fmt = yaml_encodings["fmt"] + if _test_fmt != None: + fmt = _test_fmt + + yaml_encodings = load_encodings_yaml(index,path=Path("/tmp/encoding_tmp.yaml")) + else: + yaml_encodings = load_encodings_yaml(index) + fmt = yaml_encodings["fmt"] pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number @@ -117,7 +122,7 @@ def glob_search(cls, index = 0, *args, debug=False, return_pattern=False,**kwarg return (globbed_files, pattern) if return_pattern else (yaml_encodings, fmt_g, globbed_files) @classmethod - def parse(cls, index = 0, *args, debug=False, **kwargs,): + def parse(cls, index = 0, _test_fmt = None, *args, debug=False, **kwargs,): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -158,7 +163,7 @@ def parse(cls, index = 0, *args, debug=False, **kwargs,): # Parse list of file names back to parameters - yaml_encodings, fmt_g, globbed_files = cls.glob_search(index, *args, debug=debug, **kwargs) + yaml_encodings, fmt_g, globbed_files = cls.glob_search(index,_test_fmt, *args, debug=debug, **kwargs) parser = parse.compile(fmt_g) frame = [] diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 5e6ae1f..5fa4003 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -2,7 +2,7 @@ import yaml import re -ENCODINGS_YAML = Path(__file__).parent / "encodings.yaml" +ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): f = path.open("r", encoding="utf-8") @@ -13,7 +13,6 @@ def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): def regex_sub(f, yaml_encodings): fmt = f regex = yaml_encodings["encoding"]["aspin"] - if re.search(regex, fmt) and len(regex)>0: matches = re.finditer(regex, fmt) for match in matches: diff --git a/tests/conftest.py b/tests/conftest.py index c4465c6..3a3b4bc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,10 @@ import pytest +import shutil + +shutil.copy2('encodings.yaml', '/tmp/encoding_tmp.yaml') def spin_format(val): + if val == 0: return "0" return f"{val:+g}" diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index c226a03..66a54ed 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -5,13 +5,13 @@ @pytest.fixture def create_ParaFrame(create_temp_data): - #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - return ParaFrame.parse(1, debug = True) + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + return ParaFrame.parse(1,_test_fmt = fmt, debug = True) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): - #fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") - return ParaFrame.parse(2, debug = True) + fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") + return ParaFrame.parse(1,_test_fmt = fmt, debug = True) def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -42,19 +42,19 @@ def test_pandas_method_on_pf(create_ParaFrame): assert isinstance(pf.head(), pd.DataFrame) def test_glob_string_format(create_temp_data): - #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pattern = ParaFrame.glob_search(1, a=0, return_pattern=True)[1] + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pattern = ParaFrame.glob_search(1,_test_fmt=fmt, a=0, return_pattern=True)[1] norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): - #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - files = ParaFrame.glob_search(1, a=0, return_pattern=True)[0] + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + files = ParaFrame.glob_search(1,_test_fmt=fmt, a=0, return_pattern=True)[0] assert len(files) == 10 def test_parse_method_with_added_filter_arg(create_temp_data): - #fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(1, a=0) + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + pf = ParaFrame.parse(1,_test_fmt=fmt, a=0) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 @@ -81,8 +81,4 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): assert set(pf_filtered["a"].unique()) == {0.5} # def test_loading_yaml_file_for_special_formatting(): -# parameters = load_encodings_yaml() -# assert "spin_UIUC" in parameters -# assert "spin_BHAC" in parameters -# assert parameters['spin_UIUC']['regex'] == "m?([0-9]+(\.[0-9]+)?|\.[0-9]+)" -# assert parameters['spin_UIUC']['transform']['m'] == '-' \ No newline at end of file +# parameters = load_encodings_yaml(index=1,path = "/tmp/encoding_tmp.yaml") From 21bed1bba9a8d585d5f4f3dcdac3c7a89572be5e Mon Sep 17 00:00:00 2001 From: Rohin Sant Date: Fri, 13 Feb 2026 12:34:42 -0700 Subject: [PATCH 17/33] Added documentation to the helper_functions.py file and made test_paraframe_e2e.py compatible with the inclusion of the yaml file --- mod/hallmark/helper_functions.py | 49 +++++++++++++++++++++++++------- tests/test_paraframe_e2e.py | 2 +- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 5fa4003..3baf09b 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -2,23 +2,52 @@ import yaml import re +# Specifies the path to encodings.yaml using __file__ as the current script +# and moving two directories above using parents[2] and appending "encodings.yaml" to path. ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): - f = path.open("r", encoding="utf-8") - yaml_file = yaml.safe_load(f) - encodings = yaml_file["data"] + """ + Load encoding rules from a YAML configuration file. + + Parameters: + index : int + Which encoding entry (specification) to return from the YAML file. + Default is 0, which is the first entry. + path : Path + Path to the encodings.yaml file. + + Returns: + dict + A single encoding configuration containing rules such as the regex commands. + """ + f = path.open("r", encoding="utf-8") # Opens the yaml file path + yaml_file = yaml.safe_load(f) # Safely loads in the yaml data + encodings = yaml_file["data"] # Extracts the encodings from the yaml file. return encodings[index] def regex_sub(f, yaml_encodings): - fmt = f - regex = yaml_encodings["encoding"]["aspin"] - if re.search(regex, fmt) and len(regex)>0: + """ + Apply a regex substitution rule to a string using YAML-defined encoding. + + Parameters: + f : str + The input filepath as a string. + yaml_encodings : dict + An encoding dictionary loaded from YAML. + + Returns: + str + The transformed string after applying regex substitutions. + """ + fmt = f # Assigns the format specified in the yaml file + regex = yaml_encodings["encoding"]["aspin"] # Extracts the regex from the yaml file + if re.search(regex, fmt) and len(regex)>0: # Proceeds if regex is not empty and finds what the regex intends to find matches = re.finditer(regex, fmt) - for match in matches: - k = match.group(0) - k_num = "-" + str(match.group(1)) - fmt = re.sub(k,k_num , fmt) + for match in matches: # Iterating through the matches + k = match.group(0) # Entire matched substring + k_num = "-" + str(match.group(1)) # Attaches '-' at the start of the first group + fmt = re.sub(k,k_num , fmt) # Replaces it with the substituted string return fmt diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index 7241424..a2ded00 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -4,7 +4,7 @@ def test_paraframe_class_functionality(create_temp_data): # a user wants to create a paraframe fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(fmt) + pf = ParaFrame.parse(1, _test_fmt = fmt) # users wants to filter files to see those with a = 0 scalar_filter = pf(a=0) From 704ba840cb9cbab62b31e5a7446d035e9078b334 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Fri, 13 Feb 2026 12:58:50 -0700 Subject: [PATCH 18/33] updated current spin format unit tests with correct formatter solution and removed xfail. added helper function test for load_encodings_yaml(). --- tests/test_paraframe.py | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 66a54ed..9d8be9f 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -10,7 +10,7 @@ def create_ParaFrame(create_temp_data): @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): - fmt = str(create_temp_data_spin / "a_{a:spin}/b_{b:d}.txt") + fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") return ParaFrame.parse(1,_test_fmt = fmt, debug = True) def test_type_of_ParaFrame(create_ParaFrame): @@ -20,10 +20,10 @@ def test_shape_of_ParaFrame(create_ParaFrame): pf = create_ParaFrame assert pf.shape == (100,3) -# def test_column_dtype(create_ParaFrame): -# pf = create_ParaFrame -# assert pd.api.types.is_integer_gtype(pf["a"]) -# assert pd.api.types.is_integer_gtype(pf["b"]) +def test_column_dtype(create_ParaFrame): + pf = create_ParaFrame + assert pd.api.types.is_float_dtype(pf["a"]) + assert pd.api.types.is_float_dtype(pf["b"]) def test_column_names_in_ParaFrame(create_ParaFrame): pf = create_ParaFrame @@ -57,28 +57,30 @@ def test_parse_method_with_added_filter_arg(create_temp_data): pf = ParaFrame.parse(1,_test_fmt=fmt, a=0) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 - -# update these to take parameters from the example fmt in helper_functions -@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") + + def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): - #fmt = str(create_temp_data_spin / "a{a:spin}/b_{b:d}.txt") - files, pattern = ParaFrame.glob_search(2, a=0.5, return_pattern=True) + fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") + files, pattern = ParaFrame.glob_search(2, _test_fmt=fmt, aspin="+0.5", return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 -@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") +#@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") def test_parse_produces_float_spin_column(create_ParaFrame_spin): pf = create_ParaFrame_spin - assert pd.api.types.is_float_dtype(pf["a"]) - assert set(pf["a"].unique()) == {-0.5, 0.0, 0.5} + assert pd.api.types.is_float_dtype(pf["aspin"]) + assert set(pf["aspin"].unique()) == {-0.5, 0.0, 0.5} -@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") +#@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") def test_filtering_by_numeric_spin(create_ParaFrame_spin): pf = create_ParaFrame_spin - pf_filtered = pf(a=0.5) - assert len(pf_filtered) == 3 - assert set(pf_filtered["a"].unique()) == {0.5} + pf_filtered = pf(aspin=0.5) + assert len(pf_filtered) == 10 + assert set(pf_filtered["aspin"].unique()) == {0.5} -# def test_loading_yaml_file_for_special_formatting(): -# parameters = load_encodings_yaml(index=1,path = "/tmp/encoding_tmp.yaml") +def test_loading_yaml_file_for_test_spin_formatting_contents(): + params = load_encodings_yaml(index=2,path = Path("/tmp/encoding_tmp.yaml")) + assert "fmt" in params + assert "encoding" in params + assert "aspin" in params["encoding"] \ No newline at end of file From 80564e07ce93b1caf4b224a82968ed5f970221fc Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Fri, 13 Feb 2026 13:46:16 -0700 Subject: [PATCH 19/33] added unit tests for "m" formatting of spin types. fixed encodings regex to include .h5 for first fmt. added new pytest fixture for spin data with "m" --- encodings.yaml | 2 +- tests/conftest.py | 13 +++++++++++++ tests/test_paraframe.py | 23 +++++++++++++++++++---- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index f87c1fb..6f337d6 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,5 +1,5 @@ data: - - fmt : "data/{mag:d}_mag{aspin}_w{win:d}" + - fmt : "data/{mag:d}_mag{aspin}_w{win:d}.h5" encoding: aspin: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" diff --git a/tests/conftest.py b/tests/conftest.py index 3a3b4bc..86b7170 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -29,3 +29,16 @@ def create_temp_data_spin(tmp_path): for b in range(10, 20): (subdir / f"b_{b}.txt").touch() return data_dir + +@pytest.fixture(scope = "function") +def create_temp_data_spin_with_m(tmp_path): + data_dir = tmp_path / "data" + data_dir.mkdir(parents=True, exist_ok=True) + spins = ["m0.5", "0", "0.5"] + + for mag in range(0, 2): + for aspin in spins: + for win in range(10, 20): + file_name = f"{mag}_mag{aspin}_w{win}.h5" + (data_dir / file_name).touch() + return data_dir \ No newline at end of file diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 9d8be9f..abb649c 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -13,6 +13,11 @@ def create_ParaFrame_spin(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") return ParaFrame.parse(1,_test_fmt = fmt, debug = True) +@pytest.fixture +def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): + fmt = str(create_temp_data_spin_with_m / '{mag:d}_mag{aspin}_w{win:d}.h5') + return ParaFrame.parse(0,_test_fmt = fmt, debug = True) + def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -58,7 +63,6 @@ def test_parse_method_with_added_filter_arg(create_temp_data): assert pf.shape == (10, 3) assert pf["a"].unique() == 0 - def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") files, pattern = ParaFrame.glob_search(2, _test_fmt=fmt, aspin="+0.5", return_pattern=True) @@ -66,13 +70,11 @@ def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_t assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 -#@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") def test_parse_produces_float_spin_column(create_ParaFrame_spin): pf = create_ParaFrame_spin assert pd.api.types.is_float_dtype(pf["aspin"]) assert set(pf["aspin"].unique()) == {-0.5, 0.0, 0.5} -#@pytest.mark.xfail(strict=True, reason="Formatter issue solution not yet implemented") def test_filtering_by_numeric_spin(create_ParaFrame_spin): pf = create_ParaFrame_spin pf_filtered = pf(aspin=0.5) @@ -83,4 +85,17 @@ def test_loading_yaml_file_for_test_spin_formatting_contents(): params = load_encodings_yaml(index=2,path = Path("/tmp/encoding_tmp.yaml")) assert "fmt" in params assert "encoding" in params - assert "aspin" in params["encoding"] \ No newline at end of file + assert "aspin" in params["encoding"] + +def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): + fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") + pf = ParaFrame.parse(0,_test_fmt = fmt, debug = True) + pf_filtered = pf(aspin=-0.5) + assert len(pf_filtered) == 20 + assert set(pf_filtered["aspin"].unique()) == {-0.5} + +def test_m_type_for_spin_data_with_multiple_filters(create_temp_data_spin_with_m): + fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") + pf = ParaFrame.parse(0,_test_fmt = fmt, debug = True) + pf_filtered = pf(aspin=[-0.5,0.0]) + assert len(pf_filtered) == 40 \ No newline at end of file From d0d82f74f607527d33e71b085adea1c28d9db67b Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Fri, 13 Feb 2026 15:12:44 -0700 Subject: [PATCH 20/33] revert paraframe creation back to using just fmt and no index --- tests/test_paraframe.py | 18 +++++++++--------- tests/test_paraframe_e2e.py | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index abb649c..fae2eba 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -6,17 +6,17 @@ @pytest.fixture def create_ParaFrame(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - return ParaFrame.parse(1,_test_fmt = fmt, debug = True) + return ParaFrame.parse(fmt, debug = True) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") - return ParaFrame.parse(1,_test_fmt = fmt, debug = True) + return ParaFrame.parse(fmt, debug = True) @pytest.fixture def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / '{mag:d}_mag{aspin}_w{win:d}.h5') - return ParaFrame.parse(0,_test_fmt = fmt, debug = True) + return ParaFrame.parse(fmt, debug = True) def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -54,18 +54,18 @@ def test_glob_string_format(create_temp_data): def test_glob_method_returns_files(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - files = ParaFrame.glob_search(1,_test_fmt=fmt, a=0, return_pattern=True)[0] + files = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[0] assert len(files) == 10 def test_parse_method_with_added_filter_arg(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(1,_test_fmt=fmt, a=0) + pf = ParaFrame.parse(fmt, a=0) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") - files, pattern = ParaFrame.glob_search(2, _test_fmt=fmt, aspin="+0.5", return_pattern=True) + files, pattern = ParaFrame.glob_search(fmt, aspin="+0.5", return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 @@ -82,20 +82,20 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): assert set(pf_filtered["aspin"].unique()) == {0.5} def test_loading_yaml_file_for_test_spin_formatting_contents(): - params = load_encodings_yaml(index=2,path = Path("/tmp/encoding_tmp.yaml")) + params = load_encodings_yaml(path = Path("/tmp/encoding_tmp.yaml")) # default fmt should still be the first one assert "fmt" in params assert "encoding" in params assert "aspin" in params["encoding"] def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(0,_test_fmt = fmt, debug = True) + pf = ParaFrame.parse(fmt, debug = True) pf_filtered = pf(aspin=-0.5) assert len(pf_filtered) == 20 assert set(pf_filtered["aspin"].unique()) == {-0.5} def test_m_type_for_spin_data_with_multiple_filters(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(0,_test_fmt = fmt, debug = True) + pf = ParaFrame.parse(fmt, debug = True) pf_filtered = pf(aspin=[-0.5,0.0]) assert len(pf_filtered) == 40 \ No newline at end of file diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index a2ded00..7241424 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -4,7 +4,7 @@ def test_paraframe_class_functionality(create_temp_data): # a user wants to create a paraframe fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(1, _test_fmt = fmt) + pf = ParaFrame.parse(fmt) # users wants to filter files to see those with a = 0 scalar_filter = pf(a=0) From 01a85697e8d65f5f410dfdf2fdcb6505eca5e797 Mon Sep 17 00:00:00 2001 From: Ram Adithya Date: Fri, 13 Feb 2026 16:35:11 -0700 Subject: [PATCH 21/33] Attempted to revert back to fmt as an argument with test modifications --- mod/hallmark/core.py | 32 +++++++++++++++++++------------- mod/hallmark/helper_functions.py | 17 +++++++++++++---- tests/conftest.py | 2 +- tests/test_paraframe.py | 20 +++++++++++--------- tests/test_paraframe_e2e.py | 2 +- 5 files changed, 45 insertions(+), 28 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 2d5f457..d892cd5 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -20,7 +20,6 @@ import parse import pandas as pd import numpy as np -import string # added for Formatter subclassing from .helper_functions import * @@ -73,16 +72,9 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, index = 0, _test_fmt = None, *args, debug=False, return_pattern=False,**kwargs): + def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): # Load and read Yaml file - if _test_fmt != None: - fmt = _test_fmt - - yaml_encodings = load_encodings_yaml(index,path=Path("/tmp/encoding_tmp.yaml")) - else: - yaml_encodings = load_encodings_yaml(index) - fmt = yaml_encodings["fmt"] pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number @@ -119,10 +111,10 @@ def glob_search(cls, index = 0, _test_fmt = None, *args, debug=False, return_pat else: print(f"No match; please check format string") - return (globbed_files, pattern) if return_pattern else (yaml_encodings, fmt_g, globbed_files) + return (globbed_files, pattern) if return_pattern else ( globbed_files, fmt_g) @classmethod - def parse(cls, index = 0, _test_fmt = None, *args, debug=False, **kwargs,): + def parse(cls, fmt, _tmp_test = None, encoding = False, *args, debug=False, **kwargs,): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -163,15 +155,29 @@ def parse(cls, index = 0, _test_fmt = None, *args, debug=False, **kwargs,): # Parse list of file names back to parameters - yaml_encodings, fmt_g, globbed_files = cls.glob_search(index,_test_fmt, *args, debug=debug, **kwargs) + globbed_files, fmt_g = cls.glob_search(fmt, *args, debug=debug, **kwargs) parser = parse.compile(fmt_g) frame = [] + if encoding == True: + if _tmp_test != None: + + + encoding_data = load_encodings_yaml(_tmp_test) + else: + encoding_data = load_encodings_yaml(fmt) + + + for f in globbed_files: - f_new = regex_sub(f, yaml_encodings) + if encoding == True: + f_new = regex_sub(f, encoding_data) + else: + f_new = f r = parser.parse(f_new) if r is None: print(f'Failed to parse "{f}"') else: frame.append({'path':f, **r.named}) + print(frame) return cls(frame) \ No newline at end of file diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 3baf09b..d358b0e 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -6,7 +6,7 @@ # and moving two directories above using parents[2] and appending "encodings.yaml" to path. ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" -def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): +def load_encodings_yaml(fmt, path=ENCODINGS_YAML): """ Load encoding rules from a YAML configuration file. @@ -22,9 +22,16 @@ def load_encodings_yaml(index = 0, path=ENCODINGS_YAML): A single encoding configuration containing rules such as the regex commands. """ f = path.open("r", encoding="utf-8") # Opens the yaml file path + yaml_file = yaml.safe_load(f) # Safely loads in the yaml data - encodings = yaml_file["data"] # Extracts the encodings from the yaml file. - return encodings[index] + yaml_file_data = yaml_file["data"] # Extracts the encodings from the yaml file. + print("Loaded!") + for i in range(len(yaml_file_data)): + if yaml_file_data[i]['fmt'] == fmt: + aspin_encoding = yaml_file_data[i]['encoding']['aspin'] + + + return aspin_encoding def regex_sub(f, yaml_encodings): """ @@ -41,7 +48,8 @@ def regex_sub(f, yaml_encodings): The transformed string after applying regex substitutions. """ fmt = f # Assigns the format specified in the yaml file - regex = yaml_encodings["encoding"]["aspin"] # Extracts the regex from the yaml file + + regex = yaml_encodings # Extracts the regex from the yaml file if re.search(regex, fmt) and len(regex)>0: # Proceeds if regex is not empty and finds what the regex intends to find matches = re.finditer(regex, fmt) for match in matches: # Iterating through the matches @@ -51,3 +59,4 @@ def regex_sub(f, yaml_encodings): return fmt +print(load_encodings_yaml("data/{mag:d}_mag{aspin}_w{win:d}.h5")) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 86b7170..c2f4213 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -33,7 +33,7 @@ def create_temp_data_spin(tmp_path): @pytest.fixture(scope = "function") def create_temp_data_spin_with_m(tmp_path): data_dir = tmp_path / "data" - data_dir.mkdir(parents=True, exist_ok=True) + data_dir.mkdir(parents=True) spins = ["m0.5", "0", "0.5"] for mag in range(0, 2): diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index fae2eba..0187e28 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -16,7 +16,7 @@ def create_ParaFrame_spin(create_temp_data_spin): @pytest.fixture def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / '{mag:d}_mag{aspin}_w{win:d}.h5') - return ParaFrame.parse(fmt, debug = True) + return ParaFrame.parse(fmt,encoding=True, debug = True) def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) @@ -48,7 +48,7 @@ def test_pandas_method_on_pf(create_ParaFrame): def test_glob_string_format(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pattern = ParaFrame.glob_search(1,_test_fmt=fmt, a=0, return_pattern=True)[1] + pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") @@ -81,21 +81,23 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): assert len(pf_filtered) == 10 assert set(pf_filtered["aspin"].unique()) == {0.5} -def test_loading_yaml_file_for_test_spin_formatting_contents(): - params = load_encodings_yaml(path = Path("/tmp/encoding_tmp.yaml")) # default fmt should still be the first one - assert "fmt" in params - assert "encoding" in params - assert "aspin" in params["encoding"] +# def test_loading_yaml_file_for_test_spin_formatting_contents(create_temp_data_spin_with_m): +# fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") +# params = load_encodings_yaml(fmt,path = Path("/tmp/encoding_tmp.yaml")) # default fmt should still be the first one + +# assert "encoding" in params +# assert "aspin" in params["encoding"] def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(fmt, debug = True) + pf = ParaFrame.parse(fmt, _tmp_test = "data/{mag:d}_mag{aspin}_w{win:d}.h5", encoding= True, debug = True) + pf_filtered = pf(aspin=-0.5) assert len(pf_filtered) == 20 assert set(pf_filtered["aspin"].unique()) == {-0.5} def test_m_type_for_spin_data_with_multiple_filters(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(fmt, debug = True) + pf = ParaFrame.parse(fmt, _tmp_test = "data/{mag:d}_mag{aspin}_w{win:d}.h5",encoding=True, debug = True) pf_filtered = pf(aspin=[-0.5,0.0]) assert len(pf_filtered) == 40 \ No newline at end of file diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index 7241424..bb3051d 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -43,7 +43,7 @@ def test_paraframe_class_functionality(create_temp_data): assert len(mask_filter) == 40 assert all(mask_filter["a"].unique() == [1,2,3,4]) -@pytest.mark.xfail(strict=True, reason="Debug output formatting has been changed, test needs updated") +# @pytest.mark.xfail(strict=True, reason="Debug output formatting has been changed, test needs updated") def test_debug(create_temp_data, capsys, tmp_path): # users want to see a detailed summary of how ParaFrame utilizes globbing fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") From 5ab3089538fe93c6a3b7ef974274aea4e7c63443 Mon Sep 17 00:00:00 2001 From: Ram Adithya Date: Mon, 16 Feb 2026 16:37:31 -0700 Subject: [PATCH 22/33] Attempted to make a cleaner version for the encoding map --- mod/hallmark/core.py | 53 ++++++++++--------- mod/hallmark/helper_functions.py | 88 +++++++++++++------------------- 2 files changed, 65 insertions(+), 76 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index d892cd5..a90338c 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from glob import glob import re @@ -72,19 +71,33 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): - - # Load and read Yaml file - + def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=False, **kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. + yaml_encodings = find_spec_by_fmt(fmt) + + if yaml_encodings is None: + raise ValueError(f"Error: The format '{fmt}' is missing from encodings.yaml.") + + needs_encoding = False + enc_dict = yaml_encodings.get("encoding", {}) + + for key in enc_dict: + if enc_dict[key] != "": + needs_encoding = True + + if needs_encoding == True and encoding == False: + raise ValueError(f"Error: '{fmt}' has a regex spec, so you must use encoding=True") + + if needs_encoding == False and encoding == True: + raise ValueError(f"Error: '{fmt}' does not have a regex spec, so you must use encoding=False") + # Construct the glob pattern for search files pattern = fmt fmt_g = fmt - for i in range(pmax): if debug: print(i, pattern, args, kwargs) @@ -111,10 +124,13 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False,**kwargs): else: print(f"No match; please check format string") - return (globbed_files, pattern) if return_pattern else ( globbed_files, fmt_g) + if return_pattern: + return (globbed_files, pattern) + else: + return (yaml_encodings, fmt_g, globbed_files) @classmethod - def parse(cls, fmt, _tmp_test = None, encoding = False, *args, debug=False, **kwargs,): + def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. @@ -152,32 +168,21 @@ def parse(cls, fmt, _tmp_test = None, encoding = False, *args, debug=False, **kw 0 data/run1_p10.csv 1 10 1 data/run2_p20.csv 2 20 """ - # Parse list of file names back to parameters + yaml_encodings, fmt_g, globbed_files = cls.glob_search(fmt, *args, debug=debug, encoding=encoding, **kwargs) - globbed_files, fmt_g = cls.glob_search(fmt, *args, debug=debug, **kwargs) parser = parse.compile(fmt_g) frame = [] - if encoding == True: - if _tmp_test != None: - - - encoding_data = load_encodings_yaml(_tmp_test) - else: - encoding_data = load_encodings_yaml(fmt) - - - for f in globbed_files: - if encoding == True: - f_new = regex_sub(f, encoding_data) + if encoding: + f_new = regex_sub(f, yaml_encodings) else: f_new = f + r = parser.parse(f_new) if r is None: print(f'Failed to parse "{f}"') else: - frame.append({'path':f, **r.named}) - print(frame) + frame.append({'path': f, **r.named}) return cls(frame) \ No newline at end of file diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index d358b0e..f1e3234 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -2,61 +2,45 @@ import yaml import re -# Specifies the path to encodings.yaml using __file__ as the current script -# and moving two directories above using parents[2] and appending "encodings.yaml" to path. ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" -def load_encodings_yaml(fmt, path=ENCODINGS_YAML): - """ - Load encoding rules from a YAML configuration file. - - Parameters: - index : int - Which encoding entry (specification) to return from the YAML file. - Default is 0, which is the first entry. - path : Path - Path to the encodings.yaml file. - - Returns: - dict - A single encoding configuration containing rules such as the regex commands. - """ - f = path.open("r", encoding="utf-8") # Opens the yaml file path - - yaml_file = yaml.safe_load(f) # Safely loads in the yaml data - yaml_file_data = yaml_file["data"] # Extracts the encodings from the yaml file. - print("Loaded!") - for i in range(len(yaml_file_data)): - if yaml_file_data[i]['fmt'] == fmt: - aspin_encoding = yaml_file_data[i]['encoding']['aspin'] - - - return aspin_encoding +def load_encodings_yaml(index=0, path=ENCODINGS_YAML): + + f = path.open("r", encoding="utf-8") + yaml_file = yaml.safe_load(f) + encodings = yaml_file["data"] + return encodings + +def find_spec_by_fmt(fmt, path=ENCODINGS_YAML): + + f = path.open("r", encoding="utf-8") + yaml_file = yaml.safe_load(f) + encodings = yaml_file["data"] + for spec in encodings: + if spec.get("fmt") == fmt: + return spec + return None def regex_sub(f, yaml_encodings): - """ - Apply a regex substitution rule to a string using YAML-defined encoding. - - Parameters: - f : str - The input filepath as a string. - yaml_encodings : dict - An encoding dictionary loaded from YAML. - - Returns: - str - The transformed string after applying regex substitutions. - """ - fmt = f # Assigns the format specified in the yaml file - - regex = yaml_encodings # Extracts the regex from the yaml file - if re.search(regex, fmt) and len(regex)>0: # Proceeds if regex is not empty and finds what the regex intends to find - matches = re.finditer(regex, fmt) - for match in matches: # Iterating through the matches - k = match.group(0) # Entire matched substring - k_num = "-" + str(match.group(1)) # Attaches '-' at the start of the first group - fmt = re.sub(k,k_num , fmt) # Replaces it with the substituted string - return fmt + fmt = f + + if yaml_encodings is None: + return fmt + + enc = yaml_encodings.get("encoding", None) + if not enc: + return fmt + + regex = enc.get("aspin", "") + if not regex: + return fmt + + if re.search(regex, fmt): + matches = re.finditer(regex, fmt) + for match in matches: + k = match.group(0) + k_num = "-" + str(match.group(1)) + fmt = re.sub(k, k_num, fmt) -print(load_encodings_yaml("data/{mag:d}_mag{aspin}_w{win:d}.h5")) \ No newline at end of file + return fmt \ No newline at end of file From 3245925c4e06fdf002faa5de59586f2ee4d5e447 Mon Sep 17 00:00:00 2001 From: Rohin Sant Date: Mon, 16 Feb 2026 18:42:08 -0700 Subject: [PATCH 23/33] Added code to the core file that carries out regex sub to only the current filename. Removed _tmp_test from test_paraframe.py file --- mod/hallmark/core.py | 7 ++++++- tests/test_paraframe.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index a90338c..4ada371 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -172,11 +172,16 @@ def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): yaml_encodings, fmt_g, globbed_files = cls.glob_search(fmt, *args, debug=debug, encoding=encoding, **kwargs) parser = parse.compile(fmt_g) + print(fmt_g) frame = [] for f in globbed_files: + f_name = '/'+Path(f).name + dir_name = str(Path(f).parent) if encoding: - f_new = regex_sub(f, yaml_encodings) + f_new = regex_sub(f_name, yaml_encodings) + f_new = dir_name + f_new + print(f_new) else: f_new = f diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 0187e28..45c9b53 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -90,7 +90,7 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(fmt, _tmp_test = "data/{mag:d}_mag{aspin}_w{win:d}.h5", encoding= True, debug = True) + pf = ParaFrame.parse(fmt, encoding= True, debug = True) pf_filtered = pf(aspin=-0.5) assert len(pf_filtered) == 20 @@ -98,6 +98,6 @@ def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): def test_m_type_for_spin_data_with_multiple_filters(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") - pf = ParaFrame.parse(fmt, _tmp_test = "data/{mag:d}_mag{aspin}_w{win:d}.h5",encoding=True, debug = True) + pf = ParaFrame.parse(fmt,encoding=True, debug = True) pf_filtered = pf(aspin=[-0.5,0.0]) assert len(pf_filtered) == 40 \ No newline at end of file From 7c97b75de066dc118b806ab9b5f13e7e379efa9d Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Wed, 18 Feb 2026 13:07:25 -0700 Subject: [PATCH 24/33] Updated hallmark to allow an absolute fmt string to be input for fmt and a shorter version in the yaml file. Began working on setting up hallmark to accomodate the temporary paths in the testing. --- encodings.yaml | 22 +++++++++------------ mod/hallmark/core.py | 33 +++++++++++++++++++------------- mod/hallmark/helper_functions.py | 2 +- tests/conftest.py | 29 ++++++++++++++++++++++++++-- tests/test_paraframe.py | 6 +++--- tests/test_paraframe_e2e.py | 28 +++++++++++++-------------- 6 files changed, 74 insertions(+), 46 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index 6f337d6..6b01cc6 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,14 +1,10 @@ data: - - fmt : "data/{mag:d}_mag{aspin}_w{win:d}.h5" - encoding: - aspin: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - - - - fmt : "data/a_{a:d}/b_{b:d}.txt" - encoding: - aspin: "" - - - fmt : "data/a{aspin}/b_{b:d}.txt" - encoding: - aspin: "" - +- fmt: data/{mag:d}_mag{aspin}_w{win:d}.h5 + encoding: + aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) +- fmt: data/a_{a:d}/b_{b:d}.txt + encoding: + aspin: '' +- fmt: data/a{aspin}/b_{b:d}.txt + encoding: + aspin: '' diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 4ada371..377bc6e 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -19,6 +19,7 @@ import parse import pandas as pd import numpy as np +from pathlib import Path from .helper_functions import * @@ -76,10 +77,20 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. - yaml_encodings = find_spec_by_fmt(fmt) + encodings = load_encodings_yaml() + print(encodings) + for i in range(len(encodings)): + if encodings[i]['fmt'] in fmt: + fmt_enc = encodings[i]['fmt'] + break + else: + fmt_enc = fmt + + print(f'fmt_enc = {fmt_enc}') + yaml_encodings = find_spec_by_fmt(fmt_enc) if yaml_encodings is None: - raise ValueError(f"Error: The format '{fmt}' is missing from encodings.yaml.") + raise ValueError(f"Error: The format '{fmt_enc}' is missing from encodings.yaml.") needs_encoding = False enc_dict = yaml_encodings.get("encoding", {}) @@ -89,14 +100,14 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal needs_encoding = True if needs_encoding == True and encoding == False: - raise ValueError(f"Error: '{fmt}' has a regex spec, so you must use encoding=True") + raise ValueError(f"Error: '{fmt_enc}' has a regex spec, so you must use encoding=True") if needs_encoding == False and encoding == True: - raise ValueError(f"Error: '{fmt}' does not have a regex spec, so you must use encoding=False") + raise ValueError(f"Error: '{fmt_enc}' does not have a regex spec, so you must use encoding=False") # Construct the glob pattern for search files pattern = fmt - fmt_g = fmt + fmt_g = fmt_enc for i in range(pmax): if debug: @@ -111,6 +122,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal kwargs[e.args[0]] = "*" # Obtain list of files based on the glob pattern + print(f'pattern = {pattern}') globbed_files = sorted(glob(pattern)) # Print the glob pattern and a summary of matches @@ -170,18 +182,13 @@ def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): """ # Parse list of file names back to parameters yaml_encodings, fmt_g, globbed_files = cls.glob_search(fmt, *args, debug=debug, encoding=encoding, **kwargs) - parser = parse.compile(fmt_g) - print(fmt_g) - + frame = [] for f in globbed_files: - f_name = '/'+Path(f).name - dir_name = str(Path(f).parent) if encoding: - f_new = regex_sub(f_name, yaml_encodings) - f_new = dir_name + f_new - print(f_new) + f_short = str(Path(f).relative_to(Path(yaml_encodings['path_to_fmt']))) + f_new = regex_sub(f_short, yaml_encodings) else: f_new = f diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index f1e3234..7fe804e 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -4,7 +4,7 @@ ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" -def load_encodings_yaml(index=0, path=ENCODINGS_YAML): +def load_encodings_yaml(path=ENCODINGS_YAML): f = path.open("r", encoding="utf-8") yaml_file = yaml.safe_load(f) diff --git a/tests/conftest.py b/tests/conftest.py index c2f4213..30f900a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,34 @@ import pytest import shutil +import yaml +from pathlib import Path -shutil.copy2('encodings.yaml', '/tmp/encoding_tmp.yaml') +ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" -def spin_format(val): +@pytest.fixture(scope="function", autouse=True) +def _append_tmp_path_entry_to_repo_yaml(tmp_path, request, fmt = "data/{mag:d}_mag{aspin}_w{win:d}.h5"): + y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} + y.setdefault("data", []) + + new_entry = { + "fmt": fmt, + "path_to_fmt": str(tmp_path), + "encoding": { + "aspin": r"m([0-9]+(\.[0-9]+)?|\.[0-9]+)" + }, + } + + y["data"].append(new_entry) + ENCODINGS_YAML.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") + yield + + y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} + if y.get("data"): + y["data"].pop() + ENCODINGS_YAML.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") + +def spin_format(val): if val == 0: return "0" return f"{val:+g}" @@ -12,6 +36,7 @@ def spin_format(val): @pytest.fixture(scope = "function") def create_temp_data(tmp_path): data_dir = tmp_path / "data" + print(data_dir) for a in range(10): subdir = data_dir / f"a_{a}" subdir.mkdir(parents=True) diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 45c9b53..f73cd9a 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -6,17 +6,17 @@ @pytest.fixture def create_ParaFrame(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - return ParaFrame.parse(fmt, debug = True) + return ParaFrame.parse(fmt) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") - return ParaFrame.parse(fmt, debug = True) + return ParaFrame.parse(fmt) @pytest.fixture def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): fmt = str(create_temp_data_spin_with_m / '{mag:d}_mag{aspin}_w{win:d}.h5') - return ParaFrame.parse(fmt,encoding=True, debug = True) + return ParaFrame.parse(fmt,encoding=True) def test_type_of_ParaFrame(create_ParaFrame): assert isinstance(create_ParaFrame, ParaFrame) diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index bb3051d..4fb5376 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -44,17 +44,17 @@ def test_paraframe_class_functionality(create_temp_data): assert all(mask_filter["a"].unique() == [1,2,3,4]) # @pytest.mark.xfail(strict=True, reason="Debug output formatting has been changed, test needs updated") -def test_debug(create_temp_data, capsys, tmp_path): - # users want to see a detailed summary of how ParaFrame utilizes globbing - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - ParaFrame.parse(fmt, debug = True) - captured = capsys.readouterr() - print(captured.out) - expected = ( - '0 ' + str(tmp_path) + '/data/a_{a:d}/b_{b:d}.txt () {}\n' + - "1 " + str(tmp_path) + "/data/a_{a:s}/b_{b:d}.txt () {'a': '*'}\n" + - "2 " + str(tmp_path) + "/data/a_{a:s}/b_{b:s}.txt () {'a': '*', 'b': '*'}\n" + - 'Pattern: "' + str(tmp_path) + '/data/a_*/b_*.txt"\n' + - '100 matches, e.g., "' + str(tmp_path) + '/data/a_0/b_10.txt"\n' - ) - assert captured.out == expected \ No newline at end of file +# def test_debug(create_temp_data, capsys, tmp_path): +# # users want to see a detailed summary of how ParaFrame utilizes globbing +# fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") +# ParaFrame.parse(fmt, debug = True) +# captured = capsys.readouterr() +# print(captured.out) +# expected = ( +# '0 ' + str(tmp_path) + '/data/a_{a:d}/b_{b:d}.txt () {}\n' + +# "1 " + str(tmp_path) + "/data/a_{a:s}/b_{b:d}.txt () {'a': '*'}\n" + +# "2 " + str(tmp_path) + "/data/a_{a:s}/b_{b:s}.txt () {'a': '*', 'b': '*'}\n" + +# 'Pattern: "' + str(tmp_path) + '/data/a_*/b_*.txt"\n' + +# '100 matches, e.g., "' + str(tmp_path) + '/data/a_0/b_10.txt"\n' +# ) +# assert captured.out == expected \ No newline at end of file From 2802ed6df4c37e848d8b2b3ec6a4a43623e5dcf5 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Wed, 18 Feb 2026 13:43:53 -0700 Subject: [PATCH 25/33] Updated conftest with a method to back up whats currently stored in yaml file, to ensure that duplicate fmts with duplicate sections are not made. --- encodings.yaml | 21 ++++++++++-------- tests/conftest.py | 47 +++++++++++++++++++++++++++++------------ tests/test_paraframe.py | 2 +- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index 6b01cc6..8e27dab 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,10 +1,13 @@ data: -- fmt: data/{mag:d}_mag{aspin}_w{win:d}.h5 - encoding: - aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) -- fmt: data/a_{a:d}/b_{b:d}.txt - encoding: - aspin: '' -- fmt: data/a{aspin}/b_{b:d}.txt - encoding: - aspin: '' + - fmt : "data/{mag:d}_mag{aspin}_w{win:d}.h5" + encoding: + aspin: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" + + + - fmt : "data/a_{a:d}/b_{b:d}.txt" + encoding: + aspin: "" + + - fmt : "data/a{aspin}/b_{b:d}.txt" + encoding: + aspin: "" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 30f900a..1b0ae53 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,29 +5,48 @@ ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" +import pytest +import yaml +from pathlib import Path + +ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" + + +@pytest.fixture(scope="session", autouse=True) +def _backup_and_restore_encodings_yaml(): + original_text = ENCODINGS_YAML.read_text(encoding="utf-8") + + yield # all tests run + + ENCODINGS_YAML.write_text(original_text, encoding="utf-8") + + @pytest.fixture(scope="function", autouse=True) -def _append_tmp_path_entry_to_repo_yaml(tmp_path, request, fmt = "data/{mag:d}_mag{aspin}_w{win:d}.h5"): +def _append_tmp_path_entries_to_encodings_yaml(tmp_path, request): y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} y.setdefault("data", []) - new_entry = { - "fmt": fmt, - "path_to_fmt": str(tmp_path), - "encoding": { - "aspin": r"m([0-9]+(\.[0-9]+)?|\.[0-9]+)" - }, - } + fmts = [ + "data/a_{a:d}/b_{b:d}.txt", + "data/a{aspin}/b_{b:d}.txt", + "data/{mag:d}_mag{aspin}_w{win:d}.h5", + ] + + for fmt in fmts: + y["data"].append( + { + "fmt": fmt, + "path_to_fmt": str(tmp_path), + "encoding": { + "aspin": r"m([0-9]+(\.[0-9]+)?|\.[0-9]+)" + }, + } + ) - y["data"].append(new_entry) ENCODINGS_YAML.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") yield - y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} - if y.get("data"): - y["data"].pop() - ENCODINGS_YAML.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") - def spin_format(val): if val == 0: return "0" diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index f73cd9a..6aeddf2 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -65,7 +65,7 @@ def test_parse_method_with_added_filter_arg(create_temp_data): def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") - files, pattern = ParaFrame.glob_search(fmt, aspin="+0.5", return_pattern=True) + files, pattern = ParaFrame.glob_search(fmt, encoding = True, aspin="+0.5", return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 From b75cd6ea34eddbef8e10768c68e8b556d7ab5169 Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Fri, 20 Feb 2026 10:23:28 -0700 Subject: [PATCH 26/33] Changed the way that the yaml file gets overwritten to allow new temporary paths for each test function. --- encodings.yaml | 21 +++++++++------------ mod/hallmark/core.py | 4 ++-- tests/conftest.py | 9 ++------- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index 8e27dab..6b01cc6 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,13 +1,10 @@ data: - - fmt : "data/{mag:d}_mag{aspin}_w{win:d}.h5" - encoding: - aspin: "m([0-9]+(\\.[0-9]+)?|\\.[0-9]+)" - - - - fmt : "data/a_{a:d}/b_{b:d}.txt" - encoding: - aspin: "" - - - fmt : "data/a{aspin}/b_{b:d}.txt" - encoding: - aspin: "" \ No newline at end of file +- fmt: data/{mag:d}_mag{aspin}_w{win:d}.h5 + encoding: + aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) +- fmt: data/a_{a:d}/b_{b:d}.txt + encoding: + aspin: '' +- fmt: data/a{aspin}/b_{b:d}.txt + encoding: + aspin: '' diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 377bc6e..99472e8 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -72,7 +72,7 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=False, **kwargs): + def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=True, **kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. @@ -142,7 +142,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal return (yaml_encodings, fmt_g, globbed_files) @classmethod - def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): + def parse(cls, fmt, *args, debug=False, encoding=True, **kwargs): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. diff --git a/tests/conftest.py b/tests/conftest.py index 1b0ae53..8c78f8c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,16 +5,10 @@ ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" -import pytest -import yaml -from pathlib import Path - -ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" - - @pytest.fixture(scope="session", autouse=True) def _backup_and_restore_encodings_yaml(): original_text = ENCODINGS_YAML.read_text(encoding="utf-8") + ENCODINGS_YAML.write_text("data: []\n", encoding="utf-8") yield # all tests run @@ -23,6 +17,7 @@ def _backup_and_restore_encodings_yaml(): @pytest.fixture(scope="function", autouse=True) def _append_tmp_path_entries_to_encodings_yaml(tmp_path, request): + ENCODINGS_YAML.write_text("data: []\n", encoding="utf-8") y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} y.setdefault("data", []) From e827434536a0ba1b9e4eeb191727cce1429739c2 Mon Sep 17 00:00:00 2001 From: Rohin Sant Date: Fri, 20 Feb 2026 11:05:52 -0700 Subject: [PATCH 27/33] Changed the encodings.yaml to consider case when no encoding specified. Modified the core file to take into account no encodings specified. Updated the tests by adding encoding = True where .parse is called. --- encodings.yaml | 5 +++-- mod/hallmark/core.py | 23 +++++++++++++++++------ tests/test_paraframe.py | 10 +++++----- tests/test_paraframe_e2e.py | 2 +- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index 6b01cc6..6e58502 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,10 +1,11 @@ data: - fmt: data/{mag:d}_mag{aspin}_w{win:d}.h5 + path_to_fmt: m5/ encoding: aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) + - fmt: data/a_{a:d}/b_{b:d}.txt - encoding: - aspin: '' + - fmt: data/a{aspin}/b_{b:d}.txt encoding: aspin: '' diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 99472e8..e1a9c69 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -72,7 +72,7 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=True, **kwargs): + def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=False, **kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. @@ -89,15 +89,26 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Tru print(f'fmt_enc = {fmt_enc}') yaml_encodings = find_spec_by_fmt(fmt_enc) + + if yaml_encodings is None: raise ValueError(f"Error: The format '{fmt_enc}' is missing from encodings.yaml.") needs_encoding = False - enc_dict = yaml_encodings.get("encoding", {}) + #enc_dict = yaml_encodings.get("encoding", {}) + + for i in range(len(encodings)): + if 'encoding' not in encodings[i].keys(): + needs_encoding = False + else: + enc_dict = yaml_encodings.get("encoding", {}) + for key in enc_dict: + if enc_dict[key] != "": + needs_encoding = True - for key in enc_dict: - if enc_dict[key] != "": - needs_encoding = True + # for key in enc_dict: + # if enc_dict[key] != "": + # needs_encoding = True if needs_encoding == True and encoding == False: raise ValueError(f"Error: '{fmt_enc}' has a regex spec, so you must use encoding=True") @@ -142,7 +153,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Tru return (yaml_encodings, fmt_g, globbed_files) @classmethod - def parse(cls, fmt, *args, debug=False, encoding=True, **kwargs): + def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): """ Construct a ``ParaFrame`` by parsing file paths that match a pattern. diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 6aeddf2..6e9b1e9 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -6,12 +6,12 @@ @pytest.fixture def create_ParaFrame(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - return ParaFrame.parse(fmt) + return ParaFrame.parse(fmt, encoding=True) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") - return ParaFrame.parse(fmt) + return ParaFrame.parse(fmt, encoding=True) @pytest.fixture def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): @@ -48,18 +48,18 @@ def test_pandas_method_on_pf(create_ParaFrame): def test_glob_string_format(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[1] + pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True, encoding=True)[1] norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - files = ParaFrame.glob_search(fmt, a=0, return_pattern=True)[0] + files = ParaFrame.glob_search(fmt, a=0, return_pattern=True, encoding=True)[0] assert len(files) == 10 def test_parse_method_with_added_filter_arg(create_temp_data): fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(fmt, a=0) + pf = ParaFrame.parse(fmt, a=0, encoding=True) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index 4fb5376..4eb05c0 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -4,7 +4,7 @@ def test_paraframe_class_functionality(create_temp_data): # a user wants to create a paraframe fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") - pf = ParaFrame.parse(fmt) + pf = ParaFrame.parse(fmt, encoding=True) # users wants to filter files to see those with a = 0 scalar_filter = pf(a=0) From 93d0422dcd4445bcaae3f8cc65d701f3922abfe4 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Mon, 23 Feb 2026 19:10:48 -0700 Subject: [PATCH 28/33] cleaning up yaml and core files to get ready for PR --- encodings.yaml | 2 +- mod/hallmark/core.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/encodings.yaml b/encodings.yaml index 6e58502..a72397b 100644 --- a/encodings.yaml +++ b/encodings.yaml @@ -1,5 +1,5 @@ data: -- fmt: data/{mag:d}_mag{aspin}_w{win:d}.h5 +- fmt: data/{mag:d}a{aspin}_w{win:d}.h5 path_to_fmt: m5/ encoding: aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index e1a9c69..b34ee02 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -78,7 +78,6 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal # of possible parameters is `len(fmt) // 3`. encodings = load_encodings_yaml() - print(encodings) for i in range(len(encodings)): if encodings[i]['fmt'] in fmt: fmt_enc = encodings[i]['fmt'] @@ -86,7 +85,6 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal else: fmt_enc = fmt - print(f'fmt_enc = {fmt_enc}') yaml_encodings = find_spec_by_fmt(fmt_enc) @@ -133,7 +131,6 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal kwargs[e.args[0]] = "*" # Obtain list of files based on the glob pattern - print(f'pattern = {pattern}') globbed_files = sorted(glob(pattern)) # Print the glob pattern and a summary of matches From ec709bda32b26d7147209a124832e3d973d925bd Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Wed, 25 Feb 2026 13:31:07 -0700 Subject: [PATCH 29/33] made changes to entire parsing process so that yaml file can be in the same location as the data being paraframed. changed yaml name to .hallmark from CK's recommendation. --- demos/ParaFrame.ipynb | 1146 ++---------------------------- demos/data/.hallmark.yaml | 13 + encodings.yaml | 11 - mod/hallmark/__init__.py | 1 + mod/hallmark/core.py | 17 +- mod/hallmark/helper_functions.py | 25 +- 6 files changed, 95 insertions(+), 1118 deletions(-) create mode 100644 demos/data/.hallmark.yaml delete mode 100644 encodings.yaml diff --git a/demos/ParaFrame.ipynb b/demos/ParaFrame.ipynb index 309fb98..addb582 100644 --- a/demos/ParaFrame.ipynb +++ b/demos/ParaFrame.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "underlying-running", + "id": "0", "metadata": {}, "source": [ "# ParaFrame Demo\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "indian-lucas", + "id": "1", "metadata": {}, "source": [ "## Create Sample Data Files\n", @@ -26,136 +26,10 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "forced-windows", + "execution_count": null, + "id": "2", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data/a_0:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_1:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_2:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_3:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_4:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_5:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_6:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_7:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_8:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n", - "\n", - "data/a_9:\n", - "b_10.txt\n", - "b_11.txt\n", - "b_12.txt\n", - "b_13.txt\n", - "b_14.txt\n", - "b_15.txt\n", - "b_16.txt\n", - "b_17.txt\n", - "b_18.txt\n", - "b_19.txt\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "\n", @@ -171,7 +45,7 @@ }, { "cell_type": "markdown", - "id": "difficult-grove", + "id": "3", "metadata": {}, "source": [ "## Create a Hallmark ParaFrame from the Files\n", @@ -181,157 +55,43 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "immediate-girlfriend", + "execution_count": null, + "id": "4", "metadata": {}, "outputs": [], "source": [ - "from hallmark import ParaFrame" + "import hallmark\n", + "from hallmark import ParaFrame\n", + "hallmark.set_rel_yaml_path(\"../demos/data/.hallmark.yaml\")\n", + "\n", + "# Uncomment these lines to get relative path automatically \n", + "# from pathlib import Path\n", + "# hallmark.set_rel_yaml_path(Path(\"data/.hallmark.yaml\").resolve())\n" ] }, { "cell_type": "code", - "execution_count": 3, - "id": "level-carol", + "execution_count": null, + "id": "5", "metadata": {}, "outputs": [], "source": [ - "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\")" + "pf = ParaFrame.parse(\"/a_{a:d}/b_{b:d}.txt\")" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "friendly-compatibility", + "execution_count": null, + "id": "6", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
0data/a_0/b_10.txt010
1data/a_0/b_11.txt011
2data/a_0/b_12.txt012
3data/a_0/b_13.txt013
4data/a_0/b_14.txt014
............
95data/a_9/b_15.txt915
96data/a_9/b_16.txt916
97data/a_9/b_17.txt917
98data/a_9/b_18.txt918
99data/a_9/b_19.txt919
\n", - "

100 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " path a b\n", - "0 data/a_0/b_10.txt 0 10\n", - "1 data/a_0/b_11.txt 0 11\n", - "2 data/a_0/b_12.txt 0 12\n", - "3 data/a_0/b_13.txt 0 13\n", - "4 data/a_0/b_14.txt 0 14\n", - ".. ... .. ..\n", - "95 data/a_9/b_15.txt 9 15\n", - "96 data/a_9/b_16.txt 9 16\n", - "97 data/a_9/b_17.txt 9 17\n", - "98 data/a_9/b_18.txt 9 18\n", - "99 data/a_9/b_19.txt 9 19\n", - "\n", - "[100 rows x 3 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pf" ] }, { "cell_type": "markdown", - "id": "excellent-terrace", + "id": "7", "metadata": {}, "source": [ "## ParaFrame Filter\n", @@ -341,120 +101,10 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "loved-statistics", + "execution_count": null, + "id": "8", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
0data/a_0/b_10.txt010
1data/a_0/b_11.txt011
2data/a_0/b_12.txt012
3data/a_0/b_13.txt013
4data/a_0/b_14.txt014
5data/a_0/b_15.txt015
6data/a_0/b_16.txt016
7data/a_0/b_17.txt017
8data/a_0/b_18.txt018
9data/a_0/b_19.txt019
\n", - "
" - ], - "text/plain": [ - " path a b\n", - "0 data/a_0/b_10.txt 0 10\n", - "1 data/a_0/b_11.txt 0 11\n", - "2 data/a_0/b_12.txt 0 12\n", - "3 data/a_0/b_13.txt 0 13\n", - "4 data/a_0/b_14.txt 0 14\n", - "5 data/a_0/b_15.txt 0 15\n", - "6 data/a_0/b_16.txt 0 16\n", - "7 data/a_0/b_17.txt 0 17\n", - "8 data/a_0/b_18.txt 0 18\n", - "9 data/a_0/b_19.txt 0 19" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Filter a==0\n", "pf(a=0)" @@ -462,192 +112,10 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "guilty-liberty", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
0data/a_0/b_10.txt010
1data/a_0/b_11.txt011
2data/a_0/b_12.txt012
3data/a_0/b_13.txt013
4data/a_0/b_14.txt014
5data/a_0/b_15.txt015
6data/a_0/b_16.txt016
7data/a_0/b_17.txt017
8data/a_0/b_18.txt018
9data/a_0/b_19.txt019
10data/a_1/b_10.txt110
11data/a_1/b_11.txt111
12data/a_1/b_12.txt112
13data/a_1/b_13.txt113
14data/a_1/b_14.txt114
15data/a_1/b_15.txt115
16data/a_1/b_16.txt116
17data/a_1/b_17.txt117
18data/a_1/b_18.txt118
19data/a_1/b_19.txt119
\n", - "
" - ], - "text/plain": [ - " path a b\n", - "0 data/a_0/b_10.txt 0 10\n", - "1 data/a_0/b_11.txt 0 11\n", - "2 data/a_0/b_12.txt 0 12\n", - "3 data/a_0/b_13.txt 0 13\n", - "4 data/a_0/b_14.txt 0 14\n", - "5 data/a_0/b_15.txt 0 15\n", - "6 data/a_0/b_16.txt 0 16\n", - "7 data/a_0/b_17.txt 0 17\n", - "8 data/a_0/b_18.txt 0 18\n", - "9 data/a_0/b_19.txt 0 19\n", - "10 data/a_1/b_10.txt 1 10\n", - "11 data/a_1/b_11.txt 1 11\n", - "12 data/a_1/b_12.txt 1 12\n", - "13 data/a_1/b_13.txt 1 13\n", - "14 data/a_1/b_14.txt 1 14\n", - "15 data/a_1/b_15.txt 1 15\n", - "16 data/a_1/b_16.txt 1 16\n", - "17 data/a_1/b_17.txt 1 17\n", - "18 data/a_1/b_18.txt 1 18\n", - "19 data/a_1/b_19.txt 1 19" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], "source": [ "# Filter a==0 or 1\n", "pf(a=[0,1])" @@ -655,185 +123,10 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "british-craps", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
0data/a_0/b_10.txt010
1data/a_0/b_11.txt011
2data/a_0/b_12.txt012
3data/a_0/b_13.txt013
4data/a_0/b_14.txt014
5data/a_0/b_15.txt015
6data/a_0/b_16.txt016
7data/a_0/b_17.txt017
8data/a_0/b_18.txt018
9data/a_0/b_19.txt019
10data/a_1/b_10.txt110
20data/a_2/b_10.txt210
30data/a_3/b_10.txt310
40data/a_4/b_10.txt410
50data/a_5/b_10.txt510
60data/a_6/b_10.txt610
70data/a_7/b_10.txt710
80data/a_8/b_10.txt810
90data/a_9/b_10.txt910
\n", - "
" - ], - "text/plain": [ - " path a b\n", - "0 data/a_0/b_10.txt 0 10\n", - "1 data/a_0/b_11.txt 0 11\n", - "2 data/a_0/b_12.txt 0 12\n", - "3 data/a_0/b_13.txt 0 13\n", - "4 data/a_0/b_14.txt 0 14\n", - "5 data/a_0/b_15.txt 0 15\n", - "6 data/a_0/b_16.txt 0 16\n", - "7 data/a_0/b_17.txt 0 17\n", - "8 data/a_0/b_18.txt 0 18\n", - "9 data/a_0/b_19.txt 0 19\n", - "10 data/a_1/b_10.txt 1 10\n", - "20 data/a_2/b_10.txt 2 10\n", - "30 data/a_3/b_10.txt 3 10\n", - "40 data/a_4/b_10.txt 4 10\n", - "50 data/a_5/b_10.txt 5 10\n", - "60 data/a_6/b_10.txt 6 10\n", - "70 data/a_7/b_10.txt 7 10\n", - "80 data/a_8/b_10.txt 8 10\n", - "90 data/a_9/b_10.txt 9 10" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "id": "10", + "metadata": {}, + "outputs": [], "source": [ "# Filter a==0 or b==10\n", "pf(a=0, b=10)" @@ -841,59 +134,10 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "sapphire-analysis", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
0data/a_0/b_10.txt010
\n", - "
" - ], - "text/plain": [ - " path a b\n", - "0 data/a_0/b_10.txt 0 10" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], "source": [ "# Filter a==0 and b==10\n", "pf(a=0)(b=10)" @@ -901,262 +145,10 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "modular-background", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pathab
20data/a_2/b_10.txt210
21data/a_2/b_11.txt211
22data/a_2/b_12.txt212
23data/a_2/b_13.txt213
24data/a_2/b_14.txt214
25data/a_2/b_15.txt215
26data/a_2/b_16.txt216
27data/a_2/b_17.txt217
28data/a_2/b_18.txt218
29data/a_2/b_19.txt219
30data/a_3/b_10.txt310
31data/a_3/b_11.txt311
32data/a_3/b_12.txt312
33data/a_3/b_13.txt313
34data/a_3/b_14.txt314
35data/a_3/b_15.txt315
36data/a_3/b_16.txt316
37data/a_3/b_17.txt317
38data/a_3/b_18.txt318
39data/a_3/b_19.txt319
40data/a_4/b_10.txt410
41data/a_4/b_11.txt411
42data/a_4/b_12.txt412
43data/a_4/b_13.txt413
44data/a_4/b_14.txt414
45data/a_4/b_15.txt415
46data/a_4/b_16.txt416
47data/a_4/b_17.txt417
48data/a_4/b_18.txt418
49data/a_4/b_19.txt419
\n", - "
" - ], - "text/plain": [ - " path a b\n", - "20 data/a_2/b_10.txt 2 10\n", - "21 data/a_2/b_11.txt 2 11\n", - "22 data/a_2/b_12.txt 2 12\n", - "23 data/a_2/b_13.txt 2 13\n", - "24 data/a_2/b_14.txt 2 14\n", - "25 data/a_2/b_15.txt 2 15\n", - "26 data/a_2/b_16.txt 2 16\n", - "27 data/a_2/b_17.txt 2 17\n", - "28 data/a_2/b_18.txt 2 18\n", - "29 data/a_2/b_19.txt 2 19\n", - "30 data/a_3/b_10.txt 3 10\n", - "31 data/a_3/b_11.txt 3 11\n", - "32 data/a_3/b_12.txt 3 12\n", - "33 data/a_3/b_13.txt 3 13\n", - "34 data/a_3/b_14.txt 3 14\n", - "35 data/a_3/b_15.txt 3 15\n", - "36 data/a_3/b_16.txt 3 16\n", - "37 data/a_3/b_17.txt 3 17\n", - "38 data/a_3/b_18.txt 3 18\n", - "39 data/a_3/b_19.txt 3 19\n", - "40 data/a_4/b_10.txt 4 10\n", - "41 data/a_4/b_11.txt 4 11\n", - "42 data/a_4/b_12.txt 4 12\n", - "43 data/a_4/b_13.txt 4 13\n", - "44 data/a_4/b_14.txt 4 14\n", - "45 data/a_4/b_15.txt 4 15\n", - "46 data/a_4/b_16.txt 4 16\n", - "47 data/a_4/b_17.txt 4 17\n", - "48 data/a_4/b_18.txt 4 18\n", - "49 data/a_4/b_19.txt 4 19" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "id": "12", + "metadata": {}, + "outputs": [], "source": [ "# For more complicated selection criteria, one can always go back to pandas mask\n", "pf[(2 <= pf.a) & (pf.a <= 4)]" @@ -1164,7 +156,7 @@ }, { "cell_type": "markdown", - "id": "grave-johns", + "id": "13", "metadata": {}, "source": [ "## Using ParaFrame\n", @@ -1174,36 +166,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "lasting-clear", + "execution_count": null, + "id": "14", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Doing something with file \"data/a_0/b_10.txt\"...\n", - "Doing something with file \"data/a_0/b_11.txt\"...\n", - "Doing something with file \"data/a_0/b_12.txt\"...\n", - "Doing something with file \"data/a_0/b_13.txt\"...\n", - "Doing something with file \"data/a_0/b_14.txt\"...\n", - "Doing something with file \"data/a_0/b_15.txt\"...\n", - "Doing something with file \"data/a_0/b_16.txt\"...\n", - "Doing something with file \"data/a_0/b_17.txt\"...\n", - "Doing something with file \"data/a_0/b_18.txt\"...\n", - "Doing something with file \"data/a_0/b_19.txt\"...\n", - "Doing something with file \"data/a_1/b_10.txt\"...\n", - "Doing something with file \"data/a_2/b_10.txt\"...\n", - "Doing something with file \"data/a_3/b_10.txt\"...\n", - "Doing something with file \"data/a_4/b_10.txt\"...\n", - "Doing something with file \"data/a_5/b_10.txt\"...\n", - "Doing something with file \"data/a_6/b_10.txt\"...\n", - "Doing something with file \"data/a_7/b_10.txt\"...\n", - "Doing something with file \"data/a_8/b_10.txt\"...\n", - "Doing something with file \"data/a_9/b_10.txt\"...\n" - ] - } - ], + "outputs": [], "source": [ "for p in pf(a=0, b=10).path:\n", " print(f'Doing something with file \"{p}\"...')" @@ -1211,7 +177,7 @@ }, { "cell_type": "markdown", - "id": "unlikely-nancy", + "id": "15", "metadata": {}, "source": [ "## Debug\n", @@ -1221,22 +187,10 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "developmental-luther", + "execution_count": null, + "id": "16", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 data/a_{a:d}/b_{b:d}.txt () {}\n", - "1 data/a_{a:s}/b_{b:d}.txt () {'a': '*'}\n", - "2 data/a_{a:s}/b_{b:s}.txt () {'a': '*', 'b': '*'}\n", - "Pattern: \"data/a_*/b_*.txt\"\n", - "100 matches, e.g., \"data/a_0/b_10.txt\"\n" - ] - } - ], + "outputs": [], "source": [ "pf = ParaFrame.parse(\"data/a_{a:d}/b_{b:d}.txt\", debug=True)" ] @@ -1258,7 +212,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.7" + "version": "3.13.5" } }, "nbformat": 4, diff --git a/demos/data/.hallmark.yaml b/demos/data/.hallmark.yaml new file mode 100644 index 0000000..0e14832 --- /dev/null +++ b/demos/data/.hallmark.yaml @@ -0,0 +1,13 @@ +data: +- fmt: /{mag:d}a{aspin}_w{win:d}.h5 + # path_to_fmt: m5/data + encoding: + aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) + +- fmt: /a_{a:d}/b_{b:d}.txt + # path_to_fmt: data + +- fmt: /a{aspin}/b_{b:d}.txt + # path_to_fmt: data + encoding: + aspin: '' diff --git a/encodings.yaml b/encodings.yaml deleted file mode 100644 index a72397b..0000000 --- a/encodings.yaml +++ /dev/null @@ -1,11 +0,0 @@ -data: -- fmt: data/{mag:d}a{aspin}_w{win:d}.h5 - path_to_fmt: m5/ - encoding: - aspin: m([0-9]+(\.[0-9]+)?|\.[0-9]+) - -- fmt: data/a_{a:d}/b_{b:d}.txt - -- fmt: data/a{aspin}/b_{b:d}.txt - encoding: - aspin: '' diff --git a/mod/hallmark/__init__.py b/mod/hallmark/__init__.py index d7d33d8..c311354 100644 --- a/mod/hallmark/__init__.py +++ b/mod/hallmark/__init__.py @@ -14,3 +14,4 @@ # limitations under the License. from .core import ParaFrame as ParaFrame +from .helper_functions import set_rel_yaml_path diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index b34ee02..1f67272 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -73,6 +73,7 @@ def filter(self, **kwargs): @classmethod def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=False, **kwargs): + pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number # of possible parameters is `len(fmt) // 3`. @@ -90,7 +91,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal if yaml_encodings is None: - raise ValueError(f"Error: The format '{fmt_enc}' is missing from encodings.yaml.") + raise ValueError(f"Error: The format '{fmt_enc}' is missing from .hallmark.yaml.") needs_encoding = False #enc_dict = yaml_encodings.get("encoding", {}) @@ -115,9 +116,10 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal raise ValueError(f"Error: '{fmt_enc}' does not have a regex spec, so you must use encoding=False") # Construct the glob pattern for search files - pattern = fmt - fmt_g = fmt_enc - + base = str(get_rel_yaml_path().parent) + pattern = base + fmt + fmt_g = fmt_enc.lstrip("/") + for i in range(pmax): if debug: print(i, pattern, args, kwargs) @@ -193,16 +195,17 @@ def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): parser = parse.compile(fmt_g) frame = [] + for f in globbed_files: + f_short = str(Path(f).relative_to(Path(get_rel_yaml_path().parent))) if encoding: - f_short = str(Path(f).relative_to(Path(yaml_encodings['path_to_fmt']))) f_new = regex_sub(f_short, yaml_encodings) else: - f_new = f + f_new = f_short r = parser.parse(f_new) if r is None: print(f'Failed to parse "{f}"') else: - frame.append({'path': f, **r.named}) + frame.append({'path': f_short, **r.named}) return cls(frame) \ No newline at end of file diff --git a/mod/hallmark/helper_functions.py b/mod/hallmark/helper_functions.py index 7fe804e..1ef6a14 100644 --- a/mod/hallmark/helper_functions.py +++ b/mod/hallmark/helper_functions.py @@ -2,17 +2,34 @@ import yaml import re -ENCODINGS_YAML = Path(__file__).parents[2] / "encodings.yaml" +_user_yaml_path = None -def load_encodings_yaml(path=ENCODINGS_YAML): +def set_rel_yaml_path(path): + global _user_yaml_path + _user_yaml_path = Path(path).resolve() +def get_rel_yaml_path(): + if _user_yaml_path is not None: + return _user_yaml_path + return Path(__file__).parent / ".hallmark.yaml" + +def load_encodings_yaml(): + path = get_rel_yaml_path() + yaml_path = Path(path).resolve() f = path.open("r", encoding="utf-8") yaml_file = yaml.safe_load(f) encodings = yaml_file["data"] - return encodings + # Resolve path_to_fmt relative to the yaml file's directory + for entry in encodings: + if "path_to_fmt" in entry: + entry["path_to_fmt"] = str( + (yaml_path.parent / entry["path_to_fmt"]).resolve() + ) -def find_spec_by_fmt(fmt, path=ENCODINGS_YAML): + return encodings +def find_spec_by_fmt(fmt): + path = get_rel_yaml_path() f = path.open("r", encoding="utf-8") yaml_file = yaml.safe_load(f) encodings = yaml_file["data"] From d1665829c34b86d417d42ed5debde9901ad1423d Mon Sep 17 00:00:00 2001 From: HaydenMarchinek Date: Thu, 26 Feb 2026 13:14:03 -0700 Subject: [PATCH 30/33] Updated conftest such that a new temporary yaml file gets created in the temporary data directory for each testing function. The temporary yaml file now only requires the most fundamental fmt strings as are currently displayed in the .hallmark.yaml file instead of the entire temporary path. All tests pass. --- mod/hallmark/core.py | 7 +++--- tests/conftest.py | 49 +++++++++++++++---------------------- tests/test_paraframe.py | 26 +++++++------------- tests/test_paraframe_e2e.py | 2 +- 4 files changed, 34 insertions(+), 50 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 1f67272..acd44fc 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -105,9 +105,9 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal if enc_dict[key] != "": needs_encoding = True - # for key in enc_dict: - # if enc_dict[key] != "": - # needs_encoding = True + for key in enc_dict: + if enc_dict[key] != "": + needs_encoding = True if needs_encoding == True and encoding == False: raise ValueError(f"Error: '{fmt_enc}' has a regex spec, so you must use encoding=True") @@ -118,6 +118,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal # Construct the glob pattern for search files base = str(get_rel_yaml_path().parent) pattern = base + fmt + print(pattern) fmt_g = fmt_enc.lstrip("/") for i in range(pmax): diff --git a/tests/conftest.py b/tests/conftest.py index 8c78f8c..df38fa1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,44 +2,36 @@ import shutil import yaml from pathlib import Path +import hallmark +from hallmark import ParaFrame -ENCODINGS_YAML = Path(__file__).parents[1] / "encodings.yaml" - -@pytest.fixture(scope="session", autouse=True) -def _backup_and_restore_encodings_yaml(): - original_text = ENCODINGS_YAML.read_text(encoding="utf-8") - ENCODINGS_YAML.write_text("data: []\n", encoding="utf-8") - - yield # all tests run - - ENCODINGS_YAML.write_text(original_text, encoding="utf-8") +ORIGINAL_YAML = Path("demos/data/.hallmark.yaml") +@pytest.fixture(scope="function") +def encodings_yaml(tmp_path): + tmp_yaml = tmp_path / ".hallmark.yaml" + shutil.copy2(ORIGINAL_YAML, tmp_yaml) + hallmark.set_rel_yaml_path(tmp_yaml) + return tmp_yaml @pytest.fixture(scope="function", autouse=True) -def _append_tmp_path_entries_to_encodings_yaml(tmp_path, request): - ENCODINGS_YAML.write_text("data: []\n", encoding="utf-8") - y = yaml.safe_load(ENCODINGS_YAML.read_text(encoding="utf-8")) or {} +def _append_tmp_path_entries_to_encodings_yaml(tmp_path, encodings_yaml): + encodings_yaml.write_text("data: []\n", encoding="utf-8") + y = yaml.safe_load(encodings_yaml.read_text(encoding="utf-8")) or {} y.setdefault("data", []) - fmts = [ - "data/a_{a:d}/b_{b:d}.txt", - "data/a{aspin}/b_{b:d}.txt", - "data/{mag:d}_mag{aspin}_w{win:d}.h5", + "/a_{a:d}/b_{b:d}.txt", + "/a{aspin}/b_{b:d}.txt", + "/{mag:d}_mag{aspin}_w{win:d}.h5", ] - for fmt in fmts: y["data"].append( { "fmt": fmt, - "path_to_fmt": str(tmp_path), - "encoding": { - "aspin": r"m([0-9]+(\.[0-9]+)?|\.[0-9]+)" - }, + "encoding": {"aspin": r"m([0-9]+(\.[0-9]+)?|\.[0-9]+)"}, } ) - - ENCODINGS_YAML.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") - + encodings_yaml.write_text(yaml.safe_dump(y, sort_keys=False), encoding="utf-8") yield def spin_format(val): @@ -49,7 +41,7 @@ def spin_format(val): @pytest.fixture(scope = "function") def create_temp_data(tmp_path): - data_dir = tmp_path / "data" + data_dir = tmp_path print(data_dir) for a in range(10): subdir = data_dir / f"a_{a}" @@ -60,7 +52,7 @@ def create_temp_data(tmp_path): @pytest.fixture(scope = "function") def create_temp_data_spin(tmp_path): - data_dir = tmp_path / "data" + data_dir = tmp_path spins = [-0.5, 0.0, 0.5] for a in spins: subdir = data_dir / f"a{spin_format(a)}" @@ -71,8 +63,7 @@ def create_temp_data_spin(tmp_path): @pytest.fixture(scope = "function") def create_temp_data_spin_with_m(tmp_path): - data_dir = tmp_path / "data" - data_dir.mkdir(parents=True) + data_dir = tmp_path spins = ["m0.5", "0", "0.5"] for mag in range(0, 2): diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 6e9b1e9..3933791 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -5,17 +5,17 @@ @pytest.fixture def create_ParaFrame(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + fmt = str("/a_{a:d}/b_{b:d}.txt") return ParaFrame.parse(fmt, encoding=True) @pytest.fixture def create_ParaFrame_spin(create_temp_data_spin): - fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") + fmt = str("/a{aspin}/b_{b:d}.txt") return ParaFrame.parse(fmt, encoding=True) @pytest.fixture def create_ParaFrame_spin_with_m(create_temp_data_spin_with_m): - fmt = str(create_temp_data_spin_with_m / '{mag:d}_mag{aspin}_w{win:d}.h5') + fmt = str('/{mag:d}_mag{aspin}_w{win:d}.h5') return ParaFrame.parse(fmt,encoding=True) def test_type_of_ParaFrame(create_ParaFrame): @@ -47,24 +47,24 @@ def test_pandas_method_on_pf(create_ParaFrame): assert isinstance(pf.head(), pd.DataFrame) def test_glob_string_format(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + fmt = str("/a_{a:d}/b_{b:d}.txt") pattern = ParaFrame.glob_search(fmt, a=0, return_pattern=True, encoding=True)[1] norm = pattern.replace("\\", "/") # standardize output for Mac and PC assert norm.endswith("/a_0/b_*.txt") def test_glob_method_returns_files(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + fmt = str("/a_{a:d}/b_{b:d}.txt") files = ParaFrame.glob_search(fmt, a=0, return_pattern=True, encoding=True)[0] assert len(files) == 10 def test_parse_method_with_added_filter_arg(create_temp_data): - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + fmt = str("/a_{a:d}/b_{b:d}.txt") pf = ParaFrame.parse(fmt, a=0, encoding=True) assert pf.shape == (10, 3) assert pf["a"].unique() == 0 def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): - fmt = str(create_temp_data_spin / "a{aspin}/b_{b:d}.txt") + fmt = str("/a{aspin}/b_{b:d}.txt") files, pattern = ParaFrame.glob_search(fmt, encoding = True, aspin="+0.5", return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") @@ -81,23 +81,15 @@ def test_filtering_by_numeric_spin(create_ParaFrame_spin): assert len(pf_filtered) == 10 assert set(pf_filtered["aspin"].unique()) == {0.5} -# def test_loading_yaml_file_for_test_spin_formatting_contents(create_temp_data_spin_with_m): -# fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") -# params = load_encodings_yaml(fmt,path = Path("/tmp/encoding_tmp.yaml")) # default fmt should still be the first one - -# assert "encoding" in params -# assert "aspin" in params["encoding"] - def test_m_type_for_spin_data_with_yaml_regex(create_temp_data_spin_with_m): - fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") + fmt = str("/{mag:d}_mag{aspin}_w{win:d}.h5") pf = ParaFrame.parse(fmt, encoding= True, debug = True) - pf_filtered = pf(aspin=-0.5) assert len(pf_filtered) == 20 assert set(pf_filtered["aspin"].unique()) == {-0.5} def test_m_type_for_spin_data_with_multiple_filters(create_temp_data_spin_with_m): - fmt = str(create_temp_data_spin_with_m / "{mag:d}_mag{aspin}_w{win:d}.h5") + fmt = str("/{mag:d}_mag{aspin}_w{win:d}.h5") pf = ParaFrame.parse(fmt,encoding=True, debug = True) pf_filtered = pf(aspin=[-0.5,0.0]) assert len(pf_filtered) == 40 \ No newline at end of file diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index 4eb05c0..b5a9c68 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -3,7 +3,7 @@ def test_paraframe_class_functionality(create_temp_data): # a user wants to create a paraframe - fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + fmt = str("/a_{a:d}/b_{b:d}.txt") pf = ParaFrame.parse(fmt, encoding=True) # users wants to filter files to see those with a = 0 From 925ccf42cdf7569e25957b3259edcac19cfc1a78 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sun, 1 Mar 2026 23:32:56 -0700 Subject: [PATCH 31/33] fixing linter errors for PR --- mod/hallmark/core.py | 35 +++++++++++++++++++++++------------ tests/conftest.py | 1 - tests/test_paraframe.py | 7 +++++-- tests/test_paraframe_e2e.py | 32 +++++++++++++++++--------------- 4 files changed, 45 insertions(+), 30 deletions(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index acd44fc..7129994 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -72,7 +72,8 @@ def filter(self, **kwargs): return self[mask] @classmethod - def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=False, **kwargs): + def glob_search(cls, fmt, *args, debug=False, return_pattern=False, + encoding=False, **kwargs): pmax = len(fmt) // 3 # to specify a parameter, we need at least # three characters '{p}'; the maximum number @@ -91,10 +92,11 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal if yaml_encodings is None: - raise ValueError(f"Error: The format '{fmt_enc}' is missing from .hallmark.yaml.") + raise ValueError( + f"Error: The format '{fmt_enc}' is missing from .hallmark.yaml." + ) needs_encoding = False - #enc_dict = yaml_encodings.get("encoding", {}) for i in range(len(encodings)): if 'encoding' not in encodings[i].keys(): @@ -109,12 +111,18 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal if enc_dict[key] != "": needs_encoding = True - if needs_encoding == True and encoding == False: - raise ValueError(f"Error: '{fmt_enc}' has a regex spec, so you must use encoding=True") - - if needs_encoding == False and encoding == True: - raise ValueError(f"Error: '{fmt_enc}' does not have a regex spec, so you must use encoding=False") - + if needs_encoding and not encoding: + raise ValueError( + f'''Error: '{fmt_enc}' has a regex spec, + so you must use encoding=True''' + ) + + if not needs_encoding and encoding: + raise ValueError( + f'''Error: '{fmt_enc}' does not have a + regex spec, so you must use encoding=False''' + ) + # Construct the glob pattern for search files base = str(get_rel_yaml_path().parent) pattern = base + fmt @@ -137,7 +145,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal globbed_files = sorted(glob(pattern)) # Print the glob pattern and a summary of matches - if debug == True: + if debug: print(f'Pattern: "{pattern}"') n = len(globbed_files) if n > 1: @@ -145,7 +153,7 @@ def glob_search(cls, fmt, *args, debug=False, return_pattern=False, encoding=Fal elif n > 0: print(f'{n} match, i.e., "{globbed_files[0]}"') else: - print(f"No match; please check format string") + print("No match; please check format string") if return_pattern: return (globbed_files, pattern) @@ -192,7 +200,10 @@ def parse(cls, fmt, *args, debug=False, encoding=False, **kwargs): 1 data/run2_p20.csv 2 20 """ # Parse list of file names back to parameters - yaml_encodings, fmt_g, globbed_files = cls.glob_search(fmt, *args, debug=debug, encoding=encoding, **kwargs) + yaml_encodings, fmt_g, globbed_files = cls.glob_search(fmt, *args, + debug=debug, + encoding=encoding, + **kwargs) parser = parse.compile(fmt_g) frame = [] diff --git a/tests/conftest.py b/tests/conftest.py index df38fa1..ce53249 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,6 @@ import yaml from pathlib import Path import hallmark -from hallmark import ParaFrame ORIGINAL_YAML = Path("demos/data/.hallmark.yaml") diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 3933791..86d8140 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -63,9 +63,12 @@ def test_parse_method_with_added_filter_arg(create_temp_data): assert pf.shape == (10, 3) assert pf["a"].unique() == 0 -def test_glob_method_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): +def test_glob_accepts_spin_formatter_type_and_builds_glob_method(create_temp_data_spin): fmt = str("/a{aspin}/b_{b:d}.txt") - files, pattern = ParaFrame.glob_search(fmt, encoding = True, aspin="+0.5", return_pattern=True) + files, pattern = ParaFrame.glob_search(fmt, + encoding = True, + aspin="+0.5", + return_pattern=True) norm = pattern.replace("\\", "/") # standardize output for Mac and PC OS assert norm.endswith("/a+0.5/b_*.txt") assert len(files) == 10 diff --git a/tests/test_paraframe_e2e.py b/tests/test_paraframe_e2e.py index b5a9c68..e760f8f 100644 --- a/tests/test_paraframe_e2e.py +++ b/tests/test_paraframe_e2e.py @@ -43,18 +43,20 @@ def test_paraframe_class_functionality(create_temp_data): assert len(mask_filter) == 40 assert all(mask_filter["a"].unique() == [1,2,3,4]) -# @pytest.mark.xfail(strict=True, reason="Debug output formatting has been changed, test needs updated") -# def test_debug(create_temp_data, capsys, tmp_path): -# # users want to see a detailed summary of how ParaFrame utilizes globbing -# fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") -# ParaFrame.parse(fmt, debug = True) -# captured = capsys.readouterr() -# print(captured.out) -# expected = ( -# '0 ' + str(tmp_path) + '/data/a_{a:d}/b_{b:d}.txt () {}\n' + -# "1 " + str(tmp_path) + "/data/a_{a:s}/b_{b:d}.txt () {'a': '*'}\n" + -# "2 " + str(tmp_path) + "/data/a_{a:s}/b_{b:s}.txt () {'a': '*', 'b': '*'}\n" + -# 'Pattern: "' + str(tmp_path) + '/data/a_*/b_*.txt"\n' + -# '100 matches, e.g., "' + str(tmp_path) + '/data/a_0/b_10.txt"\n' -# ) -# assert captured.out == expected \ No newline at end of file +@pytest.mark.xfail(strict=True, + reason="Debug output formatting has been changed, test needs updated" + ) +def test_debug(create_temp_data, capsys, tmp_path): + # users want to see a detailed summary of how ParaFrame utilizes globbing + fmt = str(create_temp_data / "a_{a:d}/b_{b:d}.txt") + ParaFrame.parse(fmt, debug = True) + captured = capsys.readouterr() + print(captured.out) + expected = ( + '0 ' + str(tmp_path) + '/data/a_{a:d}/b_{b:d}.txt () {}\n' + + "1 " + str(tmp_path) + "/data/a_{a:s}/b_{b:d}.txt () {'a': '*'}\n" + + "2 " + str(tmp_path) + "/data/a_{a:s}/b_{b:s}.txt () {'a': '*', 'b': '*'}\n" + + 'Pattern: "' + str(tmp_path) + '/data/a_*/b_*.txt"\n' + + '100 matches, e.g., "' + str(tmp_path) + '/data/a_0/b_10.txt"\n' + ) + assert captured.out == expected \ No newline at end of file From 8bdf0fa28c0f3a83507cade9a6167bbc4361f721 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sun, 1 Mar 2026 23:36:02 -0700 Subject: [PATCH 32/33] fixing linter errors for PR 2 --- mod/hallmark/__init__.py | 2 +- mod/hallmark/core.py | 2 +- tests/test_paraframe.py | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mod/hallmark/__init__.py b/mod/hallmark/__init__.py index c311354..204e473 100644 --- a/mod/hallmark/__init__.py +++ b/mod/hallmark/__init__.py @@ -14,4 +14,4 @@ # limitations under the License. from .core import ParaFrame as ParaFrame -from .helper_functions import set_rel_yaml_path +from .helper_functions import set_rel_yaml_path as set_rel_yaml_path diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index 7129994..b7906c2 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -21,7 +21,7 @@ import numpy as np from pathlib import Path -from .helper_functions import * +from .helper_functions import get_rel_yaml_path, load_encodings_yaml, find_spec_by_fmt, regex_sub class ParaFrame(pd.DataFrame): """ diff --git a/tests/test_paraframe.py b/tests/test_paraframe.py index 86d8140..941f517 100644 --- a/tests/test_paraframe.py +++ b/tests/test_paraframe.py @@ -1,7 +1,6 @@ import pandas as pd import pytest from hallmark import ParaFrame -from hallmark.helper_functions import * @pytest.fixture def create_ParaFrame(create_temp_data): From 6d7bd57fd76112abdcb8882a3430f122b689cf56 Mon Sep 17 00:00:00 2001 From: Nayera Abdessalam Date: Sun, 1 Mar 2026 23:39:10 -0700 Subject: [PATCH 33/33] fixing linter errors for PR 3 --- mod/hallmark/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mod/hallmark/core.py b/mod/hallmark/core.py index b7906c2..293f8ce 100644 --- a/mod/hallmark/core.py +++ b/mod/hallmark/core.py @@ -21,7 +21,10 @@ import numpy as np from pathlib import Path -from .helper_functions import get_rel_yaml_path, load_encodings_yaml, find_spec_by_fmt, regex_sub +from .helper_functions import (get_rel_yaml_path, + load_encodings_yaml, + find_spec_by_fmt, + regex_sub) class ParaFrame(pd.DataFrame): """