diff --git a/pyproject.toml b/pyproject.toml index bdfecd610..e1e364030 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -509,9 +509,8 @@ path = [ # Pd Module - Pandas utilities # Use: pip install scitex[pd] -pd = [ - "xarray", -] +# Real implementation lives in the standalone scitex-pd package. +pd = ["scitex-pd>=0.1.0"] # PLT Module - Plotting utilities # Use: pip install scitex[plt] diff --git a/src/scitex/pd/__init__.py b/src/scitex/pd/__init__.py index a25836643..5fca929df 100755 --- a/src/scitex/pd/__init__.py +++ b/src/scitex/pd/__init__.py @@ -1,46 +1,20 @@ -#!/usr/bin/env python3 -"""Scitex pd module.""" +"""SciTeX pd — thin compatibility shim for scitex-pd. -from ._find_indi import find_indi -from ._find_pval import _find_pval_col, find_pval -from ._force_df import force_df -from ._from_xyz import from_xyz -from ._get_unique import get_unique -from ._ignore_SettingWithCopyWarning import ( - ignore_setting_with_copy_warning, - ignore_SettingWithCopyWarning, -) -from ._melt_cols import melt_cols -from ._merge_columns import merge_cols, merge_columns -from ._mv import mv, mv_to_first, mv_to_last -from ._replace import replace -from ._round import round -from ._slice import slice -from ._sort import sort -from ._to_numeric import to_numeric -from ._to_xy import to_xy -from ._to_xyz import to_xyz +Aliases ``scitex.pd`` to the standalone ``scitex_pd`` package via ``sys.modules``. +``scitex.pd is scitex_pd``. -__all__ = [ - "find_indi", - "find_pval", - "_find_pval_col", - "force_df", - "from_xyz", - "get_unique", - "ignore_SettingWithCopyWarning", - "ignore_setting_with_copy_warning", - "melt_cols", - "merge_cols", - "merge_columns", - "mv", - "mv_to_first", - "mv_to_last", - "replace", - "round", - "slice", - "sort", - "to_numeric", - "to_xy", - "to_xyz", -] +Install: ``pip install scitex[pd]`` (or ``pip install scitex-pd``). +See: https://github.com/ywatanabe1989/scitex-pd +""" + +import sys as _sys + +try: + import scitex_pd as _real +except ImportError as _e: # pragma: no cover + raise ImportError( + "scitex.pd requires the 'scitex-pd' package. " + "Install with: pip install scitex[pd] (or: pip install scitex-pd)" + ) from _e + +_sys.modules[__name__] = _real diff --git a/src/scitex/pd/_find_indi.py b/src/scitex/pd/_find_indi.py deleted file mode 100755 index a814f806f..000000000 --- a/src/scitex/pd/_find_indi.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 08:11:05 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_find_indi.py - -from typing import Dict, List, Union - -import pandas as pd - -# def find_indi(df: pd.DataFrame, conditions: Dict[str, Union[str, int, float, List]]) -> pd.Series: -# """Finds indices of rows that satisfy all given conditions in a DataFrame. - -# Example -# ------- -# >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'x']}) -# >>> conditions = {'A': [1, 2], 'B': 'x'} -# >>> result = find_indi(df, conditions) -# >>> print(result) -# 0 True -# 1 False -# 2 False -# dtype: bool - -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame to search in -# conditions : Dict[str, Union[str, int, float, List]] -# Dictionary of column names and their target values - -# Returns -# ------- -# pd.Series -# Boolean Series indicating which rows satisfy all conditions - -# Raises -# ------ -# KeyError -# If any column in conditions is not found in DataFrame -# """ -# if not all(col in df.columns for col in conditions): -# missing_cols = [col for col in conditions if col not in df.columns] -# raise KeyError(f"Columns not found in DataFrame: {missing_cols}") - -# condition_series = [] -# for key, value in conditions.items(): -# if isinstance(value, (list, tuple)): -# condition_series.append(df[key].isin(value)) -# else: -# condition_series.append(df[key] == value) - -# return pd.concat(condition_series, axis=1).all(axis=1) - - -def find_indi( - df: pd.DataFrame, conditions: Dict[str, Union[str, int, float, List]] -) -> List[int]: - """Finds indices of rows that satisfy conditions, handling NaN values. - - Example - ------- - >>> df = pd.DataFrame({'A': [1, 2, None], 'B': ['x', 'y', 'x']}) - >>> conditions = {'A': [1, None], 'B': 'x'} - >>> result = find_indi(df, conditions) - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame - conditions : Dict[str, Union[str, int, float, List]] - Column conditions - - Returns - ------- - List[int] - List of integer indices of matching rows - """ - if not conditions: - return [] - - if not all(col in df.columns for col in conditions): - missing_cols = [col for col in conditions if col not in df.columns] - raise KeyError(f"Columns not found in DataFrame: {missing_cols}") - - condition_series = [] - for key, value in conditions.items(): - if isinstance(value, (list, tuple)): - # Handle NaN in lists - has_na = False - try: - # Check for None - if None in value: - has_na = True - # Check for pd.NA (may raise TypeError) - elif any(v is pd.NA for v in value): - has_na = True - # Check for np.nan - elif any(pd.isna(v) for v in value): - has_na = True - except (TypeError, ValueError): - # If any check fails, try alternative approach - has_na = any( - pd.isna(v) if not isinstance(v, str) else False for v in value - ) - - if has_na: - condition = df[key].isin(value) | df[key].isna() - else: - condition = df[key].isin(value) - else: - # Handle single NaN value - if pd.isna(value): - condition = df[key].isna() - else: - condition = df[key] == value - condition_series.append(condition) - - if condition_series: - mask = pd.concat(condition_series, axis=1).all(axis=1) - return df.index[mask].tolist() - else: - return [] - - -# EOF diff --git a/src/scitex/pd/_find_pval.py b/src/scitex/pd/_find_pval.py deleted file mode 100755 index 4c2856c74..000000000 --- a/src/scitex/pd/_find_pval.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 03:25:00 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_find_pval.py - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-06 11:09:07 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/stats/_find_pval_col.py - -""" -Functionality: - - Identifies column name(s) in a DataFrame or keys in other data structures that correspond to p-values -Input: - - pandas DataFrame, numpy array, list, or dict -Output: - - String or list of strings representing the identified p-value column name(s) or key(s), or None if not found -Prerequisites: - - pandas, numpy libraries -""" - -import re -from typing import Dict, List, Optional, Union - -import numpy as np -import pandas as pd - - -def find_pval( - data: Union[pd.DataFrame, np.ndarray, List, Dict], multiple: bool = True -) -> Union[Optional[str], List[str]]: - """ - Find p-value column name(s) or key(s) in various data structures. - - Example: - -------- - >>> df = pd.DataFrame({'p_value': [0.05, 0.01], 'pval': [0.1, 0.001], 'other': [1, 2]}) - >>> find_pval(df) - ['p_value', 'pval'] - >>> find_pval(df, multiple=False) - 'p_value' - - Parameters: - ----------- - data : Union[pd.DataFrame, np.ndarray, List, Dict] - Data structure to search for p-value column or key - multiple : bool, optional - If True, return all matches; if False, return only the first match (default is True) - - Returns: - -------- - Union[Optional[str], List[str]] - Name(s) of the column(s) or key(s) that match p-value patterns, or None if not found - """ - if isinstance(data, pd.DataFrame): - return _find_pval_col(data, multiple) - elif isinstance(data, (np.ndarray, list, dict)): - return _find_pval(data, multiple) - else: - raise ValueError("Input must be a pandas DataFrame, numpy array, list, or dict") - - -def _find_pval( - data: Union[np.ndarray, List, Dict], multiple: bool -) -> Union[Optional[str], List[str]]: - pattern = re.compile(r"p[-_]?val(ue)?(?!.*stars)", re.IGNORECASE) - matches = [] - - if isinstance(data, dict): - matches = [key for key in data.keys() if pattern.search(str(key))] - elif ( - isinstance(data, (np.ndarray, list)) - and len(data) > 0 - and isinstance(data[0], dict) - ): - matches = [key for key in data[0].keys() if pattern.search(str(key))] - - return matches if multiple else (matches[0] if matches else None) - - -def _find_pval_col( - df: pd.DataFrame, multiple: bool = False -) -> Union[Optional[str], List[str]]: - """ - Find p-value column name(s) in a DataFrame. - - Example: - -------- - >>> df = pd.DataFrame({'p_value': [0.05, 0.01], 'pval': [0.1, 0.001], 'other': [1, 2]}) - >>> find_pval_col(df) - ['p_value', 'pval'] - >>> find_pval_col(df, multiple=False) - 'p_value' - - Parameters: - ----------- - df : pd.DataFrame - DataFrame to search for p-value column(s) - multiple : bool, optional - If True, return all matches; if False, return only the first match (default is False) - - Returns: - -------- - Union[Optional[str], List[str]] - Name(s) of the column(s) that match p-value patterns, or None if not found - """ - pattern = re.compile(r"p[-_]?val(ue)?(?!.*stars)", re.IGNORECASE) - matches = [col for col in df.columns if pattern.search(str(col))] - - return matches if multiple else (matches[0] if matches else None) - - -# EOF diff --git a/src/scitex/pd/_force_df.py b/src/scitex/pd/_force_df.py deleted file mode 100755 index 66605a96d..000000000 --- a/src/scitex/pd/_force_df.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Timestamp: "2025-04-27 19:59:11 (ywatanabe)" -# File: /ssh:sp:/home/ywatanabe/proj/scitex_repo/src/scitex/pd/_force_df.py -# ---------------------------------------- -import os - -__FILE__ = "./src/scitex/pd/_force_df.py" -__DIR__ = os.path.dirname(__FILE__) -# ---------------------------------------- - -import numpy as np -import pandas as pd - -from scitex.types import is_listed_X - - -def force_df(data, filler=np.nan): - """ - Convert various data types to pandas DataFrame. - - Parameters - ---------- - data : various - The data to convert to DataFrame. Can be DataFrame, Series, ndarray, - list, tuple, dict, scalar value, etc. - filler : any, optional - Value to use for filling missing values, by default np.nan - - Returns - ------- - pd.DataFrame - Data converted to DataFrame - - Examples - -------- - >>> import scitex - >>> import pandas as pd - >>> import numpy as np - - # DataFrame input returns the same DataFrame - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) - >>> scitex.pd.force_df(df) is df - True - - # Series input is converted to DataFrame - >>> series = pd.Series([1, 2, 3], name='test') - >>> scitex.pd.force_df(series) - test - 0 1 - 1 2 - 2 3 - - # NumPy array input is converted to DataFrame - >>> arr = np.array([1, 2, 3]) - >>> scitex.pd.force_df(arr) - value - 0 1 - 1 2 - 2 3 - - # Scalar values are converted to single-value DataFrames - >>> scitex.pd.force_df(42) - value - 0 42 - - # Lists and tuples are converted to DataFrame - >>> scitex.pd.force_df([1, 2, 3]) - value - 0 1 - 1 2 - 2 3 - - # Dictionaries are converted to DataFrame with appropriate handling - # of different length values - >>> data = {'A': [1, 2, 3], 'B': [4, 5]} - >>> scitex.pd.force_df(data) - A B - 0 1 4 - 1 2 5 - 2 3 NaN - """ - # Return None as empty DataFrame - if data is None: - return pd.DataFrame() - - # Return DataFrame as is - if isinstance(data, pd.DataFrame): - return data - - # Convert Series to DataFrame - if isinstance(data, pd.Series): - return data.to_frame() - - # Convert numpy array to DataFrame - if isinstance(data, np.ndarray): - # Handle 1D array - if data.ndim == 1: - return pd.DataFrame(data, columns=["value"]) - # Handle 2D array - elif data.ndim == 2: - return pd.DataFrame(data) - # Handle higher dimensional arrays - else: - shape = data.shape - reshaped = data.reshape(shape[0], -1) - return pd.DataFrame(reshaped) - - # Handle scalar values (int, float, str, etc.) - if isinstance(data, (int, float, str, bool)): - return pd.DataFrame([data], columns=["value"]) - - # Handle lists and tuples - if isinstance(data, (list, tuple)): - # Handle list of lists/arrays -> DataFrame - if len(data) > 0 and isinstance(data[0], (list, tuple, np.ndarray)): - return pd.DataFrame(data) - # Handle simple list/tuple -> single column DataFrame - else: - return pd.DataFrame(data, columns=["value"]) - - # Continue with the original implementation for dictionaries - if isinstance(data, dict): - # Original implementation - permutable_dict = data.copy() - - # Get the lengths - max_len = 0 - for k, v in permutable_dict.items(): - # Check if v is an iterable (but not string) or treat as single length otherwise - if isinstance(v, (str, int, float)) or not hasattr(v, "__len__"): - length = 1 - else: - length = len(v) - max_len = max(max_len, length) - - # Replace with appropriately filled list - for k, v in permutable_dict.items(): - if isinstance(v, (str, int, float)) or not hasattr(v, "__len__"): - permutable_dict[k] = [v] + [filler] * (max_len - 1) - else: - permutable_dict[k] = list(v) + [filler] * (max_len - len(v)) - - # Puts them into a DataFrame - return pd.DataFrame(permutable_dict) - - # For any other iterable type - try: - return pd.DataFrame(list(data), columns=["value"]) - except: - raise TypeError(f"Cannot convert object of type {type(data)} to DataFrame") - - -# EOF diff --git a/src/scitex/pd/_from_xyz.py b/src/scitex/pd/_from_xyz.py deleted file mode 100755 index c0fb5efff..000000000 --- a/src/scitex/pd/_from_xyz.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-09-26 07:22:18 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_from_xyz.py - -import numpy as np -import pandas as pd - - -def from_xyz(data_frame, x=None, y=None, z=None, square=False): - """ - Convert a DataFrame with 'x', 'y', 'z' format into a heatmap DataFrame. - - Example - ------- - import pandas as pd - data = pd.DataFrame({ - 'col1': ['A', 'B', 'C', 'A'], - 'col2': ['X', 'Y', 'Z', 'Y'], - 'p_val': [0.01, 0.05, 0.001, 0.1] - }) - data = data.rename(columns={"col1": "x", "col2": "y", "p_val": "z"}) - result = from_xyz(data) - print(result) - - Parameters - ---------- - data_frame : pandas.DataFrame - Input DataFrame with columns for x, y, and z values. - x : str, optional - Name of the column to use as x-axis. Defaults to 'x'. - y : str, optional - Name of the column to use as y-axis. Defaults to 'y'. - z : str, optional - Name of the column to use as z-values. Defaults to 'z'. - square : bool, optional - If True, force the output to be a square matrix. Defaults to False. - - Returns - ------- - pandas.DataFrame - A DataFrame in heatmap/pivot format. - """ - x = x or "x" - y = y or "y" - z = z or "z" - - heatmap = pd.pivot_table(data_frame, values=z, index=y, columns=x, aggfunc="first") - - if square: - # Make it square by including all unique labels - all_labels = sorted(set(heatmap.index) | set(heatmap.columns)) - heatmap = heatmap.reindex(index=all_labels, columns=all_labels) - - heatmap = heatmap.fillna(0) - - return heatmap - - -if __name__ == "__main__": - np.random.seed(42) - stats = pd.DataFrame( - { - "col1": np.random.choice(["A", "B", "C"], 100), - "col2": np.random.choice(["X", "Y", "Z"], 100), - "p_val": np.random.rand(100), - } - ) - stats = stats.rename(columns={"col1": "x", "col2": "y", "p_val": "z"}) - result = from_xyz(stats) - print(result) diff --git a/src/scitex/pd/_get_unique.py b/src/scitex/pd/_get_unique.py deleted file mode 100755 index 89853e661..000000000 --- a/src/scitex/pd/_get_unique.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Timestamp: "2025-09-18 18:42:11 (ywatanabe)" -# File: /ssh:sp:/home/ywatanabe/proj/scitex_repo/src/scitex/pd/_get_unique.py -# ---------------------------------------- -from __future__ import annotations - -import os - -__FILE__ = __file__ -__DIR__ = os.path.dirname(__FILE__) -# ---------------------------------------- - -""" -Extract unique values from DataFrame columns. -""" - -from typing import Any, Optional - -import pandas as pd - - -def get_unique( - df: pd.DataFrame, - column: str, - default: Optional[Any] = None, - raise_on_multiple: bool = False, -) -> Any: - """Get value from column if it contains a unique value. - - Args: - df: DataFrame to extract from - column: Column name to check - default: Default value if column doesn't exist or has multiple unique values - raise_on_multiple: If True, raise ValueError when multiple unique values exist - - Returns: - The unique value if exactly one exists, otherwise default value - - Examples: - >>> import pandas as pd - >>> df = pd.DataFrame({'patient_id': ['P01', 'P01', 'P01']}) - >>> get_unique(df, 'patient_id') - 'P01' - - >>> df = pd.DataFrame({'patient_id': ['P01', 'P02']}) - >>> get_unique(df, 'patient_id', default='Unknown') - 'Unknown' - - >>> # Raise error on multiple values - >>> get_unique(df, 'patient_id', raise_on_multiple=True) - ValueError: Column 'patient_id' has 2 unique values: ['P01', 'P02'] - """ - if column not in df.columns: - if raise_on_multiple: - raise KeyError(f"Column '{column}' not found in DataFrame") - return default - - unique_values = df[column].unique() - - if len(unique_values) == 1: - return unique_values[0] - - if len(unique_values) > 1 and raise_on_multiple: - raise ValueError( - f"Column '{column}' has {len(unique_values)} unique values: " - f"{list(unique_values[:5])}" - ) - - return default - - -if __name__ == "__main__": - # Test the function - import pandas as pd - - # Test case 1: Unique value - df1 = pd.DataFrame({"patient_id": ["P01", "P01", "P01"]}) - assert get_unique(df1, "patient_id") == "P01" - print("✓ Test 1 passed: Unique value extracted") - - # Test case 2: Multiple values with default - df2 = pd.DataFrame({"patient_id": ["P01", "P02"]}) - assert get_unique(df2, "patient_id", default="Unknown") == "Unknown" - print("✓ Test 2 passed: Default returned for multiple values") - - # Test case 3: Missing column - assert get_unique(df1, "missing_col", default="N/A") == "N/A" - print("✓ Test 3 passed: Default returned for missing column") - - # Test case 4: Raise on multiple - try: - get_unique(df2, "patient_id", raise_on_multiple=True) - assert False, "Should have raised ValueError" - except ValueError as e: - assert "has 2 unique values" in str(e) - print("✓ Test 4 passed: ValueError raised for multiple values") - - print("\nAll tests passed!") - -# EOF diff --git a/src/scitex/pd/_ignore_SettingWithCopyWarning.py b/src/scitex/pd/_ignore_SettingWithCopyWarning.py deleted file mode 100755 index 56b59e804..000000000 --- a/src/scitex/pd/_ignore_SettingWithCopyWarning.py +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 07:35:30 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_ignore_.py - -import warnings -from contextlib import contextmanager - - -@contextmanager -def ignore_setting_with_copy_warning(): - """ - Context manager to temporarily ignore pandas SettingWithCopyWarning. - - Example - ------- - >>> with ignore_SettingWithCopyWarning(): - ... df['column'] = new_values # No warning will be shown - """ - try: - from pandas.errors import SettingWithCopyWarning - except ImportError: - from pandas.core.common import SettingWithCopyWarning - - # Save current warning filters - with warnings.catch_warnings(): - warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) - yield - - -# Backward compatibility -ignore_SettingWithCopyWarning = ignore_setting_with_copy_warning # Deprecated - -# EOF diff --git a/src/scitex/pd/_melt_cols.py b/src/scitex/pd/_melt_cols.py deleted file mode 100755 index e2249aa68..000000000 --- a/src/scitex/pd/_melt_cols.py +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-05 23:04:16 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_melt_cols.py - - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-05 23:03:39 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_melt_cols.py - -from typing import List, Optional - -import pandas as pd - - -def melt_cols( - df: pd.DataFrame, cols: List[str], id_columns: Optional[List[str]] = None -) -> pd.DataFrame: - """ - Melt specified columns while preserving links to other data in a DataFrame. - - Example - ------- - >>> data = pd.DataFrame({ - ... 'id': [1, 2, 3], - ... 'name': ['Alice', 'Bob', 'Charlie'], - ... 'score_1': [85, 90, 78], - ... 'score_2': [92, 88, 95] - ... }) - >>> melted = melt_cols(data, cols=['score_1', 'score_2']) - >>> print(melted) - id name variable value - 0 1 Alice score_1 85 - 1 2 Bob score_1 90 - 2 3 Charlie score_1 78 - 3 1 Alice score_2 92 - 4 2 Bob score_2 88 - 5 3 Charlie score_2 95 - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame - cols : List[str] - Columns to be melted - id_columns : Optional[List[str]], default None - Columns to preserve as identifiers. If None, all columns not in 'cols' are used. - - Returns - ------- - pd.DataFrame - Melted DataFrame with preserved identifier columns - - Raises - ------ - ValueError - If cols are not present in the DataFrame - """ - missing_melt = set(cols) - set(df.columns) - if missing_melt: - raise ValueError(f"Columns not found in DataFrame: {missing_melt}") - - if id_columns is None: - id_columns = [col for col in df.columns if col not in cols] - - df_copy = df.reset_index(drop=True) - df_copy["global_index"] = df_copy.index - - # Use a different value_name if "value" is one of the columns being melted - value_name = "value" if "value" not in cols else "melted_value" - melted_df = df_copy[cols + ["global_index"]].melt( - id_vars=["global_index"], value_name=value_name - ) - if id_columns: - formatted_df = melted_df.merge( - df_copy[id_columns + ["global_index"]], on="global_index" - ) - return formatted_df.drop("global_index", axis=1) - else: - # No id columns to merge, just return melted data without global_index - return melted_df.drop("global_index", axis=1) diff --git a/src/scitex/pd/_merge_columns.py b/src/scitex/pd/_merge_columns.py deleted file mode 100755 index 93e2dc158..000000000 --- a/src/scitex/pd/_merge_columns.py +++ /dev/null @@ -1,222 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 07:37:09 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_merge_columns.py - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-07 12:03:29 (ywatanabe)" -# ./src/scitex/pd/_merge_cols.py - -from typing import List, Tuple, Union - -import pandas as pd - - -def merge_columns( - df: pd.DataFrame, - *args: Union[str, List[str], Tuple[str, ...]], - sep: str = None, - sep1: str = "_", - sep2: str = "-", - name: str = "merged", -) -> pd.DataFrame: - """Creates a new column by joining specified columns. - - Example - ------- - >>> df = pd.DataFrame({ - ... 'A': [0, 5, 10], - ... 'B': [1, 6, 11], - ... 'C': [2, 7, 12] - ... }) - >>> # Simple concatenation with separator - >>> merge_columns(df, 'A', 'B', sep=' ') - A B C A_B - 0 0 1 2 0 1 - 1 5 6 7 5 6 - 2 10 11 12 10 11 - - >>> # With column labels - >>> merge_columns(df, 'A', 'B', sep1='_', sep2='-') - A B C A_B - 0 0 1 2 A-0_B-1 - 1 5 6 7 A-5_B-6 - 2 10 11 12 A-10_B-11 - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame - *args : Union[str, List[str], Tuple[str, ...]] - Column names to join - sep : str, optional - Simple separator for values only (overrides sep1/sep2) - sep1 : str, optional - Separator between column-value pairs, by default "_" - sep2 : str, optional - Separator between column name and value, by default "-" - name : str, optional - Name for the merged column, by default "merged" - - Returns - ------- - pd.DataFrame - DataFrame with added merged column - """ - _df = df.copy() - columns = args[0] if len(args) == 1 and isinstance(args[0], (list, tuple)) else args - - if not columns: - raise ValueError("No columns specified for merging") - - if not all(col in _df.columns for col in columns): - missing = [col for col in columns if col not in _df.columns] - raise KeyError(f"Columns not found in DataFrame: {missing}") - - # Handle empty DataFrame case - if len(_df) == 0: - # Determine column name - if name == "merged" and sep is not None: - new_col_name = "_".join(columns) - else: - new_col_name = name - # Create empty Series with the correct name - _df[new_col_name] = pd.Series(dtype=str) - return _df - - if sep is not None: - # Simple value concatenation - merged_col = ( - _df[list(columns)] - .astype(str) - .apply( - lambda row: sep.join(row.values), - axis=1, - ) - ) - else: - # Concatenation with column labels - merged_col = _df[list(columns)].apply( - lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()), - axis=1, - ) - - # Determine column name - if name == "merged" and sep is not None: - # When using simple separator and default name, use joined column names - new_col_name = "_".join(columns) - else: - # Use provided name or default - new_col_name = name - - _df[new_col_name] = merged_col - return _df - - -merge_cols = merge_columns - -# EOF - -# #!./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-07 12:03:29 (ywatanabe)" -# # ./src/scitex/pd/_merge_cols.py - - -# def merge_columns(df, *args, sep1="_", sep2="-", name="merged"): -# """ -# Join specified columns with their labels. - -# Example: -# import pandas as pd -# import numpy as np - -# df = pd.DataFrame( -# data=np.arange(25).reshape(5, 5), -# columns=["A", "B", "C", "D", "E"], -# ) - -# df1 = merge_columns(df, "A", "B", sep1="_", sep2="-") -# df2 = merge_columns(df, ["A", "B"], sep1="_", sep2="-") -# assert (df1 == df2).all().all() # True - -# # A B C D E A_B -# # 0 0 1 2 3 4 A-0_B-1 -# # 1 5 6 7 8 9 A-5_B-6 -# # 2 10 11 12 13 14 A-10_B-11 -# # 3 15 16 17 18 19 A-15_B-16 -# # 4 20 21 22 23 24 A-20_B-21 - - -# Parameters -# ---------- -# df : pandas.DataFrame -# Input DataFrame -# *args : str or list -# Column names to join, either as separate arguments or a single list -# sep1 : str, optional -# Separator for joining column names, default "_" -# sep2 : str, optional -# Separator between column name and value, default "-" - -# Returns -# ------- -# pandas.DataFrame -# DataFrame with added merged column -# """ -# _df = df.copy() -# columns = ( -# args[0] -# if len(args) == 1 and isinstance(args[0], (list, tuple)) -# else args -# ) -# merged_col = _df[list(columns)].apply( -# lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()), -# axis=1, -# ) - -# new_col_name = sep1.join(columns) if not name else str(name) -# _df[new_col_name] = merged_col -# return _df - - -# merge_cols = merge_columns - -# # def merge_columns(_df, *columns): -# # """ -# # Add merged columns in string. - -# # DF = pd.DataFrame(data=np.arange(25).reshape(5,5), -# # columns=["A", "B", "C", "D", "E"], -# # ) - -# # print(DF) - -# # # A B C D E -# # # 0 0 1 2 3 4 -# # # 1 5 6 7 8 9 -# # # 2 10 11 12 13 14 -# # # 3 15 16 17 18 19 -# # # 4 20 21 22 23 24 - -# # print(merge_columns(DF, "A", "B", "C")) - -# # # A B C D E A_B_C -# # # 0 0 1 2 3 4 0_1_2 -# # # 1 5 6 7 8 9 5_6_7 -# # # 2 10 11 12 13 14 10_11_12 -# # # 3 15 16 17 18 19 15_16_17 -# # # 4 20 21 22 23 24 20_21_22 -# # """ -# # from copy import deepcopy - -# # df = deepcopy(_df) -# # merged = deepcopy(df[columns[0]]) # initialization -# # for c in columns[1:]: -# # merged = scitex.ai.utils.merge_labels(list(merged), deepcopy(df[c])) -# # df.loc[:, scitex.gen.connect_strs(columns)] = merged -# # return df - - -# EOF diff --git a/src/scitex/pd/_mv.py b/src/scitex/pd/_mv.py deleted file mode 100755 index 7be5ffc8e..000000000 --- a/src/scitex/pd/_mv.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 07:39:12 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_mv.py - - -def mv(df, key, position, axis=1): - """ - Move a row or column to a specified position in a DataFrame. - - Args: - df (pandas.DataFrame): The input DataFrame. - key (str): The label of the row or column to move. - position (int): The position to move the row or column to. - axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. - - Returns: - pandas.DataFrame: A new DataFrame with the row or column moved. - """ - if axis == 0: - items = df.index.tolist() - else: - items = df.columns.tolist() - items.remove(key) - - if position < 0: - position += len(items) + 1 - - items.insert(position, key) - return df.reindex(items, axis=axis) - - -def mv_to_first(df, key, axis=1): - """ - Move a row or column to the first position in a DataFrame. - - Args: - df (pandas.DataFrame): The input DataFrame. - key (str): The label of the row or column to move. - axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. - - Returns: - pandas.DataFrame: A new DataFrame with the row or column moved to the first position. - """ - return mv(df, key, 0, axis) - - -def mv_to_last(df, key, axis=1): - """ - Move a row or column to the last position in a DataFrame. - - Args: - df (pandas.DataFrame): The input DataFrame. - key (str): The label of the row or column to move. - axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. - - Returns: - pandas.DataFrame: A new DataFrame with the row or column moved to the last position. - """ - return mv(df, key, -1, axis) - - -# EOF diff --git a/src/scitex/pd/_replace.py b/src/scitex/pd/_replace.py deleted file mode 100755 index 770c1e107..000000000 --- a/src/scitex/pd/_replace.py +++ /dev/null @@ -1,62 +0,0 @@ -#!./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-08-29 23:08:35 (ywatanabe)" -# ./src/scitex/pd/_replace.py - - -def replace(dataframe, old_value, new_value=None, regex=False, cols=None): - """ - Replace values in a DataFrame. - - Example - ------- - import pandas as pd - df = pd.DataFrame({'A': ['abc-123', 'def-456'], 'B': ['ghi-789', 'jkl-012']}) - - # Replace single value - df_replaced = replace(df, 'abc', 'xyz') - - # Replace with dictionary - replace_dict = {'-': '_', '1': 'one'} - df_replaced = replace(df, replace_dict, cols=['A']) - print(df_replaced) - - Parameters - ---------- - dataframe : pandas.DataFrame - Input DataFrame to modify. - old_value : str, dict - If str, the value to replace (requires new_value). - If dict, mapping of old values (keys) to new values (values). - new_value : str, optional - New value to replace old_value with. Required if old_value is str. - regex : bool, optional - If True, treat replacement keys as regular expressions. Default is False. - cols : list of str, optional - List of column names to apply replacements. If None, apply to all columns. - - Returns - ------- - pandas.DataFrame - DataFrame with specified replacements applied. - """ - dataframe = dataframe.copy() - - # Handle different input formats - if isinstance(old_value, dict): - replace_dict = old_value - else: - if new_value is None: - raise ValueError("new_value must be provided when old_value is not a dict") - replace_dict = {old_value: new_value} - - # Apply replacements to all columns if cols not specified - if cols is None: - # Use pandas replace method for all columns - return dataframe.replace(replace_dict, regex=regex) - else: - # Apply to specific columns - for column in cols: - if column in dataframe.columns: - dataframe[column] = dataframe[column].replace(replace_dict, regex=regex) - return dataframe diff --git a/src/scitex/pd/_round.py b/src/scitex/pd/_round.py deleted file mode 100755 index c8b8ef483..000000000 --- a/src/scitex/pd/_round.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-06 11:13:00 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_round.py - -import numpy as np -import pandas as pd - - -def round(df: pd.DataFrame, factor: int = 3) -> pd.DataFrame: - """ - Round numeric values in a DataFrame to a specified number of decimal places. - - Example - ------- - >>> df = pd.DataFrame({'A': [1.23456, 2.34567], 'B': ['abc', 'def'], 'C': [3, 4]}) - >>> round(df, 2) - A B C - 0 1.23 abc 3 - 1 2.35 def 4 - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame - factor : int, optional - Number of decimal places to round to (default is 3) - - Returns - ------- - pd.DataFrame - DataFrame with rounded numeric values - """ - - def custom_round(column): - # Skip non-numeric types like datetime, categorical, string - if pd.api.types.is_datetime64_any_dtype(column): - return column - if pd.api.types.is_categorical_dtype(column): - return column - if pd.api.types.is_string_dtype(column): - return column - # Note: boolean types are allowed to be converted to numeric - if ( - pd.api.types.is_object_dtype(column) - and not pd.api.types.is_numeric_dtype(column) - and not pd.api.types.is_bool_dtype(column) - ): - return column - - try: - # Handle boolean columns explicitly - if pd.api.types.is_bool_dtype(column): - return column.astype(int) - - numeric_column = pd.to_numeric(column, errors="coerce") - if np.issubdtype(numeric_column.dtype, np.integer): - return numeric_column.astype(int) - - # For float columns, round first - rounded = numeric_column.round(factor) - - # If factor is 0 and all values are whole numbers, convert to int - if factor == 0 and (rounded % 1 == 0).all() and not rounded.isna().any(): - return rounded.astype(int) - - return rounded - - except (ValueError, TypeError): - return column - - return df.apply(custom_round) - - -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-10-05 20:40:32 (ywatanabe)" -# /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_round.py - -# import numpy as np - -# def round(df, factor=3): -# return df.apply(lambda x: x.round(factor) if np.issubdtype(x.dtype, np.number) else x) - - -# def round(df, factor=3): -# def custom_round(x): -# try: -# numeric_x = pd.to_numeric(x, errors='raise') -# if np.issubdtype(numeric_x.dtype, np.integer): -# return numeric_x -# else: -# return numeric_x.apply(lambda y: float(f'{y:.{factor}g}')) -# except (ValueError, TypeError): -# return x - -# return df.apply(custom_round) diff --git a/src/scitex/pd/_skills/SKILL.md b/src/scitex/pd/_skills/SKILL.md deleted file mode 100644 index 173f3c611..000000000 --- a/src/scitex/pd/_skills/SKILL.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -name: stx.pd -description: Pandas DataFrame utilities for filtering, reshaping, merging, and scientific data manipulation. ---- - -# stx.pd - -The `stx.pd` module provides utility functions for pandas DataFrames tailored to scientific data analysis workflows. It extends pandas with helpers for indicator-based indexing, p-value column detection, melting, and coordinate-based reshaping. - -## Python API - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({"x": [1,2,3], "y": [4,5,6], "p_val": [0.01, 0.05, 0.5]}) - -# Boolean indicator indexing -indi = stx.pd.find_indi(df, col="group", values=["A", "B"]) -subset = df[indi] - -# Find p-value columns automatically -p_col = stx.pd.find_pval(df) - -# Force to DataFrame -df = stx.pd.force_df(my_array_or_list) - -# Coordinate reshaping -df = stx.pd.from_xyz(x_array, y_array, z_matrix) -x, y, z = stx.pd.to_xyz(df) - -# Get unique values as list -unique_vals = stx.pd.get_unique(df["group"]) - -# Melt multiple columns -melted = stx.pd.melt_cols(df, cols=["col1", "col2"], id_vars=["id"]) - -# Merge columns into one -merged = stx.pd.merge_cols(df, cols=["first", "last"], sep="_") - -# Column reordering -df = stx.pd.mv(df, col="important_col", position=0) -df = stx.pd.mv_to_first(df, "id") -df = stx.pd.mv_to_last(df, "notes") - -# Type utilities -df = stx.pd.to_numeric(df, cols=["value"]) - -# Suppress SettingWithCopyWarning -with stx.pd.ignore_SettingWithCopyWarning(): - df["new_col"] = df["old_col"] * 2 -``` - -## Key Features - -- `find_indi(df, col, values)` — boolean index for filtering rows -- `find_pval(df)` — auto-detect p-value column name -- `force_df(obj)` — coerce arrays/dicts/lists to DataFrame -- `from_xyz` / `to_xyz` — convert between XYZ arrays and pivot tables -- `melt_cols` / `merge_cols` / `mv` — DataFrame reshaping and column management -- `ignore_SettingWithCopyWarning` — context manager for pandas warning suppression -- `to_numeric` / `round` / `slice` / `sort` / `replace` — common data operations diff --git a/src/scitex/pd/_skills/cleaning.md b/src/scitex/pd/_skills/cleaning.md deleted file mode 100644 index 4879b5930..000000000 --- a/src/scitex/pd/_skills/cleaning.md +++ /dev/null @@ -1,157 +0,0 @@ ---- -description: Value replacement (replace), numeric rounding preserving non-numeric columns (round), and sort with optional custom category order (sort). ---- - -# Data Cleaning - -## replace - -Replace values in a DataFrame, with optional column scoping and regex support. - -```python -replace(dataframe, old_value, new_value=None, regex=False, cols=None) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `dataframe` | `pd.DataFrame` | required | Input DataFrame (not modified in place) | -| `old_value` | `str` or `dict` | required | Value to replace (with `new_value`), or a mapping `{old: new, …}` | -| `new_value` | `any` | `None` | Replacement value; required when `old_value` is not a dict | -| `regex` | `bool` | `False` | Treat `old_value` keys as regex patterns | -| `cols` | `list[str]` or `None` | `None` | Columns to apply replacement to; `None` applies to all columns | - -Always returns a copy; original is unchanged. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': ['abc-123', 'def-456'], 'B': ['ghi-789', 'jkl-012']}) - -# Single replacement across all columns -stx.pd.replace(df, '-', '_') -# A B -# 0 abc_123 ghi_789 -# 1 def_456 jkl_012 - -# Dict-based multi-replacement, column-scoped -stx.pd.replace(df, {'-': '_', 'abc': 'xyz'}, cols=['A']) -# A B -# 0 xyz_123 ghi-789 -# 1 def_456 jkl-012 - -# Regex replacement -stx.pd.replace(df, r'\d+', 'NUM', regex=True) -# A B -# 0 abc-NUM ghi-NUM -# 1 def-NUM jkl-NUM -``` - ---- - -## round - -Round all numeric columns in a DataFrame to a fixed number of decimal places, leaving non-numeric columns unchanged. - -```python -round(df, factor=3) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `factor` | `int` | `3` | Number of decimal places | - -**Column-type handling** - -| Column dtype | Behaviour | -|-------------|-----------| -| Datetime | Left unchanged | -| Categorical | Left unchanged | -| String / object (non-numeric) | Left unchanged | -| Boolean | Converted to `int` (0/1) | -| Integer | Left as integer (no float promotion) | -| Float | Rounded to `factor` decimal places | -| `factor=0` and all whole numbers | Converted to `int` | - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({ - 'score': [1.23456, 2.34567], - 'label': ['a', 'b'], - 'count': [3, 4], -}) - -stx.pd.round(df, 2) -# score label count -# 0 1.23 a 3 -# 1 2.35 b 4 - -stx.pd.round(df, 0) -# score label count -# 0 1 a 3 -# 1 2 b 4 -``` - ---- - -## sort - -Sort a DataFrame by one or more columns, with optional custom category ordering. The sort-key columns are moved to the front of the result. - -```python -sort(dataframe, by=None, ascending=True, inplace=False, kind="quicksort", - na_position="last", ignore_index=False, key=None, orders=None) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `dataframe` | `pd.DataFrame` | required | DataFrame to sort | -| `by` | `str` or `list[str]` or `None` | `None` | Column(s) to sort by; when `None` and `orders` is set, uses `orders.keys()` | -| `ascending` | `bool` or `list[bool]` | `True` | Sort direction | -| `inplace` | `bool` | `False` | Update original DataFrame in place (partial — index not updated correctly; prefer `False`) | -| `kind` | `str` | `"quicksort"` | Sorting algorithm passed to pandas | -| `na_position` | `str` | `"last"` | `"first"` or `"last"` for NaN placement | -| `ignore_index` | `bool` | `False` | Reset index to 0, 1, … in output | -| `key` | `callable` or `None` | `None` | Applied to values before sorting (overridden when `orders` is set) | -| `orders` | `dict[str, list]` or `None` | `None` | Custom sort order per column; creates `pd.Categorical` internally | - -**Column reordering side-effect:** the columns specified in `by` are moved to the front of the returned DataFrame. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': ['foo', 'bar', 'baz'], 'B': [3, 2, 1]}) - -# Standard ascending sort -stx.pd.sort(df, by='B') -# B A -# 1 1.0 baz -# ... - -# Custom category order -custom = {'A': ['bar', 'baz', 'foo']} -stx.pd.sort(df, orders=custom) -# A B -# 1 bar 2 -# 2 baz 1 -# 0 foo 3 - -# Multi-column sort -stx.pd.sort(df, by=['A', 'B'], ascending=[True, False]) -``` diff --git a/src/scitex/pd/_skills/columns.md b/src/scitex/pd/_skills/columns.md deleted file mode 100644 index 2ef166ea3..000000000 --- a/src/scitex/pd/_skills/columns.md +++ /dev/null @@ -1,166 +0,0 @@ ---- -description: Column reordering (mv, mv_to_first, mv_to_last), column concatenation into a label string (merge_columns / merge_cols), and column melting (melt_cols). ---- - -# Column Operations - -## mv / mv_to_first / mv_to_last - -Move a column (or row) to any position within a DataFrame without altering data. - -```python -mv(df, key, position, axis=1) -> pd.DataFrame -mv_to_first(df, key, axis=1) -> pd.DataFrame -mv_to_last(df, key, axis=1) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `key` | `str` | required | Column or row label to move | -| `position` | `int` | required | Target 0-based position; negative indices are resolved relative to the final length | -| `axis` | `int` | `1` | `1` = columns (default), `0` = rows | - -`mv_to_first` is shorthand for `mv(df, key, 0)`. -`mv_to_last` is shorthand for `mv(df, key, -1)`. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': [1], 'B': [2], 'C': [3], 'D': [4]}) - -# Move 'C' to position 1 -stx.pd.mv(df, 'C', 1).columns.tolist() -# ['A', 'C', 'B', 'D'] - -# Bring 'D' to front -stx.pd.mv_to_first(df, 'D').columns.tolist() -# ['D', 'A', 'B', 'C'] - -# Send 'A' to back -stx.pd.mv_to_last(df, 'A').columns.tolist() -# ['B', 'C', 'D', 'A'] - -# Move a row (axis=0) -df2 = pd.DataFrame({'val': [10, 20, 30]}, index=['a', 'b', 'c']) -stx.pd.mv(df2, 'c', 0, axis=0).index.tolist() -# ['c', 'a', 'b'] -``` - ---- - -## merge_columns / merge_cols - -Create a new string column by combining the values of existing columns. Two modes are available: plain value concatenation (with `sep`) or labelled concatenation (with `sep1`/`sep2`). - -`merge_cols` is an alias for `merge_columns`. - -```python -merge_columns(df, *args, sep=None, sep1="_", sep2="-", name="merged") -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `*args` | `str` or `list`/`tuple` of `str` | required | Column names to combine; may be given as positional arguments or a single list/tuple | -| `sep` | `str` | `None` | When provided: simple value-only concatenation with this separator; new column is named `"A_B"` (joined column names) | -| `sep1` | `str` | `"_"` | Separator between `col-value` pairs when `sep=None` | -| `sep2` | `str` | `"-"` | Separator between column name and its value when `sep=None` | -| `name` | `str` | `"merged"` | Explicit name for the new column (overrides auto-naming when `sep` is set) | - -**Output column naming** - -- `sep` provided + `name` left at default `"merged"` → new column is `"_".join(columns)` (e.g. `"A_B"`) -- `sep` provided + `name` given explicitly → uses that name -- `sep=None` → uses `name` (default `"merged"`) - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': [0, 5], 'B': [1, 6], 'C': [2, 7]}) - -# Plain concatenation — new column named 'A_B' -stx.pd.merge_columns(df, 'A', 'B', sep=' ') -# A B C A_B -# 0 0 1 2 0 1 -# 1 5 6 7 5 6 - -# Labelled concatenation (default) — new column named 'merged' -stx.pd.merge_columns(df, 'A', 'B') -# A B C merged -# 0 0 1 2 A-0_B-1 -# 1 5 6 7 A-5_B-6 - -# Pass columns as a list -stx.pd.merge_columns(df, ['A', 'B', 'C'], sep='-') -# A B C A_B_C -# 0 0 1 2 0-1-2 -# 1 5 6 7 5-6-7 - -# Alias -stx.pd.merge_cols(df, 'A', 'C', sep='|') -``` - ---- - -## melt_cols - -Melt a selected subset of columns while preserving all other identifier columns (long-format expansion). - -```python -melt_cols(df, cols, id_columns=None) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `cols` | `list[str]` | required | Columns to melt (become `variable` / `value` rows) | -| `id_columns` | `list[str]` or `None` | `None` | Columns to keep as identifiers; defaults to all columns not in `cols` | - -The output contains: -- All `id_columns` repeated for each melted column -- A `variable` column with the original column name -- A `value` column with the cell value (renamed `melted_value` if `"value"` is one of the melted columns) - -**Example** - -```python -import scitex as stx -import pandas as pd - -data = pd.DataFrame({ - 'id': [1, 2], - 'name': ['Alice', 'Bob'], - 'score_1': [85, 90], - 'score_2': [92, 88], -}) - -result = stx.pd.melt_cols(data, cols=['score_1', 'score_2']) -# id name variable value -# 0 1 Alice score_1 85 -# 1 2 Bob score_1 90 -# 2 1 Alice score_2 92 -# 3 2 Bob score_2 88 - -# Restrict identifiers explicitly -result2 = stx.pd.melt_cols(data, cols=['score_1', 'score_2'], id_columns=['id']) -# id variable value -# 0 1 score_1 85 -# ... -``` - -**Error handling** - -Raises `ValueError` if any column in `cols` is not present in `df`. diff --git a/src/scitex/pd/_skills/conversion.md b/src/scitex/pd/_skills/conversion.md deleted file mode 100644 index 568d85781..000000000 --- a/src/scitex/pd/_skills/conversion.md +++ /dev/null @@ -1,115 +0,0 @@ ---- -description: Convert arbitrary Python objects to DataFrames (force_df) and coerce columns to numeric types (to_numeric). ---- - -# Data Conversion - -## force_df - -Converts virtually any Python object into a `pd.DataFrame`. Returns the input unchanged if it is already a DataFrame. Handles `None` as an empty DataFrame. - -```python -force_df(data, filler=np.nan) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `data` | any | required | Object to convert | -| `filler` | any | `np.nan` | Fill value for missing entries when building from an uneven dict | - -**Input type handling** - -| Input type | Behaviour | -|------------|-----------| -| `None` | Returns `pd.DataFrame()` | -| `pd.DataFrame` | Returned as-is (no copy) | -| `pd.Series` | `.to_frame()` | -| `np.ndarray` 1-D | Single column named `"value"` | -| `np.ndarray` 2-D | Columns 0, 1, 2, … | -| `np.ndarray` N-D | Reshaped to `(shape[0], -1)` | -| `int`, `float`, `str`, `bool` | Single row, column named `"value"` | -| `list` / `tuple` of scalars | Single column named `"value"` | -| `list` of `list`/`tuple`/`ndarray` | Multi-column DataFrame | -| `dict` | Columns from keys; unequal lengths padded with `filler` | -| Other iterable | `pd.DataFrame(list(data), columns=["value"])` | - -**Examples** - -```python -import scitex as stx -import numpy as np -import pandas as pd - -# Scalar -stx.pd.force_df(42) -# value -# 0 42 - -# 1-D array -stx.pd.force_df(np.array([1, 2, 3])) -# value -# 0 1 -# 1 2 -# 2 3 - -# Uneven dict — short column padded with NaN -stx.pd.force_df({'A': [1, 2, 3], 'B': [4, 5]}) -# A B -# 0 1 4.0 -# 1 2 5.0 -# 2 3 NaN - -# DataFrame is returned unchanged -df = pd.DataFrame({'x': [1]}) -assert stx.pd.force_df(df) is df -``` - ---- - -## to_numeric - -Attempts to convert every column in a DataFrame to a numeric dtype. Non-convertible columns are handled according to the `errors` parameter. - -```python -to_numeric(df, errors="coerce") -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `errors` | `str` | `"coerce"` | `"coerce"` → invalid values become `NaN`; `"ignore"` → non-numeric columns kept unchanged; `"raise"` → raises on invalid values | - -**Behaviour detail** - -- Operates on a copy; never modifies the original. -- A column that converts entirely to `NaN` while the original had values is treated as a pure-string column: - - `errors="ignore"` keeps it as-is. - - `errors="coerce"` still replaces it with all-`NaN`. -- Integer columns are left as integers; no float promotion occurs for already-integer data. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'a': ['1', '2', '3'], 'b': ['x', 'y', 'z'], 'c': [1.1, 2.2, 3.3]}) - -# Default: coerce — 'b' becomes NaN -stx.pd.to_numeric(df) -# a b c -# 0 1.0 NaN 1.1 -# 1 2.0 NaN 2.2 -# 2 3.0 NaN 3.3 - -# ignore — pure string column 'b' kept -stx.pd.to_numeric(df, errors="ignore") -# a b c -# 0 1 x 1.1 -# 1 2 y 2.2 -# 2 3 z 3.3 -``` diff --git a/src/scitex/pd/_skills/filtering.md b/src/scitex/pd/_skills/filtering.md deleted file mode 100644 index 630879531..000000000 --- a/src/scitex/pd/_skills/filtering.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -description: Row selection with multi-column conditions including NaN-safe matching (find_indi) and combined row/column slicing (slice). ---- - -# Filtering and Slicing - -## find_indi - -Find row indices where all given column conditions are satisfied simultaneously. Handles `NaN` / `None` / `pd.NA` values in both the DataFrame and the condition values. - -```python -find_indi(df, conditions) -> list[int] -``` - -**Parameters** - -| Parameter | Type | Description | -|-----------|------|-------------| -| `df` | `pd.DataFrame` | Input DataFrame | -| `conditions` | `dict[str, str | int | float | list]` | Mapping of column names to required values. A list value uses `isin`; a scalar uses `==`. `NaN`/`None` in the list or as a scalar value matches `NaN` rows. | - -**Returns** a plain Python `list` of integer positional indices (`.tolist()` of the boolean mask index). - -**Raises** `KeyError` if any key in `conditions` is not a column of `df`. - -Returns `[]` for an empty `conditions` dict. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': [1, 2, 3, 1], 'B': ['x', 'y', 'x', 'z']}) - -# Single-value condition -stx.pd.find_indi(df, {'B': 'x'}) -# [0, 2] - -# List condition (isin) -stx.pd.find_indi(df, {'A': [1, 2]}) -# [0, 1, 3] - -# Combined conditions (AND logic) -stx.pd.find_indi(df, {'A': [1, 2], 'B': 'x'}) -# [0] - -# NaN matching -df2 = pd.DataFrame({'A': [1, None, 3], 'B': ['x', 'x', 'y']}) -stx.pd.find_indi(df2, {'A': [1, None], 'B': 'x'}) -# [0, 1] ← row with None matches because None is in the list -``` - ---- - -## slice - -Slice rows and/or columns from a DataFrame in one call. Combines index-based slicing, condition-based row selection, and column selection. - -```python -slice(df, conditions=None, columns=None) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | Input DataFrame | -| `conditions` | `builtins.slice`, `dict`, or `None` | `None` | `slice` object → `iloc`-based row slicing; `dict` → passed to `find_indi` for condition-based row selection; `None` → no row filtering | -| `columns` | `list[str]` or `None` | `None` | Columns to keep; applied after row filtering | - -Always returns a copy; the original DataFrame is not modified. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['x', 'y', 'x', 'y'], 'C': [10, 20, 30, 40]}) - -# Slice first two rows by position -stx.pd.slice(df, slice(0, 2)) -# A B C -# 0 1 x 10 -# 1 2 y 20 - -# Slice by condition -stx.pd.slice(df, {'B': 'x'}) -# A B C -# 0 1 x 10 -# 2 3 x 30 - -# Condition + column selection -stx.pd.slice(df, {'B': 'y'}, columns=['A', 'C']) -# A C -# 1 2 20 -# 3 4 40 - -# Column selection only (no row filtering) -stx.pd.slice(df, columns=['A', 'B']) -# A B -# 0 1 x -# 1 2 y -# 2 3 x -# 3 4 y -``` - -**Relationship to find_indi** - -`slice` internally delegates condition-based row selection to `find_indi`, so all NaN-safe matching rules described there apply here too. diff --git a/src/scitex/pd/_skills/search.md b/src/scitex/pd/_skills/search.md deleted file mode 100644 index ffbbb92f1..000000000 --- a/src/scitex/pd/_skills/search.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -description: Locate p-value columns by name pattern (find_pval) and extract a scalar from a column that should be constant (get_unique). ---- - -# Value Search - -## find_pval - -Identify which columns (or dict keys) hold p-values by matching names against the pattern `p[-_]?val(ue)?` (case-insensitive). Names containing `"stars"` are excluded so significance-star columns are not confused with raw p-values. - -```python -find_pval(data, multiple=True) -> str | list[str] | None -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `data` | `pd.DataFrame`, `np.ndarray`, `list`, or `dict` | required | Data structure to inspect | -| `multiple` | `bool` | `True` | `True` → return all matching names as a list; `False` → return only the first match as a string (or `None`) | - -**Dispatch behaviour** - -- `pd.DataFrame` → scans `df.columns` -- `dict` → scans dict keys -- `list` / `np.ndarray` whose first element is a `dict` → scans that dict's keys -- Other types → raises `ValueError` - -**Internal helper** `_find_pval_col(df, multiple=False)` is also exported for direct DataFrame use. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -df = pd.DataFrame({'p_value': [0.05], 'pval': [0.01], 'p_stars': ['*'], 'other': [1]}) - -# All matches -stx.pd.find_pval(df) -# ['p_value', 'pval'] ← 'p_stars' is excluded by the (?!.*stars) negative lookahead - -# First match only -stx.pd.find_pval(df, multiple=False) -# 'p_value' - -# Dict input -d = {'pvalue': 0.05, 'effect_size': 0.3} -stx.pd.find_pval(d) -# ['pvalue'] - -# Typical workflow: find column, then apply correction -col = stx.pd.find_pval(results_df, multiple=False) -if col: - corrected = results_df[col] * len(results_df) # Bonferroni -``` - ---- - -## get_unique - -Return the single unique value from a column, or a default value when the column is missing or contains more than one distinct value. Useful when a grouped DataFrame is expected to have a constant metadata column (e.g. subject ID, session label). - -```python -get_unique(df, column, default=None, raise_on_multiple=False) -> any -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `df` | `pd.DataFrame` | required | DataFrame to inspect | -| `column` | `str` | required | Column name to check | -| `default` | `any` | `None` | Value returned when the column is absent or has multiple unique values | -| `raise_on_multiple` | `bool` | `False` | If `True`, raises `ValueError` when > 1 unique value exists; raises `KeyError` for missing column | - -**Logic** - -1. Column absent → return `default` (or raise `KeyError` if `raise_on_multiple=True`) -2. Exactly one unique value → return it -3. Multiple unique values → return `default` (or raise `ValueError` if `raise_on_multiple=True`) - -The error message for multiple values includes the first five unique values for debugging. - -**Examples** - -```python -import scitex as stx -import pandas as pd - -# Constant column — returns the single value -df = pd.DataFrame({'subject': ['S01', 'S01', 'S01'], 'value': [1, 2, 3]}) -stx.pd.get_unique(df, 'subject') -# 'S01' - -# Mixed column — returns default -df2 = pd.DataFrame({'subject': ['S01', 'S02'], 'value': [1, 2]}) -stx.pd.get_unique(df2, 'subject', default='mixed') -# 'mixed' - -# Missing column -stx.pd.get_unique(df, 'session', default='unknown') -# 'unknown' - -# Strict mode -stx.pd.get_unique(df2, 'subject', raise_on_multiple=True) -# ValueError: Column 'subject' has 2 unique values: ['S01', 'S02'] - -# Typical per-group metadata extraction -for _, group in df.groupby('subject'): - subject_id = stx.pd.get_unique(group, 'subject', raise_on_multiple=True) - process(subject_id, group) -``` diff --git a/src/scitex/pd/_skills/shape.md b/src/scitex/pd/_skills/shape.md deleted file mode 100644 index 6ab6358fe..000000000 --- a/src/scitex/pd/_skills/shape.md +++ /dev/null @@ -1,133 +0,0 @@ ---- -description: Convert between wide matrix format and long (x, y, z) row format using to_xyz, from_xyz, and to_xy. ---- - -# Shape Transformation - -Three functions form a complementary trio for switching between wide (pivot/heatmap) and long (triplet) DataFrame layouts. - -## to_xyz - -Converts a wide DataFrame (matrix / heatmap layout) to long format. Each cell becomes one row with columns `x` (row index), `y` (column name), and `z` (value). - -```python -to_xyz(data_frame) -> pd.DataFrame -``` - -The output column names inherit from `data_frame.index.name` (→ `x` axis) and `data_frame.columns.name` (→ `y` axis). If those names are `None`, `"x"` and `"y"` are used. - -**Example** - -```python -import scitex as stx -import pandas as pd - -wide = pd.DataFrame( - {'col_A': [1, 2], 'col_B': [3, 4]}, - index=['row_0', 'row_1'] -) -long = stx.pd.to_xyz(wide) -# x y z -# 0 row_0 col_A 1 -# 1 row_1 col_A 2 -# 2 row_0 col_B 3 -# 3 row_1 col_B 4 -``` - ---- - -## from_xyz - -Converts a long-format DataFrame (triplets) back to a wide pivot table (heatmap layout). This is the inverse of `to_xyz`. - -```python -from_xyz(data_frame, x=None, y=None, z=None, square=False) -> pd.DataFrame -``` - -**Parameters** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `data_frame` | `pd.DataFrame` | required | Long-format input with x, y, z columns | -| `x` | `str` | `"x"` | Column used as pivot columns axis | -| `y` | `str` | `"y"` | Column used as pivot index axis | -| `z` | `str` | `"z"` | Column used as values | -| `square` | `bool` | `False` | If `True`, forces a square output by unioning index and column labels; missing cells are filled with `0` | - -Missing cells are filled with `0` via `fillna(0)`. When multiple rows share the same (x, y) pair the first occurrence wins (`aggfunc="first"`). - -**Example** - -```python -import scitex as stx -import pandas as pd - -long = pd.DataFrame({ - 'x': ['A', 'B', 'A'], - 'y': ['X', 'X', 'Y'], - 'z': [0.01, 0.05, 0.1] -}) -wide = stx.pd.from_xyz(long) -# y A B -# x -# X 0.01 0.05 -# Y 0.10 0.00 ← missing cell filled with 0 - -# Square output (union of A, B and X, Y labels): -stx.pd.from_xyz(long, square=True) -``` - -**Custom column names** - -```python -df = pd.DataFrame({'row': ['r1', 'r2'], 'col': ['c1', 'c1'], 'val': [10, 20]}) -stx.pd.from_xyz(df, x='col', y='row', z='val') -``` - ---- - -## to_xy - -Converts a *square* wide DataFrame to long format. Behaves similarly to `to_xyz` but requires the DataFrame to be square (`shape[0] == shape[1]`) and reconciles mismatched index/column labels before expanding. - -```python -to_xy(data_frame) -> pd.DataFrame -``` - -**Constraint:** input must be square. If index and columns differ, one must be a default integer range — that range is replaced by the other. - -**Output columns:** `["x", "y", "z"]` - -**Example** - -```python -import scitex as stx -import pandas as pd -import numpy as np - -square = pd.DataFrame( - np.array([[1, 2], [3, 4]]), - index=['A', 'B'], - columns=['A', 'B'] -) -result = stx.pd.to_xy(square) -# x y z -# 0 A A 1 -# 1 B A 3 -# 2 A B 2 -# 3 B B 4 -``` - ---- - -## Workflow: round-trip - -```python -import scitex as stx - -# Start from wide -wide = stx.pd.from_xyz(long_df) # long → wide -long_again = stx.pd.to_xyz(wide) # wide → long - -# For symmetric matrices use to_xy instead of to_xyz -``` diff --git a/src/scitex/pd/_skills/warnings.md b/src/scitex/pd/_skills/warnings.md deleted file mode 100644 index a5c2010f4..000000000 --- a/src/scitex/pd/_skills/warnings.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -description: Context manager to suppress pandas SettingWithCopyWarning for a block of code. ---- - -# Warnings - -## ignore_setting_with_copy_warning - -Context manager that temporarily silences `pandas.errors.SettingWithCopyWarning` (or the equivalent `pandas.core.common.SettingWithCopyWarning` on older pandas versions). - -The canonical name is `ignore_setting_with_copy_warning`. The PascalCase alias `ignore_SettingWithCopyWarning` is retained for backward compatibility but is deprecated. - -```python -@contextmanager -ignore_setting_with_copy_warning() -``` - -**When to use** - -This warning fires when pandas detects an assignment to a DataFrame slice that may or may not modify the original. If you have already verified correctness (e.g. you are intentionally modifying a view, or the slice is used as a temporary), wrapping the block suppresses noise without hiding real bugs elsewhere. - -**Examples** - -```python -import scitex as stx - -# Suppress warning for a specific assignment block -with stx.pd.ignore_setting_with_copy_warning(): - df['column'] = new_values - -# Deprecated alias (still functional) -with stx.pd.ignore_SettingWithCopyWarning(): - df['column'] = new_values -``` - -**Implementation note** - -Uses `warnings.catch_warnings()` + `warnings.simplefilter("ignore", SettingWithCopyWarning)` internally, so warning filters are fully restored on context exit even if an exception is raised. diff --git a/src/scitex/pd/_slice.py b/src/scitex/pd/_slice.py deleted file mode 100755 index b85f583fb..000000000 --- a/src/scitex/pd/_slice.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 07:45:00 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_slice.py - -import builtins -from typing import Dict, List, Optional, Union - -import pandas as pd - -from ._find_indi import find_indi - - -def slice( - df: pd.DataFrame, - conditions: Union[ - builtins.slice, Dict[str, Union[str, int, float, List]], None - ] = None, - columns: Optional[List[str]] = None, -) -> pd.DataFrame: - """Slices DataFrame rows and/or columns. - - Example - ------- - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'x']}) - >>> # Slice by row indices - >>> result = slice(df, slice(0, 2)) - >>> # Slice by conditions - >>> result = slice(df, {'A': [1, 2], 'B': 'x'}) - >>> # Slice columns - >>> result = slice(df, columns=['A']) - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame to slice - conditions : slice, Dict, or None - Either a slice object for row indices, or a dictionary of column conditions - columns : List[str], optional - List of column names to select - - Returns - ------- - pd.DataFrame - Sliced DataFrame - """ - result = df.copy() - - # Handle row slicing - if isinstance(conditions, builtins.slice): - result = result.iloc[conditions] - elif isinstance(conditions, dict): - indices = find_indi(result, conditions) - result = result.loc[indices] - - # Handle column slicing - if columns is not None: - result = result[columns] - - return result - - -# EOF diff --git a/src/scitex/pd/_sort.py b/src/scitex/pd/_sort.py deleted file mode 100755 index 76fce4778..000000000 --- a/src/scitex/pd/_sort.py +++ /dev/null @@ -1,91 +0,0 @@ -#!./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-08-25 09:35:39 (ywatanabe)" -# ./src/scitex/pd/_sort.py - -import pandas as pd - - -def sort( - dataframe, - by=None, - ascending=True, - inplace=False, - kind="quicksort", - na_position="last", - ignore_index=False, - key=None, - orders=None, -): - """ - Sort DataFrame by specified column(s) with optional custom ordering and column reordering. - - Example - ------- - import pandas as pd - df = pd.DataFrame({'A': ['foo', 'bar', 'baz'], 'B': [3, 2, 1]}) - custom_order = {'A': ['bar', 'baz', 'foo']} - sorted_df = sort(df, by=None, orders=custom_order) - print(sorted_df) - - Parameters - ---------- - dataframe : pandas.DataFrame - The DataFrame to sort. - by : str or list of str, optional - Name(s) of column(s) to sort by. - ascending : bool or list of bool, default True - Sort ascending vs. descending. - inplace : bool, default False - If True, perform operation in-place. - kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' - Choice of sorting algorithm. - na_position : {'first', 'last'}, default 'last' - Puts NaNs at the beginning if 'first'; 'last' puts NaNs at the end. - ignore_index : bool, default False - If True, the resulting axis will be labeled 0, 1, …, n - 1. - key : callable, optional - Apply the key function to the values before sorting. - orders : dict, optional - Dictionary of column names and their custom sort orders. - - Returns - ------- - pandas.DataFrame - Sorted DataFrame with reordered columns. - """ - if orders: - by = [by] if isinstance(by, str) else list(orders.keys()) if by is None else by - - def apply_custom_order(column): - return ( - pd.Categorical(column, categories=orders[column.name], ordered=True) - if column.name in orders - else column - ) - - key = apply_custom_order - elif isinstance(by, str): - by = [by] - - sorted_df = dataframe.sort_values( - by=by, - ascending=ascending, - inplace=False, - kind=kind, - na_position=na_position, - ignore_index=ignore_index, - key=key, - ) - - # Reorder columns - if by: - other_columns = [col for col in sorted_df.columns if col not in by] - sorted_df = sorted_df[by + other_columns] - - if inplace: - dataframe.update(sorted_df) - dataframe.reindex(columns=sorted_df.columns) - return dataframe - else: - return sorted_df diff --git a/src/scitex/pd/_to_numeric.py b/src/scitex/pd/_to_numeric.py deleted file mode 100755 index 63ae22237..000000000 --- a/src/scitex/pd/_to_numeric.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-08 04:35:31 (ywatanabe)" -# File: ./scitex_repo/src/scitex/pd/_to_numeric.py - -import pandas as pd - - -def to_numeric(df, errors="coerce"): - """Convert all possible columns in a DataFrame to numeric types. - - Parameters - ---------- - df : pd.DataFrame - Input DataFrame - errors : str, optional - How to handle errors. 'coerce' (default) converts invalid values to NaN, - 'ignore' leaves non-numeric columns unchanged, 'raise' raises exceptions. - - Returns - ------- - pd.DataFrame - DataFrame with numeric columns converted - """ - df_copy = df.copy() - for col in df_copy.columns: - # First try to convert - original_col = df_copy[col] - converted_col = pd.to_numeric(df_copy[col], errors="coerce") - - # Check if conversion resulted in all NaN when original had values - if converted_col.isna().all() and not original_col.isna().all(): - # This is likely a pure string column - if errors == "ignore": - # Keep original for pure string columns - continue - else: - # For coerce, still apply it - df_copy[col] = converted_col - elif not converted_col.equals(original_col): - # Conversion changed something - if errors == "ignore": - # Only convert if it doesn't introduce new NaNs - if converted_col.isna().sum() == original_col.isna().sum(): - df_copy[col] = converted_col - elif errors == "coerce": - df_copy[col] = converted_col - elif errors == "raise": - df_copy[col] = pd.to_numeric(df_copy[col], errors="raise") - return df_copy - - -# EOF diff --git a/src/scitex/pd/_to_xy.py b/src/scitex/pd/_to_xy.py deleted file mode 100755 index 4a39b5bd1..000000000 --- a/src/scitex/pd/_to_xy.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-09-03 07:01:31 (ywatanabe)" -# ./src/scitex/pd/_to_xy.py - -import numpy as np -import pandas as pd - -import scitex - - -def to_xy(data_frame): - """ - Convert a heatmap DataFrame into x, y, z format. - - Ensure the index and columns are the same, and if either exists, replace with that. - - Example - ------- - data_frame = pd.DataFrame(...) # Your DataFrame here - out = to_xy(data_frame) - print(out) - - Parameters - ---------- - data_frame : pandas.DataFrame - The input DataFrame to be converted. - - Returns - ------- - pandas.DataFrame - A DataFrame formatted with columns ['x', 'y', 'z'] - """ - assert data_frame.shape[0] == data_frame.shape[1] - - if not data_frame.index.equals(data_frame.columns): - if (data_frame.index == np.array(range(len(data_frame.index)))).all(): - data_frame.columns = data_frame.index - elif (data_frame.columns == np.array(range(len(data_frame.columns)))).all(): - data_frame.index = data_frame.columns - else: - ValueError - # else: - # ValueError "Either of index or columns has to be passed" - - formatted_data_frames = [] - - for column in data_frame.columns: - column_data_frame = data_frame[column] - y_label = column_data_frame.name - column_data_frame = pd.DataFrame(column_data_frame) - column_data_frame["x"] = column_data_frame.index - column_data_frame["y"] = y_label - column_data_frame = column_data_frame.reset_index().drop(columns=["index"]) - column_data_frame = column_data_frame.rename(columns={y_label: "z"}) - column_data_frame = scitex.pd.mv(column_data_frame, "z", -1) - formatted_data_frames.append(column_data_frame) - - return pd.concat(formatted_data_frames, ignore_index=True) diff --git a/src/scitex/pd/_to_xyz.py b/src/scitex/pd/_to_xyz.py deleted file mode 100755 index a603426cd..000000000 --- a/src/scitex/pd/_to_xyz.py +++ /dev/null @@ -1,111 +0,0 @@ -#!/./env/bin/python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-09-28 11:17:22 (ywatanabe)" -# ./src/scitex/pd/_to_xyz.py - -import numpy as np -import pandas as pd - -import scitex - - -def to_xyz(data_frame): - """ - Convert a DataFrame into x, y, z format (long format). - - Transforms a DataFrame from wide format (matrix/heatmap) to long format - where each value becomes a row with x (row index), y (column name), - and z (value) columns. - - Example - ------- - data_frame = pd.DataFrame(...) # Your DataFrame here - out = to_xyz(data_frame) - print(out) - - Parameters - ---------- - data_frame : pandas.DataFrame - The input DataFrame to be converted. - - Returns - ------- - pandas.DataFrame - A DataFrame formatted with columns ['x', 'y', 'z'] - """ - x_name = data_frame.index.name or "x" - y_name = data_frame.columns.name or "y" - - formatted_data_frames = [] - - for column in data_frame.columns: - column_data_frame = data_frame[column] - formatted_data = pd.DataFrame( - { - x_name: column_data_frame.index, - y_name: column, - "z": column_data_frame.values, - } - ) - formatted_data_frames.append(formatted_data) - - result = pd.concat(formatted_data_frames, ignore_index=True) - - # Ensure column order is x, y, z - col_order = [x_name, y_name, "z"] - result = result[col_order] - - return result - - -# def to_xyz(data_frame): -# """ -# Convert a heatmap DataFrame into x, y, z format. - -# Ensure the index and columns are the same, and if either exists, replace with that. - -# Example -# ------- -# data_frame = pd.DataFrame(...) # Your DataFrame here -# out = to_xy(data_frame) -# print(out) - -# Parameters -# ---------- -# data_frame : pandas.DataFrame -# The input DataFrame to be converted. - -# Returns -# ------- -# pandas.DataFrame -# A DataFrame formatted with columns ['x', 'y', 'z'] -# """ -# assert data_frame.shape[0] == data_frame.shape[1] - -# if not data_frame.index.equals(data_frame.columns): - -# if (data_frame.index == np.array(range(len(data_frame.index)))).all(): -# data_frame.columns = data_frame.index -# elif ( -# data_frame.columns == np.array(range(len(data_frame.columns))) -# ).all(): -# data_frame.index = data_frame.columns -# else: -# raise ValueError("Either index or columns must be a range of integers") - -# formatted_data_frames = [] - -# for column in data_frame.columns: -# column_data_frame = data_frame[column] -# y_label = column_data_frame.name -# column_data_frame = pd.DataFrame(column_data_frame) -# column_data_frame["x"] = column_data_frame.index -# column_data_frame["y"] = y_label -# column_data_frame = column_data_frame.reset_index().drop( -# columns=["index"] -# ) -# column_data_frame = column_data_frame.rename(columns={y_label: "z"}) -# column_data_frame = scitex.pd.mv(column_data_frame, "z", -1) -# formatted_data_frames.append(column_data_frame) - -# return pd.concat(formatted_data_frames, ignore_index=True) diff --git a/tests/scitex/pd/test__find_indi.py b/tests/scitex/pd/test__find_indi.py deleted file mode 100644 index 61bdc3871..000000000 --- a/tests/scitex/pd/test__find_indi.py +++ /dev/null @@ -1,480 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 10:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__find_indi.py - -import os -import sys -import tempfile -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pandas as pd -import pytest - - -class TestFindIndiBasic: - """Test basic functionality of find_indi.""" - - def test_single_condition_string(self): - """Test finding indices with single string condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": ["x", "y", "x", "z"], "B": [1, 2, 3, 4]}) - conditions = {"A": "x"} - result = find_indi(df, conditions) - - assert isinstance(result, list) - assert result == [0, 2] - - def test_single_condition_number(self): - """Test finding indices with single numeric condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3, 2], "B": ["a", "b", "c", "d"]}) - conditions = {"A": 2} - result = find_indi(df, conditions) - - assert result == [1, 3] - - def test_multiple_conditions(self): - """Test finding indices with multiple conditions.""" - from scitex.pd import find_indi - - df = pd.DataFrame( - {"A": [1, 2, 1, 2], "B": ["x", "x", "y", "y"], "C": [10, 20, 30, 40]} - ) - conditions = {"A": 1, "B": "x"} - result = find_indi(df, conditions) - - assert result == [0] - - def test_list_condition(self): - """Test finding indices with list condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": ["a", "b", "c", "d", "e"]}) - conditions = {"A": [1, 3, 5]} - result = find_indi(df, conditions) - - assert result == [0, 2, 4] - - def test_mixed_conditions(self): - """Test finding indices with mixed single and list conditions.""" - from scitex.pd import find_indi - - df = pd.DataFrame( - { - "A": [1, 2, 3, 1, 2], - "B": ["x", "y", "z", "x", "y"], - "C": [100, 200, 300, 400, 500], - } - ) - conditions = {"A": [1, 2], "B": "x"} - result = find_indi(df, conditions) - - assert result == [0, 3] - - -class TestFindIndiNaNHandling: - """Test NaN handling in find_indi.""" - - def test_nan_in_dataframe(self): - """Test handling NaN values in DataFrame.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, np.nan, 4], "B": ["x", "y", "z", "w"]}) - conditions = {"A": 2} - result = find_indi(df, conditions) - - assert result == [1] - - def test_nan_in_condition_single(self): - """Test finding NaN values with single condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, np.nan, 3, np.nan], "B": ["a", "b", "c", "d"]}) - conditions = {"A": np.nan} - result = find_indi(df, conditions) - - assert result == [1, 3] - - def test_nan_in_condition_list(self): - """Test finding NaN values in list condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, np.nan, 3, 4, np.nan]}) - conditions = {"A": [1, np.nan]} - result = find_indi(df, conditions) - - assert result == [0, 1, 4] - - def test_none_in_condition(self): - """Test finding None values.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, None, 3, None], "B": ["a", "b", "c", "d"]}) - conditions = {"A": None} - result = find_indi(df, conditions) - - assert result == [1, 3] - - def test_pd_na_in_condition(self): - """Test finding pd.NA values.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, pd.NA, 3, pd.NA]}, dtype="Int64") - conditions = {"A": pd.NA} - result = find_indi(df, conditions) - - assert result == [1, 3] - - -class TestFindIndiEdgeCases: - """Test edge cases in find_indi.""" - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - from scitex.pd import find_indi - - df = pd.DataFrame() - conditions = {} - result = find_indi(df, conditions) - - assert result == [] - - def test_empty_conditions(self): - """Test with empty conditions.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) - conditions = {} - result = find_indi(df, conditions) - - assert result == [] - - def test_no_matches(self): - """Test when no rows match conditions.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) - conditions = {"A": 999} - result = find_indi(df, conditions) - - assert result == [] - - def test_all_matches(self): - """Test when all rows match conditions.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 1, 1], "B": ["x", "x", "x"]}) - conditions = {"A": 1, "B": "x"} - result = find_indi(df, conditions) - - assert result == [0, 1, 2] - - def test_custom_index(self): - """Test with custom DataFrame index.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}, index=[10, 20, 30]) - conditions = {"A": 2} - result = find_indi(df, conditions) - - assert result == [20] - - -class TestFindIndiErrorHandling: - """Test error handling in find_indi.""" - - def test_column_not_found(self): - """Test KeyError when column not in DataFrame.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) - conditions = {"C": 1} - - with pytest.raises(KeyError, match="Columns not found in DataFrame: \\['C'\\]"): - find_indi(df, conditions) - - def test_multiple_columns_not_found(self): - """Test KeyError with multiple missing columns.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3]}) - conditions = {"B": 1, "C": 2} - - with pytest.raises(KeyError, match="Columns not found in DataFrame"): - find_indi(df, conditions) - - -class TestFindIndiDataTypes: - """Test find_indi with various data types.""" - - def test_float_values(self): - """Test with float values.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1.1, 2.2, 3.3, 2.2]}) - conditions = {"A": 2.2} - result = find_indi(df, conditions) - - assert result == [1, 3] - - def test_boolean_values(self): - """Test with boolean values.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [True, False, True, False]}) - conditions = {"A": True} - result = find_indi(df, conditions) - - assert result == [0, 2] - - def test_datetime_values(self): - """Test with datetime values.""" - from scitex.pd import find_indi - - dates = pd.to_datetime(["2021-01-01", "2021-01-02", "2021-01-01"]) - df = pd.DataFrame({"date": dates}) - conditions = {"date": pd.Timestamp("2021-01-01")} - result = find_indi(df, conditions) - - assert result == [0, 2] - - def test_categorical_values(self): - """Test with categorical values.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": pd.Categorical(["cat", "dog", "cat", "bird"])}) - conditions = {"A": "cat"} - result = find_indi(df, conditions) - - assert result == [0, 2] - - -class TestFindIndiComplexScenarios: - """Test complex scenarios with find_indi.""" - - def test_multiple_columns_multiple_values(self): - """Test with multiple columns and multiple values.""" - from scitex.pd import find_indi - - df = pd.DataFrame( - { - "A": [1, 2, 3, 4, 5], - "B": ["x", "y", "z", "x", "y"], - "C": [10, 20, 30, 40, 50], - } - ) - conditions = {"A": [1, 2, 3], "B": ["x", "y"]} - result = find_indi(df, conditions) - - assert result == [0, 1] - - def test_large_dataframe(self): - """Test with large DataFrame.""" - from scitex.pd import find_indi - - n = 10000 - df = pd.DataFrame( - { - "A": np.random.randint(0, 10, n), - "B": np.random.choice(["x", "y", "z"], n), - "C": np.random.rand(n), - } - ) - conditions = {"A": 5, "B": "x"} - result = find_indi(df, conditions) - - # Verify result manually - expected = df[(df["A"] == 5) & (df["B"] == "x")].index.tolist() - assert result == expected - - def test_tuple_condition(self): - """Test with tuple condition (should work like list).""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5]}) - conditions = {"A": (2, 4)} - result = find_indi(df, conditions) - - assert result == [1, 3] - - def test_mixed_types_in_list(self): - """Test with mixed types in list condition.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, "2", 3, "4", 5]}) - conditions = {"A": [1, "2", 3]} - result = find_indi(df, conditions) - - assert result == [0, 1, 2] - - -class TestFindIndiDocumentationExamples: - """Test examples from documentation.""" - - def test_docstring_example(self): - """Test the example from the docstring.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, None], "B": ["x", "y", "x"]}) - conditions = {"A": [1, None], "B": "x"} - result = find_indi(df, conditions) - - # Should find rows where A is 1 or None AND B is 'x' - assert result == [0, 2] - - def test_original_commented_example(self): - """Test example from commented code.""" - from scitex.pd import find_indi - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "x"]}) - conditions = {"A": [1, 2], "B": "x"} - result = find_indi(df, conditions) - - # Should find rows where A is 1 or 2 AND B is 'x' - assert result == [0] - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_find_indi.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-05 08:11:05 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_find_indi.py -# -# from typing import Dict, List, Union -# -# import pandas as pd -# -# -# # def find_indi(df: pd.DataFrame, conditions: Dict[str, Union[str, int, float, List]]) -> pd.Series: -# # """Finds indices of rows that satisfy all given conditions in a DataFrame. -# -# # Example -# # ------- -# # >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'x']}) -# # >>> conditions = {'A': [1, 2], 'B': 'x'} -# # >>> result = find_indi(df, conditions) -# # >>> print(result) -# # 0 True -# # 1 False -# # 2 False -# # dtype: bool -# -# # Parameters -# # ---------- -# # df : pd.DataFrame -# # Input DataFrame to search in -# # conditions : Dict[str, Union[str, int, float, List]] -# # Dictionary of column names and their target values -# -# # Returns -# # ------- -# # pd.Series -# # Boolean Series indicating which rows satisfy all conditions -# -# # Raises -# # ------ -# # KeyError -# # If any column in conditions is not found in DataFrame -# # """ -# # if not all(col in df.columns for col in conditions): -# # missing_cols = [col for col in conditions if col not in df.columns] -# # raise KeyError(f"Columns not found in DataFrame: {missing_cols}") -# -# # condition_series = [] -# # for key, value in conditions.items(): -# # if isinstance(value, (list, tuple)): -# # condition_series.append(df[key].isin(value)) -# # else: -# # condition_series.append(df[key] == value) -# -# # return pd.concat(condition_series, axis=1).all(axis=1) -# -# -# def find_indi( -# df: pd.DataFrame, conditions: Dict[str, Union[str, int, float, List]] -# ) -> List[int]: -# """Finds indices of rows that satisfy conditions, handling NaN values. -# -# Example -# ------- -# >>> df = pd.DataFrame({'A': [1, 2, None], 'B': ['x', 'y', 'x']}) -# >>> conditions = {'A': [1, None], 'B': 'x'} -# >>> result = find_indi(df, conditions) -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame -# conditions : Dict[str, Union[str, int, float, List]] -# Column conditions -# -# Returns -# ------- -# List[int] -# List of integer indices of matching rows -# """ -# if not conditions: -# return [] -# -# if not all(col in df.columns for col in conditions): -# missing_cols = [col for col in conditions if col not in df.columns] -# raise KeyError(f"Columns not found in DataFrame: {missing_cols}") -# -# condition_series = [] -# for key, value in conditions.items(): -# if isinstance(value, (list, tuple)): -# # Handle NaN in lists -# has_na = False -# try: -# # Check for None -# if None in value: -# has_na = True -# # Check for pd.NA (may raise TypeError) -# elif any(v is pd.NA for v in value): -# has_na = True -# # Check for np.nan -# elif any(pd.isna(v) for v in value): -# has_na = True -# except (TypeError, ValueError): -# # If any check fails, try alternative approach -# has_na = any( -# pd.isna(v) if not isinstance(v, str) else False for v in value -# ) -# -# if has_na: -# condition = df[key].isin(value) | df[key].isna() -# else: -# condition = df[key].isin(value) -# else: -# # Handle single NaN value -# if pd.isna(value): -# condition = df[key].isna() -# else: -# condition = df[key] == value -# condition_series.append(condition) -# -# if condition_series: -# mask = pd.concat(condition_series, axis=1).all(axis=1) -# return df.index[mask].tolist() -# else: -# return [] -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_find_indi.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__find_pval.py b/tests/scitex/pd/test__find_pval.py deleted file mode 100644 index 36c39b02d..000000000 --- a/tests/scitex/pd/test__find_pval.py +++ /dev/null @@ -1,492 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 10:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__find_pval.py - -import os -import sys -import tempfile -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pandas as pd -import pytest - - -class TestFindPvalDataFrame: - """Test find_pval with DataFrame inputs.""" - - def test_single_pval_column(self): - """Test finding single p-value column.""" - from scitex.pd import find_pval - - df = pd.DataFrame({"p_value": [0.05, 0.01], "other": [1, 2]}) - result = find_pval(df, multiple=False) - - assert result == "p_value" - - def test_multiple_pval_columns(self): - """Test finding multiple p-value columns.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - { - "p_value": [0.05, 0.01], - "pval": [0.1, 0.001], - "p-val": [0.2, 0.02], - "other": [1, 2], - } - ) - result = find_pval(df, multiple=True) - - assert isinstance(result, list) - assert set(result) == {"p_value", "pval", "p-val"} - - def test_pvalue_variations(self): - """Test various p-value column name variations.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - { - "pval": [0.1], - "p_val": [0.2], - "p-val": [0.3], - "pvalue": [0.4], - "p_value": [0.5], - "p-value": [0.6], - "Pval": [0.7], - "PVALUE": [0.8], - "P_Value": [0.9], - } - ) - result = find_pval(df, multiple=True) - - assert len(result) == 9 - assert all(col in result for col in df.columns) - - def test_no_pval_columns(self): - """Test when no p-value columns exist.""" - from scitex.pd import find_pval - - df = pd.DataFrame({"alpha": [0.05], "beta": [0.1], "gamma": [1]}) - result = find_pval(df, multiple=False) - - assert result is None - - def test_no_pval_columns_multiple(self): - """Test when no p-value columns exist with multiple=True.""" - from scitex.pd import find_pval - - df = pd.DataFrame({"alpha": [0.05], "beta": [0.1], "gamma": [1]}) - result = find_pval(df, multiple=True) - - assert result == [] - - def test_pval_stars_exclusion(self): - """Test that p-value stars columns are excluded.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - { - "p_value": [0.05], - "pval_stars": ["*"], - "p_value_stars": ["**"], - "pvalstars": ["***"], - } - ) - result = find_pval(df, multiple=True) - - assert result == ["p_value"] - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - from scitex.pd import find_pval - - df = pd.DataFrame() - result = find_pval(df, multiple=True) - - assert result == [] - - -class TestFindPvalDict: - """Test find_pval with dictionary inputs.""" - - def test_dict_single_match(self): - """Test finding p-value key in dictionary.""" - from scitex.pd import find_pval - - data = {"p_value": 0.05, "coefficient": 1.2, "se": 0.1} - result = find_pval(data, multiple=False) - - assert result == "p_value" - - def test_dict_multiple_matches(self): - """Test finding multiple p-value keys in dictionary.""" - from scitex.pd import find_pval - - data = {"p_value": 0.05, "pval": 0.01, "p-val": 0.02, "coefficient": 1.2} - result = find_pval(data, multiple=True) - - assert set(result) == {"p_value", "pval", "p-val"} - - def test_dict_no_matches(self): - """Test dictionary with no p-value keys.""" - from scitex.pd import find_pval - - data = {"alpha": 0.05, "beta": 0.1, "gamma": 1} - result = find_pval(data, multiple=False) - - assert result is None - - def test_nested_dict(self): - """Test with nested dictionary structure.""" - from scitex.pd import find_pval - - data = {"results": {"p_value": 0.05}, "p_val": 0.01} - result = find_pval(data, multiple=True) - - # Should only find top-level keys - assert result == ["p_val"] - - -class TestFindPvalList: - """Test find_pval with list inputs.""" - - def test_list_of_dicts(self): - """Test list of dictionaries.""" - from scitex.pd import find_pval - - data = [ - {"p_value": 0.05, "coef": 1.2}, - {"p_value": 0.01, "coef": 2.3}, - {"p_value": 0.001, "coef": 3.4}, - ] - result = find_pval(data, multiple=False) - - assert result == "p_value" - - def test_list_of_dicts_multiple_pvals(self): - """Test list of dictionaries with multiple p-value keys.""" - from scitex.pd import find_pval - - data = [ - {"p_value": 0.05, "pval": 0.06, "coef": 1.2}, - {"p_value": 0.01, "pval": 0.02, "coef": 2.3}, - ] - result = find_pval(data, multiple=True) - - assert set(result) == {"p_value", "pval"} - - def test_empty_list(self): - """Test empty list.""" - from scitex.pd import find_pval - - data = [] - result = find_pval(data, multiple=True) - - assert result == [] - - def test_list_of_non_dicts(self): - """Test list of non-dictionary items.""" - from scitex.pd import find_pval - - data = [1, 2, 3, 4] - result = find_pval(data, multiple=False) - - assert result is None - - -class TestFindPvalNumPy: - """Test find_pval with numpy array inputs.""" - - def test_numpy_array_of_dicts(self): - """Test numpy array containing dictionaries.""" - from scitex.pd import find_pval - - data = np.array( - [{"p_value": 0.05, "stat": 2.1}, {"p_value": 0.01, "stat": 3.2}] - ) - result = find_pval(data, multiple=False) - - assert result == "p_value" - - def test_numpy_structured_array(self): - """Test with numpy structured array.""" - from scitex.pd import find_pval - - # Regular numpy arrays don't have column names - data = np.array([1, 2, 3]) - result = find_pval(data, multiple=False) - - assert result is None - - def test_numpy_empty_array(self): - """Test with empty numpy array.""" - from scitex.pd import find_pval - - data = np.array([]) - result = find_pval(data, multiple=True) - - assert result == [] - - -class TestFindPvalEdgeCases: - """Test edge cases and error handling.""" - - def test_case_insensitive(self): - """Test case-insensitive matching.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - {"P_VALUE": [0.05], "Pval": [0.01], "P-Val": [0.02], "PVALUE": [0.03]} - ) - result = find_pval(df, multiple=True) - - assert len(result) == 4 - - def test_numeric_column_names(self): - """Test with numeric column names.""" - from scitex.pd import find_pval - - df = pd.DataFrame({0: [1, 2], 1: [3, 4], "p_value": [0.05, 0.01]}) - result = find_pval(df, multiple=False) - - assert result == "p_value" - - def test_special_characters(self): - """Test column names with special characters.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - {"p.value": [0.05], "p$val": [0.01], "p_value!": [0.02], "normal": [1]} - ) - result = find_pval(df, multiple=True) - - # The regex pattern requires 'p' followed by optional '-' or '_', then 'val' - # So 'p.value' and 'p$val' won't match, but 'p_value!' will - assert "p_value!" in result - assert len(result) == 1 - - def test_invalid_input_type(self): - """Test with invalid input type.""" - from scitex.pd import find_pval - - with pytest.raises(ValueError, match="Input must be a pandas DataFrame"): - find_pval("invalid_input") - - def test_partial_matches(self): - """Test that partial matches work correctly.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - { - "pval_test": [0.05], - "test_pvalue": [0.01], - "my_p_value_column": [0.02], - "not_related": [1], - } - ) - result = find_pval(df, multiple=True) - - assert len(result) == 3 - assert "not_related" not in result - - -class TestFindPvalDocumentation: - """Test examples from documentation.""" - - def test_docstring_example_multiple(self): - """Test the multiple=True example from docstring.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - {"p_value": [0.05, 0.01], "pval": [0.1, 0.001], "other": [1, 2]} - ) - result = find_pval(df) # default multiple=True - - assert set(result) == {"p_value", "pval"} - - def test_docstring_example_single(self): - """Test the multiple=False example from docstring.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - {"p_value": [0.05, 0.01], "pval": [0.1, 0.001], "other": [1, 2]} - ) - result = find_pval(df, multiple=False) - - assert result == "p_value" - - def test_function_alias(self): - """Test that _find_pval_col works directly.""" - from scitex.pd import _find_pval_col - - df = pd.DataFrame({"p_value": [0.05], "data": [10]}) - result = _find_pval_col(df, multiple=False) - - assert result == "p_value" - - -class TestFindPvalIntegration: - """Integration tests with real-world scenarios.""" - - def test_statistical_results_dataframe(self): - """Test with typical statistical results DataFrame.""" - from scitex.pd import find_pval - - df = pd.DataFrame( - { - "variable": ["age", "gender", "treatment"], - "coefficient": [0.5, -0.3, 1.2], - "std_error": [0.1, 0.2, 0.3], - "t_statistic": [5.0, -1.5, 4.0], - "p_value": [0.001, 0.134, 0.002], - "confidence_lower": [0.3, -0.7, 0.6], - "confidence_upper": [0.7, 0.1, 1.8], - } - ) - result = find_pval(df, multiple=False) - - assert result == "p_value" - - def test_multiple_test_results(self): - """Test with multiple test results format.""" - from scitex.pd import find_pval - - results = [ - {"test": "t-test", "statistic": 2.5, "pval": 0.012}, - {"test": "chi-square", "statistic": 5.3, "pval": 0.021}, - {"test": "anova", "statistic": 3.8, "pval": 0.052}, - ] - result = find_pval(results) - - assert result == ["pval"] - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_find_pval.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-03 03:25:00 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_find_pval.py -# -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-06 11:09:07 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/stats/_find_pval_col.py -# -# """ -# Functionality: -# - Identifies column name(s) in a DataFrame or keys in other data structures that correspond to p-values -# Input: -# - pandas DataFrame, numpy array, list, or dict -# Output: -# - String or list of strings representing the identified p-value column name(s) or key(s), or None if not found -# Prerequisites: -# - pandas, numpy libraries -# """ -# -# import re -# from typing import Dict, List, Optional, Union -# -# import numpy as np -# import pandas as pd -# -# -# def find_pval( -# data: Union[pd.DataFrame, np.ndarray, List, Dict], multiple: bool = True -# ) -> Union[Optional[str], List[str]]: -# """ -# Find p-value column name(s) or key(s) in various data structures. -# -# Example: -# -------- -# >>> df = pd.DataFrame({'p_value': [0.05, 0.01], 'pval': [0.1, 0.001], 'other': [1, 2]}) -# >>> find_pval(df) -# ['p_value', 'pval'] -# >>> find_pval(df, multiple=False) -# 'p_value' -# -# Parameters: -# ----------- -# data : Union[pd.DataFrame, np.ndarray, List, Dict] -# Data structure to search for p-value column or key -# multiple : bool, optional -# If True, return all matches; if False, return only the first match (default is True) -# -# Returns: -# -------- -# Union[Optional[str], List[str]] -# Name(s) of the column(s) or key(s) that match p-value patterns, or None if not found -# """ -# if isinstance(data, pd.DataFrame): -# return _find_pval_col(data, multiple) -# elif isinstance(data, (np.ndarray, list, dict)): -# return _find_pval(data, multiple) -# else: -# raise ValueError("Input must be a pandas DataFrame, numpy array, list, or dict") -# -# -# def _find_pval( -# data: Union[np.ndarray, List, Dict], multiple: bool -# ) -> Union[Optional[str], List[str]]: -# pattern = re.compile(r"p[-_]?val(ue)?(?!.*stars)", re.IGNORECASE) -# matches = [] -# -# if isinstance(data, dict): -# matches = [key for key in data.keys() if pattern.search(str(key))] -# elif ( -# isinstance(data, (np.ndarray, list)) -# and len(data) > 0 -# and isinstance(data[0], dict) -# ): -# matches = [key for key in data[0].keys() if pattern.search(str(key))] -# -# return matches if multiple else (matches[0] if matches else None) -# -# -# def _find_pval_col( -# df: pd.DataFrame, multiple: bool = False -# ) -> Union[Optional[str], List[str]]: -# """ -# Find p-value column name(s) in a DataFrame. -# -# Example: -# -------- -# >>> df = pd.DataFrame({'p_value': [0.05, 0.01], 'pval': [0.1, 0.001], 'other': [1, 2]}) -# >>> find_pval_col(df) -# ['p_value', 'pval'] -# >>> find_pval_col(df, multiple=False) -# 'p_value' -# -# Parameters: -# ----------- -# df : pd.DataFrame -# DataFrame to search for p-value column(s) -# multiple : bool, optional -# If True, return all matches; if False, return only the first match (default is False) -# -# Returns: -# -------- -# Union[Optional[str], List[str]] -# Name(s) of the column(s) that match p-value patterns, or None if not found -# """ -# pattern = re.compile(r"p[-_]?val(ue)?(?!.*stars)", re.IGNORECASE) -# matches = [col for col in df.columns if pattern.search(str(col))] -# -# return matches if multiple else (matches[0] if matches else None) -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_find_pval.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__force_df.py b/tests/scitex/pd/test__force_df.py deleted file mode 100644 index 05dc188a2..000000000 --- a/tests/scitex/pd/test__force_df.py +++ /dev/null @@ -1,533 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2025-04-27 20:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__force_df.py - -import os -import sys -import tempfile -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pandas as pd -import pytest - - -class TestForceDfBasic: - """Test basic functionality of force_df.""" - - def test_dict_to_dataframe(self): - """Test converting dictionary to DataFrame.""" - from scitex.pd import force_df - - data = {"a": [1, 2, 3], "b": [4, 5, 6]} - result = force_df(data) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (3, 2) - assert list(result.columns) == ["a", "b"] - assert result["a"].tolist() == [1, 2, 3] - assert result["b"].tolist() == [4, 5, 6] - - def test_dataframe_passthrough(self): - """Test that DataFrame is returned unchanged.""" - from scitex.pd import force_df - - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - result = force_df(df) - - assert isinstance(result, pd.DataFrame) - pd.testing.assert_frame_equal(result, df) - - def test_series_to_dataframe(self): - """Test converting Series to DataFrame.""" - from scitex.pd import force_df - - series = pd.Series([1, 2, 3], name="data") - result = force_df(series) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (3, 1) - assert result.columns[0] == "data" - assert result["data"].tolist() == [1, 2, 3] - - def test_list_to_dataframe(self): - """Test converting list to DataFrame.""" - from scitex.pd import force_df - - data = [1, 2, 3, 4, 5] - result = force_df(data) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (5, 1) - assert result.columns[0] == "value" - assert result["value"].tolist() == [1, 2, 3, 4, 5] - - def test_tuple_to_dataframe(self): - """Test converting tuple to DataFrame.""" - from scitex.pd import force_df - - data = (10, 20, 30) - result = force_df(data) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (3, 1) - assert result.columns[0] == "value" - assert result["value"].tolist() == [10, 20, 30] - - -class TestForceDfNumPy: - """Test force_df with numpy arrays.""" - - def test_1d_array_to_dataframe(self): - """Test converting 1D numpy array to DataFrame.""" - from scitex.pd import force_df - - arr = np.array([1, 2, 3, 4]) - result = force_df(arr) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (4, 1) - assert result.columns[0] == "value" - assert result["value"].tolist() == [1, 2, 3, 4] - - def test_2d_array_to_dataframe(self): - """Test converting 2D numpy array to DataFrame.""" - from scitex.pd import force_df - - arr = np.array([[1, 2, 3], [4, 5, 6]]) - result = force_df(arr) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (2, 3) - assert list(result.columns) == [0, 1, 2] - assert result[0].tolist() == [1, 4] - assert result[1].tolist() == [2, 5] - assert result[2].tolist() == [3, 6] - - def test_empty_array(self): - """Test with empty numpy array.""" - from scitex.pd import force_df - - arr = np.array([]) - result = force_df(arr) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (0, 1) - - -class TestForceDfMixedLengths: - """Test force_df with mixed-length data.""" - - def test_dict_mixed_lengths_default_filler(self): - """Test dictionary with mixed-length values using default filler.""" - from scitex.pd import force_df - - data = {"a": [1, 2, 3], "b": [4, 5], "c": [6]} - result = force_df(data) - - assert result.shape == (3, 3) - assert result["a"].tolist() == [1, 2, 3] - # NaN values need special comparison - b_values = result["b"].tolist() - assert b_values[0] == 4 - assert b_values[1] == 5 - assert pd.isna(b_values[2]) - assert pd.isna(result["b"].iloc[2]) - assert result["c"].iloc[0] == 6 - assert pd.isna(result["c"].iloc[1]) - assert pd.isna(result["c"].iloc[2]) - - def test_dict_mixed_lengths_custom_filler(self): - """Test dictionary with mixed-length values using custom filler.""" - from scitex.pd import force_df - - data = {"a": [1, 2, 3], "b": [4, 5], "c": [6]} - result = force_df(data, filler=0) - - assert result.shape == (3, 3) - assert result["b"].tolist() == [4, 5, 0] - assert result["c"].tolist() == [6, 0, 0] - - def test_scalar_values_in_dict(self): - """Test dictionary with scalar values.""" - from scitex.pd import force_df - - data = {"a": 1, "b": [2, 3, 4], "c": "hello"} - result = force_df(data) - - assert result.shape == (3, 3) - assert result["a"].iloc[0] == 1 - assert pd.isna(result["a"].iloc[1]) - assert pd.isna(result["a"].iloc[2]) - assert result["b"].tolist() == [2, 3, 4] - assert result["c"].iloc[0] == "hello" - assert pd.isna(result["c"].iloc[1]) - - -class TestForceDfListedSeries: - """Test force_df with list of Series.""" - - def test_list_of_series(self): - """Test that list of Series is handled (though not ideally).""" - from scitex.pd import force_df - - series1 = pd.Series({"a": 1, "b": 2}) - series2 = pd.Series({"a": 3, "b": 4}) - series3 = pd.Series({"a": 5, "b": 6}) - - # The current implementation treats Series as complex objects - # and creates a DataFrame with NaN values - result = force_df([series1, series2, series3]) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (3, 1) - assert result.columns[0] == "value" - # The values are NaN because Series objects aren't handled properly - assert result["value"].isna().all() - - def test_list_of_series_workaround(self): - """Test workaround for list of Series.""" - from scitex.pd import force_df - - series1 = pd.Series({"a": 1, "b": 2}) - series2 = pd.Series({"a": 3, "b": 4}) - - # Workaround: manually convert to dict - data = {"row_0": series1.to_dict(), "row_1": series2.to_dict()} - - # This creates a transposed result - result = force_df(data) - - assert isinstance(result, pd.DataFrame) - # The result will have row_0, row_1 as columns - assert "row_0" in result.columns - assert "row_1" in result.columns - - -class TestForceDfEdgeCases: - """Test edge cases for force_df.""" - - def test_empty_dict(self): - """Test with empty dictionary.""" - from scitex.pd import force_df - - result = force_df({}) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (0, 0) - - def test_nested_structures(self): - """Test with nested structures.""" - from scitex.pd import force_df - - data = {"a": [1, 2], "b": [[3, 4], [5, 6]]} - result = force_df(data) - - assert result.shape == (2, 2) - assert result["a"].tolist() == [1, 2] - assert result["b"].iloc[0] == [3, 4] - assert result["b"].iloc[1] == [5, 6] - - def test_mixed_types(self): - """Test with mixed data types.""" - from scitex.pd import force_df - - data = { - "int": [1, 2, 3], - "float": [1.1, 2.2, 3.3], - "str": ["a", "b", "c"], - "bool": [True, False, True], - "none": [None, None, None], - } - result = force_df(data) - - assert result.shape == (3, 5) - assert result["int"].dtype == "int64" - assert result["float"].dtype == "float64" - assert result["str"].dtype == "object" - assert result["bool"].dtype == "bool" - - def test_single_value_dict(self): - """Test with single-value dictionary.""" - from scitex.pd import force_df - - data = {"a": 42} - result = force_df(data) - - assert result.shape == (1, 1) - assert result["a"].iloc[0] == 42 - - -class TestForceDfSpecialCases: - """Test special cases and behaviors.""" - - def test_series_without_name(self): - """Test Series without name.""" - from scitex.pd import force_df - - series = pd.Series([1, 2, 3]) - result = force_df(series) - - assert isinstance(result, pd.DataFrame) - assert result.shape == (3, 1) - assert result.columns[0] == 0 # Default column name - - def test_dict_with_none_values(self): - """Test dictionary with None values.""" - from scitex.pd import force_df - - data = {"a": None, "b": [1, 2, 3]} - result = force_df(data) - - assert result.shape == (3, 2) - # When None is extended with np.nan filler, it becomes [None, nan, nan] - # but pandas may convert None to nan - assert pd.isna(result["a"].iloc[0]) - assert pd.isna(result["a"].iloc[1]) - assert pd.isna(result["a"].iloc[2]) - - def test_dict_with_string_keys(self): - """Test dictionary with various string keys.""" - from scitex.pd import force_df - - data = { - "column_1": [1, 2], - "Column 2": [3, 4], - "3rdColumn": [5, 6], - "col-4": [7, 8], - } - result = force_df(data) - - assert set(result.columns) == set(data.keys()) - assert result["column_1"].tolist() == [1, 2] - assert result["Column 2"].tolist() == [3, 4] - - def test_custom_filler_types(self): - """Test various custom filler types.""" - from scitex.pd import force_df - - # Test with string filler - data = {"a": [1], "b": [2, 3]} - result = force_df(data, filler="missing") - assert result["a"].iloc[1] == "missing" - - # Test with None filler - data = {"a": [1], "b": [2, 3]} - result = force_df(data, filler=None) - # Pandas may convert None to nan in numeric columns - assert pd.isna(result["a"].iloc[1]) - - # Test with custom object filler - custom_obj = object() - data = {"a": [1], "b": [2, 3]} - result = force_df(data, filler=custom_obj) - assert result["a"].iloc[1] is custom_obj - - -class TestForceDfIntegration: - """Integration tests for force_df.""" - - def test_real_world_scenario(self): - """Test with realistic data scenario.""" - from scitex.pd import force_df - - # Simulating data from different sources with varying lengths - data = { - "experiment_id": [1, 2, 3], - "measurements": [10.5, 20.3], - "status": "completed", - "notes": ["good", "better", "best", "excellent"], - } - - result = force_df(data, filler="N/A") - - assert result.shape == (4, 4) - assert result["experiment_id"].tolist() == [1, 2, 3, "N/A"] - assert result["measurements"].tolist() == [10.5, 20.3, "N/A", "N/A"] - assert result["status"].tolist() == ["completed", "N/A", "N/A", "N/A"] - assert result["notes"].tolist() == ["good", "better", "best", "excellent"] - - def test_chained_operations(self): - """Test force_df in chained operations.""" - from scitex.pd import force_df - - # Start with mixed data - data = {"a": [1, 2], "b": [3, 4, 5]} - - # Convert to DataFrame and perform operations - result = force_df(data) - result = result.fillna(0) # Replace NaN with 0 - result["sum"] = result["a"] + result["b"] - - assert result["sum"].tolist() == [4, 6, 5] - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_force_df.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Timestamp: "2025-04-27 19:59:11 (ywatanabe)" -# # File: /ssh:sp:/home/ywatanabe/proj/scitex_repo/src/scitex/pd/_force_df.py -# # ---------------------------------------- -# import os -# -# __FILE__ = "./src/scitex/pd/_force_df.py" -# __DIR__ = os.path.dirname(__FILE__) -# # ---------------------------------------- -# -# import numpy as np -# import pandas as pd -# -# from scitex.types import is_listed_X -# -# -# def force_df(data, filler=np.nan): -# """ -# Convert various data types to pandas DataFrame. -# -# Parameters -# ---------- -# data : various -# The data to convert to DataFrame. Can be DataFrame, Series, ndarray, -# list, tuple, dict, scalar value, etc. -# filler : any, optional -# Value to use for filling missing values, by default np.nan -# -# Returns -# ------- -# pd.DataFrame -# Data converted to DataFrame -# -# Examples -# -------- -# >>> import scitex -# >>> import pandas as pd -# >>> import numpy as np -# -# # DataFrame input returns the same DataFrame -# >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}) -# >>> scitex.pd.force_df(df) is df -# True -# -# # Series input is converted to DataFrame -# >>> series = pd.Series([1, 2, 3], name='test') -# >>> scitex.pd.force_df(series) -# test -# 0 1 -# 1 2 -# 2 3 -# -# # NumPy array input is converted to DataFrame -# >>> arr = np.array([1, 2, 3]) -# >>> scitex.pd.force_df(arr) -# value -# 0 1 -# 1 2 -# 2 3 -# -# # Scalar values are converted to single-value DataFrames -# >>> scitex.pd.force_df(42) -# value -# 0 42 -# -# # Lists and tuples are converted to DataFrame -# >>> scitex.pd.force_df([1, 2, 3]) -# value -# 0 1 -# 1 2 -# 2 3 -# -# # Dictionaries are converted to DataFrame with appropriate handling -# # of different length values -# >>> data = {'A': [1, 2, 3], 'B': [4, 5]} -# >>> scitex.pd.force_df(data) -# A B -# 0 1 4 -# 1 2 5 -# 2 3 NaN -# """ -# # Return None as empty DataFrame -# if data is None: -# return pd.DataFrame() -# -# # Return DataFrame as is -# if isinstance(data, pd.DataFrame): -# return data -# -# # Convert Series to DataFrame -# if isinstance(data, pd.Series): -# return data.to_frame() -# -# # Convert numpy array to DataFrame -# if isinstance(data, np.ndarray): -# # Handle 1D array -# if data.ndim == 1: -# return pd.DataFrame(data, columns=["value"]) -# # Handle 2D array -# elif data.ndim == 2: -# return pd.DataFrame(data) -# # Handle higher dimensional arrays -# else: -# shape = data.shape -# reshaped = data.reshape(shape[0], -1) -# return pd.DataFrame(reshaped) -# -# # Handle scalar values (int, float, str, etc.) -# if isinstance(data, (int, float, str, bool)): -# return pd.DataFrame([data], columns=["value"]) -# -# # Handle lists and tuples -# if isinstance(data, (list, tuple)): -# # Handle list of lists/arrays -> DataFrame -# if len(data) > 0 and isinstance(data[0], (list, tuple, np.ndarray)): -# return pd.DataFrame(data) -# # Handle simple list/tuple -> single column DataFrame -# else: -# return pd.DataFrame(data, columns=["value"]) -# -# # Continue with the original implementation for dictionaries -# if isinstance(data, dict): -# # Original implementation -# permutable_dict = data.copy() -# -# # Get the lengths -# max_len = 0 -# for k, v in permutable_dict.items(): -# # Check if v is an iterable (but not string) or treat as single length otherwise -# if isinstance(v, (str, int, float)) or not hasattr(v, "__len__"): -# length = 1 -# else: -# length = len(v) -# max_len = max(max_len, length) -# -# # Replace with appropriately filled list -# for k, v in permutable_dict.items(): -# if isinstance(v, (str, int, float)) or not hasattr(v, "__len__"): -# permutable_dict[k] = [v] + [filler] * (max_len - 1) -# else: -# permutable_dict[k] = list(v) + [filler] * (max_len - len(v)) -# -# # Puts them into a DataFrame -# return pd.DataFrame(permutable_dict) -# -# # For any other iterable type -# try: -# return pd.DataFrame(list(data), columns=["value"]) -# except: -# raise TypeError(f"Cannot convert object of type {type(data)} to DataFrame") -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_force_df.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__from_xyz.py b/tests/scitex/pd/test__from_xyz.py deleted file mode 100644 index a85cdc9d3..000000000 --- a/tests/scitex/pd/test__from_xyz.py +++ /dev/null @@ -1,476 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 10:30:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__from_xyz.py - -import os -import sys -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd -import pytest - -# Add the project root to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../src")) - -from scitex.pd import from_xyz - - -class TestBasicFunctionality: - """Test basic functionality of from_xyz.""" - - def test_simple_xyz_conversion(self): - """Test basic x, y, z conversion to pivot table.""" - data = pd.DataFrame( - {"x": ["A", "B", "C", "A"], "y": ["X", "Y", "Z", "Y"], "z": [1, 2, 3, 4]} - ) - result = from_xyz(data) - - assert isinstance(result, pd.DataFrame) - assert "A" in result.columns - assert "X" in result.index - assert result.loc["Y", "A"] == 4 - assert result.loc["Y", "B"] == 2 - assert result.loc["Z", "C"] == 3 - - def test_custom_column_names(self): - """Test with custom column names.""" - data = pd.DataFrame( - {"col1": ["A", "B", "C"], "col2": ["X", "Y", "Z"], "values": [10, 20, 30]} - ) - result = from_xyz(data, x="col1", y="col2", z="values") - - assert result.loc["X", "A"] == 10 - assert result.loc["Y", "B"] == 20 - assert result.loc["Z", "C"] == 30 - - def test_missing_values_filled_with_zero(self): - """Test that missing values are filled with 0.""" - data = pd.DataFrame({"x": ["A", "B"], "y": ["X", "Y"], "z": [1, 2]}) - result = from_xyz(data) - - # Check that non-existent combinations are 0 - assert result.loc["X", "B"] == 0 - assert result.loc["Y", "A"] == 0 - - def test_duplicate_xy_pairs(self): - """Test handling of duplicate x,y pairs (uses first value).""" - data = pd.DataFrame( - {"x": ["A", "A", "A"], "y": ["X", "X", "X"], "z": [1, 2, 3]} - ) - result = from_xyz(data) - - # Should use first value due to aggfunc='first' - assert result.loc["X", "A"] == 1 - - -class TestSquareMatrix: - """Test square matrix functionality.""" - - def test_square_false_default(self): - """Test that square=False produces non-square matrix by default.""" - data = pd.DataFrame( - {"x": ["A", "B", "C"], "y": ["X", "Y", "Y"], "z": [1, 2, 3]} - ) - result = from_xyz(data, square=False) - - # Should have 2 rows (X, Y) and 3 columns (A, B, C) - assert result.shape == (2, 3) - assert list(result.index) == ["X", "Y"] - assert list(result.columns) == ["A", "B", "C"] - - def test_square_true_creates_square_matrix(self): - """Test that square=True creates a square matrix.""" - data = pd.DataFrame( - {"x": ["A", "B", "C"], "y": ["X", "Y", "Y"], "z": [1, 2, 3]} - ) - result = from_xyz(data, square=True) - - # Should create square matrix with all unique labels - all_labels = ["A", "B", "C", "X", "Y"] - assert result.shape == (5, 5) - assert list(result.index) == all_labels - assert list(result.columns) == all_labels - - def test_square_with_identical_labels(self): - """Test square matrix when x and y have same labels.""" - data = pd.DataFrame( - {"x": ["A", "B", "C", "A"], "y": ["B", "C", "A", "C"], "z": [1, 2, 3, 4]} - ) - result = from_xyz(data, square=True) - - # Should be 3x3 matrix - assert result.shape == (3, 3) - assert set(result.index) == set(result.columns) == {"A", "B", "C"} - assert result.loc["B", "A"] == 1 - assert result.loc["C", "B"] == 2 - assert result.loc["A", "C"] == 3 - assert result.loc["C", "A"] == 4 - - -class TestDataTypes: - """Test handling of different data types.""" - - def test_numeric_labels(self): - """Test with numeric x and y labels.""" - data = pd.DataFrame( - {"x": [1, 2, 3, 1], "y": [10, 20, 30, 20], "z": [0.1, 0.2, 0.3, 0.4]} - ) - result = from_xyz(data) - - assert result.loc[20, 1] == 0.4 - assert result.loc[20, 2] == 0.2 - assert result.loc[30, 3] == 0.3 - - def test_mixed_types_labels(self): - """Test with mixed type labels.""" - data = pd.DataFrame( - { - "x": [1, "B", 3.14, 1], - "y": ["alpha", "beta", "gamma", "beta"], - "z": [10, 20, 30, 40], - } - ) - result = from_xyz(data) - - assert result.loc["beta", 1] == 40 - assert result.loc["beta", "B"] == 20 - assert result.loc["gamma", 3.14] == 30 - - def test_float_z_values(self): - """Test with float z values.""" - data = pd.DataFrame( - {"x": ["A", "B", "C"], "y": ["X", "Y", "Z"], "z": [1.5, 2.7, 3.9]} - ) - result = from_xyz(data) - - assert result.loc["X", "A"] == 1.5 - assert result.loc["Y", "B"] == 2.7 - assert result.loc["Z", "C"] == 3.9 - - def test_string_z_values(self): - """Test with string z values.""" - data = pd.DataFrame({"x": ["A", "B"], "y": ["X", "Y"], "z": ["high", "low"]}) - result = from_xyz(data) - - assert result.loc["X", "A"] == "high" - assert result.loc["Y", "B"] == "low" - # Missing values should be 0, not '0' - assert result.loc["X", "B"] == 0 - - -class TestEdgeCases: - """Test edge cases and error conditions.""" - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - data = pd.DataFrame({"x": [], "y": [], "z": []}) - result = from_xyz(data) - - assert isinstance(result, pd.DataFrame) - assert result.empty - - def test_single_row(self): - """Test with single row.""" - data = pd.DataFrame({"x": ["A"], "y": ["X"], "z": [42]}) - result = from_xyz(data) - - assert result.shape == (1, 1) - assert result.loc["X", "A"] == 42 - - def test_missing_columns(self): - """Test error when required columns are missing.""" - data = pd.DataFrame({"a": [1], "b": [2]}) - - with pytest.raises(KeyError): - from_xyz(data) # Should fail looking for 'x', 'y', 'z' - - def test_nan_values(self): - """Test handling of NaN values.""" - data = pd.DataFrame( - {"x": ["A", "B", "C"], "y": ["X", "Y", "Z"], "z": [1, np.nan, 3]} - ) - result = from_xyz(data) - - # NaN values are dropped by pivot_table, so row Y and column B don't exist - assert result.shape == (2, 2) # Only X,Z rows and A,C columns - assert result.loc["X", "A"] == 1 - assert result.loc["Z", "C"] == 3 - # Check that Y row and B column don't exist - assert "Y" not in result.index - assert "B" not in result.columns - - def test_none_in_labels(self): - """Test with None in x or y labels.""" - data = pd.DataFrame( - {"x": ["A", None, "C"], "y": ["X", "Y", None], "z": [1, 2, 3]} - ) - result = from_xyz(data) - - # None values are dropped by pivot_table - assert result.shape == (1, 1) # Only X,A remains - assert result.loc["X", "A"] == 1 - # Check that rows/columns with None are excluded - assert None not in result.index - assert None not in result.columns - assert "Y" not in result.index # Row with None in x is dropped - assert "C" not in result.columns # Column with None in y is dropped - - -class TestAggregation: - """Test aggregation behavior.""" - - def test_first_aggregation(self): - """Test that 'first' aggregation is used.""" - data = pd.DataFrame( - {"x": ["A", "A", "A"], "y": ["X", "X", "X"], "z": [1, 2, 3]} - ) - result = from_xyz(data) - - # Should take first value - assert result.loc["X", "A"] == 1 - - def test_multiple_duplicates(self): - """Test with multiple duplicate x,y pairs.""" - data = pd.DataFrame( - { - "x": ["A", "B", "A", "B", "A"], - "y": ["X", "Y", "X", "Y", "X"], - "z": [1, 2, 3, 4, 5], - } - ) - result = from_xyz(data) - - assert result.loc["X", "A"] == 1 # First occurrence - assert result.loc["Y", "B"] == 2 # First occurrence - - def test_order_preservation(self): - """Test that order of first occurrence is preserved.""" - data = pd.DataFrame( - {"x": ["C", "B", "A"], "y": ["Z", "Y", "X"], "z": [3, 2, 1]} - ) - result = from_xyz(data) - - # Columns and index should be sorted - assert list(result.columns) == ["A", "B", "C"] - assert list(result.index) == ["X", "Y", "Z"] - - -class TestRealWorldScenarios: - """Test real-world use cases.""" - - def test_statistical_pvalues_matrix(self): - """Test creating p-value matrix from statistical tests.""" - data = pd.DataFrame( - { - "x": ["gene1", "gene2", "gene3", "gene1", "gene2"], - "y": [ - "condition1", - "condition1", - "condition1", - "condition2", - "condition2", - ], - "z": [0.01, 0.05, 0.001, 0.1, 0.02], - } - ) - result = from_xyz(data) - - assert result.shape == (2, 3) - assert result.loc["condition1", "gene1"] == 0.01 - assert result.loc["condition1", "gene3"] == 0.001 - assert result.loc["condition2", "gene2"] == 0.02 - assert result.loc["condition2", "gene3"] == 0 # Missing combination - - def test_correlation_matrix_construction(self): - """Test constructing correlation matrix.""" - # Upper triangle of correlation matrix - data = pd.DataFrame( - { - "x": ["A", "A", "A", "B", "B", "C"], - "y": ["A", "B", "C", "B", "C", "C"], - "z": [1.0, 0.8, 0.6, 1.0, 0.7, 1.0], - } - ) - result = from_xyz(data, square=True) - - # Should create symmetric matrix - assert result.shape == (3, 3) - assert result.loc["A", "A"] == 1.0 - assert result.loc["B", "A"] == 0.8 # From A-B pair - assert result.loc["C", "B"] == 0.7 # From B-C pair - - def test_contingency_table(self): - """Test creating contingency table.""" - data = pd.DataFrame( - { - "x": ["Yes", "No", "Yes", "No", "Yes"], - "y": ["Group1", "Group1", "Group2", "Group2", "Group1"], - "z": [15, 10, 20, 5, 5], # counts - } - ) - result = from_xyz(data) - - assert result.loc["Group1", "Yes"] == 15 - assert result.loc["Group1", "No"] == 10 - assert result.loc["Group2", "Yes"] == 20 - assert result.loc["Group2", "No"] == 5 - - -class TestDocstringExample: - """Test the example from the docstring.""" - - def test_docstring_example(self): - """Test the exact example from the docstring.""" - data = pd.DataFrame( - { - "col1": ["A", "B", "C", "A"], - "col2": ["X", "Y", "Z", "Y"], - "p_val": [0.01, 0.05, 0.001, 0.1], - } - ) - data = data.rename(columns={"col1": "x", "col2": "y", "p_val": "z"}) - result = from_xyz(data) - - assert result.loc["X", "A"] == 0.01 - assert result.loc["Y", "B"] == 0.05 - assert result.loc["Z", "C"] == 0.001 - assert result.loc["Y", "A"] == 0.1 - - # Check filled values - assert result.loc["X", "B"] == 0 - assert result.loc["X", "C"] == 0 - - -class TestLargeDatasets: - """Test with larger datasets.""" - - def test_large_sparse_matrix(self): - """Test with large sparse data.""" - # Create sparse data - np.random.seed(42) - n_points = 1000 - x_vals = np.random.choice(list("ABCDEFGHIJ"), n_points) - y_vals = np.random.choice(list("KLMNOPQRST"), n_points) - z_vals = np.random.rand(n_points) - - data = pd.DataFrame({"x": x_vals, "y": y_vals, "z": z_vals}) - result = from_xyz(data) - - assert result.shape == (10, 10) # 10 unique x and y values - assert (result >= 0).all().all() # All values non-negative - assert (result <= 1).all().all() # All values <= 1 (including fills) - - def test_performance_with_categories(self): - """Test performance with categorical data.""" - # Using categories can improve performance - # Create data where all combinations exist - x_vals = [] - y_vals = [] - z_vals = [] - for x in ["A", "B", "C"]: - for y in ["X", "Y", "Z"]: - x_vals.extend([x] * 100) - y_vals.extend([y] * 100) - z_vals.extend(np.random.rand(100)) - - data = pd.DataFrame( - { - "x": pd.Categorical(x_vals), - "y": pd.Categorical(y_vals), - "z": z_vals, - } - ) - result = from_xyz(data) - - assert result.shape == (3, 3) - # All positions should have values (due to all combinations being present) - assert (result != 0).all().all() - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_from_xyz.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-09-26 07:22:18 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_from_xyz.py -# -# import pandas as pd -# import numpy as np -# -# -# def from_xyz(data_frame, x=None, y=None, z=None, square=False): -# """ -# Convert a DataFrame with 'x', 'y', 'z' format into a heatmap DataFrame. -# -# Example -# ------- -# import pandas as pd -# data = pd.DataFrame({ -# 'col1': ['A', 'B', 'C', 'A'], -# 'col2': ['X', 'Y', 'Z', 'Y'], -# 'p_val': [0.01, 0.05, 0.001, 0.1] -# }) -# data = data.rename(columns={"col1": "x", "col2": "y", "p_val": "z"}) -# result = from_xyz(data) -# print(result) -# -# Parameters -# ---------- -# data_frame : pandas.DataFrame -# Input DataFrame with columns for x, y, and z values. -# x : str, optional -# Name of the column to use as x-axis. Defaults to 'x'. -# y : str, optional -# Name of the column to use as y-axis. Defaults to 'y'. -# z : str, optional -# Name of the column to use as z-values. Defaults to 'z'. -# square : bool, optional -# If True, force the output to be a square matrix. Defaults to False. -# -# Returns -# ------- -# pandas.DataFrame -# A DataFrame in heatmap/pivot format. -# """ -# x = x or "x" -# y = y or "y" -# z = z or "z" -# -# heatmap = pd.pivot_table(data_frame, values=z, index=y, columns=x, aggfunc="first") -# -# if square: -# # Make it square by including all unique labels -# all_labels = sorted(set(heatmap.index) | set(heatmap.columns)) -# heatmap = heatmap.reindex(index=all_labels, columns=all_labels) -# -# heatmap = heatmap.fillna(0) -# -# return heatmap -# -# -# if __name__ == "__main__": -# np.random.seed(42) -# stats = pd.DataFrame( -# { -# "col1": np.random.choice(["A", "B", "C"], 100), -# "col2": np.random.choice(["X", "Y", "Z"], 100), -# "p_val": np.random.rand(100), -# } -# ) -# stats = stats.rename(columns={"col1": "x", "col2": "y", "p_val": "z"}) -# result = from_xyz(stats) -# print(result) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_from_xyz.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__get_unique.py b/tests/scitex/pd/test__get_unique.py deleted file mode 100644 index 97e5e0c3e..000000000 --- a/tests/scitex/pd/test__get_unique.py +++ /dev/null @@ -1,116 +0,0 @@ -# Add your tests here - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_get_unique.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Timestamp: "2025-09-18 18:42:11 (ywatanabe)" -# # File: /ssh:sp:/home/ywatanabe/proj/scitex_repo/src/scitex/pd/_get_unique.py -# # ---------------------------------------- -# from __future__ import annotations -# import os -# -# __FILE__ = __file__ -# __DIR__ = os.path.dirname(__FILE__) -# # ---------------------------------------- -# -# """ -# Extract unique values from DataFrame columns. -# """ -# -# from typing import Any, Optional -# -# import pandas as pd -# -# -# def get_unique( -# df: pd.DataFrame, -# column: str, -# default: Optional[Any] = None, -# raise_on_multiple: bool = False, -# ) -> Any: -# """Get value from column if it contains a unique value. -# -# Args: -# df: DataFrame to extract from -# column: Column name to check -# default: Default value if column doesn't exist or has multiple unique values -# raise_on_multiple: If True, raise ValueError when multiple unique values exist -# -# Returns: -# The unique value if exactly one exists, otherwise default value -# -# Examples: -# >>> import pandas as pd -# >>> df = pd.DataFrame({'patient_id': ['P01', 'P01', 'P01']}) -# >>> get_unique(df, 'patient_id') -# 'P01' -# -# >>> df = pd.DataFrame({'patient_id': ['P01', 'P02']}) -# >>> get_unique(df, 'patient_id', default='Unknown') -# 'Unknown' -# -# >>> # Raise error on multiple values -# >>> get_unique(df, 'patient_id', raise_on_multiple=True) -# ValueError: Column 'patient_id' has 2 unique values: ['P01', 'P02'] -# """ -# if column not in df.columns: -# if raise_on_multiple: -# raise KeyError(f"Column '{column}' not found in DataFrame") -# return default -# -# unique_values = df[column].unique() -# -# if len(unique_values) == 1: -# return unique_values[0] -# -# if len(unique_values) > 1 and raise_on_multiple: -# raise ValueError( -# f"Column '{column}' has {len(unique_values)} unique values: " -# f"{list(unique_values[:5])}" -# ) -# -# return default -# -# -# if __name__ == "__main__": -# # Test the function -# import pandas as pd -# -# # Test case 1: Unique value -# df1 = pd.DataFrame({"patient_id": ["P01", "P01", "P01"]}) -# assert get_unique(df1, "patient_id") == "P01" -# print("✓ Test 1 passed: Unique value extracted") -# -# # Test case 2: Multiple values with default -# df2 = pd.DataFrame({"patient_id": ["P01", "P02"]}) -# assert get_unique(df2, "patient_id", default="Unknown") == "Unknown" -# print("✓ Test 2 passed: Default returned for multiple values") -# -# # Test case 3: Missing column -# assert get_unique(df1, "missing_col", default="N/A") == "N/A" -# print("✓ Test 3 passed: Default returned for missing column") -# -# # Test case 4: Raise on multiple -# try: -# get_unique(df2, "patient_id", raise_on_multiple=True) -# assert False, "Should have raised ValueError" -# except ValueError as e: -# assert "has 2 unique values" in str(e) -# print("✓ Test 4 passed: ValueError raised for multiple values") -# -# print("\nAll tests passed!") -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_get_unique.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__ignore_SettingWithCopyWarning.py b/tests/scitex/pd/test__ignore_SettingWithCopyWarning.py deleted file mode 100644 index 7b5a1ef0c..000000000 --- a/tests/scitex/pd/test__ignore_SettingWithCopyWarning.py +++ /dev/null @@ -1,413 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 08:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__ignore_SettingWithCopyWarning.py - -import os -import sys -import tempfile -import warnings -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pandas as pd -import pytest - - -class TestIgnoreSettingWithCopyWarningBasic: - """Test basic functionality of ignore_setting_with_copy_warning.""" - - def test_suppress_warning_on_slice_assignment(self): - """Test that SettingWithCopyWarning is suppressed during slice assignment.""" - from scitex.pd import ignore_setting_with_copy_warning - - # Create a DataFrame and a view that would normally trigger warning - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - df_view = df[df["A"] > 1] - - # This would normally trigger SettingWithCopyWarning - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - df_view["B"] = 99 # This should not produce warning - - # Check no warnings were raised - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - def test_warning_raised_without_context_manager(self): - """Test that warning is raised when not using context manager.""" - # Create a DataFrame and a view that triggers warning - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - df_view = df[df["A"] > 1] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - try: - # This should trigger SettingWithCopyWarning - df_view["B"] = 99 - except Exception: - # Some pandas versions might raise, others just warn - pass - - # In many cases, the warning is raised - # Note: behavior may vary by pandas version - - def test_loc_assignment_with_context_manager(self): - """Test .loc assignment with context manager.""" - from scitex.pd import ignore_setting_with_copy_warning - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - subset = df[["A"]] # This creates a view - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - subset.loc[:, "A"] = 100 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - -class TestBackwardCompatibility: - """Test backward compatibility with old function name.""" - - def test_old_function_name_works(self): - """Test that ignore_SettingWithCopyWarning (old name) still works.""" - from scitex.pd import ignore_SettingWithCopyWarning - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - df_view = df[df["A"] > 1] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - # Use old function name - with ignore_SettingWithCopyWarning(): - df_view["B"] = 99 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - def test_both_names_are_same_function(self): - """Test that both function names refer to the same function.""" - from scitex.pd import ( - ignore_setting_with_copy_warning, - ignore_SettingWithCopyWarning, - ) - - # They should be the same object - assert ignore_setting_with_copy_warning is ignore_SettingWithCopyWarning - - -class TestComplexScenarios: - """Test complex DataFrame manipulation scenarios.""" - - def test_chained_indexing(self): - """Test suppression with chained indexing.""" - from scitex.pd import ignore_setting_with_copy_warning - - df = pd.DataFrame( - {"A": [1, 2, 3, 4], "B": [5, 6, 7, 8], "C": ["x", "y", "z", "w"]} - ) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - # Chained indexing that would normally warn - df[df["A"] > 2]["B"] = 999 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - def test_multiple_operations(self): - """Test multiple operations within context manager.""" - from scitex.pd import ignore_setting_with_copy_warning - - df = pd.DataFrame({"A": range(10), "B": range(10, 20), "C": range(20, 30)}) - - view1 = df[df["A"] < 5] - view2 = df[["B", "C"]] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - view1["B"] = -1 - view2["C"] = -2 - view1.loc[:, "C"] = -3 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - def test_nested_dataframes(self): - """Test with nested DataFrame operations.""" - from scitex.pd import ignore_setting_with_copy_warning - - df = pd.DataFrame({"group": ["A", "A", "B", "B"], "value": [1, 2, 3, 4]}) - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - for group in df["group"].unique(): - group_df = df[df["group"] == group] - group_df["value"] = group_df["value"] * 10 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - -class TestWarningRestoration: - """Test that warning settings are properly restored.""" - - def test_warnings_restored_after_context(self): - """Test that warning filters are restored after context manager exits.""" - from scitex.pd import ignore_setting_with_copy_warning - - # Get initial warning filters - initial_filters = warnings.filters.copy() - - # Use context manager - with ignore_setting_with_copy_warning(): - # Inside context, SettingWithCopyWarning should be ignored - pass - - # After context, filters should be restored - # Note: exact comparison might fail due to internal changes - # but the important thing is warnings work normally again - - # Test that we can still catch warnings after - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - warnings.warn("Test warning", UserWarning) - - assert len(w) == 1 - assert issubclass(w[0].category, UserWarning) - - def test_exception_in_context_restores_warnings(self): - """Test that warnings are restored even if exception occurs.""" - from scitex.pd import ignore_setting_with_copy_warning - - with pytest.raises(ValueError): - with ignore_setting_with_copy_warning(): - raise ValueError("Test exception") - - # Warnings should still work normally after exception - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - warnings.warn("Test warning", UserWarning) - - assert len(w) == 1 - - -class TestEdgeCases: - """Test edge cases and special scenarios.""" - - def test_empty_context(self): - """Test context manager with no operations.""" - from scitex.pd import ignore_setting_with_copy_warning - - # Should not raise any errors - with ignore_setting_with_copy_warning(): - pass - - def test_non_pandas_operations(self): - """Test that non-pandas operations work normally.""" - from scitex.pd import ignore_setting_with_copy_warning - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - # Regular numpy operations - arr = np.array([1, 2, 3]) - arr[0] = 999 - - # Regular Python operations - lst = [1, 2, 3] - lst[0] = 999 - - # Other warnings should still work - warnings.warn("Test warning", UserWarning) - - # Should have the UserWarning but no SettingWithCopyWarning - assert len(w) == 1 - assert issubclass(w[0].category, UserWarning) - - def test_multiple_context_managers(self): - """Test using multiple context managers.""" - from scitex.pd import ignore_setting_with_copy_warning - - df1 = pd.DataFrame({"A": [1, 2, 3]}) - df2 = pd.DataFrame({"B": [4, 5, 6]}) - - view1 = df1[df1["A"] > 1] - view2 = df2[df2["B"] < 6] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - view1["A"] = 10 - - with ignore_setting_with_copy_warning(): - view2["B"] = 20 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - -class TestRealWorldUsage: - """Test real-world usage patterns.""" - - def test_data_cleaning_workflow(self): - """Test typical data cleaning workflow.""" - from scitex.pd import ignore_setting_with_copy_warning - - # Create sample data - df = pd.DataFrame( - { - "id": range(100), - "value": np.random.randn(100), - "category": np.random.choice(["A", "B", "C"], 100), - } - ) - - # Filter data - df_filtered = df[df["value"] > 0] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - # Clean data without warnings - df_filtered["value"] = df_filtered["value"].round(2) - df_filtered["processed"] = True - df_filtered.loc[df_filtered["category"] == "A", "special"] = "yes" - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - def test_iterative_updates(self): - """Test iterative DataFrame updates.""" - from scitex.pd import ignore_setting_with_copy_warning - - df = pd.DataFrame( - { - "date": pd.date_range("2023-01-01", periods=30), - "value": np.random.randn(30), - } - ) - - # Create view - january = df[df["date"].dt.month == 1] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - with ignore_setting_with_copy_warning(): - # Update values iteratively - for i in range(len(january)): - if january.iloc[i]["value"] < 0: - january.iloc[i, january.columns.get_loc("value")] = 0 - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - -class TestDocstringExample: - """Test the example from the docstring.""" - - def test_docstring_example(self): - """Test exact example from docstring.""" - from scitex.pd import ignore_setting_with_copy_warning - - # Create a situation that would trigger warning - df = pd.DataFrame({"column": [1, 2, 3], "other": [4, 5, 6]}) - df_subset = df[df["column"] > 1] - new_values = [10, 20] - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - - # Example from docstring - with ignore_setting_with_copy_warning(): - df_subset["column"] = new_values # No warning will be shown - - setting_warnings = [ - warning for warning in w if "SettingWithCopy" in str(warning.category) - ] - assert len(setting_warnings) == 0 - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_ignore_SettingWithCopyWarning.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-05 07:35:30 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_ignore_.py -# -# import warnings -# from contextlib import contextmanager -# -# -# @contextmanager -# def ignore_setting_with_copy_warning(): -# """ -# Context manager to temporarily ignore pandas SettingWithCopyWarning. -# -# Example -# ------- -# >>> with ignore_SettingWithCopyWarning(): -# ... df['column'] = new_values # No warning will be shown -# """ -# try: -# from pandas.errors import SettingWithCopyWarning -# except ImportError: -# from pandas.core.common import SettingWithCopyWarning -# -# # Save current warning filters -# with warnings.catch_warnings(): -# warnings.simplefilter(action="ignore", category=SettingWithCopyWarning) -# yield -# -# -# # Backward compatibility -# ignore_SettingWithCopyWarning = ignore_setting_with_copy_warning # Deprecated -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_ignore_SettingWithCopyWarning.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__melt_cols.py b/tests/scitex/pd/test__melt_cols.py deleted file mode 100644 index 199d70124..000000000 --- a/tests/scitex/pd/test__melt_cols.py +++ /dev/null @@ -1,513 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 10:45:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__melt_cols.py - -import os -import sys -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd -import pytest - -# Add the project root to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../src")) - -from scitex.pd import melt_cols - - -class TestBasicFunctionality: - """Test basic functionality of melt_cols.""" - - def test_simple_melt(self): - """Test basic melting of columns.""" - df = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["A", "B", "C"], - "score_1": [10, 20, 30], - "score_2": [15, 25, 35], - } - ) - - result = melt_cols(df, cols=["score_1", "score_2"]) - - assert len(result) == 6 # 3 rows * 2 columns - assert "variable" in result.columns - assert "value" in result.columns - assert "id" in result.columns - assert "name" in result.columns - - # Check first few rows - assert result.iloc[0]["id"] == 1 - assert result.iloc[0]["name"] == "A" - assert result.iloc[0]["variable"] == "score_1" - assert result.iloc[0]["value"] == 10 - - def test_single_column_melt(self): - """Test melting a single column.""" - df = pd.DataFrame({"id": [1, 2], "value": [100, 200]}) - - result = melt_cols(df, cols=["value"]) - - assert len(result) == 2 - assert result["variable"].unique() == ["value"] - # When melting a column named "value", the melted values are in "melted_value" - assert list(result["melted_value"]) == [100, 200] - assert list(result["id"]) == [1, 2] - - def test_multiple_id_columns(self): - """Test with multiple identifier columns.""" - df = pd.DataFrame( - { - "year": [2020, 2021, 2022], - "month": [1, 2, 3], - "category": ["A", "B", "C"], - "sales": [100, 200, 300], - "costs": [50, 100, 150], - } - ) - - result = melt_cols(df, cols=["sales", "costs"]) - - assert len(result) == 6 - # All identifier columns should be preserved - assert "year" in result.columns - assert "month" in result.columns - assert "category" in result.columns - - # Check data integrity - sales_rows = result[result["variable"] == "sales"] - assert list(sales_rows["value"]) == [100, 200, 300] - - -class TestIdColumnParameter: - """Test id_columns parameter functionality.""" - - def test_explicit_id_columns(self): - """Test with explicitly specified id columns.""" - df = pd.DataFrame( - { - "id": [1, 2], - "name": ["A", "B"], - "extra": ["X", "Y"], - "val1": [10, 20], - "val2": [30, 40], - } - ) - - result = melt_cols(df, cols=["val1", "val2"], id_columns=["id", "name"]) - - # Should only have specified id columns - assert "id" in result.columns - assert "name" in result.columns - assert "extra" not in result.columns - assert len(result) == 4 - - def test_empty_id_columns(self): - """Test with empty id columns list.""" - df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) - - result = melt_cols(df, cols=["a", "b"], id_columns=[]) - - # Should only have variable and value columns - assert set(result.columns) == {"variable", "value"} - assert len(result) == 4 - - def test_auto_id_columns(self): - """Test automatic detection of id columns.""" - df = pd.DataFrame( - { - "id": [1, 2, 3], - "group": ["A", "B", "C"], - "metric1": [10, 20, 30], - "metric2": [40, 50, 60], - "metric3": [70, 80, 90], - } - ) - - result = melt_cols(df, cols=["metric1", "metric2", "metric3"]) - - # Should automatically use non-melted columns as id - assert "id" in result.columns - assert "group" in result.columns - assert len(result) == 9 # 3 rows * 3 metrics - - -class TestDataTypes: - """Test handling of different data types.""" - - def test_mixed_data_types(self): - """Test with mixed data types in melted columns.""" - df = pd.DataFrame( - { - "id": [1, 2], - "int_col": [10, 20], - "float_col": [1.5, 2.5], - "str_col": ["a", "b"], - "bool_col": [True, False], - } - ) - - result = melt_cols(df, cols=["int_col", "float_col", "str_col", "bool_col"]) - - assert len(result) == 8 # 2 rows * 4 columns - # Value column should handle mixed types - assert 10 in result["value"].values - assert 1.5 in result["value"].values - assert "a" in result["value"].values - assert True in result["value"].values - - def test_datetime_columns(self): - """Test with datetime columns.""" - df = pd.DataFrame( - { - "id": [1, 2], - "date1": pd.to_datetime(["2023-01-01", "2023-01-02"]), - "date2": pd.to_datetime(["2023-02-01", "2023-02-02"]), - } - ) - - result = melt_cols(df, cols=["date1", "date2"]) - - assert len(result) == 4 - assert pd.api.types.is_datetime64_any_dtype(result["value"]) - - def test_categorical_data(self): - """Test with categorical data.""" - df = pd.DataFrame( - { - "id": [1, 2, 3], - "cat1": pd.Categorical(["A", "B", "C"]), - "cat2": pd.Categorical(["X", "Y", "Z"]), - } - ) - - result = melt_cols(df, cols=["cat1", "cat2"]) - - assert len(result) == 6 - # When melting multiple categorical columns, pandas converts to object dtype - assert result["value"].dtype == object - # But the values themselves are still from the original categories - assert set(result["value"]) == {"A", "B", "C", "X", "Y", "Z"} - - -class TestEdgeCases: - """Test edge cases and error conditions.""" - - def test_missing_columns_error(self): - """Test error when specified columns don't exist.""" - df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) - - with pytest.raises(ValueError, match="Columns not found"): - melt_cols(df, cols=["c", "d"]) - - def test_partial_missing_columns(self): - """Test error with partially missing columns.""" - df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) - - with pytest.raises(ValueError, match="Columns not found.*{'d'}"): - melt_cols(df, cols=["a", "b", "d"]) - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - df = pd.DataFrame() - - with pytest.raises(ValueError, match="Columns not found"): - melt_cols(df, cols=["a"]) - - def test_single_row(self): - """Test with single row DataFrame.""" - df = pd.DataFrame({"id": [1], "val1": [10], "val2": [20]}) - - result = melt_cols(df, cols=["val1", "val2"]) - - assert len(result) == 2 - assert list(result["value"]) == [10, 20] - - def test_all_columns_melted(self): - """Test melting all columns.""" - df = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) - - result = melt_cols(df, cols=["a", "b", "c"]) - - # Should have no id columns except for internal index handling - assert set(result.columns) == {"variable", "value"} - assert len(result) == 6 - - -class TestNullHandling: - """Test handling of null values.""" - - def test_null_in_melted_columns(self): - """Test null values in columns being melted.""" - df = pd.DataFrame( - {"id": [1, 2, 3], "val1": [10, np.nan, 30], "val2": [np.nan, 20, np.nan]} - ) - - result = melt_cols(df, cols=["val1", "val2"]) - - assert len(result) == 6 - assert result["value"].isna().sum() == 3 - - def test_null_in_id_columns(self): - """Test null values in identifier columns.""" - df = pd.DataFrame( - {"id": [1, np.nan, 3], "name": ["A", "B", None], "value": [10, 20, 30]} - ) - - result = melt_cols(df, cols=["value"]) - - assert len(result) == 3 - assert pd.isna(result.iloc[1]["id"]) - assert result.iloc[2]["name"] is None - - -class TestIndexHandling: - """Test DataFrame index handling.""" - - def test_non_default_index(self): - """Test with non-default index.""" - df = pd.DataFrame( - {"val1": [10, 20, 30], "val2": [40, 50, 60]}, index=["a", "b", "c"] - ) - - result = melt_cols(df, cols=["val1", "val2"]) - - # Should reset index and handle properly - assert len(result) == 6 - assert result.index.tolist() == list(range(6)) - - def test_multiindex(self): - """Test with MultiIndex DataFrame.""" - arrays = [["A", "A", "B", "B"], [1, 2, 1, 2]] - index = pd.MultiIndex.from_arrays(arrays, names=("letter", "number")) - df = pd.DataFrame( - {"val1": [10, 20, 30, 40], "val2": [50, 60, 70, 80]}, index=index - ) - - result = melt_cols(df, cols=["val1", "val2"]) - - # Should handle MultiIndex by resetting - assert len(result) == 8 - assert isinstance(result.index, pd.RangeIndex) - - -class TestOrderPreservation: - """Test order preservation in results.""" - - def test_row_order_preservation(self): - """Test that row order is preserved.""" - df = pd.DataFrame( - { - "id": [3, 1, 2], - "name": ["C", "A", "B"], - "val1": [30, 10, 20], - "val2": [60, 40, 50], - } - ) - - result = melt_cols(df, cols=["val1", "val2"]) - - # First 3 rows should be val1 in original order - val1_rows = result[result["variable"] == "val1"] - assert list(val1_rows["id"]) == [3, 1, 2] - assert list(val1_rows["value"]) == [30, 10, 20] - - def test_column_order_in_result(self): - """Test column order in result.""" - df = pd.DataFrame({"z": [1, 2], "a": [3, 4], "val1": [5, 6], "val2": [7, 8]}) - - result = melt_cols(df, cols=["val1", "val2"], id_columns=["a", "z"]) - - # Check that columns are in a sensible order - cols = list(result.columns) - assert "variable" in cols - assert "value" in cols - assert "a" in cols - assert "z" in cols - - -class TestRealWorldScenarios: - """Test real-world use cases.""" - - def test_time_series_reshape(self): - """Test reshaping time series data.""" - df = pd.DataFrame( - { - "date": pd.date_range("2023-01-01", periods=3), - "location": ["NYC", "LA", "CHI"], - "temp_morning": [32, 65, 40], - "temp_afternoon": [45, 78, 55], - "temp_evening": [38, 70, 42], - } - ) - - result = melt_cols(df, cols=["temp_morning", "temp_afternoon", "temp_evening"]) - - assert len(result) == 9 # 3 locations * 3 time periods - # Check that date and location are preserved for each measurement - nyc_temps = result[result["location"] == "NYC"] - assert len(nyc_temps) == 3 - assert set(nyc_temps["variable"]) == { - "temp_morning", - "temp_afternoon", - "temp_evening", - } - - def test_survey_data_reshape(self): - """Test reshaping survey response data.""" - df = pd.DataFrame( - { - "respondent_id": [1, 2, 3], - "age": [25, 35, 45], - "gender": ["M", "F", "M"], - "q1_satisfaction": [4, 5, 3], - "q2_satisfaction": [5, 4, 4], - "q3_satisfaction": [3, 5, 2], - } - ) - - result = melt_cols( - df, cols=["q1_satisfaction", "q2_satisfaction", "q3_satisfaction"] - ) - - assert len(result) == 9 - # All respondent info should be preserved - assert "respondent_id" in result.columns - assert "age" in result.columns - assert "gender" in result.columns - - # Check specific respondent - resp1 = result[result["respondent_id"] == 1] - assert list(resp1["value"]) == [4, 5, 3] - - -class TestDocstringExample: - """Test the example from the docstring.""" - - def test_docstring_example(self): - """Test exact example from docstring.""" - data = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "score_1": [85, 90, 78], - "score_2": [92, 88, 95], - } - ) - - melted = melt_cols(data, cols=["score_1", "score_2"]) - - # Check structure - assert len(melted) == 6 - assert set(melted.columns) == {"id", "name", "variable", "value"} - - # Check specific values from docstring output - assert melted.iloc[0]["id"] == 1 - assert melted.iloc[0]["name"] == "Alice" - assert melted.iloc[0]["variable"] == "score_1" - assert melted.iloc[0]["value"] == 85 - - assert melted.iloc[3]["id"] == 1 - assert melted.iloc[3]["name"] == "Alice" - assert melted.iloc[3]["variable"] == "score_2" - assert melted.iloc[3]["value"] == 92 - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_melt_cols.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-05 23:04:16 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_melt_cols.py -# -# -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-05 23:03:39 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_melt_cols.py -# -# from typing import List, Optional -# import pandas as pd -# -# -# def melt_cols( -# df: pd.DataFrame, cols: List[str], id_columns: Optional[List[str]] = None -# ) -> pd.DataFrame: -# """ -# Melt specified columns while preserving links to other data in a DataFrame. -# -# Example -# ------- -# >>> data = pd.DataFrame({ -# ... 'id': [1, 2, 3], -# ... 'name': ['Alice', 'Bob', 'Charlie'], -# ... 'score_1': [85, 90, 78], -# ... 'score_2': [92, 88, 95] -# ... }) -# >>> melted = melt_cols(data, cols=['score_1', 'score_2']) -# >>> print(melted) -# id name variable value -# 0 1 Alice score_1 85 -# 1 2 Bob score_1 90 -# 2 3 Charlie score_1 78 -# 3 1 Alice score_2 92 -# 4 2 Bob score_2 88 -# 5 3 Charlie score_2 95 -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame -# cols : List[str] -# Columns to be melted -# id_columns : Optional[List[str]], default None -# Columns to preserve as identifiers. If None, all columns not in 'cols' are used. -# -# Returns -# ------- -# pd.DataFrame -# Melted DataFrame with preserved identifier columns -# -# Raises -# ------ -# ValueError -# If cols are not present in the DataFrame -# """ -# missing_melt = set(cols) - set(df.columns) -# if missing_melt: -# raise ValueError(f"Columns not found in DataFrame: {missing_melt}") -# -# if id_columns is None: -# id_columns = [col for col in df.columns if col not in cols] -# -# df_copy = df.reset_index(drop=True) -# df_copy["global_index"] = df_copy.index -# -# # Use a different value_name if "value" is one of the columns being melted -# value_name = "value" if "value" not in cols else "melted_value" -# melted_df = df_copy[cols + ["global_index"]].melt( -# id_vars=["global_index"], value_name=value_name -# ) -# if id_columns: -# formatted_df = melted_df.merge( -# df_copy[id_columns + ["global_index"]], on="global_index" -# ) -# return formatted_df.drop("global_index", axis=1) -# else: -# # No id columns to merge, just return melted data without global_index -# return melted_df.drop("global_index", axis=1) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_melt_cols.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__merge_columns.py b/tests/scitex/pd/test__merge_columns.py deleted file mode 100644 index 9d23f1a81..000000000 --- a/tests/scitex/pd/test__merge_columns.py +++ /dev/null @@ -1,620 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 11:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__merge_columns.py - -import os -import sys -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd -import pytest - -# Add the project root to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../src")) - -from scitex.pd import merge_cols, merge_columns - - -class TestBasicFunctionality: - """Test basic functionality of merge_columns.""" - - def test_simple_merge_with_sep(self): - """Test basic merging with simple separator.""" - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - - result = merge_columns(df, "A", "B", sep=" ") - - assert "A_B" in result.columns - assert list(result["A_B"]) == ["1 4", "2 5", "3 6"] - # Original columns preserved - assert "A" in result.columns - assert "B" in result.columns - - def test_merge_with_column_labels(self): - """Test merging with column labels (default behavior).""" - df = pd.DataFrame({"A": [0, 5, 10], "B": [1, 6, 11]}) - - result = merge_columns(df, "A", "B") - - assert "merged" in result.columns - assert result["merged"].iloc[0] == "A-0_B-1" - assert result["merged"].iloc[1] == "A-5_B-6" - assert result["merged"].iloc[2] == "A-10_B-11" - - def test_merge_multiple_columns(self): - """Test merging more than two columns.""" - df = pd.DataFrame({"X": [1, 2], "Y": [3, 4], "Z": [5, 6]}) - - result = merge_columns(df, "X", "Y", "Z", sep=",") - - assert "X_Y_Z" in result.columns - assert result["X_Y_Z"].iloc[0] == "1,3,5" - assert result["X_Y_Z"].iloc[1] == "2,4,6" - - def test_merge_cols_alias(self): - """Test that merge_cols is an alias for merge_columns.""" - df = pd.DataFrame({"A": [1], "B": [2]}) - - result1 = merge_columns(df, "A", "B", sep=" ") - result2 = merge_cols(df, "A", "B", sep=" ") - - pd.testing.assert_frame_equal(result1, result2) - - -class TestParameterVariations: - """Test different parameter combinations.""" - - def test_custom_separators(self): - """Test custom sep1 and sep2 parameters.""" - df = pd.DataFrame({"col1": [10, 20], "col2": [30, 40]}) - - result = merge_columns(df, "col1", "col2", sep1=" & ", sep2="=") - - assert result["merged"].iloc[0] == "col1=10 & col2=30" - assert result["merged"].iloc[1] == "col1=20 & col2=40" - - def test_custom_name(self): - """Test custom name for merged column.""" - df = pd.DataFrame({"first": ["John", "Jane"], "last": ["Doe", "Smith"]}) - - result = merge_columns(df, "first", "last", sep=" ", name="full_name") - - assert "full_name" in result.columns - assert result["full_name"].iloc[0] == "John Doe" - assert result["full_name"].iloc[1] == "Jane Smith" - - def test_list_input(self): - """Test passing columns as a list.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - - result = merge_columns(df, ["A", "B", "C"], sep="-") - - assert "A_B_C" in result.columns - assert result["A_B_C"].iloc[0] == "1-3-5" - - def test_tuple_input(self): - """Test passing columns as a tuple.""" - df = pd.DataFrame({"X": [7, 8], "Y": [9, 10]}) - - result = merge_columns(df, ("X", "Y"), sep="/") - - assert "X_Y" in result.columns - assert result["X_Y"].iloc[0] == "7/9" - - -class TestDataTypes: - """Test handling of different data types.""" - - def test_numeric_columns(self): - """Test merging numeric columns.""" - df = pd.DataFrame({"int_col": [1, 2, 3], "float_col": [1.5, 2.5, 3.5]}) - - result = merge_columns(df, "int_col", "float_col", sep=" | ") - - assert result["int_col_float_col"].iloc[0] == "1 | 1.5" - assert result["int_col_float_col"].iloc[1] == "2 | 2.5" - - def test_mixed_types(self): - """Test merging columns with mixed types.""" - df = pd.DataFrame( - { - "str": ["a", "b"], - "int": [1, 2], - "float": [3.14, 2.71], - "bool": [True, False], - } - ) - - result = merge_columns(df, "str", "int", "float", "bool", sep=",") - - assert "str_int_float_bool" in result.columns - assert result["str_int_float_bool"].iloc[0] == "a,1,3.14,True" - assert result["str_int_float_bool"].iloc[1] == "b,2,2.71,False" - - def test_datetime_columns(self): - """Test merging datetime columns.""" - df = pd.DataFrame( - { - "date": pd.to_datetime(["2023-01-01", "2023-01-02"]), - "time": ["10:00", "11:00"], - } - ) - - result = merge_columns(df, "date", "time", sep=" ") - - assert "date_time" in result.columns - # Datetime will be converted to string - assert "2023-01-01" in result["date_time"].iloc[0] - assert "10:00" in result["date_time"].iloc[0] - - def test_null_values(self): - """Test handling of null values.""" - df = pd.DataFrame({"A": [1, None, 3], "B": ["x", "y", None]}) - - result = merge_columns(df, "A", "B", sep="-") - - # When numeric columns contain None, they become float before string conversion - assert result["A_B"].iloc[0] == "1.0-x" - assert result["A_B"].iloc[1] == "nan-y" # None becomes NaN in numeric column - assert ( - result["A_B"].iloc[2] == "3.0-None" - ) # None stays as 'None' in string column - - -class TestEdgeCases: - """Test edge cases and error conditions.""" - - def test_no_columns_error(self): - """Test error when no columns specified.""" - df = pd.DataFrame({"A": [1, 2]}) - - with pytest.raises(ValueError, match="No columns specified"): - merge_columns(df) - - def test_missing_columns_error(self): - """Test error when columns don't exist.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - - with pytest.raises(KeyError, match="Columns not found.*\\['C', 'D'\\]"): - merge_columns(df, "A", "C", "D") - - def test_single_column(self): - """Test merging a single column (edge case).""" - df = pd.DataFrame({"A": [1, 2, 3]}) - - result = merge_columns(df, "A", sep=" ") - - assert "A" in result.columns # Original column name when single column - # When only one column, it just converts to string - assert list(result["A"]) == ["1", "2", "3"] - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - df = pd.DataFrame({"A": [], "B": []}) - - result = merge_columns(df, "A", "B", sep=" ") - - assert "A_B" in result.columns - assert len(result) == 0 - - def test_large_number_of_columns(self): - """Test merging many columns.""" - # Create DataFrame with 10 columns - data = {f"col{i}": list(range(3)) for i in range(10)} - df = pd.DataFrame(data) - - cols = [f"col{i}" for i in range(10)] - result = merge_columns(df, *cols, sep=",") - - expected_name = "_".join(cols) - assert expected_name in result.columns - # First row should be '0,0,0,...' - assert result[expected_name].iloc[0] == ",".join(["0"] * 10) - - -class TestSpecialCharacters: - """Test handling of special characters.""" - - def test_columns_with_spaces(self): - """Test columns with spaces in names.""" - df = pd.DataFrame( - {"First Name": ["John", "Jane"], "Last Name": ["Doe", "Smith"]} - ) - - result = merge_columns(df, "First Name", "Last Name", sep=" ") - - assert "First Name_Last Name" in result.columns - assert result["First Name_Last Name"].iloc[0] == "John Doe" - - def test_special_separators(self): - """Test with special character separators.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - - # Test various special separators - result1 = merge_columns(df, "A", "B", sep="||") - assert result1["A_B"].iloc[0] == "1||3" - - result2 = merge_columns(df, "A", "B", sep="\t") - assert result2["A_B"].iloc[0] == "1\t3" - - result3 = merge_columns(df, "A", "B", sep="\n") - assert result3["A_B"].iloc[0] == "1\n3" - - def test_unicode_content(self): - """Test with Unicode content.""" - df = pd.DataFrame( - {"name": ["José", "François"], "city": ["São Paulo", "Montréal"]} - ) - - result = merge_columns(df, "name", "city", sep=" - ") - - assert result["name_city"].iloc[0] == "José - São Paulo" - assert result["name_city"].iloc[1] == "François - Montréal" - - -class TestDocstringExamples: - """Test examples from the docstring.""" - - def test_docstring_example_simple(self): - """Test first docstring example with simple separator.""" - df = pd.DataFrame({"A": [0, 5, 10], "B": [1, 6, 11], "C": [2, 7, 12]}) - - result = merge_columns(df, "A", "B", sep=" ") - - assert result["A_B"].iloc[0] == "0 1" - assert result["A_B"].iloc[1] == "5 6" - assert result["A_B"].iloc[2] == "10 11" - - def test_docstring_example_labels(self): - """Test second docstring example with column labels.""" - df = pd.DataFrame({"A": [0, 5, 10], "B": [1, 6, 11], "C": [2, 7, 12]}) - - result = merge_columns(df, "A", "B", sep1="_", sep2="-") - - assert result["merged"].iloc[0] == "A-0_B-1" - assert result["merged"].iloc[1] == "A-5_B-6" - assert result["merged"].iloc[2] == "A-10_B-11" - - -class TestRealWorldScenarios: - """Test real-world use cases.""" - - def test_address_concatenation(self): - """Test concatenating address fields.""" - df = pd.DataFrame( - { - "street": ["123 Main St", "456 Oak Ave"], - "city": ["New York", "Los Angeles"], - "state": ["NY", "CA"], - "zip": ["10001", "90001"], - } - ) - - result = merge_columns( - df, "street", "city", "state", "zip", sep=", ", name="full_address" - ) - - assert result["full_address"].iloc[0] == "123 Main St, New York, NY, 10001" - assert result["full_address"].iloc[1] == "456 Oak Ave, Los Angeles, CA, 90001" - - def test_creating_composite_keys(self): - """Test creating composite keys for database operations.""" - df = pd.DataFrame( - { - "year": [2023, 2023, 2024], - "month": [1, 2, 1], - "category": ["A", "B", "A"], - "subcategory": ["X", "Y", "Z"], - } - ) - - result = merge_columns( - df, - "year", - "month", - "category", - "subcategory", - sep="_", - name="composite_key", - ) - - assert result["composite_key"].iloc[0] == "2023_1_A_X" - assert result["composite_key"].iloc[1] == "2023_2_B_Y" - assert result["composite_key"].iloc[2] == "2024_1_A_Z" - - def test_log_message_creation(self): - """Test creating formatted log messages.""" - df = pd.DataFrame( - { - "timestamp": ["2023-01-01 10:00:00", "2023-01-01 10:01:00"], - "level": ["INFO", "ERROR"], - "message": ["Process started", "Connection failed"], - } - ) - - result = merge_columns( - df, "timestamp", "level", "message", sep1=" | ", sep2=": ", name="log_entry" - ) - - expected1 = ( - "timestamp: 2023-01-01 10:00:00 | level: INFO | message: Process started" - ) - expected2 = ( - "timestamp: 2023-01-01 10:01:00 | level: ERROR | message: Connection failed" - ) - - assert result["log_entry"].iloc[0] == expected1 - assert result["log_entry"].iloc[1] == expected2 - - -class TestPerformance: - """Test performance-related scenarios.""" - - def test_large_dataframe(self): - """Test with a reasonably large DataFrame.""" - n_rows = 10000 - df = pd.DataFrame( - { - "A": range(n_rows), - "B": range(n_rows, 2 * n_rows), - "C": [f"str_{i}" for i in range(n_rows)], - } - ) - - result = merge_columns(df, "A", "B", "C", sep="-") - - assert len(result) == n_rows - assert result["A_B_C"].iloc[0] == "0-10000-str_0" - assert result["A_B_C"].iloc[-1] == f"{n_rows-1}-{2*n_rows-1}-str_{n_rows-1}" - - def test_no_copy_modification(self): - """Test that original DataFrame is not modified.""" - df = pd.DataFrame({"X": [1, 2, 3], "Y": [4, 5, 6]}) - - original_columns = list(df.columns) - result = merge_columns(df, "X", "Y", sep=" ") - - # Original DataFrame should be unchanged - assert list(df.columns) == original_columns - assert "X_Y" not in df.columns - # Result should have the new column - assert "X_Y" in result.columns - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_merge_columns.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-05 07:37:09 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_merge_columns.py -# -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-07 12:03:29 (ywatanabe)" -# # ./src/scitex/pd/_merge_cols.py -# -# from typing import Union, List, Tuple -# import pandas as pd -# -# -# def merge_columns( -# df: pd.DataFrame, -# *args: Union[str, List[str], Tuple[str, ...]], -# sep: str = None, -# sep1: str = "_", -# sep2: str = "-", -# name: str = "merged", -# ) -> pd.DataFrame: -# """Creates a new column by joining specified columns. -# -# Example -# ------- -# >>> df = pd.DataFrame({ -# ... 'A': [0, 5, 10], -# ... 'B': [1, 6, 11], -# ... 'C': [2, 7, 12] -# ... }) -# >>> # Simple concatenation with separator -# >>> merge_columns(df, 'A', 'B', sep=' ') -# A B C A_B -# 0 0 1 2 0 1 -# 1 5 6 7 5 6 -# 2 10 11 12 10 11 -# -# >>> # With column labels -# >>> merge_columns(df, 'A', 'B', sep1='_', sep2='-') -# A B C A_B -# 0 0 1 2 A-0_B-1 -# 1 5 6 7 A-5_B-6 -# 2 10 11 12 A-10_B-11 -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame -# *args : Union[str, List[str], Tuple[str, ...]] -# Column names to join -# sep : str, optional -# Simple separator for values only (overrides sep1/sep2) -# sep1 : str, optional -# Separator between column-value pairs, by default "_" -# sep2 : str, optional -# Separator between column name and value, by default "-" -# name : str, optional -# Name for the merged column, by default "merged" -# -# Returns -# ------- -# pd.DataFrame -# DataFrame with added merged column -# """ -# _df = df.copy() -# columns = args[0] if len(args) == 1 and isinstance(args[0], (list, tuple)) else args -# -# if not columns: -# raise ValueError("No columns specified for merging") -# -# if not all(col in _df.columns for col in columns): -# missing = [col for col in columns if col not in _df.columns] -# raise KeyError(f"Columns not found in DataFrame: {missing}") -# -# # Handle empty DataFrame case -# if len(_df) == 0: -# # Determine column name -# if name == "merged" and sep is not None: -# new_col_name = "_".join(columns) -# else: -# new_col_name = name -# # Create empty Series with the correct name -# _df[new_col_name] = pd.Series(dtype=str) -# return _df -# -# if sep is not None: -# # Simple value concatenation -# merged_col = ( -# _df[list(columns)] -# .astype(str) -# .apply( -# lambda row: sep.join(row.values), -# axis=1, -# ) -# ) -# else: -# # Concatenation with column labels -# merged_col = _df[list(columns)].apply( -# lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()), -# axis=1, -# ) -# -# # Determine column name -# if name == "merged" and sep is not None: -# # When using simple separator and default name, use joined column names -# new_col_name = "_".join(columns) -# else: -# # Use provided name or default -# new_col_name = name -# -# _df[new_col_name] = merged_col -# return _df -# -# -# merge_cols = merge_columns -# -# # EOF -# -# # #!./env/bin/python3 -# # # -*- coding: utf-8 -*- -# # # Time-stamp: "2024-10-07 12:03:29 (ywatanabe)" -# # # ./src/scitex/pd/_merge_cols.py -# -# -# # def merge_columns(df, *args, sep1="_", sep2="-", name="merged"): -# # """ -# # Join specified columns with their labels. -# -# # Example: -# # import pandas as pd -# # import numpy as np -# -# # df = pd.DataFrame( -# # data=np.arange(25).reshape(5, 5), -# # columns=["A", "B", "C", "D", "E"], -# # ) -# -# # df1 = merge_columns(df, "A", "B", sep1="_", sep2="-") -# # df2 = merge_columns(df, ["A", "B"], sep1="_", sep2="-") -# # assert (df1 == df2).all().all() # True -# -# # # A B C D E A_B -# # # 0 0 1 2 3 4 A-0_B-1 -# # # 1 5 6 7 8 9 A-5_B-6 -# # # 2 10 11 12 13 14 A-10_B-11 -# # # 3 15 16 17 18 19 A-15_B-16 -# # # 4 20 21 22 23 24 A-20_B-21 -# -# -# # Parameters -# # ---------- -# # df : pandas.DataFrame -# # Input DataFrame -# # *args : str or list -# # Column names to join, either as separate arguments or a single list -# # sep1 : str, optional -# # Separator for joining column names, default "_" -# # sep2 : str, optional -# # Separator between column name and value, default "-" -# -# # Returns -# # ------- -# # pandas.DataFrame -# # DataFrame with added merged column -# # """ -# # _df = df.copy() -# # columns = ( -# # args[0] -# # if len(args) == 1 and isinstance(args[0], (list, tuple)) -# # else args -# # ) -# # merged_col = _df[list(columns)].apply( -# # lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()), -# # axis=1, -# # ) -# -# # new_col_name = sep1.join(columns) if not name else str(name) -# # _df[new_col_name] = merged_col -# # return _df -# -# -# # merge_cols = merge_columns -# -# # # def merge_columns(_df, *columns): -# # # """ -# # # Add merged columns in string. -# -# # # DF = pd.DataFrame(data=np.arange(25).reshape(5,5), -# # # columns=["A", "B", "C", "D", "E"], -# # # ) -# -# # # print(DF) -# -# # # # A B C D E -# # # # 0 0 1 2 3 4 -# # # # 1 5 6 7 8 9 -# # # # 2 10 11 12 13 14 -# # # # 3 15 16 17 18 19 -# # # # 4 20 21 22 23 24 -# -# # # print(merge_columns(DF, "A", "B", "C")) -# -# # # # A B C D E A_B_C -# # # # 0 0 1 2 3 4 0_1_2 -# # # # 1 5 6 7 8 9 5_6_7 -# # # # 2 10 11 12 13 14 10_11_12 -# # # # 3 15 16 17 18 19 15_16_17 -# # # # 4 20 21 22 23 24 20_21_22 -# # # """ -# # # from copy import deepcopy -# -# # # df = deepcopy(_df) -# # # merged = deepcopy(df[columns[0]]) # initialization -# # # for c in columns[1:]: -# # # merged = scitex.ai.utils.merge_labels(list(merged), deepcopy(df[c])) -# # # df.loc[:, scitex.gen.connect_strs(columns)] = merged -# # # return df -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_merge_columns.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__mv.py b/tests/scitex/pd/test__mv.py deleted file mode 100644 index e0e1cf554..000000000 --- a/tests/scitex/pd/test__mv.py +++ /dev/null @@ -1,464 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 11:15:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__mv.py - -import os -import sys -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd -import pytest - -# Add the project root to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../src")) - -from scitex.pd import mv, mv_to_first, mv_to_last - - -class TestMvBasicFunctionality: - """Test basic functionality of mv function.""" - - def test_move_column_to_position(self): - """Test moving a column to a specific position.""" - df = pd.DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9], "D": [10, 11, 12]} - ) - - # Move column B to position 2 - result = mv(df, "B", 2) - - assert list(result.columns) == ["A", "C", "B", "D"] - # Data should be preserved - assert result["B"].tolist() == [4, 5, 6] - - def test_move_column_to_first(self): - """Test moving a column to the first position.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - - result = mv(df, "C", 0) - - assert list(result.columns) == ["C", "A", "B"] - assert result["C"].tolist() == [5, 6] - - def test_move_column_to_last(self): - """Test moving a column to the last position.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - - result = mv(df, "A", -1) - - assert list(result.columns) == ["B", "C", "A"] - assert result["A"].tolist() == [1, 2] - - def test_move_row(self): - """Test moving a row to a specific position.""" - df = pd.DataFrame( - {"col1": [1, 2, 3, 4], "col2": [5, 6, 7, 8]}, index=["a", "b", "c", "d"] - ) - - # Move row 'c' to position 1 - result = mv(df, "c", 1, axis=0) - - assert list(result.index) == ["a", "c", "b", "d"] - # Data should be preserved - assert result.loc["c", "col1"] == 3 - assert result.loc["c", "col2"] == 7 - - -class TestMvNegativePositions: - """Test negative position handling.""" - - def test_negative_position_columns(self): - """Test negative positions for column movement.""" - df = pd.DataFrame({"A": [1], "B": [2], "C": [3], "D": [4], "E": [5]}) - - # -1 should be last position - result = mv(df, "B", -1) - assert list(result.columns) == ["A", "C", "D", "E", "B"] - - # -2 should be second to last - result = mv(df, "B", -2) - assert list(result.columns) == ["A", "C", "D", "B", "E"] - - # -3 should be third from last - result = mv(df, "B", -3) - assert list(result.columns) == ["A", "C", "B", "D", "E"] - - def test_negative_position_rows(self): - """Test negative positions for row movement.""" - df = pd.DataFrame({"col": [1, 2, 3, 4]}, index=["a", "b", "c", "d"]) - - # Move 'b' to -1 (last) - result = mv(df, "b", -1, axis=0) - assert list(result.index) == ["a", "c", "d", "b"] - - # Move 'b' to -2 (second to last) - result = mv(df, "b", -2, axis=0) - assert list(result.index) == ["a", "c", "b", "d"] - - -class TestMvToFirst: - """Test mv_to_first function.""" - - def test_mv_to_first_column(self): - """Test moving column to first position.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6], "D": [7, 8]}) - - result = mv_to_first(df, "C") - - assert list(result.columns) == ["C", "A", "B", "D"] - assert result["C"].tolist() == [5, 6] - - def test_mv_to_first_already_first(self): - """Test moving first column to first (no change).""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - - result = mv_to_first(df, "A") - - assert list(result.columns) == ["A", "B"] - - def test_mv_to_first_row(self): - """Test moving row to first position.""" - df = pd.DataFrame({"col": [1, 2, 3, 4]}, index=["a", "b", "c", "d"]) - - result = mv_to_first(df, "c", axis=0) - - assert list(result.index) == ["c", "a", "b", "d"] - assert result.loc["c", "col"] == 3 - - -class TestMvToLast: - """Test mv_to_last function.""" - - def test_mv_to_last_column(self): - """Test moving column to last position.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6], "D": [7, 8]}) - - result = mv_to_last(df, "B") - - assert list(result.columns) == ["A", "C", "D", "B"] - assert result["B"].tolist() == [3, 4] - - def test_mv_to_last_already_last(self): - """Test moving last column to last (no change).""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - - result = mv_to_last(df, "B") - - assert list(result.columns) == ["A", "B"] - - def test_mv_to_last_row(self): - """Test moving row to last position.""" - df = pd.DataFrame({"col": [1, 2, 3, 4]}, index=["a", "b", "c", "d"]) - - result = mv_to_last(df, "b", axis=0) - - assert list(result.index) == ["a", "c", "d", "b"] - assert result.loc["b", "col"] == 2 - - -class TestEdgeCases: - """Test edge cases and error conditions.""" - - def test_nonexistent_column(self): - """Test moving non-existent column.""" - df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}) - - with pytest.raises(ValueError): - mv(df, "C", 0) - - def test_nonexistent_row(self): - """Test moving non-existent row.""" - df = pd.DataFrame({"A": [1, 2]}, index=["a", "b"]) - - with pytest.raises(ValueError): - mv(df, "c", 0, axis=0) - - def test_single_column_dataframe(self): - """Test with single column DataFrame.""" - df = pd.DataFrame({"A": [1, 2, 3]}) - - # Moving the only column should work but have no effect - result = mv(df, "A", 0) - assert list(result.columns) == ["A"] - - result = mv(df, "A", -1) - assert list(result.columns) == ["A"] - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - df = pd.DataFrame() - - # Should handle gracefully even though there's nothing to move - # This will raise because there are no columns - with pytest.raises(ValueError): - mv(df, "A", 0) - - def test_position_out_of_bounds(self): - """Test with position beyond bounds.""" - df = pd.DataFrame({"A": [1], "B": [2], "C": [3]}) - - # Position beyond end should place at end - result = mv(df, "A", 10) - assert list(result.columns) == ["B", "C", "A"] - - # Large negative position should place at beginning - result = mv(df, "C", -10) - assert list(result.columns) == ["C", "A", "B"] - - -class TestDataTypes: - """Test with different data types.""" - - def test_mixed_column_types(self): - """Test with mixed column data types.""" - df = pd.DataFrame( - { - "int": [1, 2, 3], - "float": [1.1, 2.2, 3.3], - "str": ["a", "b", "c"], - "bool": [True, False, True], - "date": pd.date_range("2023-01-01", periods=3), - } - ) - - result = mv(df, "bool", 1) - - # Check order - assert list(result.columns) == ["int", "bool", "float", "str", "date"] - # Check data integrity - assert result["bool"].tolist() == [True, False, True] - assert result["float"].tolist() == [1.1, 2.2, 3.3] - - def test_categorical_index(self): - """Test with categorical index.""" - df = pd.DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6]}, index=pd.Categorical(["x", "y", "z"]) - ) - - result = mv(df, "y", 0, axis=0) - - assert list(result.index) == ["y", "x", "z"] - # Note: pandas reindex doesn't preserve CategoricalIndex type - assert isinstance(result.index, pd.Index) - - def test_multiindex_columns(self): - """Test with MultiIndex columns.""" - # Create MultiIndex columns - arrays = [["A", "A", "B", "B"], ["X", "Y", "X", "Y"]] - columns = pd.MultiIndex.from_arrays(arrays) - df = pd.DataFrame(np.random.randn(3, 4), columns=columns) - - # Move specific column - result = mv(df, ("B", "X"), 0) - - assert result.columns[0] == ("B", "X") - assert result.columns[1] == ("A", "X") - - -class TestIndexPreservation: - """Test that indices and data are properly preserved.""" - - def test_preserve_column_attributes(self): - """Test that column attributes are preserved.""" - df = pd.DataFrame( - { - "A": pd.Series([1, 2, 3], name="A"), - "B": pd.Series([4, 5, 6], name="B"), - "C": pd.Series([7, 8, 9], name="C"), - } - ) - - result = mv(df, "B", 0) - - # Check that column names are preserved - assert result["B"].name == "B" - assert result["A"].name == "A" - - def test_preserve_index_name(self): - """Test that index names are preserved.""" - df = pd.DataFrame( - {"A": [1, 2, 3], "B": [4, 5, 6]}, - index=pd.Index(["x", "y", "z"], name="my_index"), - ) - - result = mv(df, "y", 0, axis=0) - - assert result.index.name == "my_index" - assert list(result.index) == ["y", "x", "z"] - - def test_no_data_modification(self): - """Test that original DataFrame is not modified.""" - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - - original_columns = list(df.columns) - result = mv(df, "B", 0) - - # Original should be unchanged - assert list(df.columns) == original_columns - # Result should be different - assert list(result.columns) != original_columns - - -class TestComplexScenarios: - """Test complex real-world scenarios.""" - - def test_reorganize_dataframe_columns(self): - """Test reorganizing DataFrame columns for analysis.""" - df = pd.DataFrame( - { - "id": [1, 2, 3], - "name": ["Alice", "Bob", "Charlie"], - "age": [25, 30, 35], - "score": [95, 87, 92], - "category": ["A", "B", "A"], - } - ) - - # Move id to first, category to second - result = mv_to_first(df, "category") - result = mv_to_first(result, "id") - - assert list(result.columns) == ["id", "category", "name", "age", "score"] - - def test_multiple_moves(self): - """Test multiple sequential moves.""" - df = pd.DataFrame({"A": [1], "B": [2], "C": [3], "D": [4], "E": [5]}) - - # Rearrange columns - result = mv(df, "E", 0) # E to first: ['E', 'A', 'B', 'C', 'D'] - result = mv(result, "C", 2) # C to position 2: ['E', 'A', 'C', 'B', 'D'] - result = mv(result, "A", -1) # A to last: ['E', 'C', 'B', 'D', 'A'] - - assert list(result.columns) == ["E", "C", "B", "D", "A"] - - def test_pivot_style_reorganization(self): - """Test reorganizing for pivot-style analysis.""" - df = pd.DataFrame( - { - "value1": [10, 20, 30], - "value2": [40, 50, 60], - "group": ["A", "B", "C"], - "subgroup": ["X", "Y", "Z"], - "metric1": [1.1, 2.2, 3.3], - "metric2": [4.4, 5.5, 6.6], - } - ) - - # Move grouping columns to front - result = mv_to_first(df, "subgroup") - result = mv_to_first(result, "group") - - expected_order = ["group", "subgroup", "value1", "value2", "metric1", "metric2"] - assert list(result.columns) == expected_order - - -class TestNaNAndSpecialValues: - """Test handling of NaN and special values.""" - - def test_dataframe_with_nan(self): - """Test moving columns containing NaN values.""" - df = pd.DataFrame( - {"A": [1, np.nan, 3], "B": [np.nan, 5, 6], "C": [7, 8, np.nan]} - ) - - result = mv(df, "B", 0) - - assert list(result.columns) == ["B", "A", "C"] - # NaN values should be preserved - assert pd.isna(result["B"].iloc[0]) - assert result["B"].iloc[1] == 5 - - def test_datetime_with_nat(self): - """Test with datetime columns containing NaT.""" - df = pd.DataFrame( - { - "dates": pd.to_datetime(["2023-01-01", pd.NaT, "2023-01-03"]), - "values": [1, 2, 3], - } - ) - - result = mv_to_last(df, "dates") - - assert list(result.columns) == ["values", "dates"] - assert pd.isna(result["dates"].iloc[1]) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_mv.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-05 07:39:12 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_mv.py -# -# -# def mv(df, key, position, axis=1): -# """ -# Move a row or column to a specified position in a DataFrame. -# -# Args: -# df (pandas.DataFrame): The input DataFrame. -# key (str): The label of the row or column to move. -# position (int): The position to move the row or column to. -# axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. -# -# Returns: -# pandas.DataFrame: A new DataFrame with the row or column moved. -# """ -# if axis == 0: -# items = df.index.tolist() -# else: -# items = df.columns.tolist() -# items.remove(key) -# -# if position < 0: -# position += len(items) + 1 -# -# items.insert(position, key) -# return df.reindex(items, axis=axis) -# -# -# def mv_to_first(df, key, axis=1): -# """ -# Move a row or column to the first position in a DataFrame. -# -# Args: -# df (pandas.DataFrame): The input DataFrame. -# key (str): The label of the row or column to move. -# axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. -# -# Returns: -# pandas.DataFrame: A new DataFrame with the row or column moved to the first position. -# """ -# return mv(df, key, 0, axis) -# -# -# def mv_to_last(df, key, axis=1): -# """ -# Move a row or column to the last position in a DataFrame. -# -# Args: -# df (pandas.DataFrame): The input DataFrame. -# key (str): The label of the row or column to move. -# axis (int, optional): 0 for rows, 1 for columns. Defaults to 1. -# -# Returns: -# pandas.DataFrame: A new DataFrame with the row or column moved to the last position. -# """ -# return mv(df, key, -1, axis) -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_mv.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__replace.py b/tests/scitex/pd/test__replace.py deleted file mode 100644 index 1b33568d8..000000000 --- a/tests/scitex/pd/test__replace.py +++ /dev/null @@ -1,458 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-03 11:30:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__replace.py - -import os -import sys -from unittest.mock import Mock, patch - -import numpy as np -import pandas as pd -import pytest - -# Add the project root to the path -sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../../../src")) - -from scitex.pd import replace - - -class TestBasicReplacements: - """Test basic replacement functionality.""" - - def test_simple_string_replacement(self): - """Test simple string replacement with old_value and new_value.""" - df = pd.DataFrame( - {"A": ["apple", "banana", "apple"], "B": ["orange", "apple", "grape"]} - ) - - result = replace(df, "apple", "pear") - - assert result["A"].tolist() == ["pear", "banana", "pear"] - assert result["B"].tolist() == ["orange", "pear", "grape"] - - def test_numeric_replacement(self): - """Test replacement of numeric values.""" - df = pd.DataFrame({"A": [1, 2, 3, 1], "B": [4, 1, 6, 7]}) - - result = replace(df, 1, 99) - - assert result["A"].tolist() == [99, 2, 3, 99] - assert result["B"].tolist() == [4, 99, 6, 7] - - def test_dict_replacement(self): - """Test replacement using dictionary mapping.""" - df = pd.DataFrame({"A": ["a", "b", "c"], "B": ["x", "y", "z"]}) - - replace_dict = {"a": "alpha", "b": "beta", "x": "X", "z": "Z"} - result = replace(df, replace_dict) - - assert result["A"].tolist() == ["alpha", "beta", "c"] - assert result["B"].tolist() == ["X", "y", "Z"] - - def test_specific_columns_replacement(self): - """Test replacement in specific columns only.""" - df = pd.DataFrame( - { - "A": ["test", "test", "other"], - "B": ["test", "test", "test"], - "C": ["test", "other", "test"], - } - ) - - result = replace(df, "test", "replaced", cols=["A", "C"]) - - assert result["A"].tolist() == ["replaced", "replaced", "other"] - assert result["B"].tolist() == ["test", "test", "test"] # B unchanged - assert result["C"].tolist() == ["replaced", "other", "replaced"] - - -class TestRegexReplacements: - """Test regex-based replacements.""" - - def test_simple_regex_replacement(self): - """Test simple regex pattern replacement.""" - df = pd.DataFrame( - { - "A": ["abc-123", "def-456", "ghi-789"], - "B": ["test-001", "test-002", "test-003"], - } - ) - - result = replace(df, r"-\d+", "", regex=True) - - assert result["A"].tolist() == ["abc", "def", "ghi"] - assert result["B"].tolist() == ["test", "test", "test"] - - def test_regex_dict_replacement(self): - """Test regex replacement with dictionary.""" - df = pd.DataFrame( - { - "A": ["email@domain.com", "user@test.org"], - "B": ["phone: 123-456", "tel: 789-012"], - } - ) - - replace_dict = { - r"@.*\.com": "@company.com", - r"@.*\.org": "@organization.org", - r"\d{3}-\d{3}": "XXX-XXX", - } - result = replace(df, replace_dict, regex=True) - - assert result["A"].tolist() == ["email@company.com", "user@organization.org"] - assert result["B"].tolist() == ["phone: XXX-XXX", "tel: XXX-XXX"] - - def test_regex_special_characters(self): - """Test regex replacement with special characters.""" - df = pd.DataFrame( - {"A": ["$100.00", "$250.50", "$1000.99"], "B": ["#tag1", "#tag2", "#tag3"]} - ) - - result = replace(df, r"\$|\.", "", regex=True) - - assert result["A"].tolist() == ["10000", "25050", "100099"] - assert result["B"].tolist() == ["#tag1", "#tag2", "#tag3"] - - -class TestDataTypes: - """Test replacements with different data types.""" - - def test_mixed_type_dataframe(self): - """Test replacement in DataFrame with mixed types.""" - df = pd.DataFrame( - { - "int": [1, 2, 3, 1], - "float": [1.0, 2.5, 1.0, 3.5], - "str": ["1", "2", "1", "3"], - "bool": [True, False, True, False], - } - ) - - result = replace(df, 1, 99) - - assert result["int"].tolist() == [99, 2, 3, 99] - assert result["float"].tolist() == [99.0, 2.5, 99.0, 3.5] - # Pandas doesn't replace string '1' when looking for numeric 1 - assert result["str"].tolist() == ["1", "2", "1", "3"] - # Pandas doesn't replace True when looking for numeric 1 - assert result["bool"].tolist() == [True, False, True, False] - - def test_nan_replacement(self): - """Test replacement of NaN values.""" - df = pd.DataFrame({"A": [1, np.nan, 3, np.nan], "B": ["a", "b", np.nan, "d"]}) - - result = replace(df, np.nan, 0) - - assert result["A"].tolist() == [1, 0, 3, 0] - assert result["B"].tolist() == ["a", "b", 0, "d"] - - def test_none_replacement(self): - """Test replacement of None values.""" - df = pd.DataFrame({"A": [1, None, 3], "B": ["a", None, "c"]}) - - result = replace(df, None, "missing") - - # In numeric columns, None becomes NaN, so replacing None doesn't affect it - assert result["A"][0] == 1.0 - assert pd.isna(result["A"][1]) # Still NaN - assert result["A"][2] == 3.0 - # In string columns, None is preserved and can be replaced - assert result["B"].tolist() == ["a", "missing", "c"] - - def test_datetime_replacement(self): - """Test replacement in datetime columns.""" - df = pd.DataFrame( - { - "dates": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-01"]), - "values": [1, 2, 3], - } - ) - - old_date = pd.to_datetime("2023-01-01") - new_date = pd.to_datetime("2023-01-15") - - result = replace(df, old_date, new_date) - - expected = pd.to_datetime(["2023-01-15", "2023-01-02", "2023-01-15"]) - pd.testing.assert_series_equal( - result["dates"], pd.Series(expected, name="dates") - ) - - -class TestEdgeCases: - """Test edge cases and error conditions.""" - - def test_no_new_value_error(self): - """Test error when new_value not provided with string old_value.""" - df = pd.DataFrame({"A": [1, 2, 3]}) - - with pytest.raises(ValueError, match="new_value must be provided"): - replace(df, "old") - - def test_empty_dataframe(self): - """Test replacement on empty DataFrame.""" - df = pd.DataFrame() - - result = replace(df, "old", "new") - - assert result.empty - assert isinstance(result, pd.DataFrame) - - def test_nonexistent_column(self): - """Test replacement with non-existent column specified.""" - df = pd.DataFrame({"A": [1, 2, 3]}) - - # Should not raise error, just skip non-existent column - result = replace(df, 1, 99, cols=["A", "B", "C"]) - - assert result["A"].tolist() == [99, 2, 3] - assert list(result.columns) == ["A"] - - def test_no_matches(self): - """Test when no values match replacement criteria.""" - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - - result = replace(df, 99, 100) - - # Should return unchanged DataFrame - pd.testing.assert_frame_equal(result, df) - - def test_empty_replace_dict(self): - """Test with empty replacement dictionary.""" - df = pd.DataFrame({"A": [1, 2, 3]}) - - result = replace(df, {}) - - pd.testing.assert_frame_equal(result, df) - - -class TestComplexScenarios: - """Test complex real-world scenarios.""" - - def test_data_cleaning_workflow(self): - """Test typical data cleaning workflow.""" - df = pd.DataFrame( - { - "name": ["John Doe", "Jane Smith", "Bob Johnson"], - "phone": ["123-456-7890", "(555) 123-4567", "999.888.7777"], - "email": ["john@example.com", "jane@test.org", "bob@company.com"], - } - ) - - # Clean phone numbers - phone_replacements = { - r"[^\d]": "", # Remove non-digits - } - result = replace(df, phone_replacements, regex=True, cols=["phone"]) - - assert result["phone"].tolist() == ["1234567890", "5551234567", "9998887777"] - # Other columns unchanged - assert result["name"].tolist() == df["name"].tolist() - - def test_categorical_mapping(self): - """Test replacing categories with standardized values.""" - df = pd.DataFrame( - { - "size": ["S", "small", "M", "medium", "L", "large"], - "color": ["red", "RED", "Blue", "BLUE", "green", "GREEN"], - } - ) - - size_map = { - "S": "Small", - "small": "Small", - "M": "Medium", - "medium": "Medium", - "L": "Large", - "large": "Large", - } - - result = replace(df, size_map, cols=["size"]) - - expected = ["Small", "Small", "Medium", "Medium", "Large", "Large"] - assert result["size"].tolist() == expected - - def test_multiple_replacements_same_column(self): - """Test multiple replacements in sequence.""" - df = pd.DataFrame( - {"text": ["Hello World!", "Python Programming", "Data Science"]} - ) - - # Use regex=True for substring replacement - result = replace(df, "Hello", "Hi", regex=True) - result = replace(result, "Programming", "Coding", regex=True) - result = replace(result, "!", ".", regex=True) - - expected = ["Hi World.", "Python Coding", "Data Science"] - assert result["text"].tolist() == expected - - -class TestDocstringExample: - """Test the example from the docstring.""" - - def test_docstring_example_simple(self): - """Test simple replacement from docstring.""" - df = pd.DataFrame({"A": ["abc-123", "def-456"], "B": ["ghi-789", "jkl-012"]}) - - # Use regex=True for substring replacement - df_replaced = replace(df, "abc", "xyz", regex=True) - - assert df_replaced["A"].iloc[0] == "xyz-123" - assert df_replaced["A"].iloc[1] == "def-456" - - def test_docstring_example_dict(self): - """Test dictionary replacement from docstring.""" - df = pd.DataFrame({"A": ["abc-123", "def-456"], "B": ["ghi-789", "jkl-012"]}) - - replace_dict = {"-": "_", "1": "one"} - df_replaced = replace(df, replace_dict, regex=True, cols=["A"]) - - # Should replace - with _ and 1 with one in column A only - assert df_replaced["A"].iloc[0] == "abc_one23" - assert df_replaced["A"].iloc[1] == "def_456" - assert df_replaced["B"].iloc[0] == "ghi-789" # B unchanged - - -class TestPreservation: - """Test that original DataFrame is not modified.""" - - def test_original_unchanged(self): - """Test that original DataFrame is not modified.""" - df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) - - original_a = df["A"].copy() - original_b = df["B"].copy() - - result = replace(df, 1, 99) - - # Original should be unchanged - pd.testing.assert_series_equal(df["A"], original_a) - pd.testing.assert_series_equal(df["B"], original_b) - - # Result should be different - assert result["A"].iloc[0] == 99 - assert df["A"].iloc[0] == 1 - - def test_index_preserved(self): - """Test that DataFrame index is preserved.""" - df = pd.DataFrame({"A": [1, 2, 3]}, index=["x", "y", "z"]) - - result = replace(df, 2, 99) - - assert list(result.index) == ["x", "y", "z"] - assert result.loc["y", "A"] == 99 - - def test_column_order_preserved(self): - """Test that column order is preserved.""" - df = pd.DataFrame({"Z": [1, 2], "A": [3, 4], "M": [5, 6]}) - - result = replace(df, 1, 99) - - assert list(result.columns) == ["Z", "A", "M"] - - -class TestLargeDatasets: - """Test with larger datasets.""" - - def test_large_dataframe(self): - """Test replacement on large DataFrame.""" - n = 10000 - df = pd.DataFrame( - { - "A": np.random.choice(["a", "b", "c"], n), - "B": np.random.randint(0, 10, n), - "C": np.random.choice(["x", "y", "z"], n), - } - ) - - result = replace(df, {"a": "alpha", "b": "beta", 5: 555}) - - # Check replacements worked - assert "alpha" in result["A"].values - assert "beta" in result["A"].values - assert "c" in result["A"].values - assert "a" not in result["A"].values - assert "b" not in result["A"].values - - if 5 in df["B"].values: - assert 555 in result["B"].values - assert 5 not in result["B"].values - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_replace.py -# -------------------------------------------------------------------------------- -# #!./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-08-29 23:08:35 (ywatanabe)" -# # ./src/scitex/pd/_replace.py -# -# -# def replace(dataframe, old_value, new_value=None, regex=False, cols=None): -# """ -# Replace values in a DataFrame. -# -# Example -# ------- -# import pandas as pd -# df = pd.DataFrame({'A': ['abc-123', 'def-456'], 'B': ['ghi-789', 'jkl-012']}) -# -# # Replace single value -# df_replaced = replace(df, 'abc', 'xyz') -# -# # Replace with dictionary -# replace_dict = {'-': '_', '1': 'one'} -# df_replaced = replace(df, replace_dict, cols=['A']) -# print(df_replaced) -# -# Parameters -# ---------- -# dataframe : pandas.DataFrame -# Input DataFrame to modify. -# old_value : str, dict -# If str, the value to replace (requires new_value). -# If dict, mapping of old values (keys) to new values (values). -# new_value : str, optional -# New value to replace old_value with. Required if old_value is str. -# regex : bool, optional -# If True, treat replacement keys as regular expressions. Default is False. -# cols : list of str, optional -# List of column names to apply replacements. If None, apply to all columns. -# -# Returns -# ------- -# pandas.DataFrame -# DataFrame with specified replacements applied. -# """ -# dataframe = dataframe.copy() -# -# # Handle different input formats -# if isinstance(old_value, dict): -# replace_dict = old_value -# else: -# if new_value is None: -# raise ValueError("new_value must be provided when old_value is not a dict") -# replace_dict = {old_value: new_value} -# -# # Apply replacements to all columns if cols not specified -# if cols is None: -# # Use pandas replace method for all columns -# return dataframe.replace(replace_dict, regex=regex) -# else: -# # Apply to specific columns -# for column in cols: -# if column in dataframe.columns: -# dataframe[column] = dataframe[column].replace(replace_dict, regex=regex) -# return dataframe - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_replace.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__round.py b/tests/scitex/pd/test__round.py deleted file mode 100644 index fb08a074d..000000000 --- a/tests/scitex/pd/test__round.py +++ /dev/null @@ -1,548 +0,0 @@ -#!/usr/bin/env python3 -# Time-stamp: "2025-05-31 20:45:00 (ywatanabe)" -# /data/gpfs/projects/punim2354/ywatanabe/.claude-worktree/scitex_repo/tests/scitex/pd/test__round.py - - -""" -Comprehensive tests for scitex.pd.round function. -""" - -import numpy as np -import pandas as pd -import pytest - - -class TestRound: - """Test class for round function.""" - - def test_basic_float_rounding(self): - """Test basic rounding of float values.""" - from scitex.pd import round - - df = pd.DataFrame( - {"A": [1.23456, 2.34567, 3.45678], "B": [4.56789, 5.67890, 6.78901]} - ) - - result = round(df, factor=2) - expected = pd.DataFrame({"A": [1.23, 2.35, 3.46], "B": [4.57, 5.68, 6.79]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_default_factor(self): - """Test rounding with default factor of 3.""" - from scitex.pd import round - - df = pd.DataFrame({"value": [1.234567, 2.345678, 3.456789]}) - - result = round(df) - expected = pd.DataFrame({"value": [1.235, 2.346, 3.457]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_mixed_types(self): - """Test rounding with mixed data types.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "float": [1.23456, 2.34567], - "int": [3, 4], - "str": ["abc", "def"], - "bool": [True, False], - } - ) - - result = round(df, factor=2) - expected = pd.DataFrame( - { - "float": [1.23, 2.35], - "int": [3, 4], - "str": ["abc", "def"], - "bool": [1, 0], # Booleans are converted to int by pd.to_numeric - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_integer_preservation(self): - """Test that integer columns remain as integers.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50]}) - - result = round(df, factor=2) - - assert result["A"].dtype == np.int64 - assert result["B"].dtype == np.int64 - pd.testing.assert_frame_equal(result, df) - - def test_zero_decimal_places(self): - """Test rounding to zero decimal places.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1.4, 2.5, 3.6], "B": [4.4, 5.5, 6.6]}) - - result = round(df, factor=0) - expected = pd.DataFrame({"A": [1, 2, 4], "B": [4, 6, 7]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_large_factor(self): - """Test rounding with large factor value.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1.123456789, 2.234567890]}) - - result = round(df, factor=6) - expected = pd.DataFrame({"A": [1.123457, 2.234568]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_negative_values(self): - """Test rounding negative values.""" - from scitex.pd import round - - df = pd.DataFrame( - {"A": [-1.23456, -2.34567, -3.45678], "B": [1.23456, -2.34567, 3.45678]} - ) - - result = round(df, factor=2) - expected = pd.DataFrame({"A": [-1.23, -2.35, -3.46], "B": [1.23, -2.35, 3.46]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_nan_handling(self): - """Test handling of NaN values - columns with NaN are not rounded due to comparison issue.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "A": [1.234, np.nan, 3.456], - "B": [np.nan, 2.345, np.nan], - "C": [1.234, 2.345, 3.456], # No NaN - } - ) - - result = round(df, factor=2) - # NaN values are preserved, non-NaN values are rounded - expected = pd.DataFrame( - { - "A": [1.23, np.nan, 3.46], # Rounded, NaN preserved - "B": [np.nan, 2.35, np.nan], # Rounded, NaN preserved - "C": [1.23, 2.35, 3.46], # Rounded correctly - no NaN - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_inf_handling(self): - """Test handling of infinity values - columns with inf are not rounded.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "A": [1.234, np.inf, -np.inf], - "B": [np.inf, 2.345, -np.inf], - "C": [1.234, 2.345, 3.456], # No inf - } - ) - - result = round(df, factor=2) - # inf values are preserved, finite values are rounded - expected = pd.DataFrame( - { - "A": [1.23, np.inf, -np.inf], # Rounded, inf preserved - "B": [np.inf, 2.35, -np.inf], # Rounded, inf preserved - "C": [1.23, 2.35, 3.46], # Rounded correctly - no inf - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_empty_dataframe(self): - """Test rounding empty DataFrame.""" - from scitex.pd import round - - df = pd.DataFrame() - result = round(df, factor=2) - pd.testing.assert_frame_equal(result, df) - - def test_single_column(self): - """Test rounding single column DataFrame.""" - from scitex.pd import round - - df = pd.DataFrame({"values": [1.234567, 2.345678, 3.456789]}) - result = round(df, factor=3) - expected = pd.DataFrame({"values": [1.235, 2.346, 3.457]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_datetime_columns(self): - """Test that datetime columns are preserved.""" - from scitex.pd import round - - dates = pd.date_range("2024-01-01", periods=3) - df = pd.DataFrame({"date": dates, "value": [1.23456, 2.34567, 3.45678]}) - - result = round(df, factor=2) - expected = pd.DataFrame({"date": dates, "value": [1.23, 2.35, 3.46]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_categorical_columns(self): - """Test that categorical columns are preserved.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "category": pd.Categorical(["A", "B", "C"]), - "value": [1.23456, 2.34567, 3.45678], - } - ) - - result = round(df, factor=2) - expected = pd.DataFrame( - {"category": pd.Categorical(["A", "B", "C"]), "value": [1.23, 2.35, 3.46]} - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_scientific_notation(self): - """Test rounding values in scientific notation.""" - from scitex.pd import round - - df = pd.DataFrame( - {"A": [1.234e-5, 2.345e-5, 3.456e-5], "B": [1.234e5, 2.345e5, 3.456e5]} - ) - - result = round(df, factor=3) - # Values less than 0.001 will be rounded to 0.0 when rounding to 3 decimal places - # Large values remain unchanged as they have no decimal component - expected = pd.DataFrame( - {"A": [0.0, 0.0, 0.0], "B": [123400.0, 234500.0, 345600.0]} - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_very_small_values(self): - """Test rounding very small values.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [0.000123456, 0.000234567, 0.000345678]}) - - result = round(df, factor=3) - # Rounding to 3 decimal places means values < 0.001 become 0.0 - expected = pd.DataFrame({"A": [0.0, 0.0, 0.0]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_roundable_to_int(self): - """Test values that can be converted to integers after rounding.""" - from scitex.pd import round - - df = pd.DataFrame( - {"A": [1.00001, 2.00002, 3.00003], "B": [4.99999, 5.99998, 6.99997]} - ) - - result = round(df, factor=0) - expected = pd.DataFrame({"A": [1, 2, 3], "B": [5, 6, 7]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_multiindex_dataframe(self): - """Test rounding DataFrame with MultiIndex.""" - from scitex.pd import round - - arrays = [["A", "A", "B", "B"], [1, 2, 1, 2]] - index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({"value": [1.23456, 2.34567, 3.45678, 4.56789]}, index=index) - - result = round(df, factor=2) - expected = pd.DataFrame({"value": [1.23, 2.35, 3.46, 4.57]}, index=index) - - pd.testing.assert_frame_equal(result, expected) - - def test_mixed_numeric_string(self): - """Test DataFrame with numeric strings - object dtype columns are not converted.""" - from scitex.pd import round - - df = pd.DataFrame( - {"A": ["1.234", "2.345", "3.456"], "B": [1.234, 2.345, 3.456]} - ) - - result = round(df, factor=2) - # Object dtype columns with strings are NOT converted - returned unchanged - # Only proper float columns are rounded - expected = pd.DataFrame( - {"A": ["1.234", "2.345", "3.456"], "B": [1.23, 2.35, 3.46]} - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_none_values(self): - """Test handling of None values - NaN is preserved, other values are rounded.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "A": [1.234, None, 3.456], - "B": [None, 2.345, None], - "C": [1.234, 2.345, 3.456], # No None - } - ) - - result = round(df, factor=2) - # None converts to NaN which is preserved, other values are rounded - expected = pd.DataFrame( - { - "A": [1.23, np.nan, 3.46], # Rounded, NaN preserved - "B": [np.nan, 2.35, np.nan], # Rounded, NaN preserved - "C": [1.23, 2.35, 3.46], # Rounded correctly - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_object_dtype_with_numbers(self): - """Test object dtype columns - object dtype columns are returned unchanged.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "A": pd.Series([1.234, 2.345, 3.456], dtype="object"), - "B": pd.Series(["a", "b", "c"], dtype="object"), - } - ) - - result = round(df, factor=2) - # Object dtype columns are returned unchanged (even if they contain numbers) - expected = pd.DataFrame( - { - "A": pd.Series([1.234, 2.345, 3.456], dtype="object"), - "B": pd.Series(["a", "b", "c"], dtype="object"), - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_series_like_behavior(self): - """Test that function preserves column order and names.""" - from scitex.pd import round - - df = pd.DataFrame({"Z": [1.234], "A": [2.345], "M": [3.456]}) - - result = round(df, factor=2) - - assert list(result.columns) == ["Z", "A", "M"] - assert result["Z"][0] == 1.23 - assert result["A"][0] == 2.35 - assert result["M"][0] == 3.46 - - def test_large_dataframe_performance(self): - """Test performance with large DataFrame.""" - from scitex.pd import round - - # Create large DataFrame - np.random.seed(42) - df = pd.DataFrame(np.random.randn(1000, 10)) - - result = round(df, factor=3) - - # Check shape is preserved - assert result.shape == df.shape - - # Spot check some values are rounded correctly - assert abs(result.iloc[0, 0] - np.round(df.iloc[0, 0], 3)) < 1e-10 - - def test_factor_one(self): - """Test rounding with factor=1.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1.234, 2.567, 3.891]}) - - result = round(df, factor=1) - expected = pd.DataFrame({"A": [1.2, 2.6, 3.9]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_complex_mixed_data(self): - """Test complex DataFrame with various types.""" - from scitex.pd import round - - df = pd.DataFrame( - { - "floats": [1.23456, 2.34567, np.nan], - "floats_no_nan": [1.23456, 2.34567, 3.45678], - "ints": [1, 2, 3], - "strings": ["a", "b", "c"], - "bools": [True, False, True], - "mixed": [1.234, "text", None], - } - ) - - result = round(df, factor=2) - expected = pd.DataFrame( - { - "floats": [1.23, 2.35, np.nan], # Rounded, NaN preserved - "floats_no_nan": [1.23, 2.35, 3.46], # Rounded correctly - "ints": [1, 2, 3], - "strings": ["a", "b", "c"], - "bools": [1, 0, 1], # Booleans are converted to int by pd.to_numeric - "mixed": [ - 1.234, - "text", - None, - ], # Object dtype with mixed types - returned unchanged - } - ) - - pd.testing.assert_frame_equal(result, expected) - - def test_edge_case_rounding(self): - """Test edge cases in rounding (0.5 cases).""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1.125, 2.225, 3.335, 4.445, 5.555]}) - - result = round(df, factor=2) - # Python uses banker's rounding (round to even) - expected = pd.DataFrame({"A": [1.12, 2.22, 3.34, 4.44, 5.56]}) - - pd.testing.assert_frame_equal(result, expected) - - def test_preserve_index(self): - """Test that DataFrame index is preserved.""" - from scitex.pd import round - - df = pd.DataFrame({"A": [1.234, 2.345, 3.456]}, index=["x", "y", "z"]) - - result = round(df, factor=2) - - assert list(result.index) == ["x", "y", "z"] - assert result.loc["x", "A"] == 1.23 - - def test_column_specific_behavior(self): - """Test that rounding is applied column-wise.""" - from scitex.pd import round - - df = pd.DataFrame( - {"precise": [1.123456789, 2.234567890], "rough": [100.1, 200.2]} - ) - - result = round(df, factor=4) - expected = pd.DataFrame({"precise": [1.1235, 2.2346], "rough": [100.1, 200.2]}) - - pd.testing.assert_frame_equal(result, expected) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_round.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-06 11:13:00 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_round.py -# -# import numpy as np -# import pandas as pd -# -# -# def round(df: pd.DataFrame, factor: int = 3) -> pd.DataFrame: -# """ -# Round numeric values in a DataFrame to a specified number of decimal places. -# -# Example -# ------- -# >>> df = pd.DataFrame({'A': [1.23456, 2.34567], 'B': ['abc', 'def'], 'C': [3, 4]}) -# >>> round(df, 2) -# A B C -# 0 1.23 abc 3 -# 1 2.35 def 4 -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame -# factor : int, optional -# Number of decimal places to round to (default is 3) -# -# Returns -# ------- -# pd.DataFrame -# DataFrame with rounded numeric values -# """ -# -# def custom_round(column): -# # Skip non-numeric types like datetime, categorical, string -# if pd.api.types.is_datetime64_any_dtype(column): -# return column -# if pd.api.types.is_categorical_dtype(column): -# return column -# if pd.api.types.is_string_dtype(column): -# return column -# # Note: boolean types are allowed to be converted to numeric -# if ( -# pd.api.types.is_object_dtype(column) -# and not pd.api.types.is_numeric_dtype(column) -# and not pd.api.types.is_bool_dtype(column) -# ): -# return column -# -# try: -# # Handle boolean columns explicitly -# if pd.api.types.is_bool_dtype(column): -# return column.astype(int) -# -# numeric_column = pd.to_numeric(column, errors="coerce") -# if np.issubdtype(numeric_column.dtype, np.integer): -# return numeric_column.astype(int) -# -# # For float columns, round first -# rounded = numeric_column.round(factor) -# -# # If factor is 0 and all values are whole numbers, convert to int -# if factor == 0 and (rounded % 1 == 0).all() and not rounded.isna().any(): -# return rounded.astype(int) -# -# return rounded -# -# except (ValueError, TypeError): -# return column -# -# return df.apply(custom_round) -# -# -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-10-05 20:40:32 (ywatanabe)" -# # /home/ywatanabe/proj/_scitex_repo_openhands/src/scitex/pd/_round.py -# -# # import numpy as np -# -# # def round(df, factor=3): -# # return df.apply(lambda x: x.round(factor) if np.issubdtype(x.dtype, np.number) else x) -# -# -# # def round(df, factor=3): -# # def custom_round(x): -# # try: -# # numeric_x = pd.to_numeric(x, errors='raise') -# # if np.issubdtype(numeric_x.dtype, np.integer): -# # return numeric_x -# # else: -# # return numeric_x.apply(lambda y: float(f'{y:.{factor}g}')) -# # except (ValueError, TypeError): -# # return x -# -# # return df.apply(custom_round) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_round.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__slice.py b/tests/scitex/pd/test__slice.py deleted file mode 100644 index e445218af..000000000 --- a/tests/scitex/pd/test__slice.py +++ /dev/null @@ -1,502 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Time-stamp: "2024-11-05 08:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__slice.py - -import builtins -import os -import sys -import tempfile -from unittest.mock import MagicMock, Mock, patch - -import numpy as np -import pandas as pd -import pytest - - -class TestSliceBasic: - """Test basic functionality of slice function.""" - - def test_slice_by_indices(self): - """Test slicing DataFrame by row indices using slice object.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": ["a", "b", "c", "d", "e"]}) - - result = slice(df, builtins.slice(1, 4)) - - assert len(result) == 3 - assert result["A"].tolist() == [2, 3, 4] - assert result["B"].tolist() == ["b", "c", "d"] - assert result.index.tolist() == [1, 2, 3] - - def test_slice_from_start(self): - """Test slicing from start of DataFrame.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [10, 20, 30, 40], "B": [100, 200, 300, 400]}) - - result = slice(df, builtins.slice(None, 2)) - - assert len(result) == 2 - assert result["A"].tolist() == [10, 20] - assert result["B"].tolist() == [100, 200] - - def test_slice_to_end(self): - """Test slicing to end of DataFrame.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50]}) - - result = slice(df, builtins.slice(3, None)) - - assert len(result) == 2 - assert result["A"].tolist() == [4, 5] - assert result["B"].tolist() == [40, 50] - - def test_slice_with_step(self): - """Test slicing with step parameter.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": list(range(10)), "B": list(range(10, 20))}) - - result = slice(df, builtins.slice(0, 10, 2)) - - assert len(result) == 5 - assert result["A"].tolist() == [0, 2, 4, 6, 8] - assert result["B"].tolist() == [10, 12, 14, 16, 18] - - -class TestSliceByConditions: - """Test slicing by conditions using dictionary.""" - - def test_single_condition(self): - """Test slicing with single condition.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 2, 1], "B": ["x", "y", "z", "y", "x"]}) - - result = slice(df, {"A": 2}) - - assert len(result) == 2 - assert result["A"].tolist() == [2, 2] - assert result["B"].tolist() == ["y", "y"] - assert result.index.tolist() == [1, 3] - - def test_multiple_conditions(self): - """Test slicing with multiple conditions.""" - from scitex.pd import slice - - df = pd.DataFrame( - { - "A": [1, 1, 2, 2, 3], - "B": ["x", "y", "x", "y", "x"], - "C": [10, 20, 30, 40, 50], - } - ) - - result = slice(df, {"A": 2, "B": "x"}) - - assert len(result) == 1 - assert result["A"].tolist() == [2] - assert result["B"].tolist() == ["x"] - assert result["C"].tolist() == [30] - - def test_list_condition(self): - """Test slicing with list values in conditions.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": ["a", "b", "c", "d", "e"]}) - - result = slice(df, {"A": [2, 4, 5]}) - - assert len(result) == 3 - assert result["A"].tolist() == [2, 4, 5] - assert result["B"].tolist() == ["b", "d", "e"] - - -class TestColumnSlicing: - """Test column selection functionality.""" - - def test_select_single_column(self): - """Test selecting single column.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) - - result = slice(df, columns=["B"]) - - assert list(result.columns) == ["B"] - assert result["B"].tolist() == [4, 5, 6] - - def test_select_multiple_columns(self): - """Test selecting multiple columns.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6], "D": [7, 8]}) - - result = slice(df, columns=["A", "C", "D"]) - - assert list(result.columns) == ["A", "C", "D"] - assert "B" not in result.columns - - def test_reorder_columns(self): - """Test that column order follows the specified list.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) - - result = slice(df, columns=["C", "A", "B"]) - - assert list(result.columns) == ["C", "A", "B"] - - -class TestCombinedSlicing: - """Test combining row and column slicing.""" - - def test_slice_rows_and_columns(self): - """Test slicing both rows and columns.""" - from scitex.pd import slice - - df = pd.DataFrame( - { - "A": [1, 2, 3, 4, 5], - "B": [10, 20, 30, 40, 50], - "C": ["a", "b", "c", "d", "e"], - "D": [100, 200, 300, 400, 500], - } - ) - - result = slice(df, builtins.slice(1, 4), columns=["B", "C"]) - - assert len(result) == 3 - assert list(result.columns) == ["B", "C"] - assert result["B"].tolist() == [20, 30, 40] - assert result["C"].tolist() == ["b", "c", "d"] - - def test_conditions_and_columns(self): - """Test using conditions and column selection together.""" - from scitex.pd import slice - - df = pd.DataFrame( - { - "category": ["A", "B", "A", "B", "A"], - "value": [10, 20, 30, 40, 50], - "extra1": [1, 2, 3, 4, 5], - "extra2": [6, 7, 8, 9, 10], - } - ) - - result = slice(df, {"category": "A"}, columns=["category", "value"]) - - assert len(result) == 3 - assert list(result.columns) == ["category", "value"] - assert result["value"].tolist() == [10, 30, 50] - - -class TestEdgeCases: - """Test edge cases and error handling.""" - - def test_empty_dataframe(self): - """Test slicing empty DataFrame.""" - from scitex.pd import slice - - df = pd.DataFrame() - result = slice(df, builtins.slice(0, 10)) - - assert result.empty - assert isinstance(result, pd.DataFrame) - - def test_no_conditions(self): - """Test with no slicing conditions.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - result = slice(df) - - pd.testing.assert_frame_equal(result, df) - - def test_no_matching_conditions(self): - """Test when conditions match no rows.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "z"]}) - - result = slice(df, {"A": 999}) - - assert len(result) == 0 - assert list(result.columns) == ["A", "B"] - - def test_out_of_bounds_slice(self): - """Test slice indices beyond DataFrame bounds.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3]}) - - # Slice beyond bounds should work without error - result = slice(df, builtins.slice(10, 20)) - assert len(result) == 0 - - result = slice(df, builtins.slice(-10, -5)) - assert len(result) == 0 - - def test_negative_slice_indices(self): - """Test negative indices in slice.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": ["a", "b", "c", "d", "e"]}) - - result = slice(df, builtins.slice(-3, -1)) - - assert len(result) == 2 - assert result["A"].tolist() == [3, 4] - assert result["B"].tolist() == ["c", "d"] - - -class TestDataTypes: - """Test with various data types.""" - - def test_mixed_dtypes(self): - """Test slicing DataFrame with mixed data types.""" - from scitex.pd import slice - - df = pd.DataFrame( - { - "int": [1, 2, 3, 4], - "float": [1.1, 2.2, 3.3, 4.4], - "str": ["a", "b", "c", "d"], - "bool": [True, False, True, False], - "date": pd.date_range("2023-01-01", periods=4), - } - ) - - result = slice(df, builtins.slice(1, 3)) - - assert len(result) == 2 - assert result["int"].tolist() == [2, 3] - assert result["float"].tolist() == [2.2, 3.3] - assert result["bool"].tolist() == [False, True] - - def test_nan_values(self): - """Test slicing with NaN values.""" - from scitex.pd import slice - - df = pd.DataFrame( - {"A": [1, np.nan, 3, np.nan, 5], "B": ["a", "b", np.nan, "d", "e"]} - ) - - # Slice should preserve NaN values - result = slice(df, builtins.slice(1, 4)) - - assert len(result) == 3 - assert pd.isna(result["A"].iloc[0]) - assert result["A"].iloc[1] == 3 - assert pd.isna(result["A"].iloc[2]) - - -class TestIndexPreservation: - """Test DataFrame index handling.""" - - def test_custom_index_preservation(self): - """Test that custom index is preserved.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3, 4]}, index=["w", "x", "y", "z"]) - - result = slice(df, builtins.slice(1, 3)) - - assert list(result.index) == ["x", "y"] - assert result.loc["x", "A"] == 2 - assert result.loc["y", "A"] == 3 - - def test_multiindex(self): - """Test slicing with MultiIndex.""" - from scitex.pd import slice - - arrays = [["A", "A", "B", "B"], [1, 2, 1, 2]] - index = pd.MultiIndex.from_arrays(arrays) - df = pd.DataFrame({"value": [10, 20, 30, 40]}, index=index) - - result = slice(df, builtins.slice(1, 3)) - - assert len(result) == 2 - assert result["value"].tolist() == [20, 30] - - -class TestRealWorldScenarios: - """Test real-world usage scenarios.""" - - def test_data_filtering_workflow(self): - """Test typical data filtering workflow.""" - from scitex.pd import slice - - # Sample sales data - df = pd.DataFrame( - { - "date": pd.date_range("2023-01-01", periods=10), - "product": ["A", "B", "A", "C", "B", "A", "C", "B", "A", "C"], - "quantity": [10, 20, 15, 5, 25, 30, 10, 35, 20, 15], - "revenue": [100, 400, 150, 75, 500, 300, 150, 700, 200, 225], - } - ) - - # Filter for product A with revenue > 100 - result = slice(df, {"product": "A"}, columns=["date", "product", "revenue"]) - result = result[result["revenue"] > 100] - - assert len(result) == 3 - assert all(result["product"] == "A") - assert all(result["revenue"] > 100) - - def test_time_series_window(self): - """Test extracting time series window.""" - from scitex.pd import slice - - df = pd.DataFrame( - { - "timestamp": pd.date_range("2023-01-01", periods=100, freq="H"), - "value": np.random.randn(100), - } - ) - - # Get specific time window - result = slice(df, builtins.slice(24, 48)) # Hours 24-47 - - assert len(result) == 24 - assert result["timestamp"].iloc[0].hour == 0 # Next day start - assert result["timestamp"].iloc[0].day == 2 - - -class TestDocstringExamples: - """Test examples from the docstring.""" - - def test_docstring_slice_example(self): - """Test slice example from docstring.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "x"]}) - - # Slice by row indices - result = slice(df, builtins.slice(0, 2)) - assert len(result) == 2 - assert result["A"].tolist() == [1, 2] - assert result["B"].tolist() == ["x", "y"] - - def test_docstring_conditions_example(self): - """Test conditions example from docstring.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "x"]}) - - # Slice by conditions - result = slice(df, {"A": [1, 2], "B": "x"}) - assert len(result) == 1 - assert result["A"].tolist() == [1] - assert result["B"].tolist() == ["x"] - - def test_docstring_columns_example(self): - """Test columns example from docstring.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": ["x", "y", "x"]}) - - # Slice columns - result = slice(df, columns=["A"]) - assert list(result.columns) == ["A"] - assert len(result) == 3 # All rows preserved - - -class TestCopyBehavior: - """Test that slice returns a copy, not a view.""" - - def test_returns_copy(self): - """Test that modifications to result don't affect original.""" - from scitex.pd import slice - - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) - - result = slice(df, builtins.slice(0, 2)) - result["A"] = [99, 98] - - # Original should be unchanged - assert df["A"].tolist() == [1, 2, 3] - assert result["A"].tolist() == [99, 98] - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_slice.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-05 07:45:00 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_slice.py -# -# from typing import Dict, Union, List, Optional -# import builtins -# -# import pandas as pd -# -# from ._find_indi import find_indi -# -# -# def slice( -# df: pd.DataFrame, -# conditions: Union[ -# builtins.slice, Dict[str, Union[str, int, float, List]], None -# ] = None, -# columns: Optional[List[str]] = None, -# ) -> pd.DataFrame: -# """Slices DataFrame rows and/or columns. -# -# Example -# ------- -# >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'x']}) -# >>> # Slice by row indices -# >>> result = slice(df, slice(0, 2)) -# >>> # Slice by conditions -# >>> result = slice(df, {'A': [1, 2], 'B': 'x'}) -# >>> # Slice columns -# >>> result = slice(df, columns=['A']) -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame to slice -# conditions : slice, Dict, or None -# Either a slice object for row indices, or a dictionary of column conditions -# columns : List[str], optional -# List of column names to select -# -# Returns -# ------- -# pd.DataFrame -# Sliced DataFrame -# """ -# result = df.copy() -# -# # Handle row slicing -# if isinstance(conditions, builtins.slice): -# result = result.iloc[conditions] -# elif isinstance(conditions, dict): -# indices = find_indi(result, conditions) -# result = result.loc[indices] -# -# # Handle column slicing -# if columns is not None: -# result = result[columns] -# -# return result -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_slice.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__sort.py b/tests/scitex/pd/test__sort.py deleted file mode 100644 index 0bf2b8f5f..000000000 --- a/tests/scitex/pd/test__sort.py +++ /dev/null @@ -1,339 +0,0 @@ -#!/usr/bin/env python3 -# Timestamp: "2025-06-01 19:55:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__sort.py - -""" -Test module for scitex.pd.sort function. -""" - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - - -class TestSort: - """Test class for sort function.""" - - @pytest.fixture - def sample_df(self): - """Create a sample DataFrame for testing.""" - return pd.DataFrame( - { - "A": ["foo", "bar", "baz", "qux"], - "B": [3, 1, 4, 2], - "C": [2.5, 1.2, 3.8, np.nan], - } - ) - - @pytest.fixture - def df_with_nulls(self): - """Create a DataFrame with null values.""" - return pd.DataFrame({"A": ["a", None, "c", "b"], "B": [1, 2, np.nan, 4]}) - - def test_import(self): - """Test that sort can be imported.""" - from scitex.pd import sort - - assert callable(sort) - - def test_basic_sort_by_column(self, sample_df): - """Test basic sorting by a single column.""" - from scitex.pd import sort - - # Sort by column B - result = sort(sample_df, by="B") - assert list(result["B"]) == [1, 2, 3, 4] - assert list(result["A"]) == ["bar", "qux", "foo", "baz"] - - def test_sort_descending(self, sample_df): - """Test sorting in descending order.""" - from scitex.pd import sort - - result = sort(sample_df, by="B", ascending=False) - assert list(result["B"]) == [4, 3, 2, 1] - assert list(result["A"]) == ["baz", "foo", "qux", "bar"] - - def test_sort_multiple_columns(self): - """Test sorting by multiple columns.""" - from scitex.pd import sort - - df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [4, 3, 2, 1]}) - - result = sort(df, by=["A", "B"]) - assert list(result["B"]) == [3, 4, 1, 2] - - def test_sort_with_mixed_ascending(self): - """Test sorting with mixed ascending/descending orders.""" - from scitex.pd import sort - - df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [3, 4, 1, 2]}) - - result = sort(df, by=["A", "B"], ascending=[True, False]) - assert list(result["B"]) == [4, 3, 2, 1] - - def test_sort_with_na_position(self, df_with_nulls): - """Test sorting with NaN position control.""" - from scitex.pd import sort - - # NaN last (default) - result = sort(df_with_nulls, by="B", na_position="last") - assert pd.isna(result["B"].iloc[-1]) - - # NaN first - result = sort(df_with_nulls, by="B", na_position="first") - assert pd.isna(result["B"].iloc[0]) - - def test_sort_ignore_index(self, sample_df): - """Test sorting with index reset.""" - from scitex.pd import sort - - # First, set a custom index - sample_df.index = [10, 20, 30, 40] - - # Sort without ignore_index - result = sort(sample_df, by="B", ignore_index=False) - assert list(result.index) == [20, 40, 10, 30] - - # Sort with ignore_index - result = sort(sample_df, by="B", ignore_index=True) - assert list(result.index) == [0, 1, 2, 3] - - def test_sort_with_custom_orders(self): - """Test sorting with custom category orders.""" - from scitex.pd import sort - - df = pd.DataFrame( - {"A": ["small", "medium", "large", "small", "large"], "B": [1, 2, 3, 4, 5]} - ) - - custom_order = {"A": ["small", "medium", "large"]} - result = sort(df, orders=custom_order) - - # Check that 'small' comes before 'medium' and 'medium' before 'large' - a_values = result["A"].tolist() - assert a_values == ["small", "small", "medium", "large", "large"] - - def test_sort_with_multiple_custom_orders(self): - """Test sorting with custom orders for multiple columns.""" - from scitex.pd import sort - - df = pd.DataFrame( - { - "Size": ["L", "S", "M", "L", "S"], - "Priority": ["high", "low", "medium", "low", "high"], - } - ) - - custom_order = {"Size": ["S", "M", "L"], "Priority": ["low", "medium", "high"]} - result = sort(df, orders=custom_order) - - # First 2 should be 'S', last 2 should be 'L' - assert list(result["Size"][:2]) == ["S", "S"] - assert list(result["Size"][-2:]) == ["L", "L"] - - def test_sort_inplace(self, sample_df): - """Test in-place sorting - returns same object but update doesn't reorder rows.""" - from scitex.pd import sort - - original_id = id(sample_df) - result = sort(sample_df, by="B", inplace=True) - - # Should return the same object reference - assert id(result) == original_id - # Note: The inplace implementation uses update() which doesn't reorder rows, - # so the original order is preserved (this is a limitation of the implementation) - assert list(result["B"]) == [3, 1, 4, 2] # Original order - - def test_column_reordering(self, sample_df): - """Test that sorted columns are moved to the front.""" - from scitex.pd import sort - - result = sort(sample_df, by="B") - assert list(result.columns) == ["B", "A", "C"] - - # Multiple columns - result = sort(sample_df, by=["C", "B"]) - assert list(result.columns) == ["C", "B", "A"] - - def test_sort_with_key_function(self): - """Test sorting with a key function.""" - from scitex.pd import sort - - df = pd.DataFrame( - {"A": ["apple", "Banana", "cherry", "Date"], "B": [1, 2, 3, 4]} - ) - - # Sort case-insensitive - result = sort(df, by="A", key=lambda x: x.str.lower()) - assert list(result["A"]) == ["apple", "Banana", "cherry", "Date"] - - def test_different_sort_algorithms(self, sample_df): - """Test different sorting algorithms.""" - from scitex.pd import sort - - for algorithm in ["quicksort", "mergesort", "heapsort", "stable"]: - result = sort(sample_df, by="B", kind=algorithm) - assert list(result["B"]) == [1, 2, 3, 4] - - def test_empty_dataframe(self): - """Test sorting an empty DataFrame with columns.""" - from scitex.pd import sort - - # Empty DataFrame without columns cannot be sorted (no `by` parameter) - # Empty with columns can be sorted - df = pd.DataFrame(columns=["A", "B"]) - result = sort(df, by="A") - assert result.empty - assert list(result.columns) == ["A", "B"] - - def test_single_row_dataframe(self): - """Test sorting a single-row DataFrame.""" - from scitex.pd import sort - - df = pd.DataFrame({"A": [1], "B": [2]}) - result = sort(df, by="A") - assert_frame_equal(result, df) - - def test_sort_no_by_parameter(self, sample_df): - """Test sorting without specifying 'by' parameter.""" - from scitex.pd import sort - - # Should use all columns when orders is provided - orders = {"A": ["bar", "baz", "foo", "qux"]} - result = sort(sample_df, orders=orders) - assert list(result["A"]) == ["bar", "baz", "foo", "qux"] - - def test_error_handling(self): - """Test error handling for invalid inputs.""" - from scitex.pd import sort - - df = pd.DataFrame({"A": [1, 2, 3]}) - - # Non-existent column - with pytest.raises(KeyError): - sort(df, by="NonExistent") - - @pytest.mark.parametrize( - "input_type,expected_error", - [ - ([1, 2, 3], AttributeError), # List instead of DataFrame - ("not a dataframe", AttributeError), # String - (123, AttributeError), # Integer - ], - ) - def test_invalid_input_types(self, input_type, expected_error): - """Test that invalid input types raise appropriate errors.""" - from scitex.pd import sort - - with pytest.raises(expected_error): - sort(input_type, by="A") - - -# -------------------------------------------------------------------------------- - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_sort.py -# -------------------------------------------------------------------------------- -# #!./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-08-25 09:35:39 (ywatanabe)" -# # ./src/scitex/pd/_sort.py -# -# import pandas as pd -# -# -# def sort( -# dataframe, -# by=None, -# ascending=True, -# inplace=False, -# kind="quicksort", -# na_position="last", -# ignore_index=False, -# key=None, -# orders=None, -# ): -# """ -# Sort DataFrame by specified column(s) with optional custom ordering and column reordering. -# -# Example -# ------- -# import pandas as pd -# df = pd.DataFrame({'A': ['foo', 'bar', 'baz'], 'B': [3, 2, 1]}) -# custom_order = {'A': ['bar', 'baz', 'foo']} -# sorted_df = sort(df, by=None, orders=custom_order) -# print(sorted_df) -# -# Parameters -# ---------- -# dataframe : pandas.DataFrame -# The DataFrame to sort. -# by : str or list of str, optional -# Name(s) of column(s) to sort by. -# ascending : bool or list of bool, default True -# Sort ascending vs. descending. -# inplace : bool, default False -# If True, perform operation in-place. -# kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' -# Choice of sorting algorithm. -# na_position : {'first', 'last'}, default 'last' -# Puts NaNs at the beginning if 'first'; 'last' puts NaNs at the end. -# ignore_index : bool, default False -# If True, the resulting axis will be labeled 0, 1, …, n - 1. -# key : callable, optional -# Apply the key function to the values before sorting. -# orders : dict, optional -# Dictionary of column names and their custom sort orders. -# -# Returns -# ------- -# pandas.DataFrame -# Sorted DataFrame with reordered columns. -# """ -# if orders: -# by = [by] if isinstance(by, str) else list(orders.keys()) if by is None else by -# -# def apply_custom_order(column): -# return ( -# pd.Categorical(column, categories=orders[column.name], ordered=True) -# if column.name in orders -# else column -# ) -# -# key = apply_custom_order -# elif isinstance(by, str): -# by = [by] -# -# sorted_df = dataframe.sort_values( -# by=by, -# ascending=ascending, -# inplace=False, -# kind=kind, -# na_position=na_position, -# ignore_index=ignore_index, -# key=key, -# ) -# -# # Reorder columns -# if by: -# other_columns = [col for col in sorted_df.columns if col not in by] -# sorted_df = sorted_df[by + other_columns] -# -# if inplace: -# dataframe.update(sorted_df) -# dataframe.reindex(columns=sorted_df.columns) -# return dataframe -# else: -# return sorted_df - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_sort.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__to_numeric.py b/tests/scitex/pd/test__to_numeric.py deleted file mode 100644 index 03171cd97..000000000 --- a/tests/scitex/pd/test__to_numeric.py +++ /dev/null @@ -1,327 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -# Timestamp: "2025-06-01 20:00:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__to_numeric.py - -""" -Test module for scitex.pd.to_numeric function. -""" - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal, assert_series_equal - - -class TestToNumeric: - """Test class for to_numeric function.""" - - @pytest.fixture - def mixed_df(self): - """Create a DataFrame with mixed types.""" - return pd.DataFrame( - { - "int_str": ["1", "2", "3", "4"], - "float_str": ["1.5", "2.5", "3.5", "4.5"], - "mixed": ["1", "2.5", "three", "4"], - "pure_str": ["a", "b", "c", "d"], - "already_int": [1, 2, 3, 4], - "already_float": [1.1, 2.2, 3.3, 4.4], - "with_nan": ["1", "2", np.nan, "4"], - } - ) - - @pytest.fixture - def datetime_df(self): - """Create a DataFrame with datetime strings.""" - return pd.DataFrame( - { - "dates": ["2021-01-01", "2021-01-02", "2021-01-03"], - "times": ["10:30:00", "11:45:00", "12:00:00"], - "numbers": ["100", "200", "300"], - } - ) - - def test_import(self): - """Test that to_numeric can be imported.""" - from scitex.pd import to_numeric - - assert callable(to_numeric) - - def test_basic_conversion(self, mixed_df): - """Test basic numeric conversion with coerce.""" - from scitex.pd import to_numeric - - result = to_numeric(mixed_df) - - # Check conversions - assert result["int_str"].dtype in [np.int64, np.float64] - assert result["float_str"].dtype == np.float64 - assert result["already_int"].dtype in [np.int64, np.float64] - assert result["already_float"].dtype == np.float64 - - # Check values - assert list(result["int_str"]) == [1, 2, 3, 4] - assert list(result["float_str"]) == [1.5, 2.5, 3.5, 4.5] - - def test_coerce_mode(self, mixed_df): - """Test coerce mode converts invalid values to NaN.""" - from scitex.pd import to_numeric - - result = to_numeric(mixed_df, errors="coerce") - - # Mixed column should have NaN for 'three' - assert pd.isna(result["mixed"].iloc[2]) - assert result["mixed"].iloc[0] == 1 - assert result["mixed"].iloc[1] == 2.5 - assert result["mixed"].iloc[3] == 4 - - # Pure string column should be all NaN - assert result["pure_str"].isna().all() - - def test_ignore_mode(self, mixed_df): - """Test ignore mode leaves non-numeric columns unchanged.""" - from scitex.pd import to_numeric - - result = to_numeric(mixed_df, errors="ignore") - - # Numeric strings should be converted - assert result["int_str"].dtype in [np.int64, np.float64] - assert result["float_str"].dtype == np.float64 - - # Pure string column should remain unchanged - assert result["pure_str"].dtype == object - assert list(result["pure_str"]) == ["a", "b", "c", "d"] - - # Mixed column should remain unchanged (has non-numeric values) - assert result["mixed"].dtype == object - assert list(result["mixed"]) == ["1", "2.5", "three", "4"] - - def test_raise_mode(self): - """Test raise mode raises exception on invalid conversion.""" - from scitex.pd import to_numeric - - df = pd.DataFrame({"valid": ["1", "2", "3"], "invalid": ["1", "two", "3"]}) - - # Should raise ValueError for invalid column - with pytest.raises(ValueError): - to_numeric(df, errors="raise") - - def test_with_nan_values(self, mixed_df): - """Test handling of NaN values.""" - from scitex.pd import to_numeric - - result = to_numeric(mixed_df) - - # with_nan column should preserve NaN - assert pd.isna(result["with_nan"].iloc[2]) - assert result["with_nan"].iloc[0] == 1 - assert result["with_nan"].iloc[1] == 2 - assert result["with_nan"].iloc[3] == 4 - - def test_empty_dataframe(self): - """Test with empty DataFrame.""" - from scitex.pd import to_numeric - - df = pd.DataFrame() - result = to_numeric(df) - assert result.empty - - # Empty with columns - df = pd.DataFrame(columns=["A", "B"]) - result = to_numeric(df) - assert list(result.columns) == ["A", "B"] - assert result.empty - - def test_single_column_dataframe(self): - """Test with single column DataFrame.""" - from scitex.pd import to_numeric - - df = pd.DataFrame({"A": ["1", "2", "3"]}) - result = to_numeric(df) - assert result["A"].dtype in [np.int64, np.float64] - assert list(result["A"]) == [1, 2, 3] - - def test_scientific_notation(self): - """Test conversion of scientific notation strings.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - {"sci": ["1e3", "2.5e-2", "3E+4"], "normal": ["1000", "0.025", "30000"]} - ) - - result = to_numeric(df) - assert result["sci"].iloc[0] == 1000 - assert result["sci"].iloc[1] == 0.025 - assert result["sci"].iloc[2] == 30000 - - def test_boolean_strings(self): - """Test conversion of boolean-like strings.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - {"bool_str": ["True", "False", "True"], "bool_num": ["1", "0", "1"]} - ) - - result = to_numeric(df, errors="coerce") - # 'True'/'False' strings should become NaN with coerce - assert result["bool_str"].isna().all() - # '1'/'0' should convert to numbers - assert list(result["bool_num"]) == [1, 0, 1] - - def test_preserve_dtypes_when_possible(self): - """Test that already numeric columns preserve their dtypes.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - { - "int32": pd.array([1, 2, 3], dtype="int32"), - "float32": pd.array([1.1, 2.2, 3.3], dtype="float32"), - "int64": pd.array([1, 2, 3], dtype="int64"), - "float64": pd.array([1.1, 2.2, 3.3], dtype="float64"), - } - ) - - result = to_numeric(df) - # Types might be promoted but should remain numeric - for col in df.columns: - assert pd.api.types.is_numeric_dtype(result[col]) - - def test_whitespace_handling(self): - """Test handling of whitespace in numeric strings.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - { - "with_spaces": [" 1 ", " 2.5 ", "3", " 4.0"], - "with_tabs": ["\t1\t", "2\t", "\t3", "4\t\t"], - } - ) - - result = to_numeric(df) - assert list(result["with_spaces"]) == [1, 2.5, 3, 4.0] - assert list(result["with_tabs"]) == [1, 2, 3, 4] - - def test_currency_symbols(self): - """Test handling of currency symbols.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - { - "dollars": ["$100", "$200.50", "$300"], - "pounds": ["£100", "£200.50", "£300"], - } - ) - - # Currency symbols should result in NaN with coerce - result = to_numeric(df, errors="coerce") - assert result["dollars"].isna().all() - assert result["pounds"].isna().all() - - def test_percentage_strings(self): - """Test handling of percentage strings.""" - from scitex.pd import to_numeric - - df = pd.DataFrame( - {"percent": ["10%", "20.5%", "30%"], "decimal": ["0.1", "0.205", "0.3"]} - ) - - result = to_numeric(df, errors="coerce") - # Percentage strings should become NaN - assert result["percent"].isna().all() - # Decimal strings should convert - assert list(result["decimal"]) == [0.1, 0.205, 0.3] - - def test_copy_behavior(self, mixed_df): - """Test that the function returns a copy, not modifying the original.""" - from scitex.pd import to_numeric - - original_values = mixed_df["int_str"].copy() - result = to_numeric(mixed_df) - - # Original should be unchanged - assert mixed_df["int_str"].dtype == object - assert_series_equal(mixed_df["int_str"], original_values) - - # Result should be numeric - assert result["int_str"].dtype in [np.int64, np.float64] - - @pytest.mark.parametrize("errors", ["coerce", "ignore"]) - def test_consistent_behavior(self, errors): - """Test consistent behavior across different error modes.""" - from scitex.pd import to_numeric - - df = pd.DataFrame({"nums": ["1", "2", "3"], "mixed": ["1", "a", "3"]}) - - result = to_numeric(df, errors=errors) - # Nums column should always be converted - assert pd.api.types.is_numeric_dtype(result["nums"]) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_numeric.py -# -------------------------------------------------------------------------------- -# #!/usr/bin/env python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-11-08 04:35:31 (ywatanabe)" -# # File: ./scitex_repo/src/scitex/pd/_to_numeric.py -# -# import pandas as pd -# -# -# def to_numeric(df, errors="coerce"): -# """Convert all possible columns in a DataFrame to numeric types. -# -# Parameters -# ---------- -# df : pd.DataFrame -# Input DataFrame -# errors : str, optional -# How to handle errors. 'coerce' (default) converts invalid values to NaN, -# 'ignore' leaves non-numeric columns unchanged, 'raise' raises exceptions. -# -# Returns -# ------- -# pd.DataFrame -# DataFrame with numeric columns converted -# """ -# df_copy = df.copy() -# for col in df_copy.columns: -# # First try to convert -# original_col = df_copy[col] -# converted_col = pd.to_numeric(df_copy[col], errors="coerce") -# -# # Check if conversion resulted in all NaN when original had values -# if converted_col.isna().all() and not original_col.isna().all(): -# # This is likely a pure string column -# if errors == "ignore": -# # Keep original for pure string columns -# continue -# else: -# # For coerce, still apply it -# df_copy[col] = converted_col -# elif not converted_col.equals(original_col): -# # Conversion changed something -# if errors == "ignore": -# # Only convert if it doesn't introduce new NaNs -# if converted_col.isna().sum() == original_col.isna().sum(): -# df_copy[col] = converted_col -# elif errors == "coerce": -# df_copy[col] = converted_col -# elif errors == "raise": -# df_copy[col] = pd.to_numeric(df_copy[col], errors="raise") -# return df_copy -# -# -# # EOF - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_numeric.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__to_xy.py b/tests/scitex/pd/test__to_xy.py deleted file mode 100644 index 8bfdccb4f..000000000 --- a/tests/scitex/pd/test__to_xy.py +++ /dev/null @@ -1,308 +0,0 @@ -#!/usr/bin/env python3 -# Timestamp: "2025-06-01 20:05:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__to_xy.py - -""" -Test module for scitex.pd.to_xy function. -""" - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - - -class TestToXY: - """Test class for to_xy function.""" - - @pytest.fixture - def square_df(self): - """Create a square DataFrame for testing.""" - data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - return pd.DataFrame(data, index=["A", "B", "C"], columns=["A", "B", "C"]) - - @pytest.fixture - def numeric_index_df(self): - """Create a DataFrame with numeric index and named columns.""" - data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - return pd.DataFrame(data, index=[0, 1, 2], columns=["A", "B", "C"]) - - @pytest.fixture - def numeric_columns_df(self): - """Create a DataFrame with named index and numeric columns.""" - data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - return pd.DataFrame(data, index=["A", "B", "C"], columns=[0, 1, 2]) - - def test_import(self): - """Test that to_xy can be imported.""" - from scitex.pd import to_xy - - assert callable(to_xy) - - def test_basic_conversion(self, square_df): - """Test basic conversion of square DataFrame.""" - from scitex.pd import to_xy - - result = to_xy(square_df) - - # Check shape - assert result.shape == (9, 3) # 3x3 matrix = 9 rows - - # Check columns - assert list(result.columns) == ["x", "y", "z"] - - # Check values - expected_values = [ - ("A", "A", 1), - ("B", "A", 4), - ("C", "A", 7), - ("A", "B", 2), - ("B", "B", 5), - ("C", "B", 8), - ("A", "C", 3), - ("B", "C", 6), - ("C", "C", 9), - ] - - for i, (x, y, z) in enumerate(expected_values): - assert result.iloc[i]["x"] == x - assert result.iloc[i]["y"] == y - assert result.iloc[i]["z"] == z - - def test_numeric_index_replacement(self, numeric_index_df): - """Test behavior when index is numeric and columns are named. - - The source code sets columns = index when index is numeric [0,1,2], - so both become [0, 1, 2]. - """ - from scitex.pd import to_xy - - result = to_xy(numeric_index_df) - - # After replacement, both index and columns become [0, 1, 2] - assert result.shape == (9, 3) - - # x and y values are now 0, 1, 2 (not 'A', 'B', 'C') - unique_x = sorted(result["x"].unique()) - assert unique_x == [0, 1, 2] - - # Check y values - unique_y = sorted(result["y"].unique()) - assert unique_y == [0, 1, 2] - - def test_numeric_columns_replacement(self, numeric_columns_df): - """Test behavior when columns are numeric and index is named. - - The source code sets index = columns when columns is numeric [0,1,2], - so both become [0, 1, 2]. - """ - from scitex.pd import to_xy - - result = to_xy(numeric_columns_df) - - # After replacement, both index and columns become [0, 1, 2] - assert result.shape == (9, 3) - - # x and y values are now 0, 1, 2 (not 'A', 'B', 'C') - unique_y = sorted(result["y"].unique()) - assert unique_y == [0, 1, 2] - - def test_non_square_dataframe(self): - """Test that non-square DataFrame raises assertion error.""" - from scitex.pd import to_xy - - # Create non-square DataFrame - df = pd.DataFrame(np.array([[1, 2], [3, 4], [5, 6]])) - - with pytest.raises(AssertionError): - to_xy(df) - - def test_identity_matrix(self): - """Test conversion of identity matrix.""" - from scitex.pd import to_xy - - # Create identity matrix - data = np.eye(3) - df = pd.DataFrame(data, index=["A", "B", "C"], columns=["A", "B", "C"]) - - result = to_xy(df) - - # Check diagonal values are 1 - diagonal = result[(result["x"] == result["y"])] - assert all(diagonal["z"] == 1.0) - - # Check off-diagonal values are 0 - off_diagonal = result[(result["x"] != result["y"])] - assert all(off_diagonal["z"] == 0.0) - - def test_single_element_dataframe(self): - """Test conversion of 1x1 DataFrame.""" - from scitex.pd import to_xy - - df = pd.DataFrame([[42]], index=["A"], columns=["A"]) - result = to_xy(df) - - assert result.shape == (1, 3) - assert result.iloc[0]["x"] == "A" - assert result.iloc[0]["y"] == "A" - assert result.iloc[0]["z"] == 42 - - def test_with_nan_values(self): - """Test handling of NaN values.""" - from scitex.pd import to_xy - - data = np.array([[1, np.nan, 3], [4, 5, np.nan], [np.nan, 8, 9]]) - df = pd.DataFrame(data, index=["A", "B", "C"], columns=["A", "B", "C"]) - - result = to_xy(df) - - # Check that NaN values are preserved - nan_rows = result[result["z"].isna()] - assert len(nan_rows) == 3 - - def test_column_order_preserved(self, square_df): - """Test that the order of columns is preserved in output.""" - from scitex.pd import to_xy - - result = to_xy(square_df) - - # Group by 'y' to check order - y_values = [] - for _, group in result.groupby("y", sort=False): - y_values.append(group["y"].iloc[0]) - - assert y_values == ["A", "B", "C"] - - def test_index_order_preserved(self, square_df): - """Test that the order of index is preserved in output.""" - from scitex.pd import to_xy - - result = to_xy(square_df) - - # Check first three rows (should be column 'A') - first_col_x = list(result.iloc[:3]["x"]) - assert first_col_x == ["A", "B", "C"] - - def test_with_duplicate_index_names(self): - """Test behavior with duplicate index/column names. - - When columns have duplicates, df[column] returns a DataFrame instead of - Series, which causes AttributeError since DataFrame has no .name attribute. - This is a limitation of the source code. - """ - from scitex.pd import to_xy - - # Duplicate column names cause issues - df[column] returns DataFrame - data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - df = pd.DataFrame(data, index=["A", "A", "B"], columns=["A", "A", "B"]) - - # Duplicate columns cause AttributeError (DataFrame has no .name) - with pytest.raises(AttributeError): - to_xy(df) - - def test_mismatched_index_columns_no_error(self): - """Test behavior with mismatched non-numeric index/columns. - - The source code has `ValueError` without `raise`, so no exception is raised. - The function proceeds and produces output using the mismatched labels. - """ - from scitex.pd import to_xy - - # Both index and columns are non-numeric but different - data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - df = pd.DataFrame(data, index=["A", "B", "C"], columns=["X", "Y", "Z"]) - - # No exception is raised (bug in source - ValueError without raise) - # The function proceeds with mismatched labels - result = to_xy(df) - assert result.shape == (9, 3) - # x values come from the index, y from columns - assert set(result["x"].unique()) == {"A", "B", "C"} - assert set(result["y"].unique()) == {"X", "Y", "Z"} - - @pytest.mark.parametrize("dtype", [int, float, np.float32, np.float64]) - def test_different_dtypes(self, dtype): - """Test conversion with different data types.""" - from scitex.pd import to_xy - - data = np.array([[1, 2], [3, 4]], dtype=dtype) - df = pd.DataFrame(data, index=["A", "B"], columns=["A", "B"]) - - result = to_xy(df) - assert result.shape == (4, 3) - # Z values should maintain numeric type - assert pd.api.types.is_numeric_dtype(result["z"]) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_xy.py -# -------------------------------------------------------------------------------- -# #!/./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-09-03 07:01:31 (ywatanabe)" -# # ./src/scitex/pd/_to_xy.py -# -# import scitex -# import numpy as np -# import pandas as pd -# -# -# def to_xy(data_frame): -# """ -# Convert a heatmap DataFrame into x, y, z format. -# -# Ensure the index and columns are the same, and if either exists, replace with that. -# -# Example -# ------- -# data_frame = pd.DataFrame(...) # Your DataFrame here -# out = to_xy(data_frame) -# print(out) -# -# Parameters -# ---------- -# data_frame : pandas.DataFrame -# The input DataFrame to be converted. -# -# Returns -# ------- -# pandas.DataFrame -# A DataFrame formatted with columns ['x', 'y', 'z'] -# """ -# assert data_frame.shape[0] == data_frame.shape[1] -# -# if not data_frame.index.equals(data_frame.columns): -# if (data_frame.index == np.array(range(len(data_frame.index)))).all(): -# data_frame.columns = data_frame.index -# elif (data_frame.columns == np.array(range(len(data_frame.columns)))).all(): -# data_frame.index = data_frame.columns -# else: -# ValueError -# # else: -# # ValueError "Either of index or columns has to be passed" -# -# formatted_data_frames = [] -# -# for column in data_frame.columns: -# column_data_frame = data_frame[column] -# y_label = column_data_frame.name -# column_data_frame = pd.DataFrame(column_data_frame) -# column_data_frame["x"] = column_data_frame.index -# column_data_frame["y"] = y_label -# column_data_frame = column_data_frame.reset_index().drop(columns=["index"]) -# column_data_frame = column_data_frame.rename(columns={y_label: "z"}) -# column_data_frame = scitex.pd.mv(column_data_frame, "z", -1) -# formatted_data_frames.append(column_data_frame) -# -# return pd.concat(formatted_data_frames, ignore_index=True) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_xy.py -# -------------------------------------------------------------------------------- diff --git a/tests/scitex/pd/test__to_xyz.py b/tests/scitex/pd/test__to_xyz.py deleted file mode 100644 index 674f5e232..000000000 --- a/tests/scitex/pd/test__to_xyz.py +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/env python3 -# Timestamp: "2025-06-01 20:10:00 (ywatanabe)" -# File: ./tests/scitex/pd/test__to_xyz.py - -""" -Test module for scitex.pd.to_xyz function. -""" - -import numpy as np -import pandas as pd -import pytest -from pandas.testing import assert_frame_equal - - -class TestToXYZ: - """Test class for to_xyz function.""" - - @pytest.fixture - def rectangular_df(self): - """Create a rectangular DataFrame for testing.""" - data = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]) - return pd.DataFrame(data, index=["A", "B", "C"], columns=["W", "X", "Y", "Z"]) - - @pytest.fixture - def named_axes_df(self): - """Create a DataFrame with named index and columns.""" - data = np.array([[1, 2], [3, 4]]) - df = pd.DataFrame(data, index=["row1", "row2"], columns=["col1", "col2"]) - df.index.name = "rows" - df.columns.name = "cols" - return df - - @pytest.fixture - def numeric_df(self): - """Create a DataFrame with numeric index and columns.""" - data = np.array([[10, 20, 30], [40, 50, 60]]) - return pd.DataFrame(data) - - def test_import(self): - """Test that to_xyz can be imported.""" - from scitex.pd import to_xyz - - assert callable(to_xyz) - - def test_basic_conversion(self, rectangular_df): - """Test basic conversion of rectangular DataFrame.""" - from scitex.pd import to_xyz - - result = to_xyz(rectangular_df) - - # Check shape: 3 rows × 4 columns = 12 rows - assert result.shape == (12, 3) - - # Check columns (should be 'x', 'y', 'z' when no names) - assert list(result.columns) == ["x", "y", "z"] - - # Check first few values - expected_first_rows = [ - ("A", "W", 1), - ("B", "W", 5), - ("C", "W", 9), - ("A", "X", 2), - ("B", "X", 6), - ("C", "X", 10), - ] - - for i, (x, y, z) in enumerate(expected_first_rows[:6]): - assert result.iloc[i]["x"] == x - assert result.iloc[i]["y"] == y - assert result.iloc[i]["z"] == z - - def test_named_axes(self, named_axes_df): - """Test conversion with named index and columns.""" - from scitex.pd import to_xyz - - result = to_xyz(named_axes_df) - - # Check column names use the axis names - assert list(result.columns) == ["rows", "cols", "z"] - - # Check values - assert result.shape == (4, 3) - expected_values = [ - ("row1", "col1", 1), - ("row2", "col1", 3), - ("row1", "col2", 2), - ("row2", "col2", 4), - ] - - for i, (row, col, z) in enumerate(expected_values): - assert result.iloc[i]["rows"] == row - assert result.iloc[i]["cols"] == col - assert result.iloc[i]["z"] == z - - def test_numeric_indices(self, numeric_df): - """Test conversion with numeric index and columns.""" - from scitex.pd import to_xyz - - result = to_xyz(numeric_df) - - # Should have default column names - assert list(result.columns) == ["x", "y", "z"] - - # Check shape - assert result.shape == (6, 3) # 2×3 = 6 rows - - # Check that numeric indices are preserved - assert result["x"].iloc[0] == 0 - assert result["y"].iloc[0] == 0 - assert result["z"].iloc[0] == 10 - - def test_single_column_dataframe(self): - """Test conversion of single column DataFrame.""" - from scitex.pd import to_xyz - - df = pd.DataFrame({"A": [1, 2, 3]}, index=["x1", "x2", "x3"]) - result = to_xyz(df) - - assert result.shape == (3, 3) - assert list(result["x"]) == ["x1", "x2", "x3"] - assert list(result["y"]) == ["A", "A", "A"] - assert list(result["z"]) == [1, 2, 3] - - def test_single_row_dataframe(self): - """Test conversion of single row DataFrame.""" - from scitex.pd import to_xyz - - df = pd.DataFrame([[1, 2, 3]], columns=["A", "B", "C"], index=["row1"]) - result = to_xyz(df) - - assert result.shape == (3, 3) - assert list(result["x"]) == ["row1", "row1", "row1"] - assert list(result["y"]) == ["A", "B", "C"] - assert list(result["z"]) == [1, 2, 3] - - def test_with_nan_values(self): - """Test handling of NaN values.""" - from scitex.pd import to_xyz - - df = pd.DataFrame( - {"A": [1, np.nan, 3], "B": [np.nan, 5, 6], "C": [7, 8, np.nan]}, - index=["X", "Y", "Z"], - ) - - result = to_xyz(df) - - # NaN values should be preserved - assert result.shape == (9, 3) - nan_count = result["z"].isna().sum() - assert nan_count == 3 - - def test_empty_dataframe(self): - """Test conversion of empty DataFrame. - - Empty DataFrames with no columns cause pd.concat to fail with - 'No objects to concatenate'. This is expected behavior. - """ - from scitex.pd import to_xyz - - # Empty DataFrame with no columns raises ValueError - df = pd.DataFrame() - with pytest.raises(ValueError, match="No objects to concatenate"): - to_xyz(df) - - # DataFrame with structure but no rows works - produces empty result - df = pd.DataFrame(columns=["A", "B"], index=[]) - result = to_xyz(df) - assert result.empty - assert list(result.columns) == ["x", "y", "z"] - - def test_column_order(self, rectangular_df): - """Test that columns are processed in order.""" - from scitex.pd import to_xyz - - result = to_xyz(rectangular_df) - - # Group by y and check order - y_values = result["y"].unique() - assert list(y_values) == ["W", "X", "Y", "Z"] - - def test_index_preservation(self): - """Test that index values are preserved correctly.""" - from scitex.pd import to_xyz - - # Create DataFrame with string index - df = pd.DataFrame( - {"col1": [100, 200], "col2": [300, 400]}, index=["first", "second"] - ) - - result = to_xyz(df) - - # Check that index values appear in x column - x_values = sorted(result["x"].unique()) - assert x_values == ["first", "second"] - - def test_mixed_types(self): - """Test conversion with mixed data types.""" - from scitex.pd import to_xyz - - df = pd.DataFrame( - { - "int_col": [1, 2, 3], - "float_col": [1.1, 2.2, 3.3], - "str_col": ["a", "b", "c"], - }, - index=["r1", "r2", "r3"], - ) - - result = to_xyz(df) - - assert result.shape == (9, 3) - # Check that different types are preserved in z column - int_vals = result[result["y"] == "int_col"]["z"].tolist() - assert int_vals == [1, 2, 3] - - str_vals = result[result["y"] == "str_col"]["z"].tolist() - assert str_vals == ["a", "b", "c"] - - def test_multiindex_not_supported(self): - """Test behavior with MultiIndex (current implementation doesn't handle specially). - - For MultiIndex, index.name is None (level names are in index.names), - so the x column is named 'x'. The MultiIndex values become tuples. - """ - from scitex.pd import to_xyz - - # Create DataFrame with MultiIndex - arrays = [["A", "A", "B", "B"], [1, 2, 1, 2]] - index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({"col": [10, 20, 30, 40]}, index=index) - - result = to_xyz(df) - # MultiIndex becomes tuples in the 'x' column (not 'first') - assert result.shape == (4, 3) - assert list(result.columns) == ["x", "y", "z"] - # x values are tuples representing the MultiIndex - assert isinstance(result["x"].iloc[0], tuple) - assert result["x"].iloc[0] == ("A", 1) - - def test_datetime_index(self): - """Test conversion with datetime index.""" - from scitex.pd import to_xyz - - dates = pd.date_range("2021-01-01", periods=3) - df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=dates) - - result = to_xyz(df) - - assert result.shape == (6, 3) - # Check that datetime values are preserved - assert pd.api.types.is_datetime64_any_dtype(result["x"]) - - @pytest.mark.parametrize("nrows,ncols", [(1, 10), (10, 1), (5, 5), (3, 7)]) - def test_various_shapes(self, nrows, ncols): - """Test conversion with various DataFrame shapes.""" - from scitex.pd import to_xyz - - data = np.arange(nrows * ncols).reshape(nrows, ncols) - df = pd.DataFrame(data) - - result = to_xyz(df) - assert result.shape == (nrows * ncols, 3) - assert len(result["z"]) == nrows * ncols - - def test_no_square_requirement(self): - """Test that to_xyz doesn't require square DataFrame (unlike to_xy).""" - from scitex.pd import to_xyz - - # Non-square DataFrame should work fine - df = pd.DataFrame(np.arange(12).reshape(3, 4)) - result = to_xyz(df) - assert result.shape == (12, 3) - - -if __name__ == "__main__": - import os - - import pytest - - pytest.main([os.path.abspath(__file__)]) - -# -------------------------------------------------------------------------------- -# Start of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_xyz.py -# -------------------------------------------------------------------------------- -# #!/./env/bin/python3 -# # -*- coding: utf-8 -*- -# # Time-stamp: "2024-09-28 11:17:22 (ywatanabe)" -# # ./src/scitex/pd/_to_xyz.py -# -# import scitex -# import numpy as np -# import pandas as pd -# -# -# def to_xyz(data_frame): -# """ -# Convert a DataFrame into x, y, z format (long format). -# -# Transforms a DataFrame from wide format (matrix/heatmap) to long format -# where each value becomes a row with x (row index), y (column name), -# and z (value) columns. -# -# Example -# ------- -# data_frame = pd.DataFrame(...) # Your DataFrame here -# out = to_xyz(data_frame) -# print(out) -# -# Parameters -# ---------- -# data_frame : pandas.DataFrame -# The input DataFrame to be converted. -# -# Returns -# ------- -# pandas.DataFrame -# A DataFrame formatted with columns ['x', 'y', 'z'] -# """ -# x_name = data_frame.index.name or "x" -# y_name = data_frame.columns.name or "y" -# -# formatted_data_frames = [] -# -# for column in data_frame.columns: -# column_data_frame = data_frame[column] -# formatted_data = pd.DataFrame( -# { -# x_name: column_data_frame.index, -# y_name: column, -# "z": column_data_frame.values, -# } -# ) -# formatted_data_frames.append(formatted_data) -# -# result = pd.concat(formatted_data_frames, ignore_index=True) -# -# # Ensure column order is x, y, z -# col_order = [x_name, y_name, "z"] -# result = result[col_order] -# -# return result -# -# -# # def to_xyz(data_frame): -# # """ -# # Convert a heatmap DataFrame into x, y, z format. -# -# # Ensure the index and columns are the same, and if either exists, replace with that. -# -# # Example -# # ------- -# # data_frame = pd.DataFrame(...) # Your DataFrame here -# # out = to_xy(data_frame) -# # print(out) -# -# # Parameters -# # ---------- -# # data_frame : pandas.DataFrame -# # The input DataFrame to be converted. -# -# # Returns -# # ------- -# # pandas.DataFrame -# # A DataFrame formatted with columns ['x', 'y', 'z'] -# # """ -# # assert data_frame.shape[0] == data_frame.shape[1] -# -# # if not data_frame.index.equals(data_frame.columns): -# -# # if (data_frame.index == np.array(range(len(data_frame.index)))).all(): -# # data_frame.columns = data_frame.index -# # elif ( -# # data_frame.columns == np.array(range(len(data_frame.columns))) -# # ).all(): -# # data_frame.index = data_frame.columns -# # else: -# # raise ValueError("Either index or columns must be a range of integers") -# -# # formatted_data_frames = [] -# -# # for column in data_frame.columns: -# # column_data_frame = data_frame[column] -# # y_label = column_data_frame.name -# # column_data_frame = pd.DataFrame(column_data_frame) -# # column_data_frame["x"] = column_data_frame.index -# # column_data_frame["y"] = y_label -# # column_data_frame = column_data_frame.reset_index().drop( -# # columns=["index"] -# # ) -# # column_data_frame = column_data_frame.rename(columns={y_label: "z"}) -# # column_data_frame = scitex.pd.mv(column_data_frame, "z", -1) -# # formatted_data_frames.append(column_data_frame) -# -# # return pd.concat(formatted_data_frames, ignore_index=True) - -# -------------------------------------------------------------------------------- -# End of Source Code from: /home/ywatanabe/proj/scitex-code/src/scitex/pd/_to_xyz.py -# --------------------------------------------------------------------------------