This document provides detailed API documentation for all dskit classes and functions.
class dskit:
"""Main dskit class with method chaining support."""
def __init__(self, data=None):
"""Initialize dskit instance.
Parameters:
-----------
data : pd.DataFrame, optional
Initial DataFrame to work with
"""
@staticmethod
def load(filepath: str, **kwargs) -> 'dskit':
"""Load data from file and return dskit instance."""
def fix_dtypes(self) -> 'dskit':
"""Fix data types automatically."""
def rename_columns_auto(self) -> 'dskit':
"""Clean column names automatically."""
def fill_missing(self, strategy: str = 'auto', fill_value=None) -> 'dskit':
"""Fill missing values with specified strategy."""
def remove_outliers(self, method: str = 'iqr', threshold: float = 1.5) -> 'dskit':
"""Remove outliers using specified method."""def load(filepath: str, **kwargs) -> pd.DataFrame:
"""Universal data loader with automatic format detection.
Parameters:
-----------
filepath : str
Path to the data file
**kwargs : dict
Additional parameters passed to the appropriate pandas reader
Returns:
--------
pd.DataFrame
Loaded DataFrame
Supported Formats:
------------------
- CSV: .csv
- Excel: .xlsx, .xls
- JSON: .json
- Parquet: .parquet
Examples:
---------
>>> df = dskit.load("data.csv")
>>> df = dskit.load("data.xlsx", sheet_name="Sheet1")
>>> df = dskit.load("data.json", orient="records")
"""
def read_folder(folder_path: str, file_type: str = 'csv', **kwargs) -> pd.DataFrame:
"""Read and combine multiple files from a folder.
Parameters:
-----------
folder_path : str
Path to the folder containing files
file_type : str, default 'csv'
Type of files to read ('csv', 'xlsx', 'json', 'parquet')
**kwargs : dict
Additional parameters for file reading
Returns:
--------
pd.DataFrame
Combined DataFrame from all files
Examples:
---------
>>> df = dskit.read_folder("data_folder/", file_type="csv")
>>> df = dskit.read_folder("excel_files/", file_type="xlsx")
"""
def save(df: pd.DataFrame, filepath: str, **kwargs) -> None:
"""Save DataFrame to file with format auto-detection.
Parameters:
-----------
df : pd.DataFrame
DataFrame to save
filepath : str
Output file path
**kwargs : dict
Additional parameters for saving
Examples:
---------
>>> dskit.save(df, "output.csv")
>>> dskit.save(df, "output.parquet")
>>> dskit.save(df, "output.xlsx", index=False)
"""def fix_dtypes(df: pd.DataFrame) -> pd.DataFrame:
"""Automatically detect and fix data types.
Features:
---------
- Converts string numbers to numeric
- Detects and converts datetime columns
- Optimizes integer types
- Handles mixed-type columns
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
Returns:
--------
pd.DataFrame
DataFrame with optimized data types
"""
def rename_columns_auto(df: pd.DataFrame) -> pd.DataFrame:
"""Automatically clean column names.
Cleaning Operations:
-------------------
- Convert to lowercase
- Replace spaces with underscores
- Remove special characters
- Handle duplicate names
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
Returns:
--------
pd.DataFrame
DataFrame with cleaned column names
"""
def missing_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Generate comprehensive missing value summary.
Returns:
--------
pd.DataFrame
Summary with columns: Count, Percentage, Data_Type
"""
def fill_missing(df: pd.DataFrame, strategy: str = 'auto', fill_value=None) -> pd.DataFrame:
"""Intelligent missing value imputation.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
strategy : str, default 'auto'
Imputation strategy:
- 'auto': Automatically choose best strategy
- 'mean': Mean for numeric, mode for categorical
- 'median': Median for numeric, mode for categorical
- 'mode': Most frequent value
- 'constant': Fill with fill_value
- 'ffill': Forward fill
- 'bfill': Backward fill
fill_value : any, optional
Value to use when strategy='constant'
Returns:
--------
pd.DataFrame
DataFrame with imputed missing values
Examples:
---------
>>> df = dskit.fill_missing(df, strategy='auto')
>>> df = dskit.fill_missing(df, strategy='constant', fill_value=0)
"""
def outlier_summary(df: pd.DataFrame, method: str = 'iqr', threshold: float = 1.5) -> pd.DataFrame:
"""Generate outlier detection summary.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
method : str, default 'iqr'
Detection method: 'iqr', 'zscore', 'isolation'
threshold : float, default 1.5
Threshold for outlier detection
Returns:
--------
pd.DataFrame
Summary of outliers per column
"""
def remove_outliers(df: pd.DataFrame, method: str = 'iqr', threshold: float = 1.5) -> pd.DataFrame:
"""Remove outliers using specified method.
Methods:
--------
- 'iqr': Interquartile Range method
- 'zscore': Z-score based detection (threshold = z-score limit)
- 'isolation': Isolation Forest (threshold = contamination rate)
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
method : str, default 'iqr'
Outlier detection method
threshold : float, default 1.5
Method-specific threshold
Returns:
--------
pd.DataFrame
DataFrame with outliers removed
"""
def simple_nlp_clean(df: pd.DataFrame, text_cols: list) -> pd.DataFrame:
"""Basic text preprocessing for NLP.
Operations:
-----------
- Convert to lowercase
- Remove extra whitespace
- Remove special characters (optional)
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
text_cols : list
List of text column names
Returns:
--------
pd.DataFrame
DataFrame with cleaned text columns
"""def plot_missingness(df: pd.DataFrame, figsize: tuple = (12, 6)) -> None:
"""Plot missing data patterns as heatmap.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
figsize : tuple, default (12, 6)
Figure size for the plot
"""
def plot_histograms(df: pd.DataFrame, bins: int = 30, figsize: tuple = (15, 10)) -> None:
"""Plot histograms for all numeric columns.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
bins : int, default 30
Number of bins for histograms
figsize : tuple, default (15, 10)
Figure size for the plot
"""
def plot_correlation_heatmap(df: pd.DataFrame, method: str = 'pearson', figsize: tuple = (12, 8)) -> None:
"""Plot correlation heatmap.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
method : str, default 'pearson'
Correlation method: 'pearson', 'spearman', 'kendall'
figsize : tuple, default (12, 8)
Figure size for the plot
"""
def plot_boxplots(df: pd.DataFrame, figsize: tuple = (15, 10)) -> None:
"""Plot box plots for numeric columns to show outliers.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
figsize : tuple, default (15, 10)
Figure size for the plot
"""def auto_encode(df: pd.DataFrame, target_col: str = None, max_categories: int = 10) -> pd.DataFrame:
"""Automatically encode categorical variables.
Encoding Strategy:
------------------
- Binary columns: Keep as 0/1
- High cardinality (>max_categories): Target encoding if target provided, else drop
- Low cardinality: One-hot encoding
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
target_col : str, optional
Target column name for target encoding
max_categories : int, default 10
Maximum categories for one-hot encoding
Returns:
--------
pd.DataFrame
DataFrame with encoded categorical variables
"""
def auto_scale(df: pd.DataFrame, method: str = 'standard', columns: list = None) -> pd.DataFrame:
"""Automatically scale numeric features.
Methods:
--------
- 'standard': StandardScaler (mean=0, std=1)
- 'minmax': MinMaxScaler (0 to 1)
- 'robust': RobustScaler (median and IQR)
- 'quantile': QuantileTransformer (uniform distribution)
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
method : str, default 'standard'
Scaling method
columns : list, optional
Specific columns to scale (default: all numeric)
Returns:
--------
pd.DataFrame
DataFrame with scaled features
"""
def train_test_auto(df: pd.DataFrame, target: str, test_size: float = 0.2,
random_state: int = 42, stratify: bool = True) -> tuple:
"""Smart train-test splitting with automatic stratification.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
target : str
Target column name
test_size : float, default 0.2
Proportion for test set
random_state : int, default 42
Random state for reproducibility
stratify : bool, default True
Whether to stratify split based on target
Returns:
--------
tuple
(X_train, X_test, y_train, y_test)
"""class QuickModel:
"""Quick model training with minimal setup.
Supported Models:
-----------------
Classification:
- 'random_forest', 'rf'
- 'xgboost', 'xgb'
- 'lightgbm', 'lgb'
- 'catboost', 'cat'
- 'logistic', 'lr'
- 'svm', 'svc'
- 'knn'
- 'naive_bayes', 'nb'
Regression:
- 'random_forest_reg', 'rf_reg'
- 'xgboost_reg', 'xgb_reg'
- 'lightgbm_reg', 'lgb_reg'
- 'catboost_reg', 'cat_reg'
- 'linear', 'linear_reg'
- 'ridge'
- 'lasso'
- 'svr'
"""
def __init__(self, model_name: str, task: str = 'auto', **kwargs):
"""Initialize QuickModel.
Parameters:
-----------
model_name : str
Name of the model algorithm
task : str, default 'auto'
'classification', 'regression', or 'auto'
**kwargs : dict
Model-specific parameters
"""
def fit(self, X, y):
"""Fit the model to training data."""
def predict(self, X):
"""Make predictions on new data."""
def predict_proba(self, X):
"""Predict probabilities (classification only)."""
def score(self, X, y):
"""Calculate model score."""
def compare_models(X, y, task: str = 'auto', cv: int = 5,
scoring: str = None, random_state: int = 42) -> pd.DataFrame:
"""Compare multiple machine learning models.
Parameters:
-----------
X : array-like
Feature matrix
y : array-like
Target vector
task : str, default 'auto'
'classification', 'regression', or 'auto'
cv : int, default 5
Cross-validation folds
scoring : str, optional
Scoring metric (auto-selected if None)
random_state : int, default 42
Random state for reproducibility
Returns:
--------
pd.DataFrame
Comparison results with mean scores and standard deviations
"""
def evaluate_model(model, X_test, y_test, task: str = 'auto',
plot: bool = True) -> dict:
"""Comprehensive model evaluation.
Returns:
--------
dict
Evaluation metrics including:
- Classification: accuracy, precision, recall, f1, auc
- Regression: mae, mse, rmse, r2, mape
"""def auto_hpo(model_name: str, X, y, method: str = 'optuna',
max_evals: int = 100, cv: int = 5, scoring: str = None,
random_state: int = 42, **kwargs):
"""Automated hyperparameter optimization.
Parameters:
-----------
model_name : str
Name of the model to optimize
X : array-like
Feature matrix
y : array-like
Target vector
method : str, default 'optuna'
Optimization method: 'optuna', 'random', 'grid', 'bayesian'
max_evals : int, default 100
Maximum number of evaluations
cv : int, default 5
Cross-validation folds
scoring : str, optional
Optimization metric
random_state : int, default 42
Random state
Returns:
--------
Optimized model instance
"""
def auto_tune_model(model_name: str, X, y, method: str = 'optuna',
max_evals: int = 100, **kwargs):
"""End-to-end automated model tuning.
Features:
---------
- Automatic task detection
- Smart parameter space definition
- Cross-validation based optimization
- Best model training and return
Returns:
--------
Tuned model ready for prediction
"""def create_polynomial_features(df: pd.DataFrame, degree: int = 2,
interaction_only: bool = False,
include_bias: bool = False,
columns: list = None) -> pd.DataFrame:
"""Create polynomial and interaction features.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
degree : int, default 2
Degree of polynomial features
interaction_only : bool, default False
If True, only interaction features are produced
include_bias : bool, default False
If True, include bias column (all ones)
columns : list, optional
Specific columns to transform
Returns:
--------
pd.DataFrame
DataFrame with polynomial/interaction features
"""
def create_date_features(df: pd.DataFrame, date_cols: list,
drop_original: bool = False) -> pd.DataFrame:
"""Extract comprehensive date/time features.
Created Features:
-----------------
- Basic: year, month, day, hour, minute
- Derived: weekday, quarter, day_of_year, week_of_year
- Binary: is_weekend, is_month_start, is_month_end
- Cyclical: sin/cos transformations for cyclical features
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
date_cols : list
List of datetime column names
drop_original : bool, default False
Whether to drop original date columns
Returns:
--------
pd.DataFrame
DataFrame with date features
"""
def create_target_encoding(df: pd.DataFrame, cat_cols: list, target_col: str,
smoothing: float = 10.0, min_samples: int = 10) -> pd.DataFrame:
"""Create target encoding for categorical variables.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
cat_cols : list
Categorical columns to encode
target_col : str
Target column name
smoothing : float, default 10.0
Smoothing parameter for regularization
min_samples : int, default 10
Minimum samples per category
Returns:
--------
pd.DataFrame
DataFrame with target encoded features
"""
def create_aggregation_features(df: pd.DataFrame, group_col: str,
agg_cols: list, agg_funcs: list = None) -> pd.DataFrame:
"""Create aggregation features grouped by categorical column.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
group_col : str
Column to group by
agg_cols : list
Columns to aggregate
agg_funcs : list, optional
Aggregation functions (default: ['mean', 'std', 'min', 'max'])
Returns:
--------
pd.DataFrame
DataFrame with aggregation features
"""
def select_features_univariate(X, y, k: int = 10, score_func=None):
"""Univariate feature selection.
Parameters:
-----------
X : array-like
Feature matrix
y : array-like
Target vector
k : int, default 10
Number of features to select
score_func : callable, optional
Scoring function (auto-selected if None)
Returns:
--------
Selected features and selector object
"""
def apply_pca(df: pd.DataFrame, n_components: int = None,
variance_threshold: float = 0.95) -> tuple:
"""Apply Principal Component Analysis.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
n_components : int, optional
Number of components (auto if None)
variance_threshold : float, default 0.95
Minimum variance to retain
Returns:
--------
tuple
(transformed_df, pca_object, explained_variance_ratio)
"""def advanced_text_clean(df: pd.DataFrame, text_cols: list,
remove_urls: bool = True, remove_emails: bool = True,
remove_phone: bool = True, expand_contractions: bool = True,
remove_punctuation: bool = False) -> pd.DataFrame:
"""Advanced text preprocessing pipeline.
Operations:
-----------
- URL removal
- Email removal
- Phone number removal
- Contraction expansion
- Case normalization
- Whitespace normalization
- Optional punctuation removal
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
text_cols : list
Text column names
remove_urls : bool, default True
Remove URLs
remove_emails : bool, default True
Remove email addresses
remove_phone : bool, default True
Remove phone numbers
expand_contractions : bool, default True
Expand contractions (can't -> cannot)
remove_punctuation : bool, default False
Remove punctuation marks
Returns:
--------
pd.DataFrame
DataFrame with cleaned text
"""
def sentiment_analysis(df: pd.DataFrame, text_cols: list) -> pd.DataFrame:
"""Perform sentiment analysis using TextBlob.
Creates New Columns:
-------------------
- {col}_sentiment: Sentiment polarity (-1 to 1)
- {col}_subjectivity: Subjectivity score (0 to 1)
- {col}_sentiment_label: 'positive', 'negative', 'neutral'
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
text_cols : list
Text column names
Returns:
--------
pd.DataFrame
DataFrame with sentiment features
"""
def extract_text_features(df: pd.DataFrame, text_cols: list) -> pd.DataFrame:
"""Extract comprehensive text statistics.
Features Created:
-----------------
- Character count
- Word count
- Sentence count
- Average word length
- Punctuation count
- Capital letter count
- Special character count
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
text_cols : list
Text column names
Returns:
--------
pd.DataFrame
DataFrame with text features
"""
def extract_keywords(df: pd.DataFrame, text_col: str, top_n: int = 10) -> list:
"""Extract top keywords using TF-IDF.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
text_col : str
Text column name
top_n : int, default 10
Number of top keywords
Returns:
--------
list
Top keywords with TF-IDF scores
"""class TimeSeriesUtils:
"""Comprehensive time series analysis and feature engineering."""
def create_comprehensive_time_features(self, df: pd.DataFrame,
date_col: str,
drop_original: bool = False) -> pd.DataFrame:
"""Create 25+ time-based features.
Features Created:
-----------------
Basic Time Components:
- year, month, day, hour, minute, second
- weekday, day_of_year, week_of_year, quarter
Binary Indicators:
- is_weekend, is_month_start, is_month_end
- is_quarter_start, is_quarter_end, is_year_start, is_year_end
Cyclical Features:
- month_sin, month_cos, day_sin, day_cos
- hour_sin, hour_cos, weekday_sin, weekday_cos
Business Calendar:
- business_day_of_month, days_since_month_start
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
date_col : str
Date column name
drop_original : bool, default False
Whether to drop original date column
Returns:
--------
pd.DataFrame
DataFrame with comprehensive time features
"""
def detect_seasonality(self, series: pd.Series, periods: list = None) -> dict:
"""Detect seasonality patterns in time series.
Parameters:
-----------
series : pd.Series
Time series data
periods : list, optional
Periods to test (default: [7, 30, 365])
Returns:
--------
dict
Seasonality detection results with strengths
"""
def add_lag_features(self, df: pd.DataFrame, col: str,
lags: list) -> pd.DataFrame:
"""Add lag features for time series.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
col : str
Column to create lags for
lags : list
List of lag periods
Returns:
--------
pd.DataFrame
DataFrame with lag features
"""
def add_rolling_features(self, df: pd.DataFrame, col: str,
windows: list, functions: list = None) -> pd.DataFrame:
"""Add rolling statistics features.
Parameters:
-----------
df : pd.DataFrame
Input DataFrame
col : str
Column to calculate rolling features
windows : list
Window sizes for rolling calculations
functions : list, optional
Functions to apply (default: ['mean', 'std', 'min', 'max'])
Returns:
--------
pd.DataFrame
DataFrame with rolling features
"""class AdvancedPreprocessor:
"""Advanced data preprocessing with sophisticated algorithms."""
def advanced_imputation(self, df: pd.DataFrame, method: str = 'iterative',
n_neighbors: int = 5, max_iter: int = 10) -> pd.DataFrame:
"""Advanced missing value imputation.
Methods:
--------
- 'iterative': IterativeImputer using BayesianRidge
- 'knn': KNN-based imputation
- 'mice': Multiple Imputation by Chained Equations
"""
def detect_and_handle_outliers(self, df: pd.DataFrame, method: str = 'isolation_forest',
contamination: float = 0.1) -> pd.DataFrame:
"""Advanced outlier detection and handling.
Methods:
--------
- 'isolation_forest': Isolation Forest algorithm
- 'local_outlier_factor': Local Outlier Factor
- 'one_class_svm': One-Class SVM
- 'elliptic_envelope': Robust covariance estimation
"""
def smart_feature_selection(self, X, y, method: str = 'mutual_info',
k: int = 10, task: str = 'auto') -> tuple:
"""Intelligent feature selection.
Methods:
--------
- 'mutual_info': Mutual information
- 'chi2': Chi-squared test
- 'f_classif': F-test for classification
- 'f_regression': F-test for regression
- 'rfe': Recursive Feature Elimination
"""class ModelValidator:
"""Advanced model validation and analysis."""
def comprehensive_cross_validation(self, model, X, y, cv_strategies: list = None) -> dict:
"""Multi-strategy cross-validation analysis."""
def stability_analysis(self, model, X, y, n_iterations: int = 10) -> dict:
"""Analyze model stability across different data splits."""
def learning_curve_analysis(self, model, X, y, train_sizes: list = None) -> dict:
"""Generate learning curves to analyze training efficiency."""
def bias_variance_analysis(self, model, X, y, n_iterations: int = 100) -> dict:
"""Bias-variance decomposition analysis."""
class ModelInterpreter:
"""Model interpretation and explainability."""
def permutation_importance(self, model, X, y, n_repeats: int = 10) -> dict:
"""Calculate permutation-based feature importance."""
def partial_dependence_analysis(self, model, X, features: list) -> dict:
"""Analyze partial dependence of features."""
def local_explanation(self, model, X_instance, X_train, mode: str = 'classification') -> dict:
"""LIME-based local explanation for single instance."""class DataAuditor:
"""Comprehensive data quality auditing."""
def comprehensive_audit(self, df: pd.DataFrame, target_col: str = None) -> dict:
"""Complete data quality audit.
Audit Components:
-----------------
- Data quality score (0-100)
- Missing value analysis
- Data type consistency
- Outlier detection
- Duplicate analysis
- Target correlation analysis
- Statistical summaries
"""
def print_audit_report(self):
"""Print formatted audit report."""
class DataSampler:
"""Advanced data sampling strategies."""
def stratified_sample(self, df: pd.DataFrame, target_col: str,
n_samples: int, random_state: int = 42) -> pd.DataFrame:
"""Stratified sampling maintaining target distribution."""
def time_aware_sample(self, df: pd.DataFrame, date_col: str,
n_samples: int, method: str = 'recent') -> pd.DataFrame:
"""Time-aware sampling strategies."""
def balanced_sample(self, df: pd.DataFrame, target_col: str,
method: str = 'undersample') -> pd.DataFrame:
"""Balanced sampling for imbalanced datasets."""class DatabaseConnector:
"""Universal database connectivity."""
def connect(self, db_type: str, connection_params: dict) -> str:
"""Connect to database.
Supported Databases:
-------------------
- 'sqlite': SQLite database
- 'mysql': MySQL database
- 'postgresql': PostgreSQL database
- 'oracle': Oracle database
- 'sqlserver': SQL Server database
"""
def execute_query(self, query: str, connection_name: str = 'default') -> pd.DataFrame:
"""Execute SQL query and return DataFrame."""
def insert_dataframe(self, df: pd.DataFrame, table_name: str,
connection_name: str = 'default', if_exists: str = 'append') -> bool:
"""Insert DataFrame into database table."""
class DataProfiler:
"""Advanced data profiling and analysis."""
def comprehensive_profile(self, df: pd.DataFrame) -> dict:
"""Generate comprehensive data profile."""
class DataExporter:
"""Multi-format data export utilities."""
def export_data(self, df: pd.DataFrame, filepath: str, format: str) -> bool:
"""Export data in various formats."""class Hyperplane:
"""2D and 3D hyperplane representation and visualization."""
def __init__(self, weights: Union[List, np.ndarray], bias: float):
"""Initialize hyperplane with weights and bias.
Parameters:
-----------
weights : array-like
Weight vector (2D or 3D)
bias : float
Bias term
"""
def equation(self) -> str:
"""Get mathematical equation representation."""
def predict(self, X: np.ndarray) -> np.ndarray:
"""Classify points as +1 or -1 based on hyperplane side."""
def distance(self, point: Union[List, np.ndarray]) -> float:
"""Calculate perpendicular distance from point to hyperplane."""
def plot_2d(self, x_range: Tuple[float, float] = (-5, 5),
X: Optional[np.ndarray] = None, y: Optional[np.ndarray] = None,
show_margin: bool = False) -> None:
"""Plot 2D hyperplane with optional data points and margin."""
def plot_3d(self, range_val: Tuple[float, float] = (-5, 5),
X: Optional[np.ndarray] = None, y: Optional[np.ndarray] = None) -> None:
"""Plot 3D hyperplane with optional data points."""
def plot_decision_regions(self, X: np.ndarray, y: np.ndarray,
resolution: int = 100) -> None:
"""Plot color-coded decision regions for 2D data."""
class HyperplaneExtractor:
"""Extract hyperplanes from trained ML models."""
def __init__(self, model):
"""Initialize with trained sklearn model.
Supported Models:
----------------
- LinearSVC, SVC(kernel='linear')
- LogisticRegression
- Perceptron
- LinearRegression, Ridge, Lasso
- LinearDiscriminantAnalysis
- GaussianNB (approximate)
- DecisionTree (approximate)
"""
def get_hyperplane(self) -> Hyperplane:
"""Get the extracted Hyperplane object."""
def equation(self) -> str:
"""Get hyperplane equation string."""
def analyze_model(self, X: np.ndarray, y: np.ndarray) -> dict:
"""Comprehensive hyperplane analysis.
Returns:
--------
dict
Analysis including distances, margins, weights, etc.
"""
def compare_models(self, other_extractor: 'HyperplaneExtractor') -> dict:
"""Compare two hyperplane models.
Returns:
--------
dict
Comparison metrics including angle, weight differences, etc.
"""
def plot_2d(self, X: Optional[np.ndarray] = None,
y: Optional[np.ndarray] = None, **kwargs) -> None:
"""Plot 2D hyperplane with model-specific enhancements."""
def plot_decision_regions(self, X: np.ndarray, y: np.ndarray) -> None:
"""Plot decision regions for the extracted hyperplane."""
# Algorithm-Specific Plotting Methods
def plot_svm(self, X: np.ndarray, y: np.ndarray,
show_support_vectors: bool = True, margin_style: str = 'dashed') -> None:
"""SVM-specific plotting with margins and support vector highlighting."""
def plot_logistic_regression(self, X: np.ndarray, y: np.ndarray,
show_probabilities: bool = True,
probability_contours: List[float] = [0.1, 0.3, 0.5, 0.7, 0.9]) -> None:
"""Logistic regression plotting with probability contours."""
def plot_perceptron(self, X: np.ndarray, y: np.ndarray,
show_misclassified: bool = True) -> None:
"""Perceptron plotting with misclassified points highlighted."""
def plot_lda(self, X: np.ndarray, y: np.ndarray,
show_class_centers: bool = True, show_projections: bool = False) -> None:
"""LDA plotting with class centers and projection directions."""
def plot_linear_regression(self, X: np.ndarray, y: np.ndarray,
show_residuals: bool = True, confidence_interval: bool = False) -> None:
"""Linear regression plotting with residuals (1D/2D support)."""
def plot_algorithm_comparison(self, models_dict: dict, X: np.ndarray, y: np.ndarray) -> None:
"""Compare multiple algorithm hyperplanes in subplots."""
# Convenience Functions
def extract_hyperplane(model) -> HyperplaneExtractor:
"""Extract hyperplane from any supported ML model."""
def create_hyperplane_from_points(points: List[np.ndarray]) -> Hyperplane:
"""Create hyperplane from defining points.
Parameters:
-----------
points : list of arrays
2 points for 2D line, 3 points for 3D plane
"""
# Algorithm-Specific Plotting Functions
def plot_svm_hyperplane(model, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Quick SVM hyperplane plotting with margins."""
def plot_logistic_hyperplane(model, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Quick logistic regression hyperplane plotting with probability contours."""
def plot_perceptron_hyperplane(model, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Quick perceptron hyperplane plotting with misclassified points."""
def plot_lda_hyperplane(model, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Quick LDA hyperplane plotting with class centers."""
def plot_linear_regression_hyperplane(model, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Quick linear regression hyperplane plotting with residuals."""
def compare_algorithm_hyperplanes(models_dict: dict, X: np.ndarray, y: np.ndarray, **kwargs) -> None:
"""Compare multiple algorithm hyperplanes in one visualization."""class ModelPersistence:
"""Model serialization and persistence."""
def save_model(self, model, filepath: str, format: str = 'joblib',
metadata: dict = None) -> bool:
"""Save model with metadata."""
def load_model(self, filepath: str, format: str = 'joblib'):
"""Load saved model."""
class ExperimentTracker:
"""ML experiment tracking and logging."""
def start_experiment(self, experiment_name: str, description: str = '') -> str:
"""Start new experiment."""
def log_parameter(self, key: str, value) -> None:
"""Log experiment parameter."""
def log_metric(self, key: str, value: float, step: int = None) -> None:
"""Log experiment metric."""
def end_experiment(self, status: str = 'completed') -> None:
"""End current experiment."""
class ConfigManager:
"""Configuration management."""
def load_config(self, config_path: str = 'config.json') -> dict:
"""Load configuration file."""
def get(self, key: str, default=None):
"""Get configuration value."""
def set(self, key: str, value) -> None:
"""Set configuration value."""
class DataVersioning:
"""Data versioning and lineage tracking."""
def create_version(self, df: pd.DataFrame, dataset_name: str,
description: str = '') -> str:
"""Create new data version."""
def load_version(self, version_id: str) -> pd.DataFrame:
"""Load specific data version."""
def list_versions(self, dataset_name: str = None) -> pd.DataFrame:
"""List all versions."""from dskit import dskit
# Fluent API with method chaining
result = (dskit.load("data.csv")
.fix_dtypes()
.rename_columns_auto()
.fill_missing(strategy='auto')
.remove_outliers(method='iqr')
.auto_encode(target_col='target')
.auto_scale(method='standard')
.train_advanced('xgboost')
.evaluate())
print(f"Model accuracy: {result.score}")import dskit
# Step-by-step functional approach
df = dskit.load("data.csv")
df = dskit.fix_dtypes(df)
df = dskit.fill_missing(df, strategy='auto')
df = dskit.remove_outliers(df, method='iqr')
# Model training
model = dskit.QuickModel('xgboost')
model.fit(X_train, y_train)
predictions = model.predict(X_test)from dskit import AdvancedPreprocessor, ModelValidator
# Advanced preprocessing
preprocessor = AdvancedPreprocessor()
df_clean = preprocessor.advanced_imputation(df, method='iterative')
df_clean = preprocessor.detect_and_handle_outliers(df_clean, method='isolation_forest')
# Model validation
validator = ModelValidator()
cv_results = validator.comprehensive_cross_validation(model, X, y)
stability_results = validator.stability_analysis(model, X, y)# Extract hyperplane from trained model
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Train models
svm_model = LinearSVC(random_state=42).fit(X, y)
lr_model = LogisticRegression(random_state=42).fit(X, y)
perceptron_model = Perceptron(random_state=42).fit(X, y)
lda_model = LinearDiscriminantAnalysis().fit(X, y)
# Method 1: Direct algorithm-specific plotting
dskit.plot_svm_hyperplane(svm_model, X, y) # SVM with margins
dskit.plot_logistic_hyperplane(lr_model, X, y) # With probability contours
dskit.plot_perceptron_hyperplane(perceptron_model, X, y) # Highlight misclassified
dskit.plot_lda_hyperplane(lda_model, X, y) # With class centers
# Method 2: Extractor with algorithm-specific methods
extractor = dskit.extract_hyperplane(svm_model)
extractor.plot_svm(X, y, show_support_vectors=True, margin_style='dashed')
lr_extractor = dskit.extract_hyperplane(lr_model)
lr_extractor.plot_logistic_regression(X, y,
probability_contours=[0.1, 0.3, 0.5, 0.7, 0.9],
show_probabilities=True)
perceptron_extractor = dskit.extract_hyperplane(perceptron_model)
perceptron_extractor.plot_perceptron(X, y, show_misclassified=True)
lda_extractor = dskit.extract_hyperplane(lda_model)
lda_extractor.plot_lda(X, y, show_class_centers=True, show_projections=True)
# Compare multiple algorithms
models = {'SVM': svm_model, 'Logistic': lr_model, 'Perceptron': perceptron_model, 'LDA': lda_model}
dskit.compare_algorithm_hyperplanes(models, X, y)
# Basic hyperplane operations
print(f"SVM Equation: {extractor.equation()}")
distances = [extractor.hyperplane.distance(point) for point in X]
# Advanced analysis
analysis = extractor.analyze_model(X, y)
print(f"Margin: {analysis['margin']:.3f}")
print(f"Mean distance: {analysis['mean_distance']:.3f}")
# Model comparison
comparison = extractor.compare_models(lr_extractor)
print(f"Angle between hyperplanes: {comparison['angle_between_normals']:.3f}°")
# Linear regression with residuals
from sklearn.linear_model import LinearRegression
reg_model = LinearRegression().fit(X, y_continuous)
dskit.plot_linear_regression_hyperplane(reg_model, X, y_continuous, show_residuals=True)
# Create custom hyperplane
hp = dskit.Hyperplane([1, -2], 3) # x - 2y + 3 = 0
hp.plot_2d(x_range=(-5, 5))
# Hyperplane from points
points_2d = [[0, 1], [2, 3]]
hp_from_points = dskit.create_hyperplane_from_points(points_2d)
hp_from_points.plot_2d()from dskit.config import dskitConfig
# Global configuration
config = dskitConfig()
config.set_visualization_backend('plotly') # or 'matplotlib'
config.set_random_state(42)
config.set_n_jobs(-1)
config.set_verbosity(1)
# Get current settings
settings = config.get_all_settings()- DataFrames: Most functions return
pd.DataFrame - Models: Training functions return fitted model objects
- Dictionaries: Analysis functions return structured results
- Tuples: Functions returning multiple values use tuples
- Boolean: Success/failure operations return boolean
# Model evaluation results
{
'accuracy': 0.95,
'precision': 0.94,
'recall': 0.96,
'f1_score': 0.95,
'confusion_matrix': array([[...]]),
'classification_report': {...}
}
# Data audit results
{
'overall_score': 88.1,
'data_quality': {...},
'missing_analysis': {...},
'outlier_analysis': {...},
'duplicate_analysis': {...}
}This API reference provides detailed documentation for all dskit functionality. For examples and tutorials, see the main documentation and example notebooks.