From 83055de3007b598bbfb4a1cc6dbd107618ba3220 Mon Sep 17 00:00:00 2001 From: Sneha Jain Date: Wed, 22 Oct 2025 14:47:02 +0530 Subject: [PATCH 01/38] Added a test file --- test.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 test.py diff --git a/test.py b/test.py new file mode 100644 index 0000000..8e23576 --- /dev/null +++ b/test.py @@ -0,0 +1 @@ +print("Hello World") \ No newline at end of file From 20c66a82e270c0a39409e84f37f9a63aa864eba0 Mon Sep 17 00:00:00 2001 From: Siddhartha Chakrabarty Date: Fri, 24 Oct 2025 17:37:33 +0530 Subject: [PATCH 02/38] Added missing.py --- Untitled.ipynb | 327 ++++++++++++++++++ last_query.csv | 7 + mariadb_kernel/maria_magics/missing.py | 252 ++++++++++++++ .../maria_magics/supported_magics.py | 2 + sample_sales_export.csv | 7 + 5 files changed, 595 insertions(+) create mode 100644 Untitled.ipynb create mode 100644 last_query.csv create mode 100644 mariadb_kernel/maria_magics/missing.py create mode 100644 sample_sales_export.csv diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000..a2affd5 --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,327 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "370f7ad6-27b0-4b77-855f-5b42056d8f7d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Database
information_schema
mysql
performance_schema
sys
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show databases;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f793644-f458-4091-9bec-6680a0c2b849", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "create database test;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee0b2971-9407-4de7-a800-79dd56eac54d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "use test;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f15544d-ecac-4f56-988e-b577b8fb9ff7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE TABLE IF NOT EXISTS sample_sales (\n", + " month VARCHAR(10),\n", + " sales INT\n", + ");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d12775b4-23a8-4ba2-9de9-d0d3f727f53c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "INSERT INTO sample_sales (month, sales) VALUES\n", + " ('Jan', 10), ('Feb', 20), ('Mar', 15), ('Apr', 30),\n", + " ('May', 22), ('Jun', 18);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af099a2a-a6e7-4297-b243-b64fd7be07c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
Jun18
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from sample_sales;" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "184962da-5d3c-4f96-83de-2578df4706fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%lsmagic\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9679f73-93a4-4e00-8561-e01207ab9015", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
Jun18
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT month, sales FROM sample_sales ORDER BY FIELD(month,'Jan','Feb','Mar','Apr','May','Jun');\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "771a152f-02ce-42e5-a006-6444bf2626db", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%line\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "28756366-6f31-4045-b2ea-7d47fdf08ff8", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%bar" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e01f05ec-ebc9-4344-a888-fceb40da7a8f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "pie requires either y column or 'subplots=True'\n" + ] + } + ], + "source": [ + "%pie" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "70e58c09-5a55-4093-b135-0fed2202d2d3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The result set was successfully written into sample_sales_export.csv\n" + ] + } + ], + "source": [ + "%df sample_sales_export.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "c674309d-4c1c-4d5a-a879-3b3f4816e4c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The result set was successfully written into last_query.csv\n" + ] + } + ], + "source": [ + "%df" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b1d967f6-6777-454b-821b-d057d96eef13", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
..." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%load sample_sales_export.csv sample_sales" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "364a4ddb-de96-4f20-b979-292184dadd0e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%%delimiter //\n", + "\n", + "CREATE PROCEDURE demo_proc()\n", + "BEGIN\n", + " SELECT 'hello', 1;\n", + "END;\n", + "//\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "525984db-c41b-4f63-a0b6-4c8ac14b1ac2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/last_query.csv b/last_query.csv new file mode 100644 index 0000000..913be90 --- /dev/null +++ b/last_query.csv @@ -0,0 +1,7 @@ +month,sales +Jan,10 +Feb,20 +Mar,15 +Apr,30 +May,22 +Jun,18 diff --git a/mariadb_kernel/maria_magics/missing.py b/mariadb_kernel/maria_magics/missing.py new file mode 100644 index 0000000..2653c26 --- /dev/null +++ b/mariadb_kernel/maria_magics/missing.py @@ -0,0 +1,252 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import re +import traceback + +try: + # optional niceties + from tabulate import tabulate # nice table output if available +except Exception: + tabulate = None + + +class Missing(MariaMagic): + """ + Line magic for computing missing (NULL) counts per column. + + Usage (line magic): + %missing my_schema.my_table + %missing my_table + %missing SELECT id, name, val FROM ... WHERE ... + """ + + def name(self): + return "missing" + + def type(self): + return "line" + + def help(self): + return ( + "Usage: %missing | %missing
| %missing
Database
information_schema
mysql
performance_schema
sys
" + "
Database
information_schema
mysql
performance_schema
sys
test
" ] }, "metadata": {}, @@ -29,7 +29,12 @@ { "data": { "text/html": [ - "Query OK" + "--------------\r\n", + "create database test\r\n", + "--------------\r\n", + "\r\n", + "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_763c5658-b0dd-11f0-80ed-00155d5f88b1': Can't create database 'test'; database exists\r\n", + "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, "metadata": {}, @@ -77,9 +82,12 @@ } ], "source": [ - "CREATE TABLE IF NOT EXISTS sample_sales (\n", - " month VARCHAR(10),\n", - " sales INT\n", + "CREATE TABLE employees (\n", + " id INT PRIMARY KEY AUTO_INCREMENT,\n", + " name VARCHAR(50),\n", + " department VARCHAR(50),\n", + " age INT,\n", + " salary DECIMAL(10,2)\n", ");" ] }, @@ -100,9 +108,15 @@ } ], "source": [ - "INSERT INTO sample_sales (month, sales) VALUES\n", - " ('Jan', 10), ('Feb', 20), ('Mar', 15), ('Apr', 30),\n", - " ('May', 22), ('Jun', 18);" + "INSERT INTO employees (name, department, age, salary) VALUES\n", + "('Alice', 'HR', 30, 50000),\n", + "('Bob', NULL, 40, NULL),\n", + "('Charlie', 'Engineering', NULL, 70000),\n", + "('David', 'HR', 25, 48000),\n", + "('Eve', NULL, 35, NULL),\n", + "('Frank', 'Engineering', 28, 72000),\n", + "(NULL, 'Sales', 50, NULL),\n", + "('Grace', 'Sales', 45, 65000);" ] }, { @@ -114,7 +128,7 @@ { "data": { "text/html": [ - "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
Jun18
" + "
idnamedepartmentagesalary
1AliceHR3050000.00
2BobNULL40NULL
3CharlieEngineeringNULL70000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
" ] }, "metadata": {}, @@ -122,19 +136,54 @@ } ], "source": [ - "select * from sample_sales;" + "SELECT * FROM employees;" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "184962da-5d3c-4f96-83de-2578df4706fe", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
missingpercent
id00.0
name412.5
department825.0
age412.5
salary1237.5
" ] }, "metadata": {}, @@ -142,19 +191,48 @@ } ], "source": [ - "%lsmagic\n" + "%missing" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "d9679f73-93a4-4e00-8561-e01207ab9015", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
Jun18
" + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
percent
id0.0
name12.5
department25.0
age12.5
salary37.5
" ] }, "metadata": {}, @@ -162,66 +240,244 @@ } ], "source": [ - "SELECT month, sales FROM sample_sales ORDER BY FIELD(month,'Jan','Feb','Mar','Apr','May','Jun');\n" + "%missing action=percent" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "771a152f-02ce-42e5-a006-6444bf2626db", "metadata": {}, "outputs": [ { "data": { - "image/png": "" + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
dtypemissingpercent
idint6400.0
nameobject412.5
departmentobject825.0
agefloat64412.5
salaryfloat641237.5
" + ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "%line\n" + "%missing action=summary" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 1, "id": "28756366-6f31-4045-b2ea-7d47fdf08ff8", "metadata": {}, "outputs": [ { "data": { - "image/png": "" + "text/html": [ + "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" + ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ - "%bar" + "%lsmagic" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "e01f05ec-ebc9-4344-a888-fceb40da7a8f", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "pie requires either y column or 'subplots=True'\n" + "Dropped rows with missing values (in-place). Updated last_select.\n" ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "%pie" + "%dropmissing" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "id": "70e58c09-5a55-4093-b135-0fed2202d2d3", "metadata": {}, "outputs": [ @@ -229,17 +485,150 @@ "name": "stdout", "output_type": "stream", "text": [ - "The result set was successfully written into sample_sales_export.csv\n" + "Dropped rows with missing values (in-place). Updated last_select.\n" ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "%df sample_sales_export.csv" + "%dropmissing columns=salary" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 22, "id": "c674309d-4c1c-4d5a-a879-3b3f4816e4c5", "metadata": {}, "outputs": [ @@ -247,24 +636,139 @@ "name": "stdout", "output_type": "stream", "text": [ - "The result set was successfully written into last_query.csv\n" + "Dropped rows with missing values (in-place). Updated last_select.\n" ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
3CharlieEngineeringNaN70000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "%df" + "%dropmissing columns=salary" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "b1d967f6-6777-454b-821b-d057d96eef13", + "execution_count": 8, + "id": "364a4ddb-de96-4f20-b979-292184dadd0e", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
monthsales
Jan10
Feb20
Mar15
Apr30
May22
..." + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idagesalary
count8.000007.0000005.00000
mean4.5000036.14285761000.00000
std2.449499.26334311269.42767
min1.0000025.00000048000.00000
25%2.7500029.00000050000.00000
50%4.5000035.00000065000.00000
75%6.2500042.50000070000.00000
max8.0000050.00000072000.00000
" ] }, "metadata": {}, @@ -272,19 +776,120 @@ } ], "source": [ - "%load sample_sales_export.csv sample_sales" + "%stats" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "364a4ddb-de96-4f20-b979-292184dadd0e", + "execution_count": 9, + "id": "0031cec8-4a24-491b-a910-f3effa939afd", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "Query OK" + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
count8.00000767.0000005.00000
uniqueNaN73NaNNaN
topNaNAliceHRNaNNaN
freqNaN12NaNNaN
mean4.50000NaNNaN36.14285761000.00000
std2.44949NaNNaN9.26334311269.42767
min1.00000NaNNaN25.00000048000.00000
25%2.75000NaNNaN29.00000050000.00000
50%4.50000NaNNaN35.00000065000.00000
75%6.25000NaNNaN42.50000070000.00000
max8.00000NaNNaN50.00000072000.00000
" ] }, "metadata": {}, @@ -292,22 +897,18 @@ } ], "source": [ - "%%delimiter //\n", - "\n", - "CREATE PROCEDURE demo_proc()\n", - "BEGIN\n", - " SELECT 'hello', 1;\n", - "END;\n", - "//\n" + "%stats include=all" ] }, { "cell_type": "code", "execution_count": null, - "id": "525984db-c41b-4f63-a0b6-4c8ac14b1ac2", + "id": "4247d68c-6f93-4297-b1fd-fa09bf6362f8", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "drop table employees;" + ] } ], "metadata": { From b3aab62e48fbf90cd4c71f42346c8fee398ba602 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Sun, 26 Oct 2025 10:33:55 +0000 Subject: [PATCH 08/38] Added fillmissing --- Untitled.ipynb | 464 +++++++++++++++++- last_query.csv | 16 +- .../data_cleaning}/dropmissing.py | 0 .../ml_commands/data_cleaning/fillmissing.py | 192 ++++++++ .../data_cleaning}/missing.py | 0 .../{ => ml_commands/data_cleaning}/stats.py | 0 .../maria_magics/supported_magics.py | 8 +- 7 files changed, 662 insertions(+), 18 deletions(-) rename mariadb_kernel/maria_magics/{ => ml_commands/data_cleaning}/dropmissing.py (100%) create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py rename mariadb_kernel/maria_magics/{ => ml_commands/data_cleaning}/missing.py (100%) rename mariadb_kernel/maria_magics/{ => ml_commands/data_cleaning}/stats.py (100%) diff --git a/Untitled.ipynb b/Untitled.ipynb index 1f9843b..f64b069 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -141,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 8, "id": "184962da-5d3c-4f96-83de-2578df4706fe", "metadata": {}, "outputs": [ @@ -164,22 +164,22 @@ " \n", " \n", " name\n", - " 4\n", + " 1\n", " 12.5\n", " \n", " \n", " department\n", - " 8\n", + " 2\n", " 25.0\n", " \n", " \n", " age\n", - " 4\n", + " 1\n", " 12.5\n", " \n", " \n", " salary\n", - " 12\n", + " 3\n", " 37.5\n", " \n", " \n", @@ -306,14 +306,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "28756366-6f31-4045-b2ea-7d47fdf08ff8", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" + "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
stats%stats [columns=col1,col2] [include=all|numeric|object] [percentiles=25,50,75] [transpose=true|false]
Show statistical summary (uses pandas.DataFrame.describe under the hood).
fillmissing%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]
Fills missing values in data['last_select'] (always IN-PLACE).
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" ] }, "metadata": {}, @@ -900,15 +900,463 @@ "%stats include=all" ] }, + { + "cell_type": "code", + "execution_count": 28, + "id": "829f6342-96b1-4d81-8519-2a53c091dfb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'id': filled missing with mean=4.5.\n", + "Column 'name' is not numeric; cannot use mean. Skipped.\n", + "Column 'department' is not numeric; cannot use mean. Skipped.\n", + "Column 'age': filled missing with mean=36.142857142857146.\n", + "Column 'salary': filled missing with mean=61000.0.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.00000050000.0
2BobUnknown40.00000061000.0
3CharlieEngineering36.14285770000.0
4DavidHR25.00000048000.0
5EveUnknown35.00000061000.0
6FrankEngineering28.00000072000.0
7UnknownSales50.00000061000.0
8GraceSales45.00000065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "52d78b76-69c5-41d4-874f-8d8cb8b0cae9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'age': filled missing with median=35.0.\n", + "Column 'salary': filled missing with median=65000.0.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
2BobNaN40.065000.0
3CharlieEngineering35.070000.0
4DavidHR25.048000.0
5EveNaN35.065000.0
6FrankEngineering28.072000.0
7NaNSales50.065000.0
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing columns=age,salary strategy=median" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "beac2393-829b-472e-b9cb-d12166e16088", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'id': filled missing with mode=1.\n", + "Column 'name': filled missing with mode=Alice.\n", + "Column 'department': filled missing with mode=Engineering.\n", + "Column 'age': filled missing with mode=25.0.\n", + "Column 'salary': filled missing with mode=48000.0.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
2BobEngineering40.048000.0
3CharlieEngineering25.070000.0
4DavidHR25.048000.0
5EveEngineering35.048000.0
6FrankEngineering28.072000.0
7AliceSales50.048000.0
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing strategy=mode" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'name': filled missing with constant value=Unknown.\n", + "Column 'department': filled missing with constant value=Unknown.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
2BobUnknown40.0NaN
3CharlieEngineeringNaN70000.0
4DavidHR25.048000.0
5EveUnknown35.0NaN
6FrankEngineering28.072000.0
7UnknownSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Strategy 'constant' requires a 'value=...' argument.\n" + ] + } + ], + "source": [ + "%fillmissing strategy=constant" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The result set was successfully written into last_query.csv\n" + ] + } + ], + "source": [ + "%df" + ] + }, { "cell_type": "code", "execution_count": null, "id": "4247d68c-6f93-4297-b1fd-fa09bf6362f8", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "drop table employees;" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87d73330-b792-4d19-9eac-daa9cb0c7d1a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/last_query.csv b/last_query.csv index 913be90..3450d41 100644 --- a/last_query.csv +++ b/last_query.csv @@ -1,7 +1,9 @@ -month,sales -Jan,10 -Feb,20 -Mar,15 -Apr,30 -May,22 -Jun,18 +id,name,department,age,salary +1,Alice,HR,30.0,50000.0 +2,Bob,Unknown,40.0,61000.0 +3,Charlie,Engineering,36.142857142857146,70000.0 +4,David,HR,25.0,48000.0 +5,Eve,Unknown,35.0,61000.0 +6,Frank,Engineering,28.0,72000.0 +7,Unknown,Sales,50.0,61000.0 +8,Grace,Sales,45.0,65000.0 diff --git a/mariadb_kernel/maria_magics/dropmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py similarity index 100% rename from mariadb_kernel/maria_magics/dropmissing.py rename to mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py new file mode 100644 index 0000000..2c768a4 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -0,0 +1,192 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd + + +class FillMissing(MariaMagic): + """ + %fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const] + + Always performs the operation IN-PLACE on data["last_select"]: + + - If columns provided, fill missing values only for those columns. + - If no columns provided, fill missing values for all columns. + - strategies: + * mean -> uses column mean (numeric columns only) + * median -> uses column median (numeric columns only) + * mode -> uses column mode (most frequent value; works for any dtype) + * constant-> fills with provided value (value must be supplied via value=...) + Examples: + %fillmissing + -> fills numeric columns with their mean (default strategy=mean) + %fillmissing columns=age,salary strategy=median + -> fills age and salary missing values with column medians (in-place) + %fillmissing columns=name strategy=constant value="unknown" + -> fills name with "unknown" where missing (in-place) + %fillmissing strategy=mode + -> fills every column's missing values with its mode (if exists) + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "fillmissing" + + def help(self): + return ( + "%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]\n" + "Fills missing values in data['last_select'] (always IN-PLACE)." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + # Remove surrounding quotes if present so value="abc" becomes abc (still as string) + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments (keeps behavior consistent with other magics).""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML (fallback to text if needed).""" + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def execute(self, kernel, data): + """Execute the fillmissing magic (always modifies data['last_select']).""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + # determine target columns (None => all columns) + if columns is None: + target_columns = list(df.columns) + else: + target_columns = columns + missing_cols = [c for c in target_columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + return + + # parse strategy + strategy = args.get("strategy", "mean") + if isinstance(strategy, str): + strategy = strategy.lower() + else: + strategy = str(strategy).lower() + + allowed = {"mean", "median", "mode", "constant"} + if strategy not in allowed: + kernel._send_message("stderr", f"Unknown strategy '{strategy}'. Allowed: {', '.join(allowed)}") + return + + # constant requires value + value_provided = "value" in args + const_value = args.get("value", None) + + if strategy == "constant" and not value_provided: + kernel._send_message("stderr", "Strategy 'constant' requires a 'value=...' argument.") + return + + # perform filling column by column with sensible handling for dtype + messages = [] + for col in target_columns: + try: + series = df[col] + if strategy in {"mean", "median"}: + # only numeric columns supported for mean/median + if pd.api.types.is_numeric_dtype(series): + if strategy == "mean": + fill_val = series.mean(skipna=True) + else: + fill_val = series.median(skipna=True) + # If result is NaN (e.g., all values missing), skip and warn + if pd.isna(fill_val): + messages.append(f"Column '{col}': no non-missing values to compute {strategy}. Skipped.") + continue + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with {strategy}={fill_val}.") + else: + messages.append(f"Column '{col}' is not numeric; cannot use {strategy}. Skipped.") + continue + + elif strategy == "mode": + # mode works for any dtype; pick first mode if multiple + modes = series.mode(dropna=True) + if modes.empty: + messages.append(f"Column '{col}': no mode (all missing). Skipped.") + continue + fill_val = modes.iloc[0] + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with mode={fill_val}.") + + elif strategy == "constant": + # use the parsed const_value directly + fill_val = const_value + # If fill_val is a string that looks like "None", we want to keep it as string; + # do not coerce types implicitly — user controls value type via quotes or unquoted numbers. + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with constant value={fill_val}.") + + except Exception as e: + messages.append(f"Column '{col}': error while filling missing values: {e}") + + # update the data store and display results + try: + data["last_select"] = df + summary = "\n".join(messages) + kernel._send_message("stdout", f"Fill missing completed (in-place). Summary:\n{summary}") + self._send_html(kernel, df) + except Exception as e: + kernel._send_message("stderr", f"Error while updating last_select or displaying DataFrame: {e}") diff --git a/mariadb_kernel/maria_magics/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py similarity index 100% rename from mariadb_kernel/maria_magics/missing.py rename to mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py diff --git a/mariadb_kernel/maria_magics/stats.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py similarity index 100% rename from mariadb_kernel/maria_magics/stats.py rename to mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index a207408..e7f0a79 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -11,9 +11,10 @@ from mariadb_kernel.maria_magics.pie import Pie from mariadb_kernel.maria_magics.delimiter import Delimiter from mariadb_kernel.maria_magics.load import Load -from mariadb_kernel.maria_magics.missing import Missing -from mariadb_kernel.maria_magics.dropmissing import DropMissing -from mariadb_kernel.maria_magics.stats import Stats +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing def get(): return { @@ -27,4 +28,5 @@ def get(): "missing": Missing, "dropmissing": DropMissing, "stats": Stats, + "fillmissing": FillMissing, } From d49fefea99749419637372dbab04511f4d058d72 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Sun, 26 Oct 2025 11:04:14 +0000 Subject: [PATCH 09/38] Added outliers --- Untitled.ipynb | 510 +++++++++++++++++- .../ml_commands/data_cleaning/outliers.py | 281 ++++++++++ .../maria_magics/supported_magics.py | 2 + 3 files changed, 789 insertions(+), 4 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py diff --git a/Untitled.ipynb b/Untitled.ipynb index f64b069..431b0f5 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -109,9 +109,9 @@ ], "source": [ "INSERT INTO employees (name, department, age, salary) VALUES\n", - "('Alice', 'HR', 30, 50000),\n", + "('Alice', 'HR', 30,5000),\n", "('Bob', NULL, 40, NULL),\n", - "('Charlie', 'Engineering', NULL, 70000),\n", + "('Charlie', 'Engineering', NULL, 700000),\n", "('David', 'HR', 25, 48000),\n", "('Eve', NULL, 35, NULL),\n", "('Frank', 'Engineering', 28, 72000),\n", @@ -128,7 +128,7 @@ { "data": { "text/html": [ - "
idnamedepartmentagesalary
1AliceHR3050000.00
2BobNULL40NULL
3CharlieEngineeringNULL70000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
" + "
idnamedepartmentagesalary
1AliceHR305000.00
2BobNULL40NULL
3CharlieEngineeringNULL700000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
" ] }, "metadata": {}, @@ -1330,6 +1330,503 @@ "%df" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.05000.0FalseFalseTrue
2BobNaN40.0NaNFalseFalseFalse
3CharlieEngineeringNaN700000.0FalseFalseTrue
4DavidHR25.048000.0FalseFalseFalse
5EveNaN35.0NaNFalseFalseFalse
6FrankEngineering28.072000.0FalseFalseFalse
7NaNSales50.0NaNFalseFalseFalse
8GraceSales45.065000.0FalseFalseFalse
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.05000.0FalseFalseFalse
2BobNaN40.0NaNFalseFalseFalse
3CharlieEngineeringNaN700000.0FalseFalseFalse
4DavidHR25.048000.0FalseFalseFalse
5EveNaN35.0NaNFalseFalseFalse
6FrankEngineering28.072000.0FalseFalseFalse
7NaNSales50.0NaNFalseFalseFalse
8GraceSales45.065000.0FalseFalseFalse
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers method=zscore z_thresh=2.5" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.05000.0FalseFalseTrue
2BobNaN40.0NaNFalseFalseFalse
3CharlieEngineeringNaN700000.0FalseFalseTrue
4DavidHR25.048000.0FalseFalseFalse
5EveNaN35.0NaNFalseFalseFalse
6FrankEngineering28.072000.0FalseFalseFalse
7NaNSales50.0NaNFalseFalseFalse
8GraceSales45.065000.0FalseFalseFalse
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers plot=True" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Marked outliers in-place. Summary:\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n" + ] + }, + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarysalary_is_outlier
1AliceHR30.05000.0True
2BobNaN40.0NaNFalse
3CharlieEngineeringNaN700000.0True
4DavidHR25.048000.0False
5EveNaN35.0NaNFalse
6FrankEngineering28.072000.0False
7NaNSales50.0NaNFalse
8GraceSales45.065000.0False
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers columns=salary plot=True" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1339,7 +1836,12 @@ { "data": { "text/html": [ - "Query OK" + "--------------\r\n", + "drop table employees\r\n", + "--------------\r\n", + "\r\n", + "ERROR 1051 (42S02) at line 1 in file: '/home/iddhartha/mariadb_kernel/.mariadb_statement_ebed4fbe-b25a-11f0-a17a-00155d4db3cf': Unknown table 'test.employees'\r\n", + "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, "metadata": {}, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py new file mode 100644 index 0000000..b10294f --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py @@ -0,0 +1,281 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import io +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +class Outliers(MariaMagic): + """ + %outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False] + + Detects outliers (NON IN-PLACE) and stores a copy of the DataFrame with boolean + indicator columns in data['last_select_outliers']. + + - method: + iqr -> Tukey IQR method using k (default 1.5) + zscore-> absolute z-score above z_thresh (default 3.0) + - columns: comma-separated columns to test. If omitted, all numeric columns are used. + - plot: True/False (default False). When True, displays a figure containing: + * top: boxplot of selected numeric columns with detected outliers overlaid + * bottom: scatter plot (index vs value) for each selected column; outliers highlighted + Examples: + %outliers + %outliers columns=age,salary method=zscore z_thresh=2.5 plot=True + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "outliers" + + def help(self): + return ( + "%outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False]\n" + "Detects outliers in data['last_select'] (non in-place). Results placed in data['last_select_outliers']." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _send_image(self, kernel, fig): + buf = io.BytesIO() + try: + fig.tight_layout() + except Exception: + pass + fig.savefig(buf, format="png", bbox_inches="tight") + plt.close(fig) + buf.seek(0) + img_bytes = buf.read() + display_content = {"data": {"image/png": img_bytes}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): + """Return boolean mask of outliers for a pandas Series (True where outlier).""" + if series.dropna().empty: + return pd.Series(False, index=series.index) + + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + mask = (series < lower) | (series > upper) + return mask.fillna(False) + + elif method == "zscore": + mean = series.mean(skipna=True) + std = series.std(skipna=True) + if std == 0 or np.isnan(std): + return pd.Series(False, index=series.index) + z = (series - mean) / std + mask = z.abs() > float(z_thresh) + return mask.fillna(False) + + else: + raise ValueError(f"Unknown method {method}") + + def _build_plots(self, df_numeric, outlier_masks): + """ + Builds a figure with: + - top: boxplot of df_numeric (showfliers=False) with overlay of detected outliers + - bottom: scatter plot (index vs value) for each column; outliers highlighted in red + """ + cols = list(df_numeric.columns) + if not cols: + fig = plt.figure(figsize=(6, 3)) + plt.text(0.5, 0.5, "No numeric columns to plot", ha="center", va="center") + return fig + + ncols = len(cols) + fig = plt.figure(figsize=(max(6, ncols * 1.2), 6)) + gs = fig.add_gridspec(2, 1, height_ratios=[1, 1.2], hspace=0.35) + + # Top: boxplot + ax_box = fig.add_subplot(gs[0, 0]) + df_numeric.boxplot(column=cols, ax=ax_box, showfliers=False) + ax_box.set_title("Box plot (detected outliers overlaid)") + ax_box.set_xlabel("") + ax_box.set_ylabel("Value") + + xs = np.arange(1, len(cols) + 1) + for i, col in enumerate(cols): + mask = outlier_masks.get(col) + if mask is None: + continue + out_vals = df_numeric.loc[mask, col] + if out_vals.empty: + continue + # slight horizontal jitter for readability + jitter = np.random.normal(scale=0.05, size=len(out_vals)) + ax_box.scatter(np.full(len(out_vals), xs[i]) + jitter, out_vals.values, + marker='x', s=50, linewidths=1.0, zorder=6) + + # Bottom: scatter plot (index vs value) per column + ax_scatter = fig.add_subplot(gs[1, 0]) + # Plot each column as its own series using the DataFrame index as x + for i, col in enumerate(cols): + series = df_numeric[col] + mask = outlier_masks.get(col, pd.Series(False, index=series.index)) + # Small x-offset per column to avoid overlap when multiple columns share indices + x_offset = (i - (ncols - 1) / 2) * 0.08 + xs_plot = series.index.values.astype(float) + x_offset + ax_scatter.scatter(xs_plot, series.values, alpha=0.6, label=col, s=20) + # highlight outliers in red with larger marker + if mask.any(): + ax_scatter.scatter(series.index.values.astype(float)[mask], series[mask].values, + color='red', edgecolors='k', s=50, label=f"{col} outlier", zorder=7) + + ax_scatter.set_title("Scatter plot (index vs value) — outliers highlighted") + ax_scatter.set_xlabel("Row index") + ax_scatter.set_ylabel("Value") + # avoid duplicate legend entries + handles, labels = ax_scatter.get_legend_handles_labels() + by_label = dict(zip(labels, handles)) + ax_scatter.legend(by_label.values(), by_label.keys(), fontsize='small', loc='best', ncol=2) + + return fig + + def execute(self, kernel, data): + """Execute the outliers magic (non in-place).""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + + plot = bool(args.get("plot", False)) + + # Determine target numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + return + # keep only numeric columns (skip non-numeric) + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + kernel._send_message("stderr", "No numeric target columns found to detect outliers.") + return + + # Work on a copy (non in-place) + result_df = df.copy(deep=True) + + # Detect outliers per column and store masks + outlier_masks = {} + messages = [] + for col in target_columns: + try: + mask = self._detect_outliers_series(result_df[col], method, k=k, z_thresh=z_thresh) + outlier_masks[col] = mask + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + # add boolean indicator column to the copy (non in-place on original) + result_df[f"{col}_is_outlier"] = mask.astype(bool) + except Exception as e: + messages.append(f"Column '{col}': error detecting outliers: {e}") + + # Store result in a separate key so original remains unchanged + data["last_select_outliers"] = result_df + + # Send summary message + kernel._send_message("stdout", "Outlier detection completed (non in-place). Summary:\n" + "\n".join(messages)) + kernel._send_message("stdout", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).") + + # Plot if requested + if plot: + try: + df_numeric = result_df[target_columns] + fig = self._build_plots(df_numeric, outlier_masks) + self._send_image(kernel, fig) + except Exception as e: + kernel._send_message("stderr", f"Error while plotting: {e}") + + # Finally show the result DataFrame (the copy with indicator columns) + try: + self._send_html(kernel, data["last_select_outliers"]) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index e7f0a79..280d5d1 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -15,6 +15,7 @@ from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers def get(): return { @@ -29,4 +30,5 @@ def get(): "dropmissing": DropMissing, "stats": Stats, "fillmissing": FillMissing, + "outliers": Outliers, } From 7d72efec41911059309f1e976ad83c7ce7cd7896 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Sun, 26 Oct 2025 11:33:12 +0000 Subject: [PATCH 10/38] Added clipoutliers --- Untitled.ipynb | 383 +++++++++++++++++- .../ml_commands/data_cleaning/clipoutliers.py | 206 ++++++++++ .../ml_commands/data_cleaning/dropoutliers.py | 202 +++++++++ .../maria_magics/supported_magics.py | 4 + 4 files changed, 788 insertions(+), 7 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py diff --git a/Untitled.ipynb b/Untitled.ipynb index 431b0f5..e0cd640 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -1332,7 +1332,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", "metadata": {}, "outputs": [ @@ -1827,6 +1827,372 @@ "%outliers columns=salary plot=True" ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped 2 row(s) containing outliers (in-place).\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
2BobNaN40.0NaN
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No outliers detected. No rows removed.\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
2BobNaN40.0NaN
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers method=zscore z_thresh=2.5" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed.\n", + "Column 'id': clipped 0 value(s) to bounds (lower=-2.5, upper=11.5).\n", + "Column 'age': clipped 0 value(s) to bounds (lower=8.75, upper=62.75).\n", + "Column 'salary': clipped 2 value(s) to bounds (lower=12000.0, upper=108000.0).\n", + "Total values clipped: 2. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.012000.0
2BobNaN40.0NaN
3CharlieEngineeringNaN108000.0
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed.\n", + "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", + "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", + "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", + "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.012000.0
2BobNaN40.0NaN
3CharlieEngineeringNaN108000.0
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers method=zscore z_thresh=2.0 " + ] + }, { "cell_type": "code", "execution_count": null, @@ -1836,12 +2202,7 @@ { "data": { "text/html": [ - "--------------\r\n", - "drop table employees\r\n", - "--------------\r\n", - "\r\n", - "ERROR 1051 (42S02) at line 1 in file: '/home/iddhartha/mariadb_kernel/.mariadb_statement_ebed4fbe-b25a-11f0-a17a-00155d4db3cf': Unknown table 'test.employees'\r\n", - "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" + "Query OK" ] }, "metadata": {}, @@ -1859,6 +2220,14 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87d02cd4-7308-4a7f-b598-064deb297357", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py new file mode 100644 index 0000000..98e8d4c --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -0,0 +1,206 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np + + +class ClipOutliers(MariaMagic): + """ + %clipoutliers [columns=col1,col2,...] [method=iqr|zscore] + [k=1.5] [z_thresh=3.0] [inplace=True|False] + + Clamps (clips) extreme values to computed boundary limits. + + - method: + iqr -> Tukey IQR method using k (default 1.5) + zscore -> mean ± z_thresh * std (default z_thresh=3.0) + + - columns: comma-separated list of columns to operate on. If omitted, all numeric columns are used. + - inplace: if True (default) modifies data["last_select"] in-place. + if False stores clipped copy in data["last_select_clipped"]. + + Examples: + %clipoutliers -> clip numeric columns using iqr (k=1.5) in-place + %clipoutliers method=zscore z_thresh=2.5 columns=age,salary inplace=False + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "clipoutliers" + + def help(self): + return ( + "%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] " + "[k=1.5] [z_thresh=3.0] [inplace=True|False]\n" + "Clamps extreme numeric values to computed boundaries (in-place by default)." + ) + + def _str_to_obj(self, s): + """Convert strings like numbers or bools into Python objects.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments.""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML.""" + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {mime: html}, "metadata": {}}) + + def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): + """Compute (lower, upper) clipping bounds.""" + s = series.dropna() + if s.empty: + return None, None + + if method == "iqr": + q1 = s.quantile(0.25) + q3 = s.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return lower, upper + + elif method == "zscore": + mean = s.mean() + std = s.std() + if std == 0 or np.isnan(std): + return None, None + lower = mean - z_thresh * std + upper = mean + z_thresh * std + return lower, upper + + else: + raise ValueError(f"Unknown method {method}") + + def execute(self, kernel, data): + """Execute the %clipoutliers magic.""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse args + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + + inplace = bool(args.get("inplace", True)) + + # Determine numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + return + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + kernel._send_message("stderr", "No numeric target columns found to clip outliers.") + return + + target_df = df if inplace else df.copy(deep=True) + + messages = [] + total_clipped = 0 + + for col in target_columns: + try: + series = target_df[col] + lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) + if lower is None and upper is None: + messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") + continue + + # find how many will change + mask = ((series < lower) | (series > upper)) & ~series.isna() + n_changed = int(mask.sum()) + target_df[col] = series.clip(lower=lower, upper=upper) + total_clipped += n_changed + messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") + except Exception as e: + messages.append(f"Column '{col}': error while clipping: {e}") + + if inplace: + data["last_select"] = target_df + location_msg = "Modified in-place: data['last_select'] updated." + else: + data["last_select_clipped"] = target_df + location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." + + kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" + + "\n".join(messages) + + f"\nTotal values clipped: {total_clipped}. {location_msg}") + + # Show output + try: + self._send_html(kernel, target_df) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py new file mode 100644 index 0000000..d52b3e6 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py @@ -0,0 +1,202 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np + + +class DropOutliers(MariaMagic): + """ + %dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] + + Removes rows (IN-PLACE) from data['last_select'] where any selected numeric column + is detected as an outlier according to the chosen method. + + - method: + iqr -> Tukey IQR method using k (default 1.5) + zscore -> absolute z-score above z_thresh (default 3.0) + + Examples: + %dropoutliers + -> use IQR with k=1.5 on all numeric columns and drop rows containing outliers + %dropoutliers columns=age,salary method=zscore z_thresh=2.5 + -> drop rows where age OR salary has |z| > 2.5 + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "dropoutliers" + + def help(self): + return ( + "%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]\n" + "Removes rows containing outliers from data['last_select'] (in-place)." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments (keeps behavior consistent with other magics).""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML (fallback to text if needed).""" + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): + """Return boolean mask of outliers for a pandas Series (True where outlier).""" + if series.dropna().empty: + return pd.Series(False, index=series.index) + + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + mask = (series < lower) | (series > upper) + return mask.fillna(False) + + elif method == "zscore": + mean = series.mean(skipna=True) + std = series.std(skipna=True) + if std == 0 or np.isnan(std): + return pd.Series(False, index=series.index) + z = (series - mean) / std + mask = z.abs() > float(z_thresh) + return mask.fillna(False) + + else: + raise ValueError(f"Unknown method {method}") + + def execute(self, kernel, data): + """Execute the dropoutliers magic (modifies data['last_select'] in-place).""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + # method and params + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + + # Determine target numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + return + # keep only numeric columns + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + kernel._send_message("stderr", "No numeric target columns found to detect outliers.") + return + + # Detect outliers per column and combine masks + combined_mask = None + messages = [] + for col in target_columns: + try: + mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + if combined_mask is None: + combined_mask = mask.astype(bool) + else: + combined_mask = combined_mask | mask.astype(bool) + except Exception as e: + messages.append(f"Column '{col}': error detecting outliers: {e}") + + if combined_mask is None or not combined_mask.any(): + kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) + # still show DataFrame + try: + self._send_html(kernel, df) + except Exception: + pass + return + + # Drop rows in-place where any target column is an outlier + try: + n_before = len(df) + df.drop(index=df[combined_mask].index, inplace=True) + data["last_select"] = df + n_after = len(df) + kernel._send_message("stdout", f"Dropped {n_before - n_after} row(s) containing outliers (in-place).\n" + "\n".join(messages)) + try: + self._send_html(kernel, df) + except Exception: + pass + except Exception as e: + kernel._send_message("stderr", f"Error while removing outlier rows: {e}") diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 280d5d1..6709eba 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -16,6 +16,8 @@ from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers def get(): return { @@ -31,4 +33,6 @@ def get(): "stats": Stats, "fillmissing": FillMissing, "outliers": Outliers, + "dropoutliers": DropOutliers, + "clipoutliers": ClipOutliers, } From 2deeba6208adeb835bd70dfe3965b34ce925f105 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Sun, 26 Oct 2025 12:09:44 +0000 Subject: [PATCH 11/38] Added encode.py --- Untitled.ipynb | 514 +++++++++++++++--- .../ml_commands/data_preprocessing/encode.py | 186 +++++++ .../maria_magics/supported_magics.py | 2 + 3 files changed, 618 insertions(+), 84 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py diff --git a/Untitled.ipynb b/Untitled.ipynb index e0cd640..9d65c1a 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -33,7 +33,7 @@ "create database test\r\n", "--------------\r\n", "\r\n", - "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_763c5658-b0dd-11f0-80ed-00155d5f88b1': Can't create database 'test'; database exists\r\n", + "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_af73b9b2-b262-11f0-b068-00155d4e875d': Can't create database 'test'; database exists\r\n", "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, @@ -74,7 +74,18 @@ { "data": { "text/html": [ - "Query OK" + "--------------\r\n", + "CREATE TABLE employees (\r\n", + " id INT PRIMARY KEY AUTO_INCREMENT,\r\n", + " name VARCHAR(50),\r\n", + " department VARCHAR(50),\r\n", + " age INT,\r\n", + " salary DECIMAL(10,2)\r\n", + ")\r\n", + "--------------\r\n", + "\r\n", + "ERROR 1046 (3D000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_bb843eec-b263-11f0-b5b5-00155d4e875d': No database selected\r\n", + "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, "metadata": {}, @@ -902,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "829f6342-96b1-4d81-8519-2a53c091dfb1", "metadata": {}, "outputs": [ @@ -915,7 +926,7 @@ "Column 'name' is not numeric; cannot use mean. Skipped.\n", "Column 'department' is not numeric; cannot use mean. Skipped.\n", "Column 'age': filled missing with mean=36.142857142857146.\n", - "Column 'salary': filled missing with mean=61000.0.\n" + "Column 'salary': filled missing with mean=178000.0.\n" ] }, { @@ -937,21 +948,21 @@ " Alice\n", " HR\n", " 30.000000\n", - " 50000.0\n", + " 5000.0\n", " \n", " \n", " 2\n", " Bob\n", - " Unknown\n", + " NaN\n", " 40.000000\n", - " 61000.0\n", + " 178000.0\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", " 36.142857\n", - " 70000.0\n", + " 700000.0\n", " \n", " \n", " 4\n", @@ -963,9 +974,9 @@ " \n", " 5\n", " Eve\n", - " Unknown\n", + " NaN\n", " 35.000000\n", - " 61000.0\n", + " 178000.0\n", " \n", " \n", " 6\n", @@ -976,10 +987,10 @@ " \n", " \n", " 7\n", - " Unknown\n", + " NaN\n", " Sales\n", " 50.000000\n", - " 61000.0\n", + " 178000.0\n", " \n", " \n", " 8\n", @@ -1199,7 +1210,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 6, "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", "metadata": {}, "outputs": [ @@ -1231,7 +1242,7 @@ " Alice\n", " HR\n", " 30.0\n", - " 50000.0\n", + " 5000.0\n", " \n", " \n", " 2\n", @@ -1245,7 +1256,7 @@ " Charlie\n", " Engineering\n", " NaN\n", - " 70000.0\n", + " 700000.0\n", " \n", " \n", " 4\n", @@ -1332,7 +1343,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", "metadata": {}, "outputs": [ @@ -1343,7 +1354,7 @@ "Outlier detection completed (non in-place). Summary:\n", "Column 'id': detected 0 outlier(s) using iqr.\n", "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, @@ -1368,18 +1379,18 @@ " 1\n", " Alice\n", " HR\n", - " 30.0\n", + " 30.000000\n", " 5000.0\n", " False\n", " False\n", - " True\n", + " False\n", " \n", " \n", " 2\n", " Bob\n", - " NaN\n", - " 40.0\n", - " NaN\n", + " Unknown\n", + " 40.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1388,7 +1399,7 @@ " 3\n", " Charlie\n", " Engineering\n", - " NaN\n", + " 36.142857\n", " 700000.0\n", " False\n", " False\n", @@ -1398,7 +1409,7 @@ " 4\n", " David\n", " HR\n", - " 25.0\n", + " 25.000000\n", " 48000.0\n", " False\n", " False\n", @@ -1407,9 +1418,9 @@ " \n", " 5\n", " Eve\n", - " NaN\n", - " 35.0\n", - " NaN\n", + " Unknown\n", + " 35.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1418,7 +1429,7 @@ " 6\n", " Frank\n", " Engineering\n", - " 28.0\n", + " 28.000000\n", " 72000.0\n", " False\n", " False\n", @@ -1426,10 +1437,10 @@ " \n", " \n", " 7\n", - " NaN\n", + " Unknown\n", " Sales\n", - " 50.0\n", - " NaN\n", + " 50.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1438,7 +1449,7 @@ " 8\n", " Grace\n", " Sales\n", - " 45.0\n", + " 45.000000\n", " 65000.0\n", " False\n", " False\n", @@ -1458,7 +1469,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", "metadata": {}, "outputs": [ @@ -1494,7 +1505,7 @@ " 1\n", " Alice\n", " HR\n", - " 30.0\n", + " 30.000000\n", " 5000.0\n", " False\n", " False\n", @@ -1503,9 +1514,9 @@ " \n", " 2\n", " Bob\n", - " NaN\n", - " 40.0\n", - " NaN\n", + " Unknown\n", + " 40.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1514,7 +1525,7 @@ " 3\n", " Charlie\n", " Engineering\n", - " NaN\n", + " 36.142857\n", " 700000.0\n", " False\n", " False\n", @@ -1524,7 +1535,7 @@ " 4\n", " David\n", " HR\n", - " 25.0\n", + " 25.000000\n", " 48000.0\n", " False\n", " False\n", @@ -1533,9 +1544,9 @@ " \n", " 5\n", " Eve\n", - " NaN\n", - " 35.0\n", - " NaN\n", + " Unknown\n", + " 35.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1544,7 +1555,7 @@ " 6\n", " Frank\n", " Engineering\n", - " 28.0\n", + " 28.000000\n", " 72000.0\n", " False\n", " False\n", @@ -1552,10 +1563,10 @@ " \n", " \n", " 7\n", - " NaN\n", + " Unknown\n", " Sales\n", - " 50.0\n", - " NaN\n", + " 50.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1564,7 +1575,7 @@ " 8\n", " Grace\n", " Sales\n", - " 45.0\n", + " 45.000000\n", " 65000.0\n", " False\n", " False\n", @@ -1584,7 +1595,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", "metadata": {}, "outputs": [ @@ -1595,13 +1606,13 @@ "Outlier detection completed (non in-place). Summary:\n", "Column 'id': detected 0 outlier(s) using iqr.\n", "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, { "data": { - "image/png": "" + "image/png": "" }, "metadata": {}, "output_type": "display_data" @@ -1627,18 +1638,18 @@ " 1\n", " Alice\n", " HR\n", - " 30.0\n", + " 30.000000\n", " 5000.0\n", " False\n", " False\n", - " True\n", + " False\n", " \n", " \n", " 2\n", " Bob\n", - " NaN\n", - " 40.0\n", - " NaN\n", + " Unknown\n", + " 40.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1647,7 +1658,7 @@ " 3\n", " Charlie\n", " Engineering\n", - " NaN\n", + " 36.142857\n", " 700000.0\n", " False\n", " False\n", @@ -1657,7 +1668,7 @@ " 4\n", " David\n", " HR\n", - " 25.0\n", + " 25.000000\n", " 48000.0\n", " False\n", " False\n", @@ -1666,9 +1677,9 @@ " \n", " 5\n", " Eve\n", - " NaN\n", - " 35.0\n", - " NaN\n", + " Unknown\n", + " 35.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1677,7 +1688,7 @@ " 6\n", " Frank\n", " Engineering\n", - " 28.0\n", + " 28.000000\n", " 72000.0\n", " False\n", " False\n", @@ -1685,10 +1696,10 @@ " \n", " \n", " 7\n", - " NaN\n", + " Unknown\n", " Sales\n", - " 50.0\n", - " NaN\n", + " 50.000000\n", + " 178000.0\n", " False\n", " False\n", " False\n", @@ -1697,7 +1708,7 @@ " 8\n", " Grace\n", " Sales\n", - " 45.0\n", + " 45.000000\n", " 65000.0\n", " False\n", " False\n", @@ -2000,16 +2011,115 @@ "execution_count": 13, "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed using iqr.\n", + "Column 'id': clipped 0 value(s) (bounds: -2.5000, 11.5000).\n", + "Column 'age': clipped 0 value(s) (bounds: 11.8750, 58.8750).\n", + "Column 'salary': clipped 1 value(s) (bounds: -115125.0000, 353875.0000).\n", + "Total values clipped: 1. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.0000005000.0
2BobUnknown40.000000178000.0
3CharlieEngineering36.142857353875.0
4DavidHR25.00000048000.0
5EveUnknown35.000000178000.0
6FrankEngineering28.00000072000.0
7UnknownSales50.000000178000.0
8GraceSales45.00000065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Clip outliers completed.\n", - "Column 'id': clipped 0 value(s) to bounds (lower=-2.5, upper=11.5).\n", - "Column 'age': clipped 0 value(s) to bounds (lower=8.75, upper=62.75).\n", - "Column 'salary': clipped 2 value(s) to bounds (lower=12000.0, upper=108000.0).\n", - "Total values clipped: 2. Modified in-place: data['last_select'] updated.\n" + "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", + "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", + "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", + "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" ] }, { @@ -2091,24 +2201,20 @@ } ], "source": [ - "%clipoutliers" + "%clipoutliers method=zscore z_thresh=2.0 " ] }, { "cell_type": "code", - "execution_count": 14, - "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "execution_count": 8, + "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Clip outliers completed.\n", - "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", - "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", - "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", - "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" + "Encoded columns in-place and updated last_select.\n" ] }, { @@ -2122,6 +2228,7 @@ " department\n", " age\n", " salary\n", + " department_lbl\n", " \n", " \n", " \n", @@ -2130,21 +2237,24 @@ " Alice\n", " HR\n", " 30.0\n", - " 12000.0\n", + " 5000.0\n", + " 1\n", " \n", " \n", " 2\n", " Bob\n", - " NaN\n", + " Unknown\n", " 40.0\n", " NaN\n", + " 3\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", " NaN\n", - " 108000.0\n", + " 700000.0\n", + " 0\n", " \n", " \n", " 4\n", @@ -2152,13 +2262,15 @@ " HR\n", " 25.0\n", " 48000.0\n", + " 1\n", " \n", " \n", " 5\n", " Eve\n", - " NaN\n", + " Unknown\n", " 35.0\n", " NaN\n", + " 3\n", " \n", " \n", " 6\n", @@ -2166,13 +2278,15 @@ " Engineering\n", " 28.0\n", " 72000.0\n", + " 0\n", " \n", " \n", " 7\n", - " NaN\n", + " Unknown\n", " Sales\n", " 50.0\n", " NaN\n", + " 2\n", " \n", " \n", " 8\n", @@ -2180,6 +2294,7 @@ " Sales\n", " 45.0\n", " 65000.0\n", + " 2\n", " \n", " \n", "" @@ -2190,7 +2305,233 @@ } ], "source": [ - "%clipoutliers method=zscore z_thresh=2.0 " + "%encode method=label columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameagesalarydepartment_lbldepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown
1Alice30.05000.010.01.00.00.0
2Bob40.0NaN30.00.00.01.0
3CharlieNaN700000.001.00.00.00.0
4David25.048000.010.01.00.00.0
5Eve35.0NaN30.00.00.01.0
6Frank28.072000.001.00.00.00.0
7Unknown50.0NaN20.00.01.00.0
8Grace45.065000.020.00.01.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameagesalarydepartment_ord
1Alice30.05000.03.0
2Bob40.0NaNNaN
3CharlieNaN700000.03.0
4David25.048000.03.0
5Eve35.0NaNNaN
6Frank28.072000.03.0
7NaN50.0NaN3.0
8Grace45.065000.03.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=ordinal columns=department drop_original=false" ] }, { @@ -2202,7 +2543,12 @@ { "data": { "text/html": [ - "Query OK" + "--------------\r\n", + "drop table employees\r\n", + "--------------\r\n", + "\r\n", + "ERROR 1046 (3D000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_bf03f152-b263-11f0-b5b5-00155d4e875d': No database selected\r\n", + "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, "metadata": {}, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py new file mode 100644 index 0000000..6d052c9 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py @@ -0,0 +1,186 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import shlex +from distutils import util +import numpy as np + +# sklearn imports (we'll create encoder instances in a version-compatible way) +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder + + +class Encode(MariaMagic): + """ + %encode method= + [columns=col1,col2,...] + [inplace=true|false] + [drop_original=true|false] + + Notes: + - If columns omitted, object/category dtype columns are auto-selected. + - Default: inplace=true, drop_original=true. + - Requires scikit-learn installed for onehot/ordinal. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "encode" + + def help(self): + return ( + "%encode method= [columns=col1,col2] " + "[inplace=true] [drop_original=true]\n" + "Encode categorical columns using label, one-hot, or ordinal encoding (automatic)." + ) + + def _str_to_obj(self, s): + """Cast to int/float/bool when possible, otherwise return string.""" + try: + return int(s) + except (ValueError, TypeError): + pass + try: + return float(s) + except (ValueError, TypeError): + pass + try: + return bool(util.strtobool(str(s))) + except Exception: + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _make_ohe(self, **kwargs): + """ + Create OneHotEncoder in a sklearn-version compatible way. + Older sklearn versions accept `sparse`; newer use `sparse_output`. + """ + try: + return OneHotEncoder(sparse=False, **kwargs) + except TypeError: + # fallback for newer sklearn where parameter name changed + return OneHotEncoder(sparse_output=False, **kwargs) + + def execute(self, kernel, data): + # get DataFrame + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to encode (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments.") + return + + method = str(args.get("method", "label")).lower() + cols_arg = args.get("columns", None) + if isinstance(cols_arg, str): + columns = [c.strip() for c in cols_arg.split(",") if c.strip()] + elif isinstance(cols_arg, (list, tuple)): + columns = list(cols_arg) + else: + columns = list(df.select_dtypes(include=["object", "category"]).columns) + + if not columns: + kernel._send_message("stderr", "No columns specified or detected for encoding.") + return + + # validate existence + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + return + + inplace = bool(args.get("inplace", True)) + drop_original = bool(args.get("drop_original", True)) + + # Work on copy if not inplace + result_df = df if inplace else df.copy() + + try: + if method == "label": + # Use pandas.factorize which handles NaN by assigning -1 codes + for col in columns: + codes, uniques = pd.factorize(result_df[col], sort=True) + new_col = f"{col}_lbl" + result_df[new_col] = codes + if drop_original: + result_df.drop(columns=[col], inplace=True) + + elif method == "onehot": + # sklearn OneHotEncoder with version compatibility + encoder = self._make_ohe(handle_unknown="ignore") + # replace NaN with sentinel string so it's treated as a category + arr = encoder.fit_transform(result_df[columns].astype(object).fillna("___MISSING___")) + # feature names (sklearn >= 1.0) + try: + feature_names = encoder.get_feature_names_out(columns) + except Exception: + # fallback: build names manually + cats = encoder.categories_ + feature_names = [] + for cname, cat_list in zip(columns, cats): + for cat in cat_list: + feature_names.append(f"{cname}_{str(cat)}") + ohe_df = pd.DataFrame(arr, columns=feature_names, index=result_df.index) + if drop_original: + result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) + else: + result_df = pd.concat([result_df, ohe_df], axis=1) + + elif method == "ordinal": + # use sklearn OrdinalEncoder for one or multiple columns (automatic ordering) + enc = OrdinalEncoder(dtype=np.float64) + # fillna sentinel so OrdinalEncoder treats missing as a category + tmp = result_df[columns].astype(object).fillna("___MISSING___") + enc_arr = enc.fit_transform(tmp) + for i, col in enumerate(columns): + result_df[f"{col}_ord"] = enc_arr[:, i] + if drop_original: + result_df.drop(columns=[col], inplace=True) + + else: + kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") + return + + # Apply result + if inplace: + data["last_select"] = result_df + kernel._send_message("stdout", "Encoded columns in-place and updated last_select.") + else: + kernel._send_message("stdout", "Displayed encoded result (last_select not modified).") + + # display + self._send_html(kernel, result_df) + + except Exception as e: + kernel._send_message("stderr", f"Error during encoding: {e}") + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 6709eba..c7105b1 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -18,6 +18,7 @@ from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode def get(): return { @@ -35,4 +36,5 @@ def get(): "outliers": Outliers, "dropoutliers": DropOutliers, "clipoutliers": ClipOutliers, + "encode": Encode, } From 05cb88bc23624de6d2c4322d07c8e6410afa7dcc Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Mon, 27 Oct 2025 14:45:29 +0000 Subject: [PATCH 12/38] Added training and evaluation --- Untitled.ipynb | 4726 ++++++++++++++--- catboost_info/catboost_training.json | 54 + catboost_info/learn/events.out.tfevents | Bin 0 -> 2398 bytes catboost_info/learn_error.tsv | 51 + catboost_info/time_left.tsv | 51 + .../data_preprocessing/normalize.py | 136 + .../data_preprocessing/splitdata.py | 230 + .../data_preprocessing/standardize.py | 124 + .../model_training/evaluate_model.py | 288 + .../ml_commands/model_training/train_model.py | 297 ++ .../maria_magics/supported_magics.py | 10 + 11 files changed, 5279 insertions(+), 688 deletions(-) create mode 100644 catboost_info/catboost_training.json create mode 100644 catboost_info/learn/events.out.tfevents create mode 100644 catboost_info/learn_error.tsv create mode 100644 catboost_info/time_left.tsv create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py diff --git a/Untitled.ipynb b/Untitled.ipynb index 9d65c1a..d5bc045 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -84,7 +84,7 @@ ")\r\n", "--------------\r\n", "\r\n", - "ERROR 1046 (3D000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_bb843eec-b263-11f0-b5b5-00155d4e875d': No database selected\r\n", + "ERROR 1050 (42S01) at line 1 in file: '/home/iddhartha/mariadb_kernel/.mariadb_statement_c7b7c9f2-b33c-11f0-aa46-00155d4db2b1': Table 'employees' already exists\r\n", "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, @@ -139,7 +139,7 @@ { "data": { "text/html": [ - "
idnamedepartmentagesalary
1AliceHR305000.00
2BobNULL40NULL
3CharlieEngineeringNULL700000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
" + "
idnamedepartmentagesalary
1AliceHR305000.00
2BobNULL40NULL
3CharlieEngineeringNULL700000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
9AliceHR305000.00
10BobNULL40NULL
11CharlieEngineeringNULL700000.00
12DavidHR2548000.00
13EveNULL35NULL
14FrankEngineering2872000.00
15NULLSales50NULL
16GraceSales4565000.00
17AliceHR305000.00
18BobNULL40NULL
19CharlieEngineeringNULL700000.00
20DavidHR2548000.00
21EveNULL35NULL
22FrankEngineering2872000.00
23NULLSales50NULL
24GraceSales4565000.00
25AliceHR305000.00
26BobNULL40NULL
27CharlieEngineeringNULL700000.00
28DavidHR2548000.00
29EveNULL35NULL
30FrankEngineering2872000.00
31NULLSales50NULL
32GraceSales4565000.00
33AliceHR305000.00
34BobNULL40NULL
35CharlieEngineeringNULL700000.00
36DavidHR2548000.00
37EveNULL35NULL
38FrankEngineering2872000.00
39NULLSales50NULL
40GraceSales4565000.00
" ] }, "metadata": {}, @@ -317,14 +317,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "28756366-6f31-4045-b2ea-7d47fdf08ff8", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
stats%stats [columns=col1,col2] [include=all|numeric|object] [percentiles=25,50,75] [transpose=true|false]
Show statistical summary (uses pandas.DataFrame.describe under the hood).
fillmissing%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]
Fills missing values in data['last_select'] (always IN-PLACE).
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" + "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
stats%stats [columns=col1,col2] [include=all|numeric|object] [percentiles=25,50,75] [transpose=true|false]
Show statistical summary (uses pandas.DataFrame.describe under the hood).
fillmissing%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]
Fills missing values in data['last_select'] (always IN-PLACE).
outliers%outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False]
Detects outliers in data['last_select'] (non in-place). Results placed in data['last_select_outliers'].
dropoutliers%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]
Removes rows containing outliers from data['last_select'] (in-place).
clipoutliers%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [inplace=True|False]
Clamps extreme numeric values to computed boundaries (in-place by default).
encode%encode method=<label|onehot|ordinal> [columns=col1,col2] [inplace=true] [drop_original=true]
Encode categorical columns using label, one-hot, or ordinal encoding (automatic).
normalize%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]
Normalize numeric columns using MinMaxScaler (in-place by default).
standardize%standardize [columns=col1,col2,...] [inplace=True|False]
Standardizes numeric columns using sklearn's StandardScaler (in-place by default).
splitdata%splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False]
[random_state=42] [inplace=True|False] [train_name=name] [test_name=name] [val_name=name]
Split last_select into train/test/(val).
train_modelTrain a model on data['last_select'] (no split or scaling).
evaluate_modelEvaluate a trained model on a test DataFrame and show metrics + predictions.
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" ] }, "metadata": {}, @@ -913,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "829f6342-96b1-4d81-8519-2a53c091dfb1", "metadata": {}, "outputs": [ @@ -922,11 +922,11 @@ "output_type": "stream", "text": [ "Fill missing completed (in-place). Summary:\n", - "Column 'id': filled missing with mean=4.5.\n", + "Column 'id': filled missing with mean=21.321428571428573.\n", "Column 'name' is not numeric; cannot use mean. Skipped.\n", "Column 'department' is not numeric; cannot use mean. Skipped.\n", - "Column 'age': filled missing with mean=36.142857142857146.\n", - "Column 'salary': filled missing with mean=178000.0.\n" + "Column 'age': filled missing with mean=37.07692307692308.\n", + "Column 'salary': filled missing with mean=127500.0.\n" ] }, { @@ -944,53 +944,179 @@ " \n", " \n", " \n", - " 1\n", + " 18\n", + " Bob\n", + " NaN\n", + " 40.000000\n", + " 127500.0\n", + " \n", + " \n", + " 25\n", " Alice\n", " HR\n", " 30.000000\n", " 5000.0\n", " \n", " \n", - " 2\n", + " 29\n", + " Eve\n", + " NaN\n", + " 35.000000\n", + " 127500.0\n", + " \n", + " \n", + " 23\n", + " NaN\n", + " Sales\n", + " 50.000000\n", + " 127500.0\n", + " \n", + " \n", + " 6\n", + " Frank\n", + " Engineering\n", + " 28.000000\n", + " 72000.0\n", + " \n", + " \n", + " 40\n", + " Grace\n", + " Sales\n", + " 45.000000\n", + " 65000.0\n", + " \n", + " \n", + " 14\n", + " Frank\n", + " Engineering\n", + " 28.000000\n", + " 72000.0\n", + " \n", + " \n", + " 22\n", + " Frank\n", + " Engineering\n", + " 28.000000\n", + " 72000.0\n", + " \n", + " \n", + " 32\n", + " Grace\n", + " Sales\n", + " 45.000000\n", + " 65000.0\n", + " \n", + " \n", + " 12\n", + " David\n", + " HR\n", + " 25.000000\n", + " 48000.0\n", + " \n", + " \n", + " 34\n", " Bob\n", " NaN\n", " 40.000000\n", - " 178000.0\n", + " 127500.0\n", " \n", " \n", - " 3\n", + " 33\n", + " Alice\n", + " HR\n", + " 30.000000\n", + " 5000.0\n", + " \n", + " \n", + " 7\n", + " NaN\n", + " Sales\n", + " 50.000000\n", + " 127500.0\n", + " \n", + " \n", + " 26\n", + " Bob\n", + " NaN\n", + " 40.000000\n", + " 127500.0\n", + " \n", + " \n", + " 19\n", " Charlie\n", " Engineering\n", - " 36.142857\n", + " 37.076923\n", " 700000.0\n", " \n", " \n", - " 4\n", + " 10\n", + " Bob\n", + " NaN\n", + " 40.000000\n", + " 127500.0\n", + " \n", + " \n", + " 36\n", " David\n", " HR\n", " 25.000000\n", " 48000.0\n", " \n", " \n", - " 5\n", + " 21\n", " Eve\n", " NaN\n", " 35.000000\n", - " 178000.0\n", + " 127500.0\n", " \n", " \n", - " 6\n", - " Frank\n", - " Engineering\n", - " 28.000000\n", - " 72000.0\n", + " 24\n", + " Grace\n", + " Sales\n", + " 45.000000\n", + " 65000.0\n", " \n", " \n", - " 7\n", + " 31\n", " NaN\n", " Sales\n", " 50.000000\n", - " 178000.0\n", + " 127500.0\n", + " \n", + " \n", + " 39\n", + " NaN\n", + " Sales\n", + " 50.000000\n", + " 127500.0\n", + " \n", + " \n", + " 4\n", + " David\n", + " HR\n", + " 25.000000\n", + " 48000.0\n", + " \n", + " \n", + " 9\n", + " Alice\n", + " HR\n", + " 30.000000\n", + " 5000.0\n", + " \n", + " \n", + " 1\n", + " Alice\n", + " HR\n", + " 30.000000\n", + " 5000.0\n", + " \n", + " \n", + " 2\n", + " Bob\n", + " NaN\n", + " 40.000000\n", + " 127500.0\n", " \n", " \n", " 8\n", @@ -999,6 +1125,20 @@ " 45.000000\n", " 65000.0\n", " \n", + " \n", + " 37\n", + " Eve\n", + " NaN\n", + " 35.000000\n", + " 127500.0\n", + " \n", + " \n", + " 35\n", + " Charlie\n", + " Engineering\n", + " 37.076923\n", + " 700000.0\n", + " \n", " \n", "" ] @@ -1013,7 +1153,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 4, "id": "52d78b76-69c5-41d4-874f-8d8cb8b0cae9", "metadata": {}, "outputs": [ @@ -1045,7 +1185,7 @@ " Alice\n", " HR\n", " 30.0\n", - " 50000.0\n", + " 5000.0\n", " \n", " \n", " 2\n", @@ -1059,7 +1199,7 @@ " Charlie\n", " Engineering\n", " 35.0\n", - " 70000.0\n", + " 700000.0\n", " \n", " \n", " 4\n", @@ -1096,198 +1236,225 @@ " 45.0\n", " 65000.0\n", " \n", - " \n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing columns=age,salary strategy=median" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "beac2393-829b-472e-b9cb-d12166e16088", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'id': filled missing with mode=1.\n", - "Column 'name': filled missing with mode=Alice.\n", - "Column 'department': filled missing with mode=Engineering.\n", - "Column 'age': filled missing with mode=25.0.\n", - "Column 'salary': filled missing with mode=48000.0.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
idnamedepartmentagesalary
19AliceHR30.050000.05000.0
210BobEngineeringNaN40.048000.065000.0
311CharlieEngineering25.070000.035.0700000.0
412DavidHR25.048000.0
513EveEngineeringNaN35.048000.065000.0
614FrankEngineering28.072000.0
7Alice15NaNSales50.048000.065000.0
816GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing strategy=mode" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'name': filled missing with constant value=Unknown.\n", - "Column 'department': filled missing with constant value=Unknown.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1302,60 +1469,25 @@ } ], "source": [ - "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Strategy 'constant' requires a 'value=...' argument.\n" - ] - } - ], - "source": [ - "%fillmissing strategy=constant" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The result set was successfully written into last_query.csv\n" - ] - } - ], - "source": [ - "%df" + "%fillmissing columns=age,salary strategy=median" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", + "execution_count": 19, + "id": "beac2393-829b-472e-b9cb-d12166e16088", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + "Fill missing completed (in-place). Summary:\n", + "Column 'id': filled missing with mode=1.\n", + "Column 'name': filled missing with mode=Alice.\n", + "Column 'department': filled missing with mode=Engineering.\n", + "Column 'age': filled missing with mode=25.0.\n", + "Column 'salary': filled missing with mode=48000.0.\n" ] }, { @@ -1369,9 +1501,6 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -1379,81 +1508,57 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", "
idnamedepartmentagesalary
117AliceHR30.05000.0
218BobUnknown40.0NaN40.065000.0
319CharlieEngineeringNaN35.0700000.0
420DavidHR25.048000.0
521EveUnknownNaN35.065000.0
22FrankEngineering28.072000.0
23NaNSales50.065000.0
24GraceSales45.065000.0
25AliceHR30.05000.0
26BobNaN40.065000.0
627CharlieEngineering35.0700000.0
28DavidHR25.048000.0
29EveNaN35.065000.0
30FrankEngineering28.072000.0
7Unknown31NaNSales50.065000.0
32GraceSales45.065000.0
33AliceHR30.05000.0
34BobNaN40.065000.0
35CharlieEngineering35.0700000.0
36DavidHR25.048000.0
37EveNaN35.065000.0
838FrankEngineering28.072000.0
39NaNSales50.065000.0
40GraceSales45.0departmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.0000005000.0FalseFalseFalse30.050000.0
2BobUnknown40.000000178000.0FalseFalseFalseEngineering40.048000.0
3CharlieEngineering36.142857700000.0FalseFalseTrue25.070000.0
4DavidHR25.00000025.048000.0FalseFalseFalse
5EveUnknown35.000000178000.0FalseFalseFalseEngineering35.048000.0
6FrankEngineering28.00000028.072000.0FalseFalseFalse
7UnknownAliceSales50.000000178000.0FalseFalseFalse50.048000.0
8GraceSales45.00000045.065000.0FalseFalseFalse
" @@ -1464,24 +1569,22 @@ } ], "source": [ - "%outliers" + "%fillmissing strategy=mode" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", + "execution_count": 5, + "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + "Fill missing completed (in-place). Summary:\n", + "Column 'name': filled missing with constant value=Unknown.\n", + "Column 'department': filled missing with constant value=Unknown.\n" ] }, { @@ -1495,9 +1598,6 @@ " department\n", " age\n", " salary\n", - " id_is_outlier\n", - " age_is_outlier\n", - " salary_is_outlier\n", " \n", " \n", " \n", @@ -1505,490 +1605,277 @@ " 1\n", " Alice\n", " HR\n", - " 30.000000\n", + " 30.0\n", " 5000.0\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 2\n", " Bob\n", " Unknown\n", - " 40.000000\n", - " 178000.0\n", - " False\n", - " False\n", - " False\n", + " 40.0\n", + " 65000.0\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", - " 36.142857\n", + " 35.0\n", " 700000.0\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 4\n", " David\n", " HR\n", - " 25.000000\n", + " 25.0\n", " 48000.0\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 5\n", " Eve\n", " Unknown\n", - " 35.000000\n", - " 178000.0\n", - " False\n", - " False\n", - " False\n", + " 35.0\n", + " 65000.0\n", " \n", " \n", " 6\n", " Frank\n", " Engineering\n", - " 28.000000\n", + " 28.0\n", " 72000.0\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 7\n", " Unknown\n", " Sales\n", - " 50.000000\n", - " 178000.0\n", - " False\n", - " False\n", - " False\n", + " 50.0\n", + " 65000.0\n", " \n", " \n", " 8\n", " Grace\n", " Sales\n", - " 45.000000\n", + " 45.0\n", " 65000.0\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers method=zscore z_thresh=2.5" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" - ] - }, - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
19AliceHR30.00000030.05000.0FalseFalseFalse
210BobUnknown40.000000178000.0FalseFalseFalse40.065000.0
311CharlieEngineering36.14285735.0700000.0FalseFalseTrue
412DavidHR25.00000025.048000.0FalseFalseFalse
513EveUnknown35.000000178000.0FalseFalseFalse35.065000.0
614FrankEngineering28.00000028.072000.0FalseFalseFalse
715UnknownSales50.000000178000.0FalseFalseFalse50.065000.0
816GraceSales45.00000045.065000.0FalseFalseFalse
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers plot=True" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Marked outliers in-place. Summary:\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n" - ] - }, - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", - " \n", - "
idnamedepartmentagesalarysalary_is_outlier
117AliceHR30.05000.0True
218BobNaNUnknown40.0NaNFalse65000.0
319CharlieEngineeringNaN35.0700000.0True
420DavidHR25.048000.0False
521EveNaNUnknown35.0NaNFalse65000.0
622FrankEngineering28.072000.0False
7NaN23UnknownSales50.0NaNFalse65000.0
824GraceSales45.065000.0False
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers columns=salary plot=True" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dropped 2 row(s) containing outliers (in-place).\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
idnamedepartmentagesalary
25AliceHR30.05000.0
226BobNaNUnknown40.0NaN65000.0
427CharlieEngineering35.0700000.0
28DavidHR25.048000.0
529EveNaNUnknown35.0NaN65000.0
630FrankEngineering28.072000.0
7NaN31UnknownSales50.0NaN65000.0
832GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropoutliers" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No outliers detected. No rows removed.\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2003,24 +1890,60 @@ } ], "source": [ - "%dropoutliers method=zscore z_thresh=2.5" + "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", + "execution_count": 27, + "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Strategy 'constant' requires a 'value=...' argument.\n" + ] + } + ], + "source": [ + "%fillmissing strategy=constant" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Clip outliers completed using iqr.\n", - "Column 'id': clipped 0 value(s) (bounds: -2.5000, 11.5000).\n", - "Column 'age': clipped 0 value(s) (bounds: 11.8750, 58.8750).\n", - "Column 'salary': clipped 1 value(s) (bounds: -115125.0000, 353875.0000).\n", - "Total values clipped: 1. Modified in-place: data['last_select'] updated.\n" + "The result set was successfully written into last_query.csv\n" + ] + } + ], + "source": [ + "%df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, { @@ -2034,6 +1957,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2043,6 +1969,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2050,13 +1979,19 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2064,6 +1999,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2071,6 +2009,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2078,6 +2019,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2085,6 +2029,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2092,6 +2039,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idnamedepartmentagesalary
33AliceHR30.05000.0
234BobNaNUnknown40.0NaN65000.0
435CharlieEngineering35.0700000.0
36DavidHR25.048000.0
537EveNaNUnknown35.0NaN65000.0
638FrankEngineering28.072000.0
7NaN39UnknownSales50.0NaN65000.0
840GraceSales45.0departmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
HR30.0000005000.0FalseFalseFalse
2Unknown40.000000178000.0FalseFalseFalse
3CharlieEngineering36.142857353875.0700000.0FalseFalseTrue
4HR25.00000048000.0FalseFalseFalse
5Unknown35.000000178000.0FalseFalseFalse
6Engineering28.00000072000.0FalseFalseFalse
7Sales50.000000178000.0FalseFalseFalse
8Sales45.00000065000.0FalseFalseFalse
" @@ -2102,24 +2052,24 @@ } ], "source": [ - "%clipoutliers" + "%outliers" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "execution_count": 11, + "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Clip outliers completed.\n", - "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", - "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", - "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", - "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, { @@ -2133,6 +2083,9 @@ " department\n", " age\n", " salary\n", + " id_is_outlier\n", + " age_is_outlier\n", + " salary_is_outlier\n", " \n", " \n", " \n", @@ -2140,57 +2093,81 @@ " 1\n", " Alice\n", " HR\n", - " 30.0\n", - " 12000.0\n", + " 30.000000\n", + " 5000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 2\n", " Bob\n", - " NaN\n", - " 40.0\n", - " NaN\n", + " Unknown\n", + " 40.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", - " NaN\n", - " 108000.0\n", + " 36.142857\n", + " 700000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 4\n", " David\n", " HR\n", - " 25.0\n", + " 25.000000\n", " 48000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 5\n", " Eve\n", - " NaN\n", - " 35.0\n", - " NaN\n", + " Unknown\n", + " 35.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 6\n", " Frank\n", " Engineering\n", - " 28.0\n", + " 28.000000\n", " 72000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 7\n", - " NaN\n", + " Unknown\n", " Sales\n", - " 50.0\n", - " NaN\n", + " 50.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 8\n", " Grace\n", " Sales\n", - " 45.0\n", + " 45.000000\n", " 65000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", "" @@ -2201,22 +2178,33 @@ } ], "source": [ - "%clipoutliers method=zscore z_thresh=2.0 " + "%outliers method=zscore z_thresh=2.5" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", + "execution_count": 12, + "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Encoded columns in-place and updated last_select.\n" + "Outlier detection completed (non in-place). Summary:\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -2228,7 +2216,9 @@ " department\n", " age\n", " salary\n", - " department_lbl\n", + " id_is_outlier\n", + " age_is_outlier\n", + " salary_is_outlier\n", " \n", " \n", " \n", @@ -2236,65 +2226,81 @@ " 1\n", " Alice\n", " HR\n", - " 30.0\n", + " 30.000000\n", " 5000.0\n", - " 1\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 2\n", " Bob\n", " Unknown\n", - " 40.0\n", - " NaN\n", - " 3\n", + " 40.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", - " NaN\n", + " 36.142857\n", " 700000.0\n", - " 0\n", + " False\n", + " False\n", + " True\n", " \n", " \n", " 4\n", " David\n", " HR\n", - " 25.0\n", + " 25.000000\n", " 48000.0\n", - " 1\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 5\n", " Eve\n", " Unknown\n", - " 35.0\n", - " NaN\n", - " 3\n", + " 35.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 6\n", " Frank\n", " Engineering\n", - " 28.0\n", + " 28.000000\n", " 72000.0\n", - " 0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 7\n", " Unknown\n", " Sales\n", - " 50.0\n", - " NaN\n", - " 2\n", + " 50.000000\n", + " 178000.0\n", + " False\n", + " False\n", + " False\n", " \n", " \n", " 8\n", " Grace\n", " Sales\n", - " 45.0\n", + " 45.000000\n", " 65000.0\n", - " 2\n", + " False\n", + " False\n", + " False\n", " \n", " \n", "" @@ -2305,22 +2311,30 @@ } ], "source": [ - "%encode method=label columns=department drop_original=false" + "%outliers plot=True" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", + "execution_count": 24, + "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Encoded columns in-place and updated last_select.\n" + "Marked outliers in-place. Summary:\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n" ] }, + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -2329,103 +2343,76 @@ " \n", " id\n", " name\n", + " department\n", " age\n", " salary\n", - " department_lbl\n", - " department_Engineering\n", - " department_HR\n", - " department_Sales\n", - " department_Unknown\n", + " salary_is_outlier\n", " \n", " \n", " \n", " \n", " 1\n", " Alice\n", + " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " True\n", " \n", " \n", " 2\n", " Bob\n", + " NaN\n", " 40.0\n", " NaN\n", - " 3\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", + " False\n", " \n", " \n", " 3\n", " Charlie\n", + " Engineering\n", " NaN\n", " 700000.0\n", - " 0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " True\n", " \n", " \n", " 4\n", " David\n", + " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", + " False\n", " \n", " \n", " 5\n", " Eve\n", + " NaN\n", " 35.0\n", " NaN\n", - " 3\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", + " False\n", " \n", " \n", " 6\n", " Frank\n", + " Engineering\n", " 28.0\n", " 72000.0\n", - " 0\n", - " 1.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " False\n", " \n", " \n", " 7\n", - " Unknown\n", + " NaN\n", + " Sales\n", " 50.0\n", " NaN\n", - " 2\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " False\n", " \n", " \n", " 8\n", " Grace\n", + " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", - " 0.0\n", - " 0.0\n", - " 1.0\n", - " 0.0\n", + " False\n", " \n", " \n", "" @@ -2436,20 +2423,23 @@ } ], "source": [ - "%encode method=onehot columns=department drop_original=false" + "%outliers columns=salary plot=True" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", + "execution_count": 7, + "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Encoded columns in-place and updated last_select.\n" + "Dropped 2 row(s) containing outliers (in-place).\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n" ] }, { @@ -2460,67 +2450,53 @@ " \n", " id\n", " name\n", + " department\n", " age\n", " salary\n", - " department_ord\n", " \n", " \n", " \n", " \n", - " 1\n", - " Alice\n", - " 30.0\n", - " 5000.0\n", - " 3.0\n", - " \n", - " \n", " 2\n", " Bob\n", - " 40.0\n", " NaN\n", + " 40.0\n", " NaN\n", " \n", " \n", - " 3\n", - " Charlie\n", - " NaN\n", - " 700000.0\n", - " 3.0\n", - " \n", - " \n", " 4\n", " David\n", + " HR\n", " 25.0\n", " 48000.0\n", - " 3.0\n", " \n", " \n", " 5\n", " Eve\n", - " 35.0\n", " NaN\n", + " 35.0\n", " NaN\n", " \n", " \n", " 6\n", " Frank\n", + " Engineering\n", " 28.0\n", " 72000.0\n", - " 3.0\n", " \n", " \n", " 7\n", " NaN\n", + " Sales\n", " 50.0\n", " NaN\n", - " 3.0\n", " \n", " \n", " 8\n", " Grace\n", + " Sales\n", " 45.0\n", " 65000.0\n", - " 3.0\n", " \n", " \n", "" @@ -2531,24 +2507,3398 @@ } ], "source": [ - "%encode method=ordinal columns=department drop_original=false" + "%dropoutliers" ] }, { "cell_type": "code", - "execution_count": null, - "id": "4247d68c-6f93-4297-b1fd-fa09bf6362f8", + "execution_count": 8, + "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No outliers detected. No rows removed.\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n" + ] + }, { "data": { "text/html": [ - "--------------\r\n", - "drop table employees\r\n", - "--------------\r\n", - "\r\n", - "ERROR 1046 (3D000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_bf03f152-b263-11f0-b5b5-00155d4e875d': No database selected\r\n", - "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
2BobNaN40.0NaN
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers method=zscore z_thresh=2.5" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed using iqr.\n", + "Column 'id': clipped 0 value(s) (bounds: -2.5000, 11.5000).\n", + "Column 'age': clipped 0 value(s) (bounds: 11.8750, 58.8750).\n", + "Column 'salary': clipped 1 value(s) (bounds: -115125.0000, 353875.0000).\n", + "Total values clipped: 1. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.0000005000.0
2BobUnknown40.000000178000.0
3CharlieEngineering36.142857353875.0
4DavidHR25.00000048000.0
5EveUnknown35.000000178000.0
6FrankEngineering28.00000072000.0
7UnknownSales50.000000178000.0
8GraceSales45.00000065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed.\n", + "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", + "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", + "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", + "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.012000.0
2BobNaN40.0NaN
3CharlieEngineeringNaN108000.0
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers method=zscore z_thresh=2.0 " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbl
1AliceHR30.05000.01
2BobUnknown40.065000.03
3CharlieEngineering35.0700000.00
4DavidHR25.048000.01
5EveUnknown35.065000.03
6FrankEngineering28.072000.00
7UnknownSales50.065000.02
8GraceSales45.065000.02
9AliceHR30.05000.01
10BobUnknown40.065000.03
11CharlieEngineering35.0700000.00
12DavidHR25.048000.01
13EveUnknown35.065000.03
14FrankEngineering28.072000.00
15UnknownSales50.065000.02
16GraceSales45.065000.02
17AliceHR30.05000.01
18BobUnknown40.065000.03
19CharlieEngineering35.0700000.00
20DavidHR25.048000.01
21EveUnknown35.065000.03
22FrankEngineering28.072000.00
23UnknownSales50.065000.02
24GraceSales45.065000.02
25AliceHR30.05000.01
26BobUnknown40.065000.03
27CharlieEngineering35.0700000.00
28DavidHR25.048000.01
29EveUnknown35.065000.03
30FrankEngineering28.072000.00
31UnknownSales50.065000.02
32GraceSales45.065000.02
33AliceHR30.05000.01
34BobUnknown40.065000.03
35CharlieEngineering35.0700000.00
36DavidHR25.048000.01
37EveUnknown35.065000.03
38FrankEngineering28.072000.00
39UnknownSales50.065000.02
40GraceSales45.065000.02
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=label columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown
1AliceHR30.05000.00.01.00.00.0
2BobUnknown40.065000.00.00.00.01.0
3CharlieEngineering35.0700000.01.00.00.00.0
4DavidHR25.048000.00.01.00.00.0
5EveUnknown35.065000.00.00.00.01.0
6FrankEngineering28.072000.01.00.00.00.0
7UnknownSales50.065000.00.00.01.00.0
8GraceSales45.065000.00.00.01.00.0
9AliceHR30.05000.00.01.00.00.0
10BobUnknown40.065000.00.00.00.01.0
11CharlieEngineering35.0700000.01.00.00.00.0
12DavidHR25.048000.00.01.00.00.0
13EveUnknown35.065000.00.00.00.01.0
14FrankEngineering28.072000.01.00.00.00.0
15UnknownSales50.065000.00.00.01.00.0
16GraceSales45.065000.00.00.01.00.0
17AliceHR30.05000.00.01.00.00.0
18BobUnknown40.065000.00.00.00.01.0
19CharlieEngineering35.0700000.01.00.00.00.0
20DavidHR25.048000.00.01.00.00.0
21EveUnknown35.065000.00.00.00.01.0
22FrankEngineering28.072000.01.00.00.00.0
23UnknownSales50.065000.00.00.01.00.0
24GraceSales45.065000.00.00.01.00.0
25AliceHR30.05000.00.01.00.00.0
26BobUnknown40.065000.00.00.00.01.0
27CharlieEngineering35.0700000.01.00.00.00.0
28DavidHR25.048000.00.01.00.00.0
29EveUnknown35.065000.00.00.00.01.0
30FrankEngineering28.072000.01.00.00.00.0
31UnknownSales50.065000.00.00.01.00.0
32GraceSales45.065000.00.00.01.00.0
33AliceHR30.05000.00.01.00.00.0
34BobUnknown40.065000.00.00.00.01.0
35CharlieEngineering35.0700000.01.00.00.00.0
36DavidHR25.048000.00.01.00.00.0
37EveUnknown35.065000.00.00.00.01.0
38FrankEngineering28.072000.01.00.00.00.0
39UnknownSales50.065000.00.00.01.00.0
40GraceSales45.065000.00.00.01.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbldepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknowndepartment_ord
1AliceHR30.05000.010.01.00.00.01.0
2BobUnknown40.0NaN30.00.00.01.03.0
3CharlieEngineeringNaN700000.001.00.00.00.00.0
4DavidHR25.048000.010.01.00.00.01.0
5EveUnknown35.0NaN30.00.00.01.03.0
6FrankEngineering28.072000.001.00.00.00.00.0
7UnknownSales50.0NaN20.00.01.00.02.0
8GraceSales45.065000.020.00.01.00.02.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=ordinal columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ab1491d5-4a2b-46e8-a079-dab57ae95afe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
0.000000AliceHR0.200.000000
0.142857BobNaN0.60NaN
0.285714CharlieEngineeringNaN1.000000
0.428571DavidHR0.000.061871
0.571429EveNaN0.40NaN
0.714286FrankEngineering0.120.096403
0.857143NaNSales1.00NaN
1.000000GraceSales0.800.086331
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized 2 column(s) to range (5.0, 10.0). Stored in data['last_select_normalized'].\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR6.05.000000
2BobNaN8.0NaN
3CharlieEngineeringNaN10.000000
4DavidHR5.05.309353
5EveNaN7.0NaN
6FrankEngineering5.65.482014
7NaNSales10.0NaN
8GraceSales9.05.431655
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize columns=age,salary feature_range=5,10 inplace=False" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Standardized 3 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
-1.527525AliceHR-0.716269-0.660211
-1.091089BobNaN0.449750NaN
-0.654654CharlieEngineeringNaN1.992082
-0.218218DavidHR-1.299278-0.496112
0.218218EveNaN-0.133259NaN
0.654654FrankEngineering-0.949473-0.404522
1.091089NaNSales1.615769NaN
1.527525GraceSales1.032760-0.431236
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%standardize inplace=False" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "92f0fb87-521e-43dc-8604-9f4342d446e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Standardized 2 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR-0.716269-0.660211
2BobNaN0.449750NaN
3CharlieEngineeringNaN1.992082
4DavidHR-1.299278-0.496112
5EveNaN-0.133259NaN
6FrankEngineering-0.949473-0.404522
7NaNSales1.615769NaN
8GraceSales1.032760-0.431236
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%standardize columns=age,salary inplace=False" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6a7c652c-35b8-4c51-b00a-9ccb7c2c78b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=8, train=6, test=2, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (6 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
5EveNaN35.0NaN
7NaNSales50.0NaN
3CharlieEngineeringNaN700000.0
8GraceSales45.065000.0
2BobNaN40.0NaN
6FrankEngineering28.072000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (2 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
4DavidHR25.048000.0
1AliceHR30.05000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "992bf2a2-15e2-4c67-a8fc-f0ac3c3e0630", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=76, train=52, test=16, val=8.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (52 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lbl
16GraceSales45.065000.00.00.01.00.0Sales[9.200406422034337e-08, 1.243137546336329e-07, 0.9999996747434211, 1.0893876019841573e-07]2
27CharlieEngineering35.0700000.01.00.00.00.0300335.891003NaN0
28DavidHR25.048000.00.01.00.00.0HR[9.321102887398835e-257, 1.0, 1.6922240381185897e-260, 3.545685506178999e-152]1
25AliceHR30.05000.00.01.00.00.0NaNNaN1
5EveUnknown35.065000.00.00.00.01.0Unknown[3.028517537014313e-05, 1.0663051108596776e-09, 1.6103057563599044e-05, 0.9999536107007612]3
9AliceHR30.05000.00.01.00.00.0NaNNaN1
4DavidHR25.048000.00.01.00.00.0NaNNaN1
31UnknownSales50.065000.00.00.01.00.0NaNNaN2
5EveUnknown35.065000.00.00.00.01.0Unknown[8.812514498750527e-08, 1.192424207762908e-07, 8.236229869877078e-08, 0.9999997102701355]3
27CharlieEngineering35.0700000.01.00.00.00.0107701.601304NaN0
28DavidHR25.048000.00.01.00.00.0104344.615975NaN1
16GraceSales45.065000.00.00.01.00.0Sales[0.0, 0.0, 0.96, 0.04]2
33AliceHR30.05000.00.01.00.00.0NaNNaN1
8GraceSales45.065000.00.00.01.00.0NaNNaN2
10BobUnknown40.065000.00.00.00.01.0NaNNaN3
19CharlieEngineering35.0700000.01.00.00.00.0NaNNaN0
16GraceSales45.065000.00.00.01.00.0111058.586633NaN2
20DavidHR25.048000.00.01.00.00.0HR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]1
13EveUnknown35.065000.00.00.00.01.0Unknown[8.351973478650103e-08, 1.1257529759402385e-07, 9.123997229467953e-08, 0.9999997126649953]3
17AliceHR30.05000.00.01.00.00.0HR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]1
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Validation (8 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lbl
12DavidHR25.048000.00.01.00.00.0NaNNaN1
5EveUnknown35.065000.00.00.00.01.0107701.601304NaN3
13EveUnknown35.065000.00.00.00.01.0107701.601304NaN3
13EveUnknown35.065000.00.00.00.01.0300335.891003NaN3
20DavidHR25.048000.00.01.00.00.0HR[8.219883955756973e-08, 0.9999997238582116, 8.980877253636851e-08, 1.0413417643748582e-07]1
1AliceHR30.05000.00.01.00.00.0NaNNaN1
38FrankEngineering28.072000.01.00.00.00.072000.0NaN0
17AliceHR30.05000.00.01.00.00.0106023.108639NaN1
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (16 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lbl
6FrankEngineering28.072000.01.00.00.00.0NaNNaN0
28DavidHR25.048000.00.01.00.00.0HR[0.01, 0.95, 0.0, 0.04]1
34BobUnknown40.065000.00.00.00.01.0NaNNaN3
18BobUnknown40.065000.00.00.00.01.0NaNNaN3
17AliceHR30.05000.00.01.00.00.0HR[8.219883955756973e-08, 0.9999997238582116, 8.980877253636851e-08, 1.0413417643748582e-07]1
27CharlieEngineering35.0700000.01.00.00.00.0Engineering[0.9999996765912755, 1.1755477900860697e-07, 9.537254044052893e-08, 1.104814050856308e-07]0
13EveUnknown35.065000.00.00.00.01.0Unknown[8.812514498750527e-08, 1.192424207762908e-07, 8.236229869877078e-08, 0.9999997102701355]3
17AliceHR30.05000.00.01.00.00.0HR[0.0, 1.0, 0.0, 0.0]1
38FrankEngineering28.072000.01.00.00.00.0Engineering[0.9999996765912744, 1.1755477900860684e-07, 9.537254044052883e-08, 1.104814062137575e-07]0
20DavidHR25.048000.00.01.00.00.0HR[0.01, 0.95, 0.0, 0.04]1
28DavidHR25.048000.00.01.00.00.0HR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]1
16GraceSales45.065000.00.00.01.00.065000.0NaN2
24GraceSales45.065000.00.00.01.00.0NaNNaN2
7UnknownSales50.065000.00.00.01.00.0NaNNaN2
38FrankEngineering28.072000.01.00.00.00.0Engineering[0.9991777270016182, 7.148513430183318e-113, 0.0008222729983817936, 1.9337772157312066e-34]0
13EveUnknown35.065000.00.00.00.01.0Unknown[0.01, 0.0, 0.0, 0.99]3
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata test_size=0.2 val_size=0.1 random_state=42" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "000a2e5b-1918-4371-8b47-d3a4547a1759", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=40, train=23, test=12, val=5.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (23 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbl
28DavidHR25.048000.01
20DavidHR25.048000.01
35CharlieEngineering35.0700000.00
8GraceSales45.065000.02
18BobUnknown40.065000.03
24GraceSales45.065000.02
5EveUnknown35.065000.03
33AliceHR30.05000.01
3CharlieEngineering35.0700000.00
30FrankEngineering28.072000.00
17AliceHR30.05000.01
22FrankEngineering28.072000.00
7UnknownSales50.065000.02
2BobUnknown40.065000.03
12DavidHR25.048000.01
34BobUnknown40.065000.03
23UnknownSales50.065000.02
32GraceSales45.065000.02
37EveUnknown35.065000.03
25AliceHR30.05000.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Validation (5 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbl
26BobUnknown40.065000.03
14FrankEngineering28.072000.00
9AliceHR30.05000.01
39UnknownSales50.065000.02
15UnknownSales50.065000.02
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (12 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbl
10BobUnknown40.065000.03
16GraceSales45.065000.02
11CharlieEngineering35.0700000.00
4DavidHR25.048000.01
38FrankEngineering28.072000.00
29EveUnknown35.065000.03
1AliceHR30.05000.01
40GraceSales45.065000.02
31UnknownSales50.065000.02
27CharlieEngineering35.0700000.00
13EveUnknown35.065000.03
36DavidHR25.048000.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata stratify=department test_size=0.3 val_size=0.1 random_state=123" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "29b7abe8-4825-4096-86a0-7026c21de397", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=23, train=17, test=6, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (17 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
28DavidHR25.048000.0
20DavidHR25.048000.0
35CharlieEngineeringNaN700000.0
8GraceSales45.065000.0
18BobUnknown40.0NaN
24GraceSales45.065000.0
5EveUnknown35.0NaN
33AliceHR30.05000.0
3CharlieEngineeringNaN700000.0
30FrankEngineering28.072000.0
17AliceHR30.05000.0
22FrankEngineering28.072000.0
7UnknownSales50.0NaN
2BobUnknown40.0NaN
12DavidHR25.048000.0
34BobUnknown40.0NaN
23UnknownSales50.0NaN
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (6 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
32GraceSales45.065000.0
37EveUnknown35.0NaN
25AliceHR30.05000.0
19CharlieEngineeringNaN700000.0
6FrankEngineering28.072000.0
21EveUnknown35.0NaN
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata test_size=0.25 shuffle=False inplace=False train_name=mytrain test_name=mytest" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2543a896-7047-45a7-a118-3adcfb822023", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'random_forest' trained and saved to data['last_model']. problem=classification. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model target=department features=age,salary model=random_forest n_estimators=50 max_depth=4" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c75dd88-2c4a-462f-8ee5-48b50cae3d69", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'linear_regression' trained and saved to data['last_model']. problem=regression. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model target=salary features=age model=linear_regression" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2778c96c-80b3-4d80-8199-75ff6b7154e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'gbm' trained and saved to data['last_model']. problem=classification. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model target=department features=age,salary model=gbm n_estimators=100 learning_rate=0.05" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8f84d412-6352-4396-b5d6-d693262d3b7b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'logistic_regression' trained and saved to data['last_model']. problem=classification. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model target=department features=age,salary model=logistic_regression max_iter=500" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4f7d7e57-e6e6-47cd-84f4-ac653b11e9b1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'catboost' trained and saved to data['last_model']. problem=regression. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model model=catboost target=department_lbl features=age,salary model_params='{\"iterations\":50}'" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1cdea559-1c4b-493d-a7bc-b9565f7f3b7d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'xgboost' trained and saved to data['last_model']. problem=classification. train_rows=23\n" + ] + } + ], + "source": [ + "%train_model model=xgboost target=department_lbl features=age,salary model_params='{\"n_estimators\":100, \"max_depth\":3}' problem=\"classification\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1d90ce87-aafd-4958-8318-09f66793b98e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Classification metrics (model: 'last_model')\n", + " accuracy = 1.0000\n", + " precision (weighted) = 1.0000\n", + " recall (weighted) = 1.0000\n", + " f1 (weighted) = 1.0000\n", + " Confusion matrix (rows=actual, cols=predicted):\n", + "[[3, 0, 0, 0], [0, 3, 0, 0], [0, 0, 3, 0], [0, 0, 0, 3]]\n", + " ROC AUC (multiclass OVR, weighted) = 1.0000\n", + "\n", + "Classification report:\n", + " precision recall f1-score support\n", + "\n", + " Engineering 1.00 1.00 1.00 3\n", + " HR 1.00 1.00 1.00 3\n", + " Sales 1.00 1.00 1.00 3\n", + " Unknown 1.00 1.00 1.00 3\n", + "\n", + " accuracy 1.00 12\n", + " macro avg 1.00 1.00 1.00 12\n", + "weighted avg 1.00 1.00 1.00 12\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "

Predictions (actual=department | predicted=_predicted). Showing up to 200 rows.

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalarydepartment_lbl_predicted_pred_proba
10BobUnknown40.065000.03Unknown[0.0, 0.0, 0.0, 1.0]
16GraceSales45.065000.02Sales[0.0, 0.0, 0.98, 0.02]
11CharlieEngineering35.0700000.00Engineering[0.97, 0.0, 0.0, 0.03]
4DavidHR25.048000.01HR[0.01, 0.97, 0.0, 0.02]
38FrankEngineering28.072000.00Engineering[0.95, 0.01, 0.0, 0.04]
29EveUnknown35.065000.03Unknown[0.0, 0.0, 0.0, 1.0]
1AliceHR30.05000.01HR[0.0, 1.0, 0.0, 0.0]
40GraceSales45.065000.02Sales[0.0, 0.0, 0.98, 0.02]
31UnknownSales50.065000.02Sales[0.0, 0.0, 1.0, 0.0]
27CharlieEngineering35.0700000.00Engineering[0.97, 0.0, 0.0, 0.03]
13EveUnknown35.065000.03Unknown[0.0, 0.0, 0.0, 1.0]
36DavidHR25.048000.01HR[0.01, 0.97, 0.0, 0.02]
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%evaluate_model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4247d68c-6f93-4297-b1fd-fa09bf6362f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" ] }, "metadata": {}, diff --git a/catboost_info/catboost_training.json b/catboost_info/catboost_training.json new file mode 100644 index 0000000..00cf6ba --- /dev/null +++ b/catboost_info/catboost_training.json @@ -0,0 +1,54 @@ +{ +"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"RMSE"}],"launch_mode":"Train","parameters":"","iteration_count":50,"learn_sets":["learn"],"name":"experiment"}, +"iterations":[ +{"learn":[0.9710631926],"iteration":0,"passed_time":0.05695364861,"remaining_time":2.790728782}, +{"learn":[0.8247710073],"iteration":1,"passed_time":0.05764274648,"remaining_time":1.383425916}, +{"learn":[0.7022282601],"iteration":2,"passed_time":0.05819126054,"remaining_time":0.9116630817}, +{"learn":[0.6270477627],"iteration":3,"passed_time":0.06090029169,"remaining_time":0.7003533545}, +{"learn":[0.5346013406],"iteration":4,"passed_time":0.06149235543,"remaining_time":0.5534311989}, +{"learn":[0.444717163],"iteration":5,"passed_time":0.06183177846,"remaining_time":0.4534330421}, +{"learn":[0.3839413771],"iteration":6,"passed_time":0.06219568651,"remaining_time":0.3820592172}, +{"learn":[0.3332464887],"iteration":7,"passed_time":0.06268006686,"remaining_time":0.329070351}, +{"learn":[0.2896858851],"iteration":8,"passed_time":0.06303172973,"remaining_time":0.2871445465}, +{"learn":[0.2546482153],"iteration":9,"passed_time":0.06325858125,"remaining_time":0.253034325}, +{"learn":[0.2217991067],"iteration":10,"passed_time":0.06379790184,"remaining_time":0.2261925611}, +{"learn":[0.1892739767],"iteration":11,"passed_time":0.0646127055,"remaining_time":0.2046069008}, +{"learn":[0.1682408917],"iteration":12,"passed_time":0.06483456629,"remaining_time":0.1845291502}, +{"learn":[0.1400203532],"iteration":13,"passed_time":0.06534558127,"remaining_time":0.1680314947}, +{"learn":[0.1217420713],"iteration":14,"passed_time":0.06562706824,"remaining_time":0.1531298259}, +{"learn":[0.1038814836],"iteration":15,"passed_time":0.06632963735,"remaining_time":0.1409504794}, +{"learn":[0.08639816643],"iteration":16,"passed_time":0.06670532651,"remaining_time":0.1294868103}, +{"learn":[0.07526269779],"iteration":17,"passed_time":0.06711008939,"remaining_time":0.1193068256}, +{"learn":[0.06268846321],"iteration":18,"passed_time":0.06768000763,"remaining_time":0.1104252756}, +{"learn":[0.05362374493],"iteration":19,"passed_time":0.07160355027,"remaining_time":0.1074053254}, +{"learn":[0.04587823084],"iteration":20,"passed_time":0.07205487013,"remaining_time":0.09950434447}, +{"learn":[0.03940739145],"iteration":21,"passed_time":0.07219507641,"remaining_time":0.0918846427}, +{"learn":[0.03396654535],"iteration":22,"passed_time":0.07238429637,"remaining_time":0.08497286966}, +{"learn":[0.0283153494],"iteration":23,"passed_time":0.07257083526,"remaining_time":0.07861840487}, +{"learn":[0.02431601344],"iteration":24,"passed_time":0.07274715604,"remaining_time":0.07274715604}, +{"learn":[0.02028646573],"iteration":25,"passed_time":0.07293322932,"remaining_time":0.06732298091}, +{"learn":[0.01693585957],"iteration":26,"passed_time":0.07308056369,"remaining_time":0.06225381351}, +{"learn":[0.01414882591],"iteration":27,"passed_time":0.07323423661,"remaining_time":0.05754118591}, +{"learn":[0.01183896794],"iteration":28,"passed_time":0.07350821259,"remaining_time":0.05323008498}, +{"learn":[0.009906073428],"iteration":29,"passed_time":0.07368584825,"remaining_time":0.04912389883}, +{"learn":[0.008624575516],"iteration":30,"passed_time":0.07389072582,"remaining_time":0.04528786421}, +{"learn":[0.007223667424],"iteration":31,"passed_time":0.07407611524,"remaining_time":0.04166781482}, +{"learn":[0.00605654099],"iteration":32,"passed_time":0.07424812302,"remaining_time":0.03824903307}, +{"learn":[0.005085089643],"iteration":33,"passed_time":0.07443328806,"remaining_time":0.03502742967}, +{"learn":[0.004273904823],"iteration":34,"passed_time":0.07461059825,"remaining_time":0.03197597068}, +{"learn":[0.003596032008],"iteration":35,"passed_time":0.07478362838,"remaining_time":0.02908252215}, +{"learn":[0.003101566493],"iteration":36,"passed_time":0.07496663232,"remaining_time":0.02633962757}, +{"learn":[0.002677233959],"iteration":37,"passed_time":0.07515916974,"remaining_time":0.02373447465}, +{"learn":[0.002312693569],"iteration":38,"passed_time":0.07534052031,"remaining_time":0.02124989034}, +{"learn":[0.002014958717],"iteration":39,"passed_time":0.07554323296,"remaining_time":0.01888580824}, +{"learn":[0.001755565836],"iteration":40,"passed_time":0.07573511792,"remaining_time":0.01662478198}, +{"learn":[0.001529576385],"iteration":41,"passed_time":0.07590874078,"remaining_time":0.01445880777}, +{"learn":[0.001332687989],"iteration":42,"passed_time":0.07606214567,"remaining_time":0.01238220976}, +{"learn":[0.001161152485],"iteration":43,"passed_time":0.07621032472,"remaining_time":0.01039231701}, +{"learn":[0.001011704525],"iteration":44,"passed_time":0.07638826593,"remaining_time":0.008487585104}, +{"learn":[0.0008814993786],"iteration":45,"passed_time":0.07655899022,"remaining_time":0.006657303498}, +{"learn":[0.0007680587495],"iteration":46,"passed_time":0.07675038047,"remaining_time":0.004898960456}, +{"learn":[0.0006692235685],"iteration":47,"passed_time":0.07692142334,"remaining_time":0.003205059306}, +{"learn":[0.0005831128718],"iteration":48,"passed_time":0.07707417959,"remaining_time":0.001572942441}, +{"learn":[0.0005080879754],"iteration":49,"passed_time":0.07723189595,"remaining_time":0} +]} \ No newline at end of file diff --git a/catboost_info/learn/events.out.tfevents b/catboost_info/learn/events.out.tfevents new file mode 100644 index 0000000000000000000000000000000000000000..5dcfc9e11856200c628bdac0adfeea813ecea0ee GIT binary patch literal 2398 zcmZ|OTSyaN6bEpvdE1;i=bFx(^U_M4u9*h8yfwwlCaVb&;mmbWQG_5XM6*InY%R@j zrG>F_l2I%uRu5@l*@9+T+QnOqN=pi~h*qE#?GWeT?0Y&7KmPEY|Mv;qoagmqV4~08 z@$Pi`cxEh@$zd|KE7jS&^M=FeDgQ=`y=Y1VI_Aq&)qfq}j=U^d4Zf~YtfBof(DUdZQm^@1NO8fVkq8~NMY&VtW+)vl(!5Aqo$$H6Pg4DPgNBA*|=8+__l+XvdS zkY|o$f>%oSNNMkjd|{3Ryyi(+1?}0$*O&N%e{b)rqP-vTAJsoooqhE+DqQT-ANi5~ zQSimBo0{nS0OT7N%-|=wFGSIvgM2|$2l&ap+V!*#ME<+SG^-(4O7-YBue(S8N;$3!gf&oqg}wC5vl_&&SQ*_XXzCWQ9E z$nQ#g1m2iu8lrs&@;0dnyec-u#s3u`A9$}BeBEVF1Dzj=e9dqv_>Q z|MtYe9NLE=&r?Q%4@uVRXfH&bpU(!rG~4M$`*7qNZhe+J_n%$UwVC!J#}~ zx_F-v$Unbi0zayqKSJk6A|K|f11}c#*k~VxeA%@M@cnwuH0`62?@{D|pYs!LrTr@8 zn+{09>lZ98`Vu35x`hv3z-4Ob{ME?sV>7^y%n$9L{Tk#Mp))DYzIre=&ZX_VLJ929<)(5Bror y`vl}q73P8;H13_Jy%hPeDk=CZN4c5yYmsM727_-e*y2Td8S=?85Adyufyh67bX#Kp literal 0 HcmV?d00001 diff --git a/catboost_info/learn_error.tsv b/catboost_info/learn_error.tsv new file mode 100644 index 0000000..5631041 --- /dev/null +++ b/catboost_info/learn_error.tsv @@ -0,0 +1,51 @@ +iter RMSE +0 0.9710631926 +1 0.8247710073 +2 0.7022282601 +3 0.6270477627 +4 0.5346013406 +5 0.444717163 +6 0.3839413771 +7 0.3332464887 +8 0.2896858851 +9 0.2546482153 +10 0.2217991067 +11 0.1892739767 +12 0.1682408917 +13 0.1400203532 +14 0.1217420713 +15 0.1038814836 +16 0.08639816643 +17 0.07526269779 +18 0.06268846321 +19 0.05362374493 +20 0.04587823084 +21 0.03940739145 +22 0.03396654535 +23 0.0283153494 +24 0.02431601344 +25 0.02028646573 +26 0.01693585957 +27 0.01414882591 +28 0.01183896794 +29 0.009906073428 +30 0.008624575516 +31 0.007223667424 +32 0.00605654099 +33 0.005085089643 +34 0.004273904823 +35 0.003596032008 +36 0.003101566493 +37 0.002677233959 +38 0.002312693569 +39 0.002014958717 +40 0.001755565836 +41 0.001529576385 +42 0.001332687989 +43 0.001161152485 +44 0.001011704525 +45 0.0008814993786 +46 0.0007680587495 +47 0.0006692235685 +48 0.0005831128718 +49 0.0005080879754 diff --git a/catboost_info/time_left.tsv b/catboost_info/time_left.tsv new file mode 100644 index 0000000..911fdf9 --- /dev/null +++ b/catboost_info/time_left.tsv @@ -0,0 +1,51 @@ +iter Passed Remaining +0 56 2790 +1 57 1383 +2 58 911 +3 60 700 +4 61 553 +5 61 453 +6 62 382 +7 62 329 +8 63 287 +9 63 253 +10 63 226 +11 64 204 +12 64 184 +13 65 168 +14 65 153 +15 66 140 +16 66 129 +17 67 119 +18 67 110 +19 71 107 +20 72 99 +21 72 91 +22 72 84 +23 72 78 +24 72 72 +25 72 67 +26 73 62 +27 73 57 +28 73 53 +29 73 49 +30 73 45 +31 74 41 +32 74 38 +33 74 35 +34 74 31 +35 74 29 +36 74 26 +37 75 23 +38 75 21 +39 75 18 +40 75 16 +41 75 14 +42 76 12 +43 76 10 +44 76 8 +45 76 6 +46 76 4 +47 76 3 +48 77 1 +49 77 0 diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py new file mode 100644 index 0000000..8ce6b8a --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py @@ -0,0 +1,136 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.preprocessing import MinMaxScaler + + +class Normalize(MariaMagic): + """ + %normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False] + + Scales numeric columns to a fixed range (default 0–1) using sklearn's MinMaxScaler. + + - columns: list of columns to normalize. If omitted, all numeric columns are used. + - feature_range: lower and upper bounds for scaling (default: 0,1) + - inplace: if True (default), modifies data["last_select"] in-place. + if False, stores result in data["last_select_normalized"]. + + Examples: + %normalize + %normalize columns=age,salary + %normalize feature_range=5,10 inplace=False + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "normalize" + + def help(self): + return ( + "%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]\n" + "Normalize numeric columns using MinMaxScaler (in-place by default)." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def execute(self, kernel, data): + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + else: + columns = None + + feature_range_arg = args.get("feature_range", "0,1") + if isinstance(feature_range_arg, str): + parts = [p.strip() for p in feature_range_arg.split(",")] + if len(parts) == 2: + feature_range = (float(parts[0]), float(parts[1])) + else: + kernel._send_message("stderr", "feature_range must be provided as 'min,max'.") + return + else: + feature_range = (0, 1) + + inplace = bool(args.get("inplace", True)) + target_df = df if inplace else df.copy(deep=True) + + # Select numeric columns + if columns is None: + target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] + else: + missing_cols = [c for c in columns if c not in target_df.columns] + if missing_cols: + kernel._send_message("stderr", f"Missing columns: {', '.join(missing_cols)}") + return + target_columns = columns + + if not target_columns: + kernel._send_message("stderr", "No numeric columns to normalize.") + return + + try: + scaler = MinMaxScaler(feature_range=feature_range) + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + msg = f"Normalized {len(target_columns)} column(s) to range {feature_range}." + except Exception as e: + kernel._send_message("stderr", f"Error during normalization: {e}") + return + + if inplace: + data["last_select"] = target_df + msg += " Updated data['last_select'] in-place." + else: + data["last_select_normalized"] = target_df + msg += " Stored in data['last_select_normalized']." + + kernel._send_message("stdout", msg) + self._send_html(kernel, target_df) diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py new file mode 100644 index 0000000..9cb8071 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py @@ -0,0 +1,230 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.model_selection import train_test_split + + +class SplitData(MariaMagic): + """ + %splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False] + [random_state=42] [inplace=True|False] [train_name=last_select_train] + [test_name=last_select_test] [val_name=last_select_val] + + Split the current data["last_select"] DataFrame into train/test/(validation). + + - test_size: float fraction (0-1) or int count. Interpreted relative to the original dataset. + Default: 0.2 + - val_size: float fraction (0-1) or int count. Interpreted relative to the original dataset. + If 0 (default), no validation set is created. + - stratify: column name to stratify on (must exist in the DataFrame). + - shuffle: whether to shuffle before splitting (default True). + - random_state: integer seed for reproducibility (default None). + - inplace: if True (default), sets data["last_select"] to the training set and also stores + test/val under the provided names. If False, original last_select is kept and train/test/val + are stored under the provided names. + - train_name/test_name/val_name: keys under which resulting DataFrames will be stored in `data`. + Defaults: last_select_train, last_select_test, last_select_val + + Behavior notes: + - test_size and val_size may be integers (counts) or floats (fractions of the original dataset). + - If both fractions are provided, the code first removes the test set (test_size of original), + then splits the remaining data to create the validation set. The computed relative fraction + for the second split uses val_size relative to the original dataset (so results match user intent). + - If val_size is 0 or not provided, only train/test split occurs. + + Examples: + %splitdata + %splitdata test_size=0.25 val_size=0.1 stratify=target random_state=123 + %splitdata test_size=100 val_size=50 inplace=False + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "splitdata" + + def help(self): + return ( + "%splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False]\n" + "[random_state=42] [inplace=True|False] [train_name=name] [test_name=name] [val_name=name]\n" + "Split last_select into train/test/(val)." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def execute(self, kernel, data): + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # Defaults + test_size_arg = args.get("test_size", 0.2) + val_size_arg = args.get("val_size", 0.0) + stratify_col = args.get("stratify", None) + shuffle = bool(args.get("shuffle", True)) + random_state = args.get("random_state", None) + inplace = bool(args.get("inplace", True)) + + train_name = args.get("train_name", "last_select_train") + test_name = args.get("test_name", "last_select_test") + val_name = args.get("val_name", "last_select_val") + + # Validate dataset + n_total = len(df) + if n_total == 0: + kernel._send_message("stderr", "DataFrame has no rows to split.") + return + + # Helper to interpret sizes (int count or fraction) + def interpret_size(size_arg, total): + if isinstance(size_arg, int): + if size_arg < 0: + raise ValueError("Sizes must be non-negative.") + return float(size_arg) / total + try: + size_f = float(size_arg) + except Exception: + raise ValueError("Size must be an int or float.") + if size_f < 0: + raise ValueError("Sizes must be non-negative.") + if 0 <= size_f < 1: + return size_f + # If provided >=1 and integer-like, treat as count + if size_f >= 1 and abs(size_f - int(size_f)) < 1e-9: + if int(size_f) > total: + raise ValueError("Size count larger than dataset.") + return float(int(size_f)) / total + # fractions >= 1 are invalid + raise ValueError("If numeric and >=1, size must be an integer count <= total rows.") + + try: + test_frac = interpret_size(test_size_arg, n_total) + val_frac = interpret_size(val_size_arg, n_total) + except ValueError as e: + kernel._send_message("stderr", f"Error interpreting sizes: {e}") + return + + if test_frac + val_frac >= 1.0: + kernel._send_message("stderr", "Sum of test_size and val_size must be less than 1.0.") + return + + # Prepare stratify arrays if requested + stratify_arr = None + if stratify_col: + if stratify_col not in df.columns: + kernel._send_message("stderr", f"Stratify column '{stratify_col}' not found in DataFrame.") + return + stratify_arr = df[stratify_col].values + + try: + # First split off the test set (test_frac of original) + if test_frac > 0: + train_val_df, test_df = train_test_split( + df, + test_size=test_frac, + shuffle=shuffle, + random_state=random_state, + stratify=stratify_arr if stratify_arr is not None else None + ) + else: + train_val_df = df.copy(deep=True) + test_df = pd.DataFrame(columns=df.columns) + + # If no val requested, train = train_val_df + if val_frac <= 0: + train_df = train_val_df + val_df = pd.DataFrame(columns=df.columns) + else: + # We need to compute val fraction relative to the remaining (train_val_df). + # val_frac was relative to original; relative fraction = val_frac / (1 - test_frac) + rel_val_frac = val_frac / (1.0 - test_frac) + # For stratify on second split, use stratify column restricted to train_val_df if provided + stratify_arr_second = None + if stratify_arr is not None: + stratify_arr_second = train_val_df[stratify_col].values + train_df, val_df = train_test_split( + train_val_df, + test_size=rel_val_frac, + shuffle=shuffle, + random_state=random_state, + stratify=stratify_arr_second if stratify_arr_second is not None else None + ) + + # Store results in data dict + data[test_name] = test_df + data[val_name] = val_df + data[train_name] = train_df + + if inplace: + # follow behavior of other magics: set last_select to training set + data["last_select"] = train_df + + # Report sizes + msg = ( + f"Split completed: total={n_total}, train={len(train_df)}, " + f"test={len(test_df)}, val={len(val_df)}." + ) + kernel._send_message("stdout", msg) + + # Display small previews + # Show train + validation (if exists) and test + try: + if not train_df.empty: + self._send_html(kernel, train_df.head(20), title=f"Train ({len(train_df)} rows)") + if not val_df.empty: + self._send_html(kernel, val_df.head(20), title=f"Validation ({len(val_df)} rows)") + if not test_df.empty: + self._send_html(kernel, test_df.head(20), title=f"Test ({len(test_df)} rows)") + except Exception: + # non-fatal; already stored the DataFrames + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during splitting: {e}") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py new file mode 100644 index 0000000..f288973 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py @@ -0,0 +1,124 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.preprocessing import StandardScaler + + +class Standardize(MariaMagic): + """ + %standardize [columns=col1,col2,...] [inplace=True|False] + + Standardizes numeric columns using sklearn's StandardScaler + (zero mean and unit variance). + + - columns: comma-separated list of columns to standardize. + If omitted, all numeric columns are used. + - inplace: if True (default), modifies data["last_select"] in-place. + if False, stores result in data["last_select_standardized"]. + + Examples: + %standardize + %standardize columns=age,salary inplace=False + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "standardize" + + def help(self): + return ( + "%standardize [columns=col1,col2,...] [inplace=True|False]\n" + "Standardizes numeric columns using sklearn's StandardScaler (in-place by default)." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def execute(self, kernel, data): + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + else: + columns = None + + inplace = bool(args.get("inplace", True)) + target_df = df if inplace else df.copy(deep=True) + + if columns is None: + target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] + else: + missing_cols = [c for c in columns if c not in target_df.columns] + if missing_cols: + kernel._send_message("stderr", f"Missing columns: {', '.join(missing_cols)}") + return + target_columns = columns + + if not target_columns: + kernel._send_message("stderr", "No numeric columns to standardize.") + return + + try: + scaler = StandardScaler() + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." + except Exception as e: + kernel._send_message("stderr", f"Error during standardization: {e}") + return + + if inplace: + data["last_select"] = target_df + msg += " Updated data['last_select'] in-place." + else: + data["last_select_standardized"] = target_df + msg += " Stored in data['last_select_standardized']." + + kernel._send_message("stdout", msg) + self._send_html(kernel, target_df) diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py new file mode 100644 index 0000000..7f01c63 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py @@ -0,0 +1,288 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import joblib +import json + +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, + mean_squared_error, mean_absolute_error, r2_score, + roc_auc_score, classification_report +) +from sklearn.preprocessing import LabelEncoder + +class EvaluateModel(MariaMagic): + """ + %evaluate_model [model_name=last_model] [test_name=last_select_test] [pred_name=last_preds] + [problem=classification|regression] + + Evaluate a previously trained model stored in `data[model_name]` using the test + DataFrame `data[test_name]`. Outputs metrics, confusion matrix, ROC AUC (if applicable), + and displays a table with actual vs predicted (and probabilities if available). + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "evaluate_model" + + def help(self): + return "Evaluate a trained model on a test DataFrame and show metrics + predictions." + + def _str_to_obj(self, s): + # same helper as in TrainModel + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def execute(self, kernel, data): + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + model_store_name = args.get("model_name", args.get("model", "last_model")) + test_name = args.get("test_name", "last_select_test") + pred_name = args.get("pred_name", "last_preds") + problem_override = args.get("problem", None) + + # fetch model + model = data.get(model_store_name) + if model is None: + kernel._send_message("stderr", f"No model found in data['{model_store_name}']. Train and save a model first.") + return + + # fetch test set + test_df = data.get(test_name) + if test_df is None or not isinstance(test_df, pd.DataFrame) or test_df.empty: + kernel._send_message("stderr", f"No test DataFrame found in data['{test_name}'] or it is empty.") + return + + # try to infer problem from model type if not provided + if problem_override: + problem = problem_override.lower() + else: + is_classifier = any(attr in dir(model) for attr in ("predict_proba", "decision_function", "classes_")) + problem = "classification" if is_classifier else "regression" + + # get meta (features + target) from training metadata + meta = data.get(model_store_name + "_meta", {}) or {} + features = meta.get("features") + target_col = meta.get("target") or meta.get("target_col") + + # fallback: if no target in meta, try to infer target as the only non-feature column + if not target_col: + if features: + possible_targets = [c for c in test_df.columns if c not in features] + if len(possible_targets) == 1: + target_col = possible_targets[0] + + if not target_col: + kernel._send_message("stderr", "Target column not found in model meta and could not be inferred from test DataFrame. " + "Set data[model_name + '_meta']['target']='' when training, or pass target info in meta.") + return + + if target_col not in test_df.columns: + kernel._send_message("stderr", f"Target column '{target_col}' not present in test DataFrame '{test_name}'.") + return + + if not features: + kernel._send_message("stderr", "Model metadata does not contain 'features' list. Cannot build X_test.") + return + + missing_features = [c for c in features if c not in test_df.columns] + if missing_features: + kernel._send_message("stderr", f"Test DataFrame missing feature columns: {', '.join(missing_features)}") + return + + X_test = test_df[features].copy() + y_true_orig = test_df[target_col].copy() # preserve original values for display + + # Predict + try: + preds_raw = model.predict(X_test) + except Exception as e: + kernel._send_message("stderr", f"Error during prediction: {e}") + return + + # Try predict_proba + pred_proba = None + if problem == "classification" and hasattr(model, "predict_proba"): + try: + proba = model.predict_proba(X_test) + if proba.ndim == 2 and proba.shape[1] == 2: + pred_proba = proba[:, 1].tolist() + else: + pred_proba = proba.tolist() + except Exception: + pred_proba = None + + # Try to build preds_display (human-readable) + preds_display = preds_raw + model_classes = getattr(model, "classes_", None) + try: + # if model has classes_ and preds_raw are indices, map to class labels + if model_classes is not None and pd.api.types.is_integer_dtype(np.asarray(preds_raw).dtype): + preds_display = np.asarray(model_classes)[np.asarray(preds_raw).astype(int)] + # if preds_raw are numeric but classes_ are strings, try mapping by index + elif model_classes is not None and not pd.api.types.is_numeric_dtype(model_classes): + # if preds_raw are label indices (ints) handle above; else if preds_raw are strings keep as-is + pass + except Exception: + preds_display = preds_raw + + # Construct predictions DataFrame + preds_df = test_df.copy(deep=True) + preds_df["_predicted"] = preds_display + if pred_proba is not None: + preds_df["_pred_proba"] = pred_proba + + data[pred_name] = preds_df + + # --- Metrics: ensure consistent types for y_true and preds --- + out_lines = [] + if problem == "classification": + # build arrays for metrics + y_true_vals = np.asarray(y_true_orig) + preds_vals = np.asarray(preds_display) + + # If types are mixed (numbers and strings), cast both to str for label-based metrics + def is_mixed(a, b): + return (pd.api.types.is_numeric_dtype(a) and not pd.api.types.is_numeric_dtype(b)) or \ + (pd.api.types.is_numeric_dtype(b) and not pd.api.types.is_numeric_dtype(a)) + + if is_mixed(y_true_vals.dtype, preds_vals.dtype): + y_metric = np.asarray(y_true_orig.astype(str)) + p_metric = np.asarray(pd.Series(preds_display).astype(str)) + else: + # prefer numeric if both numeric; else use original dtype (strings) + if pd.api.types.is_numeric_dtype(y_true_vals) and pd.api.types.is_numeric_dtype(preds_vals): + y_metric = y_true_vals.astype(float) + p_metric = preds_vals.astype(float) + else: + y_metric = np.asarray(y_true_orig.astype(str)) + p_metric = np.asarray(pd.Series(preds_display).astype(str)) + + # compute basic metrics (these accept string labels fine) + try: + acc = accuracy_score(y_metric, p_metric) + prec = precision_score(y_metric, p_metric, average="weighted", zero_division=0) + rec = recall_score(y_metric, p_metric, average="weighted", zero_division=0) + f1 = f1_score(y_metric, p_metric, average="weighted", zero_division=0) + cm = confusion_matrix(y_metric, p_metric) + except Exception as e: + kernel._send_message("stderr", f"Error computing classification metrics: {e}") + return + + out_lines.append(f"Classification metrics (model: '{model_store_name}')") + out_lines.append(f" accuracy = {acc:.4f}") + out_lines.append(f" precision (weighted) = {prec:.4f}") + out_lines.append(f" recall (weighted) = {rec:.4f}") + out_lines.append(f" f1 (weighted) = {f1:.4f}") + out_lines.append(" Confusion matrix (rows=actual, cols=predicted):") + out_lines.append(str(cm.tolist())) + + # ROC AUC: attempt only if predict_proba available and we can map y_true to integer indices + roc_text = " ROC AUC not available." + if pred_proba is not None and model_classes is not None: + try: + # map true labels to indices using model.classes_ + class_to_idx = {str(c): i for i, c in enumerate(model_classes)} + y_idx = np.array([class_to_idx.get(str(v), None) for v in y_true_orig]) + if None in y_idx: + roc_text = " ROC AUC not computable: some test classes not present in model.classes_." + else: + proba_arr = np.asarray(pred_proba) + if proba_arr.ndim == 1: + # binary case + roc_auc = roc_auc_score(y_idx.astype(int), proba_arr.astype(float)) + roc_text = f" ROC AUC (binary) = {roc_auc:.4f}" + else: + roc_auc = roc_auc_score(y_idx.astype(int), proba_arr, multi_class="ovr", average="weighted") + roc_text = f" ROC AUC (multiclass OVR, weighted) = {roc_auc:.4f}" + except Exception: + roc_text = " ROC AUC computation failed." + elif pred_proba is not None and model_classes is None: + roc_text = " ROC AUC not computed: model.classes_ missing." + out_lines.append(roc_text) + + # classification report with readable labels if possible + try: + target_names = None + if model_classes is not None: + target_names = [str(c) for c in model_classes] + report = classification_report(y_metric, p_metric, zero_division=0, target_names=target_names) + out_lines.append("\nClassification report:\n" + report) + except Exception: + pass + + else: + # regression metrics + try: + preds_num = np.asarray(preds_raw).astype(float) + y_true_num = np.asarray(y_true_orig).astype(float) + rmse = float(np.sqrt(mean_squared_error(y_true_num, preds_num))) + mae = float(mean_absolute_error(y_true_num, preds_num)) + r2 = float(r2_score(y_true_num, preds_num)) + out_lines.append(f"Regression metrics (model: '{model_store_name}')") + out_lines.append(f" RMSE = {rmse:.4f}") + out_lines.append(f" MAE = {mae:.4f}") + out_lines.append(f" R2 = {r2:.4f}") + except Exception as e: + kernel._send_message("stderr", f"Error computing regression metrics: {e}") + return + + # send textual summary + kernel._send_message("stdout", "\n".join(out_lines)) + + # display HTML preview of predictions (actual vs predicted) + try: + self._send_html(kernel, preds_df.head(200), + title=f"Predictions (actual={target_col} | predicted=_predicted). Showing up to 200 rows.") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py new file mode 100644 index 0000000..10c8313 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py @@ -0,0 +1,297 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import joblib +import json + +from sklearn.model_selection import cross_val_score +from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.neural_network import MLPClassifier, MLPRegressor +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, + mean_squared_error, mean_absolute_error, r2_score +) + +# Optional external libraries +_XGBOOST_AVAILABLE = False +_LIGHTGBM_AVAILABLE = False +_CATBOOST_AVAILABLE = False +try: + from xgboost import XGBClassifier, XGBRegressor + _XGBOOST_AVAILABLE = True +except Exception: + pass + +try: + from lightgbm import LGBMClassifier, LGBMRegressor + _LIGHTGBM_AVAILABLE = True +except Exception: + pass + +try: + from catboost import CatBoostClassifier, CatBoostRegressor + _CATBOOST_AVAILABLE = True +except Exception: + pass + + +class TrainModel(MariaMagic): + """ + %train_model model= features=col1,col2 target=target_col + [cv=0] [problem=classification|regression] + [model_name=last_model] [pred_name=last_preds] [test_name=last_select_test] + [save_path=/path/to/model.joblib] [inplace=True|False] [model_params={'n':1}] + + Train a model on data["last_select"] (TRAINING set). This magic DOES NOT perform + splitting or scaling — run your preprocessing and %splitdata beforehand. + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "train_model" + + def help(self): + return "Train a model on data['last_select'] (no split or scaling)." + + def _str_to_obj(self, s): + # try int/float/bool, then JSON, then string unquote + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + # try json + try: + return json.loads(s) + except Exception: + pass + # strip quotes + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _choose_model(self, name, problem, params=None): + p = params or {} + name = name.lower() + # Classification vs regression models where appropriate + if name in ("logistic", "logistic_regression", "lr"): + if problem != "classification": + raise ValueError("LogisticRegression is for classification problems.") + return LogisticRegression(max_iter=1000, **p) + if name in ("rf", "random_forest"): + return RandomForestClassifier(**p) if problem == "classification" else RandomForestRegressor(**p) + if name in ("svc", "svm"): + if problem != "classification": + raise ValueError("SVC is for classification problems.") + return SVC(probability=True, **p) + if name in ("linear", "linear_regression"): + if problem != "regression": + raise ValueError("LinearRegression is for regression problems.") + return LinearRegression(**p) + if name == "ridge": + if problem != "regression": + raise ValueError("Ridge is for regression problems.") + return Ridge(**p) + if name == "lasso": + if problem != "regression": + raise ValueError("Lasso is for regression problems.") + return Lasso(**p) + if name == "knn": + return KNeighborsClassifier(**p) if problem == "classification" else KNeighborsRegressor(**p) + if name == "gbm": + return GradientBoostingClassifier(**p) if problem == "classification" else GradientBoostingRegressor(**p) + if name == "ada": + return AdaBoostClassifier(**p) if problem == "classification" else AdaBoostRegressor(**p) + if name == "mlp": + return MLPClassifier(max_iter=1000, **p) if problem == "classification" else MLPRegressor(max_iter=1000, **p) + if name == "xgboost": + if not _XGBOOST_AVAILABLE: + raise ImportError("xgboost not available in this environment.") + return XGBClassifier(**p) if problem == "classification" else XGBRegressor(**p) + if name == "lightgbm": + if not _LIGHTGBM_AVAILABLE: + raise ImportError("lightgbm not available in this environment.") + return LGBMClassifier(**p) if problem == "classification" else LGBMRegressor(**p) + if name == "catboost": + if not _CATBOOST_AVAILABLE: + raise ImportError("catboost not available in this environment.") + # CatBoost often prints to stdout; keep default verbose False + p = dict(p) + p.setdefault("verbose", False) + return CatBoostClassifier(**p) if problem == "classification" else CatBoostRegressor(**p) + raise ValueError(f"Unknown model name '{name}'") + + def execute(self, kernel, data): + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty (training set required).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + features_arg = args.get("features") + target = args.get("target") + model_name_arg = args.get("model", "rf") + cv = int(args.get("cv", 0) or 0) + problem_override = args.get("problem", None) + test_name = args.get("test_name", "last_select_test") + model_store_name = args.get("model_name", "last_model") + # pred_name and save_path intentionally ignored/removed + inplace = bool(args.get("inplace", True)) + model_params = args.get("model_params", {}) or {} + + if not features_arg: + kernel._send_message("stderr", "features argument is required (features=col1,col2...).") + return + if not target: + kernel._send_message("stderr", "target argument is required (target=target_col).") + return + + # parse features + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + kernel._send_message("stderr", "features must be comma-separated string or list.") + return + + missing = [c for c in features + [target] if c not in df.columns] + if missing: + kernel._send_message("stderr", f"Missing columns in training DataFrame: {', '.join(missing)}") + return + + # Determine problem type + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + return + else: + # improved heuristic for problem detection + tgt_ser = df[target] + + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + + # treat as regression if: + # - float dtype, or + # - many distinct values (>20), or + # - distinct proportion high (e.g. >5% of rows) + if pd.api.types.is_float_dtype(tgt_ser) or (nunique > 20) or (uniq_prop > 0.05): + problem = "regression" + else: + # few distinct integer-like values -> classification (categorical target) + problem = "classification" + else: + problem = "classification" + + # Prepare X_train, y_train + X_train = df[features].copy() + y_train = df[target].copy() + + # NOTE: test set (if present) will be ignored in this modified flow — no predictions or metrics. + # Keep reading test_df only to validate presence but do not use it. + test_df = data.get(test_name) + if isinstance(test_df, pd.DataFrame) and not test_df.empty: + missing_test = [c for c in features + [target] if c not in test_df.columns] + if missing_test: + kernel._send_message("stderr", f"Test DataFrame '{test_name}' missing columns: {', '.join(missing_test)}") + return + + # Instantiate model + try: + model = self._choose_model(model_name_arg, problem, params=model_params) + except Exception as e: + kernel._send_message("stderr", f"Error creating model: {e}") + return + + # Cross-validation on training set if requested (kept) + cv_results = None + if cv and cv > 1: + try: + scoring = "accuracy" if problem == "classification" else "r2" + cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring) + except Exception as e: + kernel._send_message("stderr", f"Error during cross-validation: {e}") + return + + # Fit + try: + model.fit(X_train, y_train) + except Exception as e: + kernel._send_message("stderr", f"Error fitting model: {e}") + return + + # Store only the trained model and minimal meta (no preds, no metrics, no joblib saving) + try: + data[model_store_name] = model + + # Save metadata including target so evaluate_model can find it + meta = data.setdefault(model_store_name + "_meta", {}) + meta["problem"] = problem + meta["features"] = features + meta["target"] = target + + # If model exposes classes_, save them for easier decoding later + if hasattr(model, "classes_"): + try: + meta["classes"] = list(getattr(model, "classes_")) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error storing model: {e}") + return + + # Output concise summary + out_lines = [f"Model '{model_name_arg}' trained and saved to data['{model_store_name}']. problem={problem}. train_rows={len(X_train)}"] + if cv_results is not None: + out_lines.append(f"cross-val (cv={cv}) scores: mean={float(np.mean(cv_results)):.4f}, std={float(np.std(cv_results)):.4f}") + kernel._send_message("stdout", "\n".join(out_lines)) + + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index c7105b1..34750e1 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -19,6 +19,11 @@ from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData +from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel +from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel def get(): return { @@ -37,4 +42,9 @@ def get(): "dropoutliers": DropOutliers, "clipoutliers": ClipOutliers, "encode": Encode, + "normalize": Normalize, + "standardize": Standardize, + "splitdata": SplitData, + "train_model": TrainModel, + "evaluate_model": EvaluateModel, } From c9eeeb670b09d39982281c9cd1393ed50499bbd6 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Mon, 27 Oct 2025 15:09:26 +0000 Subject: [PATCH 13/38] Added save model --- Untitled.ipynb | 207 ++++------- .../model_training/evaluate_model.py | 198 +++++++--- .../ml_commands/model_training/savemodel.py | 341 ++++++++++++++++++ .../maria_magics/supported_magics.py | 2 + 4 files changed, 545 insertions(+), 203 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py diff --git a/Untitled.ipynb b/Untitled.ipynb index d5bc045..c5a00bf 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -5000,7 +5000,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "000a2e5b-1918-4371-8b47-d3a4547a1759", "metadata": {}, "outputs": [ @@ -5022,7 +5022,6 @@ " department\n", " age\n", " salary\n", - " department_lbl\n", " \n", " \n", " \n", @@ -5032,7 +5031,6 @@ " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", " \n", " \n", " 20\n", @@ -5040,7 +5038,6 @@ " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", " \n", " \n", " 35\n", @@ -5048,7 +5045,6 @@ " Engineering\n", " 35.0\n", " 700000.0\n", - " 0\n", " \n", " \n", " 8\n", @@ -5056,7 +5052,6 @@ " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 18\n", @@ -5064,7 +5059,6 @@ " Unknown\n", " 40.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 24\n", @@ -5072,7 +5066,6 @@ " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 5\n", @@ -5080,7 +5073,6 @@ " Unknown\n", " 35.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 33\n", @@ -5088,7 +5080,6 @@ " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", " \n", " \n", " 3\n", @@ -5096,7 +5087,6 @@ " Engineering\n", " 35.0\n", " 700000.0\n", - " 0\n", " \n", " \n", " 30\n", @@ -5104,7 +5094,6 @@ " Engineering\n", " 28.0\n", " 72000.0\n", - " 0\n", " \n", " \n", " 17\n", @@ -5112,7 +5101,6 @@ " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", " \n", " \n", " 22\n", @@ -5120,7 +5108,6 @@ " Engineering\n", " 28.0\n", " 72000.0\n", - " 0\n", " \n", " \n", " 7\n", @@ -5128,7 +5115,6 @@ " Sales\n", " 50.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 2\n", @@ -5136,7 +5122,6 @@ " Unknown\n", " 40.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 12\n", @@ -5144,7 +5129,6 @@ " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", " \n", " \n", " 34\n", @@ -5152,7 +5136,6 @@ " Unknown\n", " 40.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 23\n", @@ -5160,7 +5143,6 @@ " Sales\n", " 50.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 32\n", @@ -5168,7 +5150,6 @@ " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 37\n", @@ -5176,7 +5157,6 @@ " Unknown\n", " 35.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 25\n", @@ -5184,7 +5164,6 @@ " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", " \n", " \n", "" @@ -5204,7 +5183,6 @@ " department\n", " age\n", " salary\n", - " department_lbl\n", " \n", " \n", " \n", @@ -5214,7 +5192,6 @@ " Unknown\n", " 40.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 14\n", @@ -5222,7 +5199,6 @@ " Engineering\n", " 28.0\n", " 72000.0\n", - " 0\n", " \n", " \n", " 9\n", @@ -5230,7 +5206,6 @@ " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", " \n", " \n", " 39\n", @@ -5238,7 +5213,6 @@ " Sales\n", " 50.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 15\n", @@ -5246,7 +5220,6 @@ " Sales\n", " 50.0\n", " 65000.0\n", - " 2\n", " \n", " \n", "" @@ -5266,7 +5239,6 @@ " department\n", " age\n", " salary\n", - " department_lbl\n", " \n", " \n", " \n", @@ -5276,7 +5248,6 @@ " Unknown\n", " 40.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 16\n", @@ -5284,7 +5255,6 @@ " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 11\n", @@ -5292,7 +5262,6 @@ " Engineering\n", " 35.0\n", " 700000.0\n", - " 0\n", " \n", " \n", " 4\n", @@ -5300,7 +5269,6 @@ " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", " \n", " \n", " 38\n", @@ -5308,7 +5276,6 @@ " Engineering\n", " 28.0\n", " 72000.0\n", - " 0\n", " \n", " \n", " 29\n", @@ -5316,7 +5283,6 @@ " Unknown\n", " 35.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 1\n", @@ -5324,7 +5290,6 @@ " HR\n", " 30.0\n", " 5000.0\n", - " 1\n", " \n", " \n", " 40\n", @@ -5332,7 +5297,6 @@ " Sales\n", " 45.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 31\n", @@ -5340,7 +5304,6 @@ " Sales\n", " 50.0\n", " 65000.0\n", - " 2\n", " \n", " \n", " 27\n", @@ -5348,7 +5311,6 @@ " Engineering\n", " 35.0\n", " 700000.0\n", - " 0\n", " \n", " \n", " 13\n", @@ -5356,7 +5318,6 @@ " Unknown\n", " 35.0\n", " 65000.0\n", - " 3\n", " \n", " \n", " 36\n", @@ -5364,7 +5325,6 @@ " HR\n", " 25.0\n", " 48000.0\n", - " 1\n", " \n", " \n", "" @@ -5601,7 +5561,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "2543a896-7047-45a7-a118-3adcfb822023", "metadata": {}, "outputs": [ @@ -5709,173 +5669,116 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "1d90ce87-aafd-4958-8318-09f66793b98e", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "Classification metrics (model: 'last_model')\n", - " accuracy = 1.0000\n", - " precision (weighted) = 1.0000\n", - " recall (weighted) = 1.0000\n", - " f1 (weighted) = 1.0000\n", - " Confusion matrix (rows=actual, cols=predicted):\n", - "[[3, 0, 0, 0], [0, 3, 0, 0], [0, 0, 3, 0], [0, 0, 0, 3]]\n", - " ROC AUC (multiclass OVR, weighted) = 1.0000\n", - "\n", - "Classification report:\n", - " precision recall f1-score support\n", - "\n", - " Engineering 1.00 1.00 1.00 3\n", - " HR 1.00 1.00 1.00 3\n", - " Sales 1.00 1.00 1.00 3\n", - " Unknown 1.00 1.00 1.00 3\n", - "\n", - " accuracy 1.00 12\n", - " macro avg 1.00 1.00 1.00 12\n", - "weighted avg 1.00 1.00 1.00 12\n", - "\n" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
              precision    recall  f1-score   support\n",
+       "\n",
+       " Engineering       1.00      1.00      1.00         3\n",
+       "          HR       1.00      1.00      1.00         3\n",
+       "       Sales       1.00      1.00      1.00         3\n",
+       "     Unknown       1.00      1.00      1.00         3\n",
+       "\n",
+       "    accuracy                           1.00        12\n",
+       "   macro avg       1.00      1.00      1.00        12\n",
+       "weighted avg       1.00      1.00      1.00        12\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "

Predictions (actual=department | predicted=_predicted). Showing up to 200 rows.

\n", + "

Predictions preview (actual vs predicted)

\n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", "
idnamedepartmentagesalarydepartment_lbl_predicted_pred_proba
10BobUnknown40.065000.03Unknown[0.0, 0.0, 0.0, 1.0]
16GraceSales45.065000.02Sales[0.0, 0.0, 0.98, 0.02][0.0, 0.0, 0.97, 0.03]
11CharlieEngineering35.0700000.00Engineering[0.97, 0.0, 0.0, 0.03][0.96, 0.0, 0.0, 0.04]
4DavidHR25.048000.01HR[0.01, 0.97, 0.0, 0.02][0.0, 0.98, 0.0, 0.02]
38FrankEngineering28.072000.00Engineering[0.95, 0.01, 0.0, 0.04][0.99, 0.01, 0.0, 0.0]
29EveUnknown35.065000.03Unknown[0.0, 0.0, 0.0, 1.0][0.01, 0.0, 0.0, 0.99]
1AliceHR30.05000.01HR[0.0, 1.0, 0.0, 0.0]
40GraceSales45.065000.02Sales[0.0, 0.0, 0.98, 0.02][0.0, 0.0, 0.97, 0.03]
31UnknownSales50.065000.02Sales[0.0, 0.0, 1.0, 0.0]
27CharlieEngineering35.0700000.00Engineering[0.97, 0.0, 0.0, 0.03][0.96, 0.0, 0.0, 0.04]
13EveUnknown35.065000.03Unknown[0.0, 0.0, 0.0, 1.0][0.01, 0.0, 0.0, 0.99]
36DavidHR25.048000.01HR[0.01, 0.97, 0.0, 0.02][0.0, 0.98, 0.0, 0.02]
" @@ -5889,6 +5792,24 @@ "%evaluate_model" ] }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3e1a5300-a034-469a-abed-b50108a7f3a9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The %savemodel magic command does not exist\n" + ] + } + ], + "source": [ + "%save_model model_name=last_model save_path=/tmp/test_model.joblib" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py index 7f01c63..a6020b8 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py @@ -16,14 +16,20 @@ ) from sklearn.preprocessing import LabelEncoder +import matplotlib +# Use non-interactive backend if needed (safe in most notebook envs) +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import io +import base64 + class EvaluateModel(MariaMagic): """ %evaluate_model [model_name=last_model] [test_name=last_select_test] [pred_name=last_preds] [problem=classification|regression] - Evaluate a previously trained model stored in `data[model_name]` using the test - DataFrame `data[test_name]`. Outputs metrics, confusion matrix, ROC AUC (if applicable), - and displays a table with actual vs predicted (and probabilities if available). + Nice, visual evaluation of a trained model: metrics card, confusion-matrix plot, + classification report and a preview table of actual vs predicted. """ def __init__(self, args=""): self.args = args @@ -37,8 +43,8 @@ def name(self): def help(self): return "Evaluate a trained model on a test DataFrame and show metrics + predictions." + # reuse helpers from previous version def _str_to_obj(self, s): - # same helper as in TrainModel try: return int(s) except Exception: @@ -77,6 +83,45 @@ def _send_html(self, kernel, df, title=None): except Exception: pass + def _send_raw_html(self, kernel, html): + """Send raw HTML to the frontend.""" + try: + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _plot_confusion_matrix_to_datauri(self, cm, labels): + """Draw confusion matrix (matplotlib) and return data URI PNG.""" + fig, ax = plt.subplots(figsize=(6, 5)) + im = ax.imshow(cm, interpolation='nearest') + ax.set_title("Confusion matrix") + ax.set_xlabel("Predicted") + ax.set_ylabel("Actual") + + # Set tick labels + ax.set_xticks(np.arange(len(labels))) + ax.set_yticks(np.arange(len(labels))) + ax.set_xticklabels(labels, rotation=45, ha="right") + ax.set_yticklabels(labels) + + # Annotate cells + thresh = cm.max() / 2.0 if cm.size else 0 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(int(cm[i, j]), 'd'), + ha="center", va="center", + fontsize=10) + + fig.tight_layout() + + buf = io.BytesIO() + fig.savefig(buf, format="png", bbox_inches="tight") + plt.close(fig) + buf.seek(0) + data = base64.b64encode(buf.read()).decode("ascii") + return f"data:image/png;base64,{data}" + def execute(self, kernel, data): try: args = self.parse_args(self.args) @@ -101,19 +146,19 @@ def execute(self, kernel, data): kernel._send_message("stderr", f"No test DataFrame found in data['{test_name}'] or it is empty.") return - # try to infer problem from model type if not provided + # infer problem if not provided if problem_override: problem = problem_override.lower() else: is_classifier = any(attr in dir(model) for attr in ("predict_proba", "decision_function", "classes_")) problem = "classification" if is_classifier else "regression" - # get meta (features + target) from training metadata + # get meta meta = data.get(model_store_name + "_meta", {}) or {} features = meta.get("features") target_col = meta.get("target") or meta.get("target_col") - # fallback: if no target in meta, try to infer target as the only non-feature column + # fallback target inference if not target_col: if features: possible_targets = [c for c in test_df.columns if c not in features] @@ -124,22 +169,19 @@ def execute(self, kernel, data): kernel._send_message("stderr", "Target column not found in model meta and could not be inferred from test DataFrame. " "Set data[model_name + '_meta']['target']='' when training, or pass target info in meta.") return - if target_col not in test_df.columns: kernel._send_message("stderr", f"Target column '{target_col}' not present in test DataFrame '{test_name}'.") return - if not features: kernel._send_message("stderr", "Model metadata does not contain 'features' list. Cannot build X_test.") return - missing_features = [c for c in features if c not in test_df.columns] if missing_features: kernel._send_message("stderr", f"Test DataFrame missing feature columns: {', '.join(missing_features)}") return X_test = test_df[features].copy() - y_true_orig = test_df[target_col].copy() # preserve original values for display + y_true_orig = test_df[target_col].copy() # Predict try: @@ -148,7 +190,7 @@ def execute(self, kernel, data): kernel._send_message("stderr", f"Error during prediction: {e}") return - # Try predict_proba + # predict_proba if available pred_proba = None if problem == "classification" and hasattr(model, "predict_proba"): try: @@ -160,36 +202,30 @@ def execute(self, kernel, data): except Exception: pred_proba = None - # Try to build preds_display (human-readable) + # human-readable preds preds_display = preds_raw model_classes = getattr(model, "classes_", None) try: - # if model has classes_ and preds_raw are indices, map to class labels if model_classes is not None and pd.api.types.is_integer_dtype(np.asarray(preds_raw).dtype): preds_display = np.asarray(model_classes)[np.asarray(preds_raw).astype(int)] - # if preds_raw are numeric but classes_ are strings, try mapping by index - elif model_classes is not None and not pd.api.types.is_numeric_dtype(model_classes): - # if preds_raw are label indices (ints) handle above; else if preds_raw are strings keep as-is - pass except Exception: preds_display = preds_raw - # Construct predictions DataFrame + # predictions DataFrame preds_df = test_df.copy(deep=True) preds_df["_predicted"] = preds_display if pred_proba is not None: preds_df["_pred_proba"] = pred_proba - data[pred_name] = preds_df - # --- Metrics: ensure consistent types for y_true and preds --- + # metrics calculation out_lines = [] + metrics_html = "" + cm_image_uri = None if problem == "classification": - # build arrays for metrics y_true_vals = np.asarray(y_true_orig) preds_vals = np.asarray(preds_display) - # If types are mixed (numbers and strings), cast both to str for label-based metrics def is_mixed(a, b): return (pd.api.types.is_numeric_dtype(a) and not pd.api.types.is_numeric_dtype(b)) or \ (pd.api.types.is_numeric_dtype(b) and not pd.api.types.is_numeric_dtype(a)) @@ -198,7 +234,6 @@ def is_mixed(a, b): y_metric = np.asarray(y_true_orig.astype(str)) p_metric = np.asarray(pd.Series(preds_display).astype(str)) else: - # prefer numeric if both numeric; else use original dtype (strings) if pd.api.types.is_numeric_dtype(y_true_vals) and pd.api.types.is_numeric_dtype(preds_vals): y_metric = y_true_vals.astype(float) p_metric = preds_vals.astype(float) @@ -206,7 +241,6 @@ def is_mixed(a, b): y_metric = np.asarray(y_true_orig.astype(str)) p_metric = np.asarray(pd.Series(preds_display).astype(str)) - # compute basic metrics (these accept string labels fine) try: acc = accuracy_score(y_metric, p_metric) prec = precision_score(y_metric, p_metric, average="weighted", zero_division=0) @@ -217,71 +251,115 @@ def is_mixed(a, b): kernel._send_message("stderr", f"Error computing classification metrics: {e}") return - out_lines.append(f"Classification metrics (model: '{model_store_name}')") - out_lines.append(f" accuracy = {acc:.4f}") - out_lines.append(f" precision (weighted) = {prec:.4f}") - out_lines.append(f" recall (weighted) = {rec:.4f}") - out_lines.append(f" f1 (weighted) = {f1:.4f}") - out_lines.append(" Confusion matrix (rows=actual, cols=predicted):") - out_lines.append(str(cm.tolist())) - - # ROC AUC: attempt only if predict_proba available and we can map y_true to integer indices - roc_text = " ROC AUC not available." + # ROC AUC if possible + roc_text = "N/A" if pred_proba is not None and model_classes is not None: try: - # map true labels to indices using model.classes_ class_to_idx = {str(c): i for i, c in enumerate(model_classes)} y_idx = np.array([class_to_idx.get(str(v), None) for v in y_true_orig]) if None in y_idx: - roc_text = " ROC AUC not computable: some test classes not present in model.classes_." + roc_text = "Not computable: some test classes missing from model.classes_." else: proba_arr = np.asarray(pred_proba) if proba_arr.ndim == 1: - # binary case roc_auc = roc_auc_score(y_idx.astype(int), proba_arr.astype(float)) - roc_text = f" ROC AUC (binary) = {roc_auc:.4f}" + roc_text = f"{roc_auc:.4f}" else: roc_auc = roc_auc_score(y_idx.astype(int), proba_arr, multi_class="ovr", average="weighted") - roc_text = f" ROC AUC (multiclass OVR, weighted) = {roc_auc:.4f}" + roc_text = f"{roc_auc:.4f}" except Exception: - roc_text = " ROC AUC computation failed." - elif pred_proba is not None and model_classes is None: - roc_text = " ROC AUC not computed: model.classes_ missing." - out_lines.append(roc_text) - - # classification report with readable labels if possible + roc_text = "Computation failed." + + # Prepare metrics HTML card + metrics_html = f""" +
+
+

Metrics

+ + + + + + +
Accuracy{acc:.4f}
Precision (w){prec:.4f}
Recall (w){rec:.4f}
F1 (w){f1:.4f}
ROC AUC{roc_text}
+
+ """ + + # Render confusion matrix as image and embed + # determine label names for axes try: - target_names = None if model_classes is not None: - target_names = [str(c) for c in model_classes] + label_names = [str(c) for c in model_classes] + else: + # derive from the union of unique labels in y_metric and p_metric + uniq = sorted(set(np.unique(y_metric).tolist() + np.unique(p_metric).tolist()), key=lambda x: str(x)) + label_names = [str(x) for x in uniq] + cm_arr = np.asarray(cm, dtype=int) + cm_image_uri = self._plot_confusion_matrix_to_datauri(cm_arr, label_names) + metrics_html += f'
confusion matrix
' + except Exception: + # fallback: textual representation included below + metrics_html += '
Confusion matrix image failed to render.
' + + metrics_html += "
" # close flex container + + # classification report text + try: + target_names = [str(c) for c in model_classes] if model_classes is not None else None report = classification_report(y_metric, p_metric, zero_division=0, target_names=target_names) - out_lines.append("\nClassification report:\n" + report) except Exception: - pass + report = "Classification report not available." else: - # regression metrics + # regression branch try: preds_num = np.asarray(preds_raw).astype(float) y_true_num = np.asarray(y_true_orig).astype(float) rmse = float(np.sqrt(mean_squared_error(y_true_num, preds_num))) mae = float(mean_absolute_error(y_true_num, preds_num)) r2 = float(r2_score(y_true_num, preds_num)) - out_lines.append(f"Regression metrics (model: '{model_store_name}')") - out_lines.append(f" RMSE = {rmse:.4f}") - out_lines.append(f" MAE = {mae:.4f}") - out_lines.append(f" R2 = {r2:.4f}") except Exception as e: kernel._send_message("stderr", f"Error computing regression metrics: {e}") return - # send textual summary - kernel._send_message("stdout", "\n".join(out_lines)) + metrics_html = f""" +
+

Regression metrics

+ + + + +
RMSE{rmse:.4f}
MAE{mae:.4f}
{r2:.4f}
+
+ """ + report = None + + # Build final HTML to display (metrics + classification report text) + html_parts = [ + f"
", + metrics_html + ] + if problem == "classification": + html_parts.append("

Classification report

") + html_parts.append(f"
{report}
") + # also add textual confusion matrix below if image not present + if cm_image_uri is None: + html_parts.append("

Confusion matrix

")
+                html_parts.append(str(cm.tolist()))
+                html_parts.append("
") + html_parts.append("
") + + # send HTML + try: + self._send_raw_html(kernel, "\n".join(html_parts)) + except Exception: + pass - # display HTML preview of predictions (actual vs predicted) + # then show predictions table (actual vs predicted) using your helper try: - self._send_html(kernel, preds_df.head(200), - title=f"Predictions (actual={target_col} | predicted=_predicted). Showing up to 200 rows.") + # show a limited set (up to 200 rows) + display_df = preds_df[[target_col, "_predicted"] + (["_pred_proba"] if "_pred_proba" in preds_df.columns else [])] + self._send_html(kernel, display_df.head(200), title="Predictions preview (actual vs predicted)") except Exception: pass diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py new file mode 100644 index 0000000..01d0667 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py @@ -0,0 +1,341 @@ +import os +import tempfile +import datetime +import pickle +import joblib +import shlex +import json +from distutils import util + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic + +class SaveModel(MariaMagic): + """ + %savemodel [model_name=last_model] [save_path=/path/to/model.joblib] + [db_table=] [db_conn_key=mariadb_conn] [db_uri=] + [db_host=...] [db_user=...] [db_password=...] [db_name=...] + [overwrite=True|False] [auto_db=True] + + Save a trained model (stored in data[model_name]) to disk or to a MariaDB table as a BLOB. + + This magic will attempt to automatically detect the active DB connection from: + - common keys in `data`: mariadb_conn, db_conn, conn, connection, engine, sqlalchemy_engine + - attributes on the `kernel` object with the same names + - a connection info dict in data/kernel (e.g. connection_info) + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "savemodel" + + def help(self): + return "Save trained model to disk or MariaDB storage (auto-detects active DB connection if possible)." + + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _detect_connection(self, kernel, data, preferred_key="mariadb_conn"): + """ + Try multiple strategies to obtain a DB-API connection or SQLAlchemy engine. + Returns tuple (conn_obj, cursor_factory, created_conn_bool, info_dict) + - conn_obj: DB-API connection or SQLAlchemy engine/raw_connection + - cursor_factory: callable to obtain a cursor from conn_obj (conn_obj.cursor) + - created_conn_bool: whether this method created the connection (so caller can close it) + - info_dict: dict with connection metadata (e.g., database name) if found + """ + # 1) check data for common keys + keys = [preferred_key, "db_conn", "mariadb_connection", "conn", "connection", "engine", "sqlalchemy_engine", "connection_info"] + for k in keys: + if k in data and data[k] is not None: + obj = data[k] + # SQLAlchemy engine + try: + from sqlalchemy.engine.base import Engine as _Engine # type: ignore + except Exception: + _Engine = None + if _Engine is not None and isinstance(obj, _Engine): + try: + raw_conn = obj.raw_connection() + return raw_conn, (lambda c: c.cursor()), True, {"source": f"data['{k}'] (sqlalchemy engine)"} + except Exception: + pass + # DB-API connection-like + if hasattr(obj, "cursor") and hasattr(obj, "commit"): + return obj, (lambda c: c.cursor()), False, {"source": f"data['{k}'] (db-api conn)"} + # SQLAlchemy connection object (Connection) + if hasattr(obj, "connection"): + try: + raw_conn = obj.connection + return raw_conn, (lambda c: c.cursor()), True, {"source": f"data['{k}'] (sqlalchemy raw connection)"} + except Exception: + pass + # a plain dict of connection params + if isinstance(obj, dict): + return None, None, False, {"conn_params": obj, "source": f"data['{k}'] (params dict)"} + + # 2) check kernel attributes for same keys + for k in keys + ["mariadb_conn", "db_conn", "connection", "conn", "engine", "sqlalchemy_engine", "current_database", "current_db", "_last_use_db", "connection_info"]: + if hasattr(kernel, k): + obj = getattr(kernel, k) + if obj is None: + continue + try: + from sqlalchemy.engine.base import Engine as _Engine # type: ignore + except Exception: + _Engine = None + if _Engine is not None and isinstance(obj, _Engine): + try: + raw_conn = obj.raw_connection() + return raw_conn, (lambda c: c.cursor()), True, {"source": f"kernel.{k} (sqlalchemy engine)"} + except Exception: + pass + if hasattr(obj, "cursor") and hasattr(obj, "commit"): + return obj, (lambda c: c.cursor()), False, {"source": f"kernel.{k} (db-api conn)"} + if isinstance(obj, dict): + return None, None, False, {"conn_params": obj, "source": f"kernel.{k} (params dict)"} + + # 3) try to read a small connection-info dict from common locations + for info_key in ("connection_info", "conn_info", "db_info"): + if info_key in data and isinstance(data[info_key], dict): + return None, None, False, {"conn_params": data[info_key], "source": f"data['{info_key}']"} + if hasattr(kernel, info_key): + obj = getattr(kernel, info_key) + if isinstance(obj, dict): + return None, None, False, {"conn_params": obj, "source": f"kernel.{info_key}"} + + # 4) nothing found + return None, None, False, {} + + def execute(self, kernel, data): + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + model_name = args.get("model_name", args.get("model", "last_model")) + save_path = args.get("save_path", None) + db_table = args.get("db_table", None) + db_conn_key = args.get("db_conn_key", "mariadb_conn") + db_uri = args.get("db_uri", None) + overwrite = bool(args.get("overwrite", False)) + auto_db = bool(args.get("auto_db", True)) + + # optional explicit connection details (fallback) + db_host = args.get("db_host") + db_user = args.get("db_user") + db_password = args.get("db_password") + db_name = args.get("db_name") + + model = data.get(model_name) + if model is None: + kernel._send_message("stderr", f"No model found in data['{model_name}']. Train and save a model first.") + return + + did_something = False + + # Save to disk if requested + if save_path: + try: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + joblib.dump(model, save_path) + kernel._send_message("stdout", f"Model saved to {save_path}") + did_something = True + except Exception as e: + kernel._send_message("stderr", f"Failed to save model to disk ({save_path}): {e}") + + # If user asked to save to DB, attempt detection and insert + if db_table: + # serialize model to bytes + try: + model_bytes = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) + except Exception as e: + kernel._send_message("stderr", f"Failed to serialize model with pickle: {e}") + model_bytes = None + + if model_bytes is None: + kernel._send_message("stderr", "Model serialization failed; cannot save to DB.") + else: + conn_obj, cursor_factory, created_conn, info = (None, None, False, {}) + # If db_uri explicitly provided, prefer it (SQLAlchemy) + if db_uri: + try: + from sqlalchemy import create_engine, text + engine = create_engine(db_uri) + raw_conn = engine.raw_connection() + conn_obj = raw_conn + cursor_factory = (lambda c: c.cursor()) + created_conn = True + info = {"source": "db_uri"} + except Exception as e: + kernel._send_message("stderr", f"Could not connect via db_uri: {e}") + conn_obj = None + + # If auto_db requested, attempt to detect connection from kernel/data + if conn_obj is None and auto_db: + detected_conn, cursor_factory, created_conn_flag, info = self._detect_connection(kernel, data, preferred_key=db_conn_key) + conn_obj = detected_conn + created_conn = created_conn_flag + + # If detection returned connection params dict, try to open via mariadb connector + conn_params = info.get("conn_params") if isinstance(info, dict) else None + if conn_obj is None and conn_params: + try: + import mariadb + # rename keys if necessary + host = conn_params.get("host") or conn_params.get("db_host") or conn_params.get("hostaddr") + user = conn_params.get("user") or conn_params.get("username") + password = conn_params.get("password") or conn_params.get("passwd") or conn_params.get("db_password") + database = conn_params.get("database") or conn_params.get("db_name") or conn_params.get("schema") + conn_obj = mariadb.connect(host=host, user=user, password=password or "", database=database) + cursor_factory = (lambda c: c.cursor()) + created_conn = True + info["source_detail"] = "opened via mariadb from conn_params" + except Exception as e: + kernel._send_message("stderr", f"Failed to open mariadb connection from params: {e}") + conn_obj = None + + # If nothing found yet but explicit host/user provided on command line, try them + if conn_obj is None and db_host and db_user and db_name: + try: + import mariadb + conn_obj = mariadb.connect(host=db_host, user=db_user, password=db_password or "", database=db_name) + cursor_factory = (lambda c: c.cursor()) + created_conn = True + info = {"source": "db_host/db_user arguments"} + except Exception as e: + kernel._send_message("stderr", f"Could not connect using provided db_host/db_user/db_name: {e}") + conn_obj = None + + # Final check: if conn_obj is still None, return helpful error + if conn_obj is None: + kernel._send_message("stderr", "No usable DB connection detected. Provide one via:\n" + " - data['mariadb_conn'] (DB-API connection), or\n" + " - data['engine'] (SQLAlchemy engine), or\n" + " - db_uri=..., or\n" + " - db_host/db_user/db_name arguments.\n" + "Set auto_db=False to suppress detection and provide explicit params.") + else: + # We have a connection-like object (conn_obj) and a cursor factory. + inserted = False + created_local_conn = created_conn + try: + # Try to obtain a cursor + try: + cursor = cursor_factory(conn_obj) + except Exception: + # fallback: try conn_obj.cursor() + try: + cursor = conn_obj.cursor() + except Exception as e: + raise RuntimeError(f"Could not obtain cursor from connection: {e}") + + # Ensure table exists (simple create) + try: + create_sql = f""" + CREATE TABLE IF NOT EXISTS `{db_table}` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + model_name VARCHAR(255), + created_at DATETIME, + model_blob LONGBLOB + ) + """ + try: + cursor.execute(create_sql) + except Exception: + # some drivers need different execution path (SQLAlchemy) + try: + conn_obj.execute(create_sql) + except Exception: + pass + except Exception: + pass + + # If overwrite requested, delete previous with same model_name + now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + try: + if overwrite: + try: + cursor.execute(f"DELETE FROM `{db_table}` WHERE model_name=%s", (model_name,)) + except Exception: + try: + cursor.execute(f"DELETE FROM `{db_table}` WHERE model_name=:%s", (model_name,)) + except Exception: + pass + except Exception: + pass + + # Insert; adapt paramstyle if necessary + insert_sql = f"INSERT INTO `{db_table}` (model_name, created_at, model_blob) VALUES (%s, %s, %s)" + try: + cursor.execute(insert_sql, (model_name, now, model_bytes)) + except Exception: + # try SQLAlchemy style named params + try: + cursor.execute(insert_sql.replace("%s", ":blob"), {"blob": model_bytes, "model_name": model_name, "created_at": now}) + except Exception as e: + # last resort: use execute with binary literal (unsafe for special bytes) -- avoid + raise + + # commit if method available + try: + conn_obj.commit() + except Exception: + pass + + kernel._send_message("stdout", f"Model stored into DB table '{db_table}' (model_name='{model_name}'). source={info.get('source') or info.get('source_detail', 'detected')}") + inserted = True + did_something = True + except Exception as e: + kernel._send_message("stderr", f"Failed to insert model into table '{db_table}': {e}") + finally: + # close created connections only + try: + if created_local_conn and conn_obj: + try: + cursor.close() + except Exception: + pass + try: + conn_obj.close() + except Exception: + pass + except Exception: + pass + + if not inserted: + kernel._send_message("stderr", f"Model was not inserted into DB table '{db_table}'.") + + if not did_something: + kernel._send_message("stderr", "No action taken. Provide save_path and/or db_table to save the model.") + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 34750e1..bdeb8d6 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -24,6 +24,7 @@ from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel +from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel def get(): return { @@ -47,4 +48,5 @@ def get(): "splitdata": SplitData, "train_model": TrainModel, "evaluate_model": EvaluateModel, + "savemodel": SaveModel, } From d198769842a57a8a6c8aec55e7acbfba68416f40 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Mon, 27 Oct 2025 16:00:35 +0000 Subject: [PATCH 14/38] Added Select Features --- Untitled.ipynb | 302 ++++++++++++++++-- .../ml_pipeline/select_features.py | 298 +++++++++++++++++ .../maria_magics/supported_magics.py | 2 + 3 files changed, 576 insertions(+), 26 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py diff --git a/Untitled.ipynb b/Untitled.ipynb index c5a00bf..e3b4e8d 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -33,7 +33,7 @@ "create database test\r\n", "--------------\r\n", "\r\n", - "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_af73b9b2-b262-11f0-b068-00155d4e875d': Can't create database 'test'; database exists\r\n", + "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_744c7db6-b34c-11f0-a961-00155dd935c1': Can't create database 'test'; database exists\r\n", "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" ] }, @@ -74,18 +74,7 @@ { "data": { "text/html": [ - "--------------\r\n", - "CREATE TABLE employees (\r\n", - " id INT PRIMARY KEY AUTO_INCREMENT,\r\n", - " name VARCHAR(50),\r\n", - " department VARCHAR(50),\r\n", - " age INT,\r\n", - " salary DECIMAL(10,2)\r\n", - ")\r\n", - "--------------\r\n", - "\r\n", - "ERROR 1050 (42S01) at line 1 in file: '/home/iddhartha/mariadb_kernel/.mariadb_statement_c7b7c9f2-b33c-11f0-aa46-00155d4db2b1': Table 'employees' already exists\r\n", - "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" + "Query OK" ] }, "metadata": {}, @@ -94,12 +83,26 @@ ], "source": [ "CREATE TABLE employees (\n", - " id INT PRIMARY KEY AUTO_INCREMENT,\n", + " emp_id INT PRIMARY KEY AUTO_INCREMENT,\n", " name VARCHAR(50),\n", " department VARCHAR(50),\n", " age INT,\n", - " salary DECIMAL(10,2)\n", - ");" + " gender VARCHAR(10),\n", + " education_level VARCHAR(30),\n", + " years_experience INT,\n", + " projects_completed INT,\n", + " avg_project_score DECIMAL(5,2),\n", + " certifications INT,\n", + " training_hours INT,\n", + " overtime_hours INT,\n", + " remote_ratio DECIMAL(3,2),\n", + " salary DECIMAL(10,2),\n", + " bonus DECIMAL(10,2),\n", + " satisfaction_score DECIMAL(3,2),\n", + " performance_rating INT, -- target variable for classification\n", + " potential_score DECIMAL(5,2), -- target variable for regression\n", + " attrition_flag INT -- 1 = left company, 0 = stayed\n", + ");\n" ] }, { @@ -119,15 +122,22 @@ } ], "source": [ - "INSERT INTO employees (name, department, age, salary) VALUES\n", - "('Alice', 'HR', 30,5000),\n", - "('Bob', NULL, 40, NULL),\n", - "('Charlie', 'Engineering', NULL, 700000),\n", - "('David', 'HR', 25, 48000),\n", - "('Eve', NULL, 35, NULL),\n", - "('Frank', 'Engineering', 28, 72000),\n", - "(NULL, 'Sales', 50, NULL),\n", - "('Grace', 'Sales', 45, 65000);" + "INSERT INTO employees\n", + "(name, department, age, gender, education_level, years_experience,\n", + " projects_completed, avg_project_score, certifications, training_hours,\n", + " overtime_hours, remote_ratio, salary, bonus, satisfaction_score,\n", + " performance_rating, potential_score, attrition_flag)\n", + "VALUES\n", + "('Alice', 'HR', 30, 'F', 'Bachelors', 5, 12, 87.5, 1, 40, 5, 0.2, 55000, 3000, 8.5, 4, 75.0, 0),\n", + "('Bob', 'Engineering', 45, 'M', 'Masters', 20, 30, 91.0, 3, 20, 10, 0.1, 120000, 15000, 9.0, 5, 89.0, 0),\n", + "('Charlie', 'Sales', 38, 'M', 'Bachelors', 10, 18, 79.3, 0, 15, 20, 0.5, 80000, 7000, 7.2, 3, 70.0, 1),\n", + "('Diana', 'Engineering', 29, 'F', 'PhD', 6, 22, 95.2, 2, 50, 2, 0.0, 97000, 10000, 9.6, 5, 95.0, 0),\n", + "('Eve', 'Finance', 35, 'F', 'Bachelors', 8, 15, 88.0, 1, 30, 6, 0.3, 90000, 8000, 8.0, 4, 85.0, 0),\n", + "('Frank', 'HR', 50, 'M', 'High School', 25, 8, 72.5, 0, 10, 15, 0.7, 60000, 4000, 6.5, 2, 60.0, 1),\n", + "('Grace', 'Sales', 42, 'F', 'Bachelors', 18, 20, 81.4, 1, 25, 12, 0.4, 85000, 7000, 7.8, 3, 74.0, 0),\n", + "('Henry', 'Engineering', 31, 'M', 'Masters', 7, 25, 93.1, 2, 35, 5, 0.2, 95000, 9000, 9.1, 5, 90.0, 0),\n", + "('Ivy', 'Finance', 27, 'F', 'Bachelors', 3, 10, 85.0, 0, 20, 8, 0.6, 70000, 5000, 8.2, 4, 82.0, 0),\n", + "('Jack', 'Sales', 55, 'M', 'High School', 30, 12, 68.9, 0, 5, 25, 0.8, 65000, 2000, 5.5, 1, 50.0, 1);" ] }, { @@ -139,7 +149,7 @@ { "data": { "text/html": [ - "
idnamedepartmentagesalary
1AliceHR305000.00
2BobNULL40NULL
3CharlieEngineeringNULL700000.00
4DavidHR2548000.00
5EveNULL35NULL
6FrankEngineering2872000.00
7NULLSales50NULL
8GraceSales4565000.00
9AliceHR305000.00
10BobNULL40NULL
11CharlieEngineeringNULL700000.00
12DavidHR2548000.00
13EveNULL35NULL
14FrankEngineering2872000.00
15NULLSales50NULL
16GraceSales4565000.00
17AliceHR305000.00
18BobNULL40NULL
19CharlieEngineeringNULL700000.00
20DavidHR2548000.00
21EveNULL35NULL
22FrankEngineering2872000.00
23NULLSales50NULL
24GraceSales4565000.00
25AliceHR305000.00
26BobNULL40NULL
27CharlieEngineeringNULL700000.00
28DavidHR2548000.00
29EveNULL35NULL
30FrankEngineering2872000.00
31NULLSales50NULL
32GraceSales4565000.00
33AliceHR305000.00
34BobNULL40NULL
35CharlieEngineeringNULL700000.00
36DavidHR2548000.00
37EveNULL35NULL
38FrankEngineering2872000.00
39NULLSales50NULL
40GraceSales4565000.00
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -5810,6 +5820,246 @@ "%save_model model_name=last_model save_path=/tmp/test_model.joblib" ] }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6c5def76-a36c-45be-8712-d886a1e52e25", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Feature Selection Results (method=correlation)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
remote_ratio0.744150
training_hours0.742307
age0.683720
certifications0.654654
years_experience0.623764
bonus0.480500
salary0.463771
projects_completed0.441624
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, remote_ratio, training_hours\n" + ] + } + ], + "source": [ + "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b0c717fb-9f2f-47ba-8c0c-73c6934a069f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Feature Selection Results (method=rf_importance)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
avg_project_score0.197081
years_experience0.133905
satisfaction_score0.131587
salary0.097661
bonus0.094199
overtime_hours0.085659
training_hours0.076767
age0.050450
certifications0.046578
projects_completed0.044383
remote_ratio0.041728
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 7 features saved to data['selected_features']: avg_project_score, years_experience, satisfaction_score, salary, bonus, overtime_hours, training_hours\n" + ] + } + ], + "source": [ + "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=potential_score method=rf_importance k=7 problem=regression" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c106f132-0c90-4db8-8d7d-7c0cb29f6b10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Feature Selection Results (method=chi2)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
overtime_hours1.792208
certifications1.428571
age1.224733
avg_project_score1.202767
remote_ratio1.158521
years_experience1.115538
training_hours1.071429
satisfaction_score1.015994
bonus0.468864
salary0.441209
projects_completed0.435159
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['top_features']: overtime_hours, certifications, age, avg_project_score, remote_ratio\n" + ] + } + ], + "source": [ + "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=attrition_flag method=chi2 k=5 problem=classification output_name=top_features" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py new file mode 100644 index 0000000..bd56480 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py @@ -0,0 +1,298 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from sklearn.feature_selection import SelectKBest, f_classif, f_regression, RFE, mutual_info_classif, mutual_info_regression, chi2, VarianceThreshold +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LogisticRegression, Lasso +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +class SelectFeatures(MariaMagic): + """ + %select_features features=col1,col2 target=target_col + [method=correlation|rf_importance|rfe|mutual_info|chi2|anova|l1_selection|variance] + [k=5] [problem=classification|regression] + [output_name=selected_features] [inplace=True|False] + + Identify the best features for training a model on data['last_select']. + Methods: + - correlation: Absolute Pearson correlation with the target. + - rf_importance: RandomForest feature importance scores. + - rfe: Recursive Feature Elimination with a RandomForest model. + - mutual_info: Mutual Information between features and target. + - chi2: Chi-squared statistic (classification only, non-negative features). + - anova: ANOVA F-test for feature significance. + - l1_selection: L1-based feature selection (LogisticRegression for classification, Lasso for regression). + - variance: Remove features with low variance (threshold-based). + Stores the ranked features in data[output_name] and displays a table of results. + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "select_features" + + def help(self): + return "Identify the best features for model training from data['last_select']." + + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + import json + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def execute(self, kernel, data): + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + features_arg = args.get("features") + target = args.get("target") + method = args.get("method", "correlation").lower() + k = args.get("k", 5) + problem_override = args.get("problem", None) + output_name = args.get("output_name", "selected_features") + inplace = bool(args.get("inplace", True)) + + if not features_arg: + kernel._send_message("stderr", "features argument is required (features=col1,col2...).") + return + if not target: + kernel._send_message("stderr", "target argument is required (target=target_col).") + return + + # Parse features + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + kernel._send_message("stderr", "features must be comma-separated string or list.") + return + + missing = [c for c in features + [target] if c not in df.columns] + if missing: + kernel._send_message("stderr", f"Missing columns in DataFrame: {', '.join(missing)}") + return + + # Determine problem type (same logic as TrainModel) + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + return + else: + tgt_ser = df[target] + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + if pd.api.types.is_float_dtype(tgt_ser) or nunique > 20 or uniq_prop > 0.05: + problem = "regression" + else: + problem = "classification" + else: + problem = "classification" + + # Prepare data + X = df[features].copy() + y = df[target].copy() + + # Handle missing values (simple imputation for feature selection) + X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) + if X.isna().any().any(): + kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") + return + + # Scale data for methods that require it (e.g., chi2 requires non-negative, l1_selection benefits from scaling) + if method in ("chi2", "l1_selection"): + scaler = MinMaxScaler() if method == "chi2" else StandardScaler() + try: + X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) + except Exception as e: + kernel._send_message("stderr", f"Error scaling data: {e}") + return + + # Feature selection + try: + if method == "correlation": + # Pearson correlation (absolute value) with target + correlations = X.corrwith(y, method="pearson").abs() + scores = correlations.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "rf_importance": + # RandomForest feature importance + model = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() + model.fit(X, y) + importances = pd.Series(model.feature_importances_, index=features) + scores = importances.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "rfe": + # Recursive Feature Elimination + estimator = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() + selector = RFE(estimator, n_features_to_select=k) + selector.fit(X, y) + ranking = pd.Series(selector.ranking_, index=features) + scores = 1 / (ranking + 1) # Inverse ranking as score (higher is better) + selected_features = ranking[ranking == 1].index.tolist() + result_df = pd.DataFrame({ + "Feature": ranking.index, + "Score": scores, + "Ranking": ranking + }).sort_values("Score", ascending=False) + + elif method == "mutual_info": + # Mutual Information + score_func = mutual_info_classif if problem == "classification" else mutual_info_regression + selector = SelectKBest(score_func=score_func, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "chi2": + # Chi-squared (classification only, requires non-negative features) + if problem != "classification": + kernel._send_message("stderr", "chi2 method is only for classification problems.") + return + if (X < 0).any().any(): + kernel._send_message("stderr", "chi2 requires non-negative features.") + return + selector = SelectKBest(score_func=chi2, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "anova": + # ANOVA F-test + score_func = f_classif if problem == "classification" else f_regression + selector = SelectKBest(score_func=score_func, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "l1_selection": + # L1-based feature selection + model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000) if problem == "classification" else Lasso(alpha=0.01) + model.fit(X, y) + # Use absolute coefficients as importance scores + scores = pd.Series(np.abs(model.coef_.ravel() if problem == "classification" else model.coef_), index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores[scores > 0].head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "variance": + # Variance Threshold + selector = VarianceThreshold(threshold=0.0) # Default threshold, can be customized via args if needed + selector.fit(X) + variances = pd.Series(selector.variances_, index=features) + scores = variances.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + else: + kernel._send_message("stderr", "method must be one of 'correlation', 'rf_importance', 'rfe', 'mutual_info', 'chi2', 'anova', 'l1_selection', or 'variance'.") + return + + except Exception as e: + kernel._send_message("stderr", f"Error during feature selection: {e}") + return + + # Store results + try: + data[output_name] = selected_features + data[output_name + "_meta"] = { + "method": method, + "problem": problem, + "target": target, + "k": k, + "all_scores": result_df.to_dict() + } + except Exception as e: + kernel._send_message("stderr", f"Error storing results: {e}") + return + + # Display results + self._send_html(kernel, result_df, title=f"Feature Selection Results (method={method})") + kernel._send_message("stdout", f"Selected {len(selected_features)} features saved to data['{output_name}']: {', '.join(selected_features)}") + + return \ No newline at end of file diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index bdeb8d6..4448fe7 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -25,6 +25,7 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures def get(): return { @@ -49,4 +50,5 @@ def get(): "train_model": TrainModel, "evaluate_model": EvaluateModel, "savemodel": SaveModel, + "select_features": SelectFeatures } From 56253c4903a6bdd12bb1309d04fbe0c516de330c Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Mon, 27 Oct 2025 17:24:42 +0000 Subject: [PATCH 15/38] Added select_model.py --- .../ml_commands/ml_pipeline/select_model.py | 327 ++++++++++++++++++ .../ml_commands/model_training/train_model.py | 4 - .../maria_magics/supported_magics.py | 4 +- 3 files changed, 330 insertions(+), 5 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py new file mode 100644 index 0000000..54fbff1 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py @@ -0,0 +1,327 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from sklearn.model_selection import cross_val_score +from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.neural_network import MLPClassifier, MLPRegresso + +_XGBOOST_AVAILABLE = False +_LIGHTGBM_AVAILABLE = False +_CATBOOST_AVAILABLE = False +try: + from xgboost import XGBClassifier, XGBRegressor + _XGBOOST_AVAILABLE = True +except Exception: + pass + +try: + from lightgbm import LGBMClassifier, LGBMRegressor + _LIGHTGBM_AVAILABLE = True +except Exception: + pass + +try: + from catboost import CatBoostClassifier, CatBoostRegressor + _CATBOOST_AVAILABLE = True +except Exception: + pass + +class SelectModel(MariaMagic): + """ + %select_model features=col1,col2 target=target_col + [models=rf,logistic,svm] [cv=5] [metric=accuracy|r2|f1|precision|recall|mse|mae] + [problem=classification|regression] [output_name=best_model] + [inplace=True|False] [model_params={'rf': {'n_estimators': 100}, 'logistic': {'C': 1.0}}] + + Select the best model by comparing multiple models on data['last_select'] using cross-validation. + Models: logistic, rf, svm, knn, gbm, ada, mlp, xgboost, lightgbm, catboost (classification); + linear, ridge, lasso, rf, knn, gbm, ada, mlp, xgboost, lightgbm, catboost (regression). + Stores the best model in data[output_name] and displays a table of model performances. + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "select_model" + + def help(self): + return "Select the best model for training from data['last_select'] using cross-validation." + + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + import json + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _choose_model(self, name, problem, params=None): + # Reuse TrainModel's model selection logic + p = params or {} + name = name.lower() + if name in ("logistic", "logistic_regression", "lr"): + if problem != "classification": + raise ValueError("LogisticRegression is for classification problems.") + return LogisticRegression(max_iter=1000, **p) + if name in ("rf", "random_forest"): + return RandomForestClassifier(**p) if problem == "classification" else RandomForestRegressor(**p) + if name in ("svc", "svm"): + if problem != "classification": + raise ValueError("SVC is for classification problems.") + return SVC(probability=True, **p) + if name in ("linear", "linear_regression"): + if problem != "regression": + raise ValueError("LinearRegression is for regression problems.") + return LinearRegression(**p) + if name == "ridge": + if problem != "regression": + raise ValueError("Ridge is for regression problems.") + return Ridge(**p) + if name == "lasso": + if problem != "regression": + raise ValueError("Lasso is for regression problems.") + return Lasso(**p) + if name == "knn": + return KNeighborsClassifier(**p) if problem == "classification" else KNeighborsRegressor(**p) + if name == "gbm": + return GradientBoostingClassifier(**p) if problem == "classification" else GradientBoostingRegressor(**p) + if name == "ada": + return AdaBoostClassifier(**p) if problem == "classification" else AdaBoostRegressor(**p) + if name == "mlp": + return MLPClassifier(max_iter=1000, **p) if problem == "classification" else MLPRegressor(max_iter=1000, **p) + if name == "xgboost": + if not _XGBOOST_AVAILABLE: + raise ImportError("xgboost not available in this environment.") + return XGBClassifier(**p) if problem == "classification" else XGBRegressor(**p) + if name == "lightgbm": + if not _LIGHTGBM_AVAILABLE: + raise ImportError("lightgbm not available in this environment.") + return LGBMClassifier(**p) if problem == "classification" else LGBMRegressor(**p) + if name == "catboost": + if not _CATBOOST_AVAILABLE: + raise ImportError("catboost not available in this environment.") + p = dict(p) + p.setdefault("verbose", False) + return CatBoostClassifier(**p) if problem == "classification" else CatBoostRegressor(**p) + raise ValueError(f"Unknown model name '{name}'") + + def execute(self, kernel, data): + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + features_arg = args.get("features") + target = args.get("target") + models_arg = args.get("models", "rf,logistic,knn") # Default models + cv = int(args.get("cv", 5) or 5) + metric = args.get("metric", None) + problem_override = args.get("problem", None) + output_name = args.get("output_name", "best_model") + inplace = bool(args.get("inplace", True)) + model_params = args.get("model_params", {}) or {} + + if not features_arg: + kernel._send_message("stderr", "features argument is required (features=col1,col2...).") + return + if not target: + kernel._send_message("stderr", "target argument is required (target=target_col).") + return + + # Parse features + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + kernel._send_message("stderr", "features must be comma-separated string or list.") + return + + # Parse models + if isinstance(models_arg, str): + models = [m.strip() for m in models_arg.split(",") if m.strip()] + elif isinstance(models_arg, (list, tuple)): + models = list(models_arg) + else: + kernel._send_message("stderr", "models must be comma-separated string or list.") + return + + missing = [c for c in features + [target] if c not in df.columns] + if missing: + kernel._send_message("stderr", f"Missing columns in DataFrame: {', '.join(missing)}") + return + + # Determine problem type (same logic as TrainModel) + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + return + else: + tgt_ser = df[target] + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + if pd.api.types.is_float_dtype(tgt_ser) or nunique > 20 or uniq_prop > 0.05: + problem = "regression" + else: + problem = "classification" + else: + problem = "classification" + + # Validate metric + valid_metrics = { + "classification": ["accuracy", "f1", "precision", "recall"], + "regression": ["r2", "mse", "mae"] + } + if metric is None: + metric = "accuracy" if problem == "classification" else "r2" + if metric not in valid_metrics[problem]: + kernel._send_message("stderr", f"Invalid metric '{metric}' for {problem}. Choose from {', '.join(valid_metrics[problem])}.") + return + + # Prepare data + X = df[features].copy() + y = df[target].copy() + + # Handle missing values (simple imputation) + X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) + if X.isna().any().any(): + kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") + return + + # Evaluate models + results = [] + best_model = None + best_score = -float("inf") if metric not in ("mse", "mae") else float("inf") + best_model_name = None + + for model_name in models: + try: + # Get model-specific parameters + params = model_params.get(model_name, {}) if isinstance(model_params, dict) else {} + model = self._choose_model(model_name, problem, params) + scoring = metric if metric in ("accuracy", "f1", "precision", "recall", "r2") else ( + "neg_mean_squared_error" if metric == "mse" else "neg_mean_absolute_error" + ) + cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring) + mean_score = np.mean(cv_scores) + std_score = np.std(cv_scores) + + # Adjust score for negative metrics (mse, mae) + if metric in ("mse", "mae"): + mean_score = -mean_score # Convert back to positive for reporting + + results.append({ + "Model": model_name, + "Mean_Score": mean_score, + "Std_Score": std_score + }) + + # Update best model (maximize for accuracy, f1, precision, recall, r2; minimize for mse, mae) + if metric in ("mse", "mae"): + if mean_score < best_score: + best_score = mean_score + best_model = model + best_model_name = model_name + else: + if mean_score > best_score: + best_score = mean_score + best_model = model + best_model_name = model_name + + except Exception as e: + kernel._send_message("stderr", f"Error evaluating model '{model_name}': {e}") + continue + + if not results: + kernel._send_message("stderr", "No models were successfully evaluated.") + return + + # Create results DataFrame + result_df = pd.DataFrame(results).sort_values("Mean_Score", ascending=metric in ("mse", "mae")) + result_df["Mean_Score"] = result_df["Mean_Score"].round(4) + result_df["Std_Score"] = result_df["Std_Score"].round(4) + + # Fit the best model on the full training data + try: + best_model.fit(X, y) + except Exception as e: + kernel._send_message("stderr", f"Error fitting best model '{best_model_name}': {e}") + return + + # Store the best model and metadata + try: + data[output_name] = best_model + data[output_name + "_meta"] = { + "model_name": best_model_name, + "problem": problem, + "features": features, + "target": target, + "metric": metric, + "cv": cv, + "score": float(best_score), + "all_results": result_df.to_dict() + } + if hasattr(best_model, "classes_"): + data[output_name + "_meta"]["classes"] = list(getattr(best_model, "classes_")) + except Exception as e: + kernel._send_message("stderr", f"Error storing best model: {e}") + return + + # Display results + self._send_html(kernel, result_df, title=f"Model Selection Results (metric={metric})") + kernel._send_message("stdout", f"Best model '{best_model_name}' (mean {metric}={best_score:.4f}) saved to data['{output_name}'].") + + return \ No newline at end of file diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py index 10c8313..b48bfb7 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py @@ -15,10 +15,6 @@ from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor -from sklearn.metrics import ( - accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, - mean_squared_error, mean_absolute_error, r2_score -) # Optional external libraries _XGBOOST_AVAILABLE = False diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 4448fe7..d67659c 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -26,6 +26,7 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel def get(): return { @@ -50,5 +51,6 @@ def get(): "train_model": TrainModel, "evaluate_model": EvaluateModel, "savemodel": SaveModel, - "select_features": SelectFeatures + "select_features": SelectFeatures, + "select_model": SelectModel, } From c48c2f4905aa9e7170191684e5b95308c55927f8 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Mon, 27 Oct 2025 17:27:16 +0000 Subject: [PATCH 16/38] Fixed error --- .../maria_magics/ml_commands/ml_pipeline/select_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py index 54fbff1..4960dd7 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py @@ -11,7 +11,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor -from sklearn.neural_network import MLPClassifier, MLPRegresso +from sklearn.neural_network import MLPClassifier _XGBOOST_AVAILABLE = False _LIGHTGBM_AVAILABLE = False From 4ac7a0cf66551e5fc0f924fa6dd04fcec7145643 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Mon, 27 Oct 2025 18:09:34 +0000 Subject: [PATCH 17/38] Added predict --- .eggs/README.txt | 6 + .../EGG-INFO/PKG-INFO | 185 + .../EGG-INFO/RECORD | 43 + .../EGG-INFO/WHEEL | 5 + .../EGG-INFO/entry_points.txt | 49 + .../EGG-INFO/licenses/LICENSE | 17 + .../EGG-INFO/requires.txt | 9 + .../EGG-INFO/top_level.txt | 1 + .../setuptools_scm/.git_archival.txt | 3 + .../setuptools_scm/__init__.py | 30 + .../setuptools_scm/__main__.py | 6 + .../setuptools_scm/_cli.py | 291 ++ .../setuptools_scm/_compat.py | 65 + .../setuptools_scm/_config.py | 318 ++ .../setuptools_scm/_entrypoints.py | 126 + .../setuptools_scm/_file_finders/__init__.py | 113 + .../setuptools_scm/_file_finders/git.py | 124 + .../setuptools_scm/_file_finders/hg.py | 72 + .../setuptools_scm/_file_finders/pathtools.py | 9 + .../setuptools_scm/_get_version_impl.py | 250 ++ .../setuptools_scm/_integration/__init__.py | 0 .../_integration/deprecation.py | 20 + .../_integration/dump_version.py | 128 + .../_integration/pyproject_reading.py | 285 ++ .../setuptools_scm/_integration/setup_cfg.py | 46 + .../setuptools_scm/_integration/setuptools.py | 159 + .../setuptools_scm/_integration/toml.py | 69 + .../_integration/version_inference.py | 141 + .../setuptools_scm/_log.py | 87 + .../setuptools_scm/_modify_version.py | 61 + .../setuptools_scm/_node_utils.py | 46 + .../setuptools_scm/_overrides.py | 298 ++ .../setuptools_scm/_requirement_cls.py | 34 + .../setuptools_scm/_run_cmd.py | 221 ++ .../setuptools_scm/_types.py | 61 + .../setuptools_scm/_version_cls.py | 101 + .../setuptools_scm/discover.py | 74 + .../setuptools_scm/fallbacks.py | 45 + .../setuptools_scm/git.py | 454 +++ .../setuptools_scm/hg.py | 308 ++ .../setuptools_scm/hg_git.py | 181 + .../setuptools_scm/integration.py | 31 + .../setuptools_scm/py.typed | 0 .../setuptools_scm/scm_workdir.py | 54 + .../setuptools_scm/version.py | 583 ++++ Untitled.ipynb | 3060 ++++++++++++----- mariadb_kernel.egg-info/PKG-INFO | 76 + mariadb_kernel.egg-info/SOURCES.txt | 96 + mariadb_kernel.egg-info/dependency_links.txt | 1 + mariadb_kernel.egg-info/requires.txt | 9 + mariadb_kernel.egg-info/top_level.txt | 1 + .../ml_commands/model_training/loadmodel.py | 78 + .../ml_commands/model_training/predict.py | 162 + .../ml_commands/model_training/savemodel.py | 353 +- .../maria_magics/supported_magics.py | 4 + models/test_model.joblib | Bin 0 -> 146537 bytes 56 files changed, 7832 insertions(+), 1217 deletions(-) create mode 100644 .eggs/README.txt create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py create mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py create mode 100644 mariadb_kernel.egg-info/PKG-INFO create mode 100644 mariadb_kernel.egg-info/SOURCES.txt create mode 100644 mariadb_kernel.egg-info/dependency_links.txt create mode 100644 mariadb_kernel.egg-info/requires.txt create mode 100644 mariadb_kernel.egg-info/top_level.txt create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/predict.py create mode 100644 models/test_model.joblib diff --git a/.eggs/README.txt b/.eggs/README.txt new file mode 100644 index 0000000..5d01668 --- /dev/null +++ b/.eggs/README.txt @@ -0,0 +1,6 @@ +This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins. + +This directory caches those eggs to prevent repeated downloads. + +However, it is safe to delete this directory. + diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO new file mode 100644 index 0000000..0a1de09 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO @@ -0,0 +1,185 @@ +Metadata-Version: 2.4 +Name: setuptools-scm +Version: 9.2.2 +Summary: the blessed package to manage your versions by scm tags +Author-email: Ronny Pfannschmidt +License: Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +Project-URL: documentation, https://setuptools-scm.readthedocs.io/ +Project-URL: repository, https://github.com/pypa/setuptools-scm/ +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 :: Only +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Version Control +Classifier: Topic :: System :: Software Distribution +Classifier: Topic :: Utilities +Requires-Python: >=3.8 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: packaging>=20 +Requires-Dist: setuptools +Requires-Dist: tomli>=1; python_version < "3.11" +Requires-Dist: typing-extensions; python_version < "3.10" +Provides-Extra: rich +Requires-Dist: rich; extra == "rich" +Provides-Extra: simple +Provides-Extra: toml +Dynamic: license-file + +# setuptools-scm +[![github ci](https://github.com/pypa/setuptools-scm/actions/workflows/python-tests.yml/badge.svg)](https://github.com/pypa/setuptools-scm/actions/workflows/python-tests.yml) +[![Documentation Status](https://readthedocs.org/projects/setuptools-scm/badge/?version=latest)](https://setuptools-scm.readthedocs.io/en/latest/?badge=latest) +[![tidelift](https://tidelift.com/badges/package/pypi/setuptools-scm) ](https://tidelift.com/subscription/pkg/pypi-setuptools-scm?utm_source=pypi-setuptools-scm&utm_medium=readme) + +## about + +[setuptools-scm] extracts Python package versions from `git` or `hg` metadata +instead of declaring them as the version argument +or in a Source Code Managed (SCM) managed file. + +Additionally [setuptools-scm] provides `setuptools` with a list of +files that are managed by the SCM +
+(i.e. it automatically adds all the SCM-managed files to the sdist). +
+Unwanted files must be excluded via `MANIFEST.in` +or [configuring Git archive][git-archive-docs]. + +> **⚠️ Important:** Installing setuptools-scm automatically enables a file finder that includes **all SCM-tracked files** in your source distributions. This can be surprising if you have development files tracked in Git/Mercurial that you don't want in your package. Use `MANIFEST.in` to exclude unwanted files. See the [documentation] for details. + +## `pyproject.toml` usage + +The preferred way to configure [setuptools-scm] is to author +settings in a `tool.setuptools_scm` section of `pyproject.toml`. + +This feature requires setuptools 61 or later (recommended: >=80 for best compatibility). +First, ensure that [setuptools-scm] is present during the project's +build step by specifying it as one of the build requirements. + +```toml title="pyproject.toml" +[build-system] +requires = ["setuptools>=80", "setuptools-scm>=8"] +build-backend = "setuptools.build_meta" +``` + +That will be sufficient to require [setuptools-scm] for projects +that support [PEP 518] like [pip] and [build]. + +[pip]: https://pypi.org/project/pip +[build]: https://pypi.org/project/build +[PEP 518]: https://peps.python.org/pep-0518/ + + +To enable version inference, you need to set the version +dynamically in the `project` section of `pyproject.toml`: + +```toml title="pyproject.toml" +[project] +# version = "0.0.1" # Remove any existing version parameter. +dynamic = ["version"] + +[tool.setuptools_scm] +``` + +!!! note "Simplified Configuration" + + Starting with setuptools-scm 8.1+, if `setuptools_scm` (or `setuptools-scm`) is + present in your `build-system.requires`, the `[tool.setuptools_scm]` section + becomes optional! You can now enable setuptools-scm with just: + + ```toml title="pyproject.toml" + [build-system] + requires = ["setuptools>=80", "setuptools-scm>=8"] + build-backend = "setuptools.build_meta" + + [project] + dynamic = ["version"] + ``` + + The `[tool.setuptools_scm]` section is only needed if you want to customize + configuration options. + +Additionally, a version file can be written by specifying: + +```toml title="pyproject.toml" +[tool.setuptools_scm] +version_file = "pkg/_version.py" +``` + +Where `pkg` is the name of your package. + +If you need to confirm which version string is being generated or debug the configuration, +you can install [setuptools-scm] directly in your working environment and run: + +```console +$ python -m setuptools_scm +# To explore other options, try: +$ python -m setuptools_scm --help +``` + +For further configuration see the [documentation]. + +[setuptools-scm]: https://github.com/pypa/setuptools-scm +[documentation]: https://setuptools-scm.readthedocs.io/ +[git-archive-docs]: https://setuptools-scm.readthedocs.io/en/stable/usage/#builtin-mechanisms-for-obtaining-version-numbers + + +## Interaction with Enterprise Distributions + +Some enterprise distributions like RHEL7 +ship rather old setuptools versions. + +In those cases its typically possible to build by using an sdist against `setuptools-scm<2.0`. +As those old setuptools versions lack sensible types for versions, +modern [setuptools-scm] is unable to support them sensibly. + +It's strongly recommended to build a wheel artifact using modern Python and setuptools, +then installing the artifact instead of trying to run against old setuptools versions. + +!!! note "Legacy Setuptools Support" + While setuptools-scm recommends setuptools >=80, it maintains compatibility with setuptools 61+ + to support legacy deployments that cannot easily upgrade. Support for setuptools <80 is deprecated + and will be removed in a future release. This allows enterprise environments and older CI/CD systems + to continue using setuptools-scm while still encouraging adoption of newer versions. + + +## Code of Conduct + + +Everyone interacting in the [setuptools-scm] project's codebases, issue +trackers, chat rooms, and mailing lists is expected to follow the +[PSF Code of Conduct]. + +[PSF Code of Conduct]: https://github.com/pypa/.github/blob/main/CODE_OF_CONDUCT.md + + +## Security Contact + +To report a security vulnerability, please use the +[Tidelift security contact](https://tidelift.com/security). +Tidelift will coordinate the fix and disclosure. diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD new file mode 100644 index 0000000..cdee4cc --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD @@ -0,0 +1,43 @@ +setuptools_scm/.git_archival.txt,sha256=2_90kdS1POSQMuZfBCUw6qNjObu7Ijp8DmptEAmlGkU,102 +setuptools_scm/__init__.py,sha256=k4jjJK8ejFI95amIoLWNCFECWIQW9NlxF9Had4RqOHM,785 +setuptools_scm/__main__.py,sha256=AhntzdNH3Jhcio_Ohoc6_EW7CuIN02OM-0irpGEXXh0,116 +setuptools_scm/_cli.py,sha256=btP9GJ66NIymXwaJNV_MpHg_2V5vZ444TWWELnoHplQ,9427 +setuptools_scm/_compat.py,sha256=CNt8TT3vdaHNsxWqIaSIoVOovfjnyprrx9RLyCD6-f0,2193 +setuptools_scm/_config.py,sha256=v2pRxWCJ4dzTL8a4p_tKsjvxJTDkMDA_oULCwSlACdc,10769 +setuptools_scm/_entrypoints.py,sha256=5ix0F8RAqRaP7ketm0O_IfsOdYKwZqDaKNU1gOKHImA,3697 +setuptools_scm/_get_version_impl.py,sha256=_FGFkXyk8ACUWinpA07L7Z43q5FBsa5juw6k4GPWZWw,9000 +setuptools_scm/_log.py,sha256=ulRwblEzYhkkbyTb2P-217GPpwn-z_Tz_lsd9pTL0RQ,2070 +setuptools_scm/_modify_version.py,sha256=9VU-juFg2IZjrcyz9kLGRfBq4RyZZElhjPMipqjB3Xc,1738 +setuptools_scm/_node_utils.py,sha256=ORxu526O4ruEUYHEHgcxl6punKdWY5K4kWA5e6zUU8Y,1310 +setuptools_scm/_overrides.py,sha256=AXYCpB4OCgFBRnaRWcbQA57jSmar4SAWB-uGHJZ0L6c,10520 +setuptools_scm/_requirement_cls.py,sha256=1q276rt4ZYafSiyN_coIWW3eHZG004TM-iZ130DEoJA,1100 +setuptools_scm/_run_cmd.py,sha256=84edY5QRqKdSJxkdjlUwIb0ztlOi1bMPC1uOxtYSzP0,6193 +setuptools_scm/_types.py,sha256=7ytOld6LZJzDegaamZC4-6ukQhVx8b1OdJbyzpcSeVI,1765 +setuptools_scm/_version_cls.py,sha256=YiD0IMtcKQq-eWfrQHAZLT5VKs598nHp0kzLmCaoxqo,3256 +setuptools_scm/discover.py,sha256=kelrYHy_LSsMuFN7QhYn7iq6xifWCIPBI4bBGxl8AI8,2069 +setuptools_scm/fallbacks.py,sha256=x3Xv1p89AqJiBX6oxuoo8Di0yR5ijOFOwKBJGAeWTbY,1448 +setuptools_scm/git.py,sha256=Z7ByutjME4JB-h7LXryIaSMY-LbqHsbKsqFNrZXZgkM,14964 +setuptools_scm/hg.py,sha256=COpRbHHgMcPMnrjP3ROmM8nrGmpQb6GIfHTzHMP-etY,10733 +setuptools_scm/hg_git.py,sha256=JLTFvUMMJnN-5zZwNOerMkvlqPJrkKTRAlTTTNl6neM,5540 +setuptools_scm/integration.py,sha256=n3FleU_zlCqIp6pv0PKrUx83q8Alc9Er420u_aUoRqI,806 +setuptools_scm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +setuptools_scm/scm_workdir.py,sha256=HULdWY2vmYWz-nD3sZibeMsSXvbemNgY9CoH4tGvHKU,1466 +setuptools_scm/version.py,sha256=vX2fpwNrlBai9MQtka9fQL83zyB9ZTkvu6RgSBgTuvc,19474 +setuptools_scm/_file_finders/__init__.py,sha256=WPeWsbyrGOGqsjxc_dkYkUzWHXxUQFCzxMrAsSby-jg,3732 +setuptools_scm/_file_finders/git.py,sha256=4IsnFiTz-iyrm3R5ih6HArrGJzImLVdY9zsmMiESOg8,4434 +setuptools_scm/_file_finders/hg.py,sha256=fK_mTX-feWSyGYO8WWN_hVVt2wiulqXLO1qePLZ5h2A,2227 +setuptools_scm/_file_finders/pathtools.py,sha256=AgOl5u_WHxCQeiUCwlN8bUE3B4vs5BxSJEK1LJutyus,179 +setuptools_scm/_integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +setuptools_scm/_integration/deprecation.py,sha256=ZnC8Yr8RBUCGjlwPAZB0Rj0fsUkbbt9mqfgIkyYYU30,786 +setuptools_scm/_integration/dump_version.py,sha256=3zJESs0T-XJS2qm0yiyMWfh8MN9n5lGW8_qVS9M8Ew8,3219 +setuptools_scm/_integration/pyproject_reading.py,sha256=RIpa82xFR4J_bC4rWQtpg-zWGlt88m0aqK85IwKIz5c,9252 +setuptools_scm/_integration/setup_cfg.py,sha256=eM88lHpyG7HkUUQ3Lz75aeha8jS_vAhLJKIL4lfJLho,1302 +setuptools_scm/_integration/setuptools.py,sha256=pcbe-J-P_ELNo2FW8y20EnlGrzp9NmCB64idnpLOwNg,5142 +setuptools_scm/_integration/toml.py,sha256=9PIJGUBRAdxDyKM5XsweTSDERSUCdWBQfXYOAXCQVGI,1882 +setuptools_scm/_integration/version_inference.py,sha256=26iPIq8402dBf6FS7lo5IfqNqHgwAxE_T6fLKnKklc8,4195 +setuptools_scm-9.2.2.dist-info/licenses/LICENSE,sha256=iYB6zyMJvShfAzQE7nhYFgLzzZuBmhasLw5fYP9KRz4,1023 +setuptools_scm-9.2.2.dist-info/METADATA,sha256=_OGZb2ixEINe2f-PV2DzadHpkhgFbs57bCPCLi3ktcw,7749 +setuptools_scm-9.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91 +setuptools_scm-9.2.2.dist-info/entry_points.txt,sha256=sdHLPEpsB8x6KvWlekw5G1qo-huQlrmh-Lk-EfIucxE,1933 +setuptools_scm-9.2.2.dist-info/top_level.txt,sha256=kiu-91q3_rJLUoc2wl8_lC4cIlpgtgdD_4NaChF4hOA,15 +setuptools_scm-9.2.2.dist-info/RECORD,, diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL new file mode 100644 index 0000000..e7fa31b --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL @@ -0,0 +1,5 @@ +Wheel-Version: 1.0 +Generator: setuptools (80.9.0) +Root-Is-Purelib: true +Tag: py3-none-any + diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt new file mode 100644 index 0000000..24009c4 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt @@ -0,0 +1,49 @@ +[console_scripts] +setuptools-scm = setuptools_scm._cli:main + +[distutils.setup_keywords] +use_scm_version = setuptools_scm._integration.setuptools:version_keyword + +[pipx.run] +setuptools-scm = setuptools_scm._cli:main +setuptools_scm = setuptools_scm._cli:main + +[setuptools.file_finders] +setuptools_scm = setuptools_scm._file_finders:find_files + +[setuptools.finalize_distribution_options] +setuptools_scm = setuptools_scm._integration.setuptools:infer_version + +[setuptools_scm.files_command] +.git = setuptools_scm._file_finders.git:git_find_files +.hg = setuptools_scm._file_finders.hg:hg_find_files + +[setuptools_scm.files_command_fallback] +.git_archival.txt = setuptools_scm._file_finders.git:git_archive_find_files +.hg_archival.txt = setuptools_scm._file_finders.hg:hg_archive_find_files + +[setuptools_scm.local_scheme] +dirty-tag = setuptools_scm.version:get_local_dirty_tag +no-local-version = setuptools_scm.version:get_no_local_node +node-and-date = setuptools_scm.version:get_local_node_and_date +node-and-timestamp = setuptools_scm.version:get_local_node_and_timestamp + +[setuptools_scm.parse_scm] +.git = setuptools_scm.git:parse +.hg = setuptools_scm.hg:parse + +[setuptools_scm.parse_scm_fallback] +.git_archival.txt = setuptools_scm.git:parse_archival +.hg_archival.txt = setuptools_scm.hg:parse_archival +PKG-INFO = setuptools_scm.fallbacks:parse_pkginfo +pyproject.toml = setuptools_scm.fallbacks:fallback_version +setup.py = setuptools_scm.fallbacks:fallback_version + +[setuptools_scm.version_scheme] +calver-by-date = setuptools_scm.version:calver_by_date +guess-next-dev = setuptools_scm.version:guess_next_dev_version +no-guess-dev = setuptools_scm.version:no_guess_dev_version +only-version = setuptools_scm.version:only_version +post-release = setuptools_scm.version:postrelease_version +python-simplified-semver = setuptools_scm.version:simplified_semver_version +release-branch-semver = setuptools_scm.version:release_branch_semver_version diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE new file mode 100644 index 0000000..89de354 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE @@ -0,0 +1,17 @@ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt new file mode 100644 index 0000000..b350a80 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt @@ -0,0 +1,9 @@ +packaging>=20 +setuptools + +[rich] +rich + +[simple] + +[toml] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt new file mode 100644 index 0000000..cba8d88 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt @@ -0,0 +1 @@ +setuptools_scm diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt new file mode 100644 index 0000000..7c51009 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt @@ -0,0 +1,3 @@ +node: $Format:%H$ +node-date: $Format:%cI$ +describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py new file mode 100644 index 0000000..e265e85 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py @@ -0,0 +1,30 @@ +""" +:copyright: 2010-2023 by Ronny Pfannschmidt +:license: MIT +""" + +from __future__ import annotations + +from ._config import DEFAULT_LOCAL_SCHEME +from ._config import DEFAULT_VERSION_SCHEME +from ._config import Configuration +from ._get_version_impl import _get_version +from ._get_version_impl import get_version +from ._integration.dump_version import dump_version # soft deprecated +from ._version_cls import NonNormalizedVersion +from ._version_cls import Version +from .version import ScmVersion + +# Public API +__all__ = [ + "DEFAULT_LOCAL_SCHEME", + "DEFAULT_VERSION_SCHEME", + "Configuration", + "NonNormalizedVersion", + "ScmVersion", + "Version", + "_get_version", + "dump_version", + # soft deprecated imports, left for backward compatibility + "get_version", +] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py new file mode 100644 index 0000000..3f56d42 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py @@ -0,0 +1,6 @@ +from __future__ import annotations + +from ._cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py new file mode 100644 index 0000000..1f104f4 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py @@ -0,0 +1,291 @@ +from __future__ import annotations + +import argparse +import json +import os +import sys + +from pathlib import Path +from typing import Any + +from setuptools_scm import Configuration +from setuptools_scm._file_finders import find_files +from setuptools_scm._get_version_impl import _get_version +from setuptools_scm.discover import walk_potential_roots + + +def main(args: list[str] | None = None) -> int: + opts = _get_cli_opts(args) + inferred_root: str = opts.root or "." + + pyproject = opts.config or _find_pyproject(inferred_root) + + try: + config = Configuration.from_file( + pyproject, + root=(os.path.abspath(opts.root) if opts.root is not None else None), + ) + except (LookupError, FileNotFoundError) as ex: + # no pyproject.toml OR no [tool.setuptools_scm] + print( + f"Warning: could not use {os.path.relpath(pyproject)}," + " using default configuration.\n" + f" Reason: {ex}.", + file=sys.stderr, + ) + config = Configuration(root=inferred_root) + version: str | None + if opts.no_version: + version = "0.0.0+no-version-was-requested.fake-version" + else: + version = _get_version( + config, force_write_version_files=opts.force_write_version_files + ) + if version is None: + raise SystemExit("ERROR: no version found for", opts) + if opts.strip_dev: + version = version.partition(".dev")[0] + + return command(opts, version, config) + + +def _get_cli_opts(args: list[str] | None) -> argparse.Namespace: + prog = "python -m setuptools_scm" + desc = "Print project version according to SCM metadata" + parser = argparse.ArgumentParser(prog, description=desc) + # By default, help for `--help` starts with lower case, so we keep the pattern: + parser.add_argument( + "-r", + "--root", + default=None, + help='directory managed by the SCM, default: inferred from config file, or "."', + ) + parser.add_argument( + "-c", + "--config", + default=None, + metavar="PATH", + help="path to 'pyproject.toml' with setuptools-scm config, " + "default: looked up in the current or parent directories", + ) + parser.add_argument( + "--strip-dev", + action="store_true", + help="remove the dev/local parts of the version before printing the version", + ) + parser.add_argument( + "-N", + "--no-version", + action="store_true", + help="do not include package version in the output", + ) + output_formats = ["json", "plain", "key-value"] + parser.add_argument( + "-f", + "--format", + type=str.casefold, + default="plain", + help="specify output format", + choices=output_formats, + ) + parser.add_argument( + "-q", + "--query", + type=str.casefold, + nargs="*", + help="display setuptools-scm settings according to query, " + "e.g. dist_name, do not supply an argument in order to " + "print a list of valid queries.", + ) + parser.add_argument( + "--force-write-version-files", + action="store_true", + help="trigger to write the content of the version files\n" + "its recommended to use normal/editable installation instead)", + ) + sub = parser.add_subparsers(title="extra commands", dest="command", metavar="") + # We avoid `metavar` to prevent printing repetitive information + desc = "List information about the package, e.g. included files" + sub.add_parser("ls", help=desc[0].lower() + desc[1:], description=desc) + + # Add create-archival-file subcommand + archival_desc = "Create .git_archival.txt file for git archive support" + archival_parser = sub.add_parser( + "create-archival-file", + help=archival_desc[0].lower() + archival_desc[1:], + description=archival_desc, + ) + archival_group = archival_parser.add_mutually_exclusive_group(required=True) + archival_group.add_argument( + "--stable", + action="store_true", + help="create stable archival file (recommended, no branch names)", + ) + archival_group.add_argument( + "--full", + action="store_true", + help="create full archival file with branch information (can cause instability)", + ) + archival_parser.add_argument( + "--force", action="store_true", help="overwrite existing .git_archival.txt file" + ) + return parser.parse_args(args) + + +# flake8: noqa: C901 +def command(opts: argparse.Namespace, version: str, config: Configuration) -> int: + data: dict[str, Any] = {} + + if opts.command == "ls": + opts.query = ["files"] + + if opts.command == "create-archival-file": + return _create_archival_file(opts, config) + + if opts.query == []: + opts.no_version = True + sys.stderr.write("Available queries:\n\n") + opts.query = ["queries"] + data["queries"] = ["files", *config.__dataclass_fields__] + + if opts.query is None: + opts.query = [] + + if not opts.no_version: + data["version"] = version + + if "files" in opts.query: + data["files"] = find_files(config.root) + + for q in opts.query: + if q in ["files", "queries", "version"]: + continue + + try: + if q.startswith("_"): + raise AttributeError() + data[q] = getattr(config, q) + except AttributeError: + sys.stderr.write(f"Error: unknown query: '{q}'\n") + return 1 + + if opts.format == "json": + print(json.dumps(data, indent=2)) + + if opts.format == "plain": + _print_plain(data) + + if opts.format == "key-value": + _print_key_value(data) + + return 0 + + +def _print_plain(data: dict[str, Any]) -> None: + version = data.pop("version", None) + if version: + print(version) + files = data.pop("files", []) + for file_ in files: + print(file_) + queries = data.pop("queries", []) + for query in queries: + print(query) + if data: + print("\n".join(data.values())) + + +def _print_key_value(data: dict[str, Any]) -> None: + for key, value in data.items(): + if isinstance(value, str): + print(f"{key} = {value}") + else: + str_value = "\n ".join(value) + print(f"{key} = {str_value}") + + +def _find_pyproject(parent: str) -> str: + for directory in walk_potential_roots(os.path.abspath(parent)): + pyproject = os.path.join(directory, "pyproject.toml") + if os.path.isfile(pyproject): + return pyproject + + return os.path.abspath( + "pyproject.toml" + ) # use default name to trigger the default errors + + +def _create_archival_file(opts: argparse.Namespace, config: Configuration) -> int: + """Create .git_archival.txt file with appropriate content.""" + archival_path = Path(config.root, ".git_archival.txt") + + # Check if file exists and force flag + if archival_path.exists() and not opts.force: + print( + f"Error: {archival_path} already exists. Use --force to overwrite.", + file=sys.stderr, + ) + return 1 + + if opts.stable: + content = _get_stable_archival_content() + print("Creating stable .git_archival.txt (recommended for releases)") + elif opts.full: + content = _get_full_archival_content() + print("Creating full .git_archival.txt with branch information") + print("WARNING: This can cause archive checksums to be unstable!") + + try: + archival_path.write_text(content, encoding="utf-8") + print(f"Created: {archival_path}") + + gitattributes_path = Path(config.root, ".gitattributes") + needs_gitattributes = True + + if gitattributes_path.exists(): + # TODO: more nuanced check later + gitattributes_content = gitattributes_path.read_text("utf-8") + if ( + ".git_archival.txt" in gitattributes_content + and "export-subst" in gitattributes_content + ): + needs_gitattributes = False + + if needs_gitattributes: + print("\nNext steps:") + print("1. Add this line to .gitattributes:") + print(" .git_archival.txt export-subst") + print("2. Commit both files:") + print(" git add .git_archival.txt .gitattributes") + print(" git commit -m 'add git archive support'") + else: + print("\nNext step:") + print("Commit the archival file:") + print(" git add .git_archival.txt") + print(" git commit -m 'update git archival file'") + + return 0 + except OSError as e: + print(f"Error: Could not create {archival_path}: {e}", file=sys.stderr) + return 1 + + +def _get_stable_archival_content() -> str: + """Generate stable archival file content (no branch names).""" + return """\ +node: $Format:%H$ +node-date: $Format:%cI$ +describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ +""" + + +def _get_full_archival_content() -> str: + """Generate full archival file content with branch information.""" + return """\ +# WARNING: Including ref-names can make archive checksums unstable +# after commits are added post-release. Use only if describe-name is insufficient. +node: $Format:%H$ +node-date: $Format:%cI$ +describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ +ref-names: $Format:%D$ +""" diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py new file mode 100644 index 0000000..4e9e301 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py @@ -0,0 +1,65 @@ +"""Compatibility utilities for cross-platform functionality.""" + +from __future__ import annotations + + +def normalize_path_for_assertion(path: str) -> str: + """Normalize path separators for cross-platform assertions. + + On Windows, this converts backslashes to forward slashes to ensure + path comparisons work correctly. On other platforms, returns the path unchanged. + The length of the string is not changed by this operation. + + Args: + path: The path string to normalize + + Returns: + The path with normalized separators + """ + return path.replace("\\", "/") + + +def strip_path_suffix( + full_path: str, suffix_path: str, error_msg: str | None = None +) -> str: + """Strip a suffix from a path, with cross-platform path separator handling. + + This function first normalizes path separators for Windows compatibility, + then asserts that the full path ends with the suffix, and finally returns + the path with the suffix removed. This is the common pattern used for + computing parent directories from git output. + + Args: + full_path: The full path string + suffix_path: The suffix path to strip from the end + error_msg: Optional custom error message for the assertion + + Returns: + The prefix path with the suffix removed + + Raises: + AssertionError: If the full path doesn't end with the suffix + """ + normalized_full = normalize_path_for_assertion(full_path) + + if error_msg: + assert normalized_full.endswith(suffix_path), error_msg + else: + assert normalized_full.endswith(suffix_path), ( + f"Path assertion failed: {full_path!r} does not end with {suffix_path!r}" + ) + + return full_path[: -len(suffix_path)] + + +# Legacy aliases for backward compatibility during transition +def assert_path_endswith( + full_path: str, suffix_path: str, error_msg: str | None = None +) -> None: + """Legacy alias - use strip_path_suffix instead.""" + strip_path_suffix(full_path, suffix_path, error_msg) + + +def compute_path_prefix(full_path: str, suffix_path: str) -> str: + """Legacy alias - use strip_path_suffix instead.""" + return strip_path_suffix(full_path, suffix_path) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py new file mode 100644 index 0000000..49fac2a --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py @@ -0,0 +1,318 @@ +"""configuration""" + +from __future__ import annotations + +import dataclasses +import os +import re +import warnings + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any +from typing import Pattern +from typing import Protocol + +if TYPE_CHECKING: + from . import git + +from . import _log +from . import _types as _t +from ._integration.pyproject_reading import PyProjectData +from ._integration.pyproject_reading import ( + get_args_for_pyproject as _get_args_for_pyproject, +) +from ._integration.pyproject_reading import read_pyproject as _read_pyproject +from ._overrides import read_toml_overrides +from ._version_cls import Version as _Version +from ._version_cls import _validate_version_cls +from ._version_cls import _VersionT + +log = _log.log.getChild("config") + + +def _is_called_from_dataclasses() -> bool: + """Check if the current call is from the dataclasses module.""" + import inspect + + frame = inspect.currentframe() + try: + # Walk up to 7 frames to check for dataclasses calls + current_frame = frame + assert current_frame is not None + for _ in range(7): + current_frame = current_frame.f_back + if current_frame is None: + break + if "dataclasses.py" in current_frame.f_code.co_filename: + return True + return False + finally: + del frame + + +class _GitDescribeCommandDescriptor: + """Data descriptor for deprecated git_describe_command field.""" + + def __get__( + self, obj: Configuration | None, objtype: type[Configuration] | None = None + ) -> _t.CMD_TYPE | None: + if obj is None: + return self # type: ignore[return-value] + + # Only warn if not being called by dataclasses.replace or similar introspection + is_from_dataclasses = _is_called_from_dataclasses() + if not is_from_dataclasses: + warnings.warn( + "Configuration field 'git_describe_command' is deprecated. " + "Use 'scm.git.describe_command' instead.", + DeprecationWarning, + stacklevel=2, + ) + return obj.scm.git.describe_command + + def __set__(self, obj: Configuration, value: _t.CMD_TYPE | None) -> None: + warnings.warn( + "Configuration field 'git_describe_command' is deprecated. " + "Use 'scm.git.describe_command' instead.", + DeprecationWarning, + stacklevel=2, + ) + obj.scm.git.describe_command = value + + +DEFAULT_TAG_REGEX = re.compile( + r"^(?:[\w-]+-)?(?P[vV]?\d+(?:\.\d+){0,2}[^\+]*)(?:\+.*)?$" +) +"""default tag regex that tries to match PEP440 style versions +with prefix consisting of dashed words""" + +DEFAULT_VERSION_SCHEME = "guess-next-dev" +DEFAULT_LOCAL_SCHEME = "node-and-date" + + +def _check_tag_regex(value: str | Pattern[str] | None) -> Pattern[str]: + if not value: + regex = DEFAULT_TAG_REGEX + else: + regex = re.compile(value) + + group_names = regex.groupindex.keys() + if regex.groups == 0 or (regex.groups > 1 and "version" not in group_names): + raise ValueError( + f"Expected tag_regex '{regex.pattern}' to contain a single match group or" + " a group named 'version' to identify the version part of any tag." + ) + + return regex + + +def _get_default_git_pre_parse() -> git.GitPreParse: + """Get the default git pre_parse enum value""" + from . import git + + return git.GitPreParse.WARN_ON_SHALLOW + + +class ParseFunction(Protocol): + def __call__( + self, root: _t.PathT, *, config: Configuration + ) -> _t.SCMVERSION | None: ... + + +def _check_absolute_root(root: _t.PathT, relative_to: _t.PathT | None) -> str: + log.debug("check absolute root=%s relative_to=%s", root, relative_to) + if relative_to: + if ( + os.path.isabs(root) + and os.path.isabs(relative_to) + and not os.path.commonpath([root, relative_to]) == root + ): + warnings.warn( + f"absolute root path '{root}' overrides relative_to '{relative_to}'" + ) + if os.path.isdir(relative_to): + warnings.warn( + "relative_to is expected to be a file," + f" its the directory {relative_to}\n" + "assuming the parent directory was passed" + ) + log.debug("dir %s", relative_to) + root = os.path.join(relative_to, root) + else: + log.debug("file %s", relative_to) + root = os.path.join(os.path.dirname(relative_to), root) + return os.path.abspath(root) + + +@dataclasses.dataclass +class GitConfiguration: + """Git-specific configuration options""" + + pre_parse: git.GitPreParse = dataclasses.field( + default_factory=lambda: _get_default_git_pre_parse() + ) + describe_command: _t.CMD_TYPE | None = None + + @classmethod + def from_data(cls, data: dict[str, Any]) -> GitConfiguration: + """Create GitConfiguration from configuration data, converting strings to enums""" + git_data = data.copy() + + # Convert string pre_parse values to enum instances + if "pre_parse" in git_data and isinstance(git_data["pre_parse"], str): + from . import git + + try: + git_data["pre_parse"] = git.GitPreParse(git_data["pre_parse"]) + except ValueError as e: + valid_options = [option.value for option in git.GitPreParse] + raise ValueError( + f"Invalid git pre_parse function '{git_data['pre_parse']}'. " + f"Valid options are: {', '.join(valid_options)}" + ) from e + + return cls(**git_data) + + +@dataclasses.dataclass +class ScmConfiguration: + """SCM-specific configuration options""" + + git: GitConfiguration = dataclasses.field(default_factory=GitConfiguration) + + @classmethod + def from_data(cls, data: dict[str, Any]) -> ScmConfiguration: + """Create ScmConfiguration from configuration data""" + scm_data = data.copy() + + # Handle git-specific configuration + git_data = scm_data.pop("git", {}) + git_config = GitConfiguration.from_data(git_data) + + return cls(git=git_config, **scm_data) + + +@dataclasses.dataclass +class Configuration: + """Global configuration model""" + + relative_to: _t.PathT | None = None + root: _t.PathT = "." + version_scheme: _t.VERSION_SCHEME = DEFAULT_VERSION_SCHEME + local_scheme: _t.VERSION_SCHEME = DEFAULT_LOCAL_SCHEME + tag_regex: Pattern[str] = DEFAULT_TAG_REGEX + parentdir_prefix_version: str | None = None + fallback_version: str | None = None + fallback_root: _t.PathT = "." + write_to: _t.PathT | None = None + write_to_template: str | None = None + version_file: _t.PathT | None = None + version_file_template: str | None = None + parse: ParseFunction | None = None + git_describe_command: dataclasses.InitVar[_t.CMD_TYPE | None] = ( + _GitDescribeCommandDescriptor() + ) + + dist_name: str | None = None + version_cls: type[_VersionT] = _Version + search_parent_directories: bool = False + + parent: _t.PathT | None = None + + # Nested SCM configurations + scm: ScmConfiguration = dataclasses.field( + default_factory=lambda: ScmConfiguration() + ) + + # Deprecated fields (handled in __post_init__) + + def __post_init__(self, git_describe_command: _t.CMD_TYPE | None) -> None: + self.tag_regex = _check_tag_regex(self.tag_regex) + + # Handle deprecated git_describe_command + # Check if it's a descriptor object (happens when no value is passed) + if git_describe_command is not None and not isinstance( + git_describe_command, _GitDescribeCommandDescriptor + ): + # Check if this is being called from dataclasses + is_from_dataclasses = _is_called_from_dataclasses() + + same_value = ( + self.scm.git.describe_command is not None + and self.scm.git.describe_command == git_describe_command + ) + + if is_from_dataclasses and same_value: + # Ignore the passed value - it's from dataclasses.replace() with same value + pass + else: + warnings.warn( + "Configuration field 'git_describe_command' is deprecated. " + "Use 'scm.git.describe_command' instead.", + DeprecationWarning, + stacklevel=2, + ) + # Check for conflicts + if self.scm.git.describe_command is not None: + raise ValueError( + "Cannot specify both 'git_describe_command' (deprecated) and " + "'scm.git.describe_command'. Please use only 'scm.git.describe_command'." + ) + self.scm.git.describe_command = git_describe_command + + @property + def absolute_root(self) -> str: + return _check_absolute_root(self.root, self.relative_to) + + @classmethod + def from_file( + cls, + name: str | os.PathLike[str] = "pyproject.toml", + dist_name: str | None = None, + pyproject_data: PyProjectData | None = None, + **kwargs: Any, + ) -> Configuration: + """ + Read Configuration from pyproject.toml (or similar). + Raises exceptions when file is not found or toml is + not installed or the file has invalid format. + + Parameters: + - name: path to pyproject.toml + - dist_name: name of the distribution + - **kwargs: additional keyword arguments to pass to the Configuration constructor + """ + + if pyproject_data is None: + pyproject_data = _read_pyproject(Path(name)) + args = _get_args_for_pyproject(pyproject_data, dist_name, kwargs) + + args.update(read_toml_overrides(args["dist_name"])) + relative_to = args.pop("relative_to", name) + return cls.from_data(relative_to=relative_to, data=args) + + @classmethod + def from_data( + cls, relative_to: str | os.PathLike[str], data: dict[str, Any] + ) -> Configuration: + """ + given configuration data + create a config instance after validating tag regex/version class + """ + version_cls = _validate_version_cls( + data.pop("version_cls", None), data.pop("normalize", True) + ) + + # Handle nested SCM configuration + scm_data = data.pop("scm", {}) + + # Handle nested SCM configuration + + scm_config = ScmConfiguration.from_data(scm_data) + return cls( + relative_to=relative_to, + version_cls=version_cls, + scm=scm_config, + **data, + ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py new file mode 100644 index 0000000..74a18a7 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import sys + +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Iterator +from typing import cast + +from . import _log +from . import version + +__all__ = [ + "entry_points", + "im", +] +if TYPE_CHECKING: + from . import _types as _t + from ._config import Configuration + from ._config import ParseFunction + +from importlib import metadata as im + +log = _log.log.getChild("entrypoints") + + +if sys.version_info[:2] < (3, 10): + + def entry_points(*, group: str, name: str | None = None) -> list[im.EntryPoint]: + # Python 3.9: entry_points() returns dict, need to handle filtering manually + + eps = im.entry_points() # Returns dict + + group_eps = eps.get(group, []) + if name is not None: + return [ep for ep in group_eps if ep.name == name] + return group_eps +else: + + def entry_points(*, group: str, name: str | None = None) -> im.EntryPoints: + kw = {"group": group} + if name is not None: + kw["name"] = name + return im.entry_points(**kw) + + +def version_from_entrypoint( + config: Configuration, *, entrypoint: str, root: _t.PathT +) -> version.ScmVersion | None: + from .discover import iter_matching_entrypoints + + log.debug("version_from_ep %s in %s", entrypoint, root) + for ep in iter_matching_entrypoints(root, entrypoint, config): + fn: ParseFunction = ep.load() + maybe_version: version.ScmVersion | None = fn(root, config=config) + log.debug("%s found %r", ep, maybe_version) + if maybe_version is not None: + return maybe_version + return None + + +def _get_ep(group: str, name: str) -> Any | None: + for ep in entry_points(group=group, name=name): + log.debug("ep found: %s", ep.name) + return ep.load() + return None + + +def _get_from_object_reference_str(path: str, group: str) -> Any | None: + # todo: remove for importlib native spelling + from importlib.metadata import EntryPoint # hack + + ep = EntryPoint(path, path, group) + try: + return ep.load() + except (AttributeError, ModuleNotFoundError): + return None + + +def _iter_version_schemes( + entrypoint: str, + scheme_value: _t.VERSION_SCHEMES, + _memo: set[object] | None = None, +) -> Iterator[Callable[[version.ScmVersion], str]]: + if _memo is None: + _memo = set() + if isinstance(scheme_value, str): + scheme_value = cast( + "_t.VERSION_SCHEMES", + _get_ep(entrypoint, scheme_value) + or _get_from_object_reference_str(scheme_value, entrypoint), + ) + + if isinstance(scheme_value, (list, tuple)): + for variant in scheme_value: + if variant not in _memo: + _memo.add(variant) + yield from _iter_version_schemes(entrypoint, variant, _memo=_memo) + elif callable(scheme_value): + yield scheme_value + + +def _call_version_scheme( + version: version.ScmVersion, + entrypoint: str, + given_value: _t.VERSION_SCHEMES, + default: str | None = None, +) -> str: + found_any_implementation = False + for scheme in _iter_version_schemes(entrypoint, given_value): + found_any_implementation = True + result = scheme(version) + if result is not None: + return result + if not found_any_implementation: + raise ValueError( + f'Couldn\'t find any implementations for entrypoint "{entrypoint}"' + f' with value "{given_value}".' + ) + if default is not None: + return default + raise ValueError( + f'None of the "{entrypoint}" entrypoints matching "{given_value}"' + " returned a value." + ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py new file mode 100644 index 0000000..e19afc8 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py @@ -0,0 +1,113 @@ +from __future__ import annotations + +import os + +from typing import TYPE_CHECKING +from typing import Callable + +from .. import _log +from .. import _types as _t +from .._entrypoints import entry_points +from .pathtools import norm_real + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import TypeGuard + else: + from typing_extensions import TypeGuard + + +log = _log.log.getChild("file_finder") + + +def scm_find_files( + path: _t.PathT, + scm_files: set[str], + scm_dirs: set[str], + force_all_files: bool = False, +) -> list[str]: + """ setuptools compatible file finder that follows symlinks + + - path: the root directory from which to search + - scm_files: set of scm controlled files and symlinks + (including symlinks to directories) + - scm_dirs: set of scm controlled directories + (including directories containing no scm controlled files) + - force_all_files: ignore ``scm_files`` and ``scm_dirs`` and list everything. + + scm_files and scm_dirs must be absolute with symlinks resolved (realpath), + with normalized case (normcase) + + Spec here: https://setuptools.pypa.io/en/latest/userguide/extension.html#\ + adding-support-for-revision-control-systems + """ + realpath = norm_real(path) + seen: set[str] = set() + res: list[str] = [] + for dirpath, dirnames, filenames in os.walk(realpath, followlinks=True): + # dirpath with symlinks resolved + realdirpath = norm_real(dirpath) + + def _link_not_in_scm(n: str, realdirpath: str = realdirpath) -> bool: + fn = os.path.join(realdirpath, os.path.normcase(n)) + return os.path.islink(fn) and fn not in scm_files + + if not force_all_files and realdirpath not in scm_dirs: + # directory not in scm, don't walk it's content + dirnames[:] = [] + continue + if os.path.islink(dirpath) and not os.path.relpath( + realdirpath, realpath + ).startswith(os.pardir): + # a symlink to a directory not outside path: + # we keep it in the result and don't walk its content + res.append(os.path.join(path, os.path.relpath(dirpath, path))) + dirnames[:] = [] + continue + if realdirpath in seen: + # symlink loop protection + dirnames[:] = [] + continue + dirnames[:] = [ + dn for dn in dirnames if force_all_files or not _link_not_in_scm(dn) + ] + for filename in filenames: + if not force_all_files and _link_not_in_scm(filename): + continue + # dirpath + filename with symlinks preserved + fullfilename = os.path.join(dirpath, filename) + is_tracked = norm_real(fullfilename) in scm_files + if force_all_files or is_tracked: + res.append(os.path.join(path, os.path.relpath(fullfilename, realpath))) + seen.add(realdirpath) + return res + + +def is_toplevel_acceptable(toplevel: str | None) -> TypeGuard[str]: + """ """ + if toplevel is None: + return False + + ignored: list[str] = os.environ.get("SETUPTOOLS_SCM_IGNORE_VCS_ROOTS", "").split( + os.pathsep + ) + ignored = [os.path.normcase(p) for p in ignored] + + log.debug("toplevel: %r\n ignored %s", toplevel, ignored) + + return toplevel not in ignored + + +def find_files(path: _t.PathT = "") -> list[str]: + eps = [ + *entry_points(group="setuptools_scm.files_command"), + *entry_points(group="setuptools_scm.files_command_fallback"), + ] + for ep in eps: + command: Callable[[_t.PathT], list[str]] = ep.load() + res: list[str] = command(path) + if res: + return res + return [] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py new file mode 100644 index 0000000..4379c21 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py @@ -0,0 +1,124 @@ +from __future__ import annotations + +import logging +import os +import subprocess +import tarfile + +from typing import IO + +from .. import _types as _t +from .._run_cmd import run as _run +from ..integration import data_from_mime +from . import is_toplevel_acceptable +from . import scm_find_files +from .pathtools import norm_real + +log = logging.getLogger(__name__) + + +def _git_toplevel(path: str) -> str | None: + try: + cwd = os.path.abspath(path or ".") + res = _run(["git", "rev-parse", "HEAD"], cwd=cwd) + if res.returncode: + # BAIL if there is no commit + log.error("listing git files failed - pretending there aren't any") + return None + res = _run( + ["git", "rev-parse", "--show-prefix"], + cwd=cwd, + ) + if res.returncode: + return None + out = res.stdout[:-1] # remove the trailing pathsep + if not out: + out = cwd + else: + # Here, ``out`` is a relative path to root of git. + # ``cwd`` is absolute path to current working directory. + # the below method removes the length of ``out`` from + # ``cwd``, which gives the git toplevel + from .._compat import strip_path_suffix + + out = strip_path_suffix(cwd, out, f"cwd={cwd!r}\nout={out!r}") + log.debug("find files toplevel %s", out) + return norm_real(out) + except subprocess.CalledProcessError: + # git returned error, we are not in a git repo + return None + except OSError: + # git command not found, probably + return None + + +def _git_interpret_archive(fd: IO[bytes], toplevel: str) -> tuple[set[str], set[str]]: + with tarfile.open(fileobj=fd, mode="r|*") as tf: + git_files = set() + git_dirs = {toplevel} + for member in tf.getmembers(): + name = os.path.normcase(member.name).replace("/", os.path.sep) + if member.type == tarfile.DIRTYPE: + git_dirs.add(name) + else: + git_files.add(name) + return git_files, git_dirs + + +def _git_ls_files_and_dirs(toplevel: str) -> tuple[set[str], set[str]]: + # use git archive instead of git ls-file to honor + # export-ignore git attribute + + cmd = ["git", "archive", "--prefix", toplevel + os.path.sep, "HEAD"] + log.info("running %s", " ".join(str(x) for x in cmd)) + proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, cwd=toplevel, stderr=subprocess.DEVNULL + ) + assert proc.stdout is not None + try: + try: + return _git_interpret_archive(proc.stdout, toplevel) + finally: + # ensure we avoid resource warnings by cleaning up the process + proc.stdout.close() + proc.terminate() + # Wait for process to actually terminate and be reaped + try: + proc.wait(timeout=5) # Add timeout to avoid hanging + except subprocess.TimeoutExpired: + log.warning("git archive process did not terminate gracefully, killing") + proc.kill() + proc.wait() + except Exception: + # proc.wait() already called in finally block, check if it failed + if proc.returncode != 0: + log.error("listing git files failed - pretending there aren't any") + return set(), set() + + +def git_find_files(path: _t.PathT = "") -> list[str]: + toplevel = _git_toplevel(os.fspath(path)) + if not is_toplevel_acceptable(toplevel): + return [] + fullpath = norm_real(path) + if not fullpath.startswith(toplevel): + log.warning("toplevel mismatch computed %s vs resolved %s ", toplevel, fullpath) + git_files, git_dirs = _git_ls_files_and_dirs(toplevel) + return scm_find_files(path, git_files, git_dirs) + + +def git_archive_find_files(path: _t.PathT = "") -> list[str]: + # This function assumes that ``path`` is obtained from a git archive + # and therefore all the files that should be ignored were already removed. + archival = os.path.join(path, ".git_archival.txt") + if not os.path.exists(archival): + return [] + + data = data_from_mime(archival) + + if "$Format" in data.get("node", ""): + # Substitutions have not been performed, so not a reliable archive + return [] + + log.warning("git archive detected - fallback to listing all files") + return scm_find_files(path, set(), set(), force_all_files=True) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py new file mode 100644 index 0000000..182429c --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +import logging +import os +import subprocess + +from .. import _types as _t +from .._file_finders import is_toplevel_acceptable +from .._file_finders import scm_find_files +from ..hg import run_hg +from ..integration import data_from_mime +from .pathtools import norm_real + +log = logging.getLogger(__name__) + + +def _hg_toplevel(path: str) -> str | None: + try: + return run_hg( + ["root"], + cwd=(path or "."), + check=True, + ).parse_success(norm_real) + except subprocess.CalledProcessError: + # hg returned error, we are not in a mercurial repo + return None + except OSError: + # hg command not found, probably + return None + + +def _hg_ls_files_and_dirs(toplevel: str) -> tuple[set[str], set[str]]: + hg_files: set[str] = set() + hg_dirs = {toplevel} + res = run_hg(["files"], cwd=toplevel) + if res.returncode: + return set(), set() + for name in res.stdout.splitlines(): + name = os.path.normcase(name).replace("/", os.path.sep) + fullname = os.path.join(toplevel, name) + hg_files.add(fullname) + dirname = os.path.dirname(fullname) + while len(dirname) > len(toplevel) and dirname not in hg_dirs: + hg_dirs.add(dirname) + dirname = os.path.dirname(dirname) + return hg_files, hg_dirs + + +def hg_find_files(path: str = "") -> list[str]: + toplevel = _hg_toplevel(path) + if not is_toplevel_acceptable(toplevel): + return [] + assert toplevel is not None + hg_files, hg_dirs = _hg_ls_files_and_dirs(toplevel) + return scm_find_files(path, hg_files, hg_dirs) + + +def hg_archive_find_files(path: _t.PathT = "") -> list[str]: + # This function assumes that ``path`` is obtained from a mercurial archive + # and therefore all the files that should be ignored were already removed. + archival = os.path.join(path, ".hg_archival.txt") + if not os.path.exists(archival): + return [] + + data = data_from_mime(archival) + + if "node" not in data: + # Ensure file is valid + return [] + + log.warning("hg archive detected - fallback to listing all files") + return scm_find_files(path, set(), set(), force_all_files=True) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py new file mode 100644 index 0000000..6de8508 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py @@ -0,0 +1,9 @@ +from __future__ import annotations + +import os + +from setuptools_scm import _types as _t + + +def norm_real(path: _t.PathT) -> str: + return os.path.normcase(os.path.realpath(path)) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py new file mode 100644 index 0000000..31bc9c3 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py @@ -0,0 +1,250 @@ +from __future__ import annotations + +import dataclasses +import logging +import re +import warnings + +from pathlib import Path +from typing import Any +from typing import NoReturn +from typing import Pattern + +from . import _config +from . import _entrypoints +from . import _run_cmd +from . import _types as _t +from ._config import Configuration +from ._overrides import _read_pretended_version_for +from ._version_cls import _validate_version_cls +from .version import ScmVersion +from .version import format_version as _format_version + +EMPTY_TAG_REGEX_DEPRECATION = DeprecationWarning( + "empty regex for tag regex is invalid, using default" +) + +_log = logging.getLogger(__name__) + + +def parse_scm_version(config: Configuration) -> ScmVersion | None: + try: + if config.parse is not None: + parse_result = config.parse(config.absolute_root, config=config) + if parse_result is not None and not isinstance(parse_result, ScmVersion): + raise TypeError( + f"version parse result was {str!r}\n" + "please return a parsed version (ScmVersion)" + ) + return parse_result + else: + return _entrypoints.version_from_entrypoint( + config, + entrypoint="setuptools_scm.parse_scm", + root=config.absolute_root, + ) + except _run_cmd.CommandNotFoundError as e: + _log.exception("command %s not found while parsing the scm, using fallbacks", e) + return None + + +def parse_fallback_version(config: Configuration) -> ScmVersion | None: + return _entrypoints.version_from_entrypoint( + config, + entrypoint="setuptools_scm.parse_scm_fallback", + root=config.fallback_root, + ) + + +def parse_version(config: Configuration) -> ScmVersion | None: + # First try to get a version from the normal flow + scm_version = ( + _read_pretended_version_for(config) + or parse_scm_version(config) + or parse_fallback_version(config) + ) + + # Apply any metadata overrides to the version we found + from ._overrides import _apply_metadata_overrides + + return _apply_metadata_overrides(scm_version, config) + + +def write_version_files( + config: Configuration, version: str, scm_version: ScmVersion +) -> None: + if config.write_to is not None: + from ._integration.dump_version import dump_version + + dump_version( + root=config.root, + version=version, + scm_version=scm_version, + write_to=config.write_to, + template=config.write_to_template, + ) + if config.version_file: + from ._integration.dump_version import write_version_to_path + + version_file = Path(config.version_file) + assert not version_file.is_absolute(), f"{version_file=}" + # todo: use a better name than fallback root + assert config.relative_to is not None + target = Path(config.relative_to).parent.joinpath(version_file) + write_version_to_path( + target, + template=config.version_file_template, + version=version, + scm_version=scm_version, + ) + + +def _get_version( + config: Configuration, force_write_version_files: bool | None = None +) -> str | None: + parsed_version = parse_version(config) + if parsed_version is None: + return None + version_string = _format_version(parsed_version) + if force_write_version_files is None: + force_write_version_files = True + warnings.warn( + "force_write_version_files ought to be set," + " presuming the legacy True value", + DeprecationWarning, + ) + + if force_write_version_files: + write_version_files(config, version=version_string, scm_version=parsed_version) + + return version_string + + +def _find_scm_in_parents(config: Configuration) -> Path | None: + """ + Search parent directories for SCM repositories when relative_to is not set. + Uses the existing entrypoint system for SCM discovery. + """ + if config.search_parent_directories: + return None + + searching_config = dataclasses.replace(config, search_parent_directories=True) + + from .discover import iter_matching_entrypoints + + for _ep in iter_matching_entrypoints( + config.absolute_root, "setuptools_scm.parse_scm", searching_config + ): + # xxx: iter_matching_entrypoints should return the parent directory, we do a hack atm + assert searching_config.parent is not None + return Path(searching_config.parent) + + return None + + +def _version_missing(config: Configuration) -> NoReturn: + base_error = ( + f"setuptools-scm was unable to detect version for {config.absolute_root}.\n\n" + ) + + # If relative_to is not set, check for SCM repositories in parent directories + scm_parent = None + if config.relative_to is None: + scm_parent = _find_scm_in_parents(config) + + if scm_parent is not None: + # Found an SCM repository in a parent directory + error_msg = ( + base_error + + f"However, a repository was found in a parent directory: {scm_parent}\n\n" + f"To fix this, you have a few options:\n\n" + f"1. Use the 'relative_to' parameter to specify the file that setuptools-scm should use as reference:\n" + f" setuptools_scm.get_version(relative_to=__file__)\n\n" + f"2. Enable parent directory search in your configuration:\n" + f" [tool.setuptools_scm]\n" + f" search_parent_directories = true\n\n" + f"3. Change your working directory to the repository root: {scm_parent}\n\n" + f"4. Set the root explicitly in your configuration:\n" + f" [tool.setuptools_scm]\n" + f' root = "{scm_parent}"\n\n' + "For more information, see: https://setuptools-scm.readthedocs.io/en/latest/config/" + ) + else: + # No SCM repository found in parent directories either + error_msg = ( + base_error + + "Make sure you're either building from a fully intact git repository " + "or PyPI tarballs. Most other sources (such as GitHub's tarballs, a " + "git checkout without the .git folder) don't contain the necessary " + "metadata and will not work.\n\n" + "For example, if you're using pip, instead of " + "https://github.com/user/proj/archive/master.zip " + "use git+https://github.com/user/proj.git#egg=proj\n\n" + "Alternatively, set the version with the environment variable " + "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_${NORMALIZED_DIST_NAME} as described " + "in https://setuptools-scm.readthedocs.io/en/latest/config/" + ) + + raise LookupError(error_msg) + + +def get_version( + root: _t.PathT = ".", + version_scheme: _t.VERSION_SCHEME = _config.DEFAULT_VERSION_SCHEME, + local_scheme: _t.VERSION_SCHEME = _config.DEFAULT_LOCAL_SCHEME, + write_to: _t.PathT | None = None, + write_to_template: str | None = None, + version_file: _t.PathT | None = None, + version_file_template: str | None = None, + relative_to: _t.PathT | None = None, + tag_regex: str | Pattern[str] = _config.DEFAULT_TAG_REGEX, + parentdir_prefix_version: str | None = None, + fallback_version: str | None = None, + fallback_root: _t.PathT = ".", + parse: Any | None = None, + git_describe_command: _t.CMD_TYPE | None = None, + dist_name: str | None = None, + version_cls: Any | None = None, + normalize: bool = True, + search_parent_directories: bool = False, + scm: dict[str, Any] | None = None, +) -> str: + """ + If supplied, relative_to should be a file from which root may + be resolved. Typically called by a script or module that is not + in the root of the repository to direct setuptools-scm to the + root of the repository by supplying ``__file__``. + """ + + version_cls = _validate_version_cls(version_cls, normalize) + del normalize + tag_regex = parse_tag_regex(tag_regex) + + # Handle scm parameter by converting it to ScmConfiguration + if scm is not None: + scm_config = _config.ScmConfiguration.from_data(scm) + else: + scm_config = _config.ScmConfiguration() + + # Remove scm from locals() since we handle it separately + config_params = locals().copy() + config_params.pop("scm", None) + config_params.pop("scm_config", None) + + config = _config.Configuration(scm=scm_config, **config_params) + maybe_version = _get_version(config, force_write_version_files=True) + + if maybe_version is None: + _version_missing(config) + return maybe_version + + +def parse_tag_regex(tag_regex: str | Pattern[str]) -> Pattern[str]: + if isinstance(tag_regex, str): + if tag_regex == "": + warnings.warn(EMPTY_TAG_REGEX_DEPRECATION) + return _config.DEFAULT_TAG_REGEX + else: + return re.compile(tag_regex) + else: + return tag_regex diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py new file mode 100644 index 0000000..a1b3615 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py @@ -0,0 +1,20 @@ +import warnings + +from pathlib import Path + + +def warn_dynamic_version(path: Path, section: str, expression: str) -> None: + warnings.warn( + f"{path}: at [{section}]\n" + f"{expression} is forcing setuptools to override the version setuptools-scm did already set\n" + "When using setuptools-scm it's invalid to use setuptools dynamic version as well, please remove it.\n" + "Setuptools-scm is responsible for setting the version, forcing setuptools to override creates errors." + ) + + +def warn_pyproject_setuptools_dynamic_version(path: Path) -> None: + warn_dynamic_version(path, "tool.setuptools.dynamic", "version = {attr = ...}") + + +def warn_setup_cfg_dynamic_version(path: Path) -> None: + warn_dynamic_version(path, "metadata", "version = attr: ...") diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py new file mode 100644 index 0000000..06081c9 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import warnings + +from pathlib import Path + +from .. import _types as _t +from .._log import log as parent_log +from .._version_cls import _version_as_tuple +from ..version import ScmVersion + +log = parent_log.getChild("dump_version") + + +TEMPLATES = { + ".py": """\ +# file generated by setuptools-scm +# don't change, don't track in version control + +__all__ = [ + "__version__", + "__version_tuple__", + "version", + "version_tuple", + "__commit_id__", + "commit_id", +] + +TYPE_CHECKING = False +if TYPE_CHECKING: + from typing import Tuple + from typing import Union + + VERSION_TUPLE = Tuple[Union[int, str], ...] + COMMIT_ID = Union[str, None] +else: + VERSION_TUPLE = object + COMMIT_ID = object + +version: str +__version__: str +__version_tuple__: VERSION_TUPLE +version_tuple: VERSION_TUPLE +commit_id: COMMIT_ID +__commit_id__: COMMIT_ID + +__version__ = version = {version!r} +__version_tuple__ = version_tuple = {version_tuple!r} + +__commit_id__ = commit_id = {scm_version.short_node!r} +""", + ".txt": "{version}", +} + + +def dump_version( + root: _t.PathT, + version: str, + write_to: _t.PathT, + template: str | None = None, + scm_version: ScmVersion | None = None, +) -> None: + assert isinstance(version, str) + root = Path(root) + write_to = Path(write_to) + if write_to.is_absolute(): + # trigger warning on escape + write_to.relative_to(root) + warnings.warn( + f"{write_to=!s} is a absolute path," + " please switch to using a relative version file", + DeprecationWarning, + ) + target = write_to + else: + target = Path(root).joinpath(write_to) + write_version_to_path( + target, template=template, version=version, scm_version=scm_version + ) + + +def _validate_template(target: Path, template: str | None) -> str: + if template == "": + warnings.warn(f"{template=} looks like a error, using default instead") + template = None + if template is None: + template = TEMPLATES.get(target.suffix) + + if template is None: + raise ValueError( + f"bad file format: {target.suffix!r} (of {target})\n" + "only *.txt and *.py have a default template" + ) + else: + return template + + +class DummyScmVersion: + @property + def short_node(self) -> str | None: + return None + + +def write_version_to_path( + target: Path, + template: str | None, + version: str, + scm_version: ScmVersion | None = None, +) -> None: + final_template = _validate_template(target, template) + log.debug("dump %s into %s", version, target) + version_tuple = _version_as_tuple(version) + if scm_version is None: + warnings.warn( + "write_version_to_path called without scm_version parameter. " + "This will be required in a future version. " + "Pass scm_version=None explicitly to suppress this warning.", + DeprecationWarning, + stacklevel=2, + ) + + content = final_template.format( + version=version, + version_tuple=version_tuple, + scm_version=scm_version or DummyScmVersion(), + ) + + target.write_text(content, encoding="utf-8") diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py new file mode 100644 index 0000000..75d86f6 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py @@ -0,0 +1,285 @@ +from __future__ import annotations + +import warnings + +from dataclasses import dataclass +from pathlib import Path +from typing import Sequence + +from .. import _log +from .. import _types as _t +from .._requirement_cls import extract_package_name +from .toml import TOML_RESULT +from .toml import InvalidTomlError +from .toml import read_toml_content + +log = _log.log.getChild("pyproject_reading") + +_ROOT = "root" + + +DEFAULT_PYPROJECT_PATH = Path("pyproject.toml") +DEFAULT_TOOL_NAME = "setuptools_scm" + + +@dataclass +class PyProjectData: + path: Path + tool_name: str + project: TOML_RESULT + section: TOML_RESULT + is_required: bool + section_present: bool + project_present: bool + build_requires: list[str] + + @classmethod + def for_testing( + cls, + *, + is_required: bool = False, + section_present: bool = False, + project_present: bool = False, + project_name: str | None = None, + has_dynamic_version: bool = True, + build_requires: list[str] | None = None, + local_scheme: str | None = None, + ) -> PyProjectData: + """Create a PyProjectData instance for testing purposes.""" + project: TOML_RESULT + if project_name is not None: + project = {"name": project_name} + assert project_present + else: + project = {} + + # If project is present and has_dynamic_version is True, add dynamic=['version'] + if project_present and has_dynamic_version: + project["dynamic"] = ["version"] + + if build_requires is None: + build_requires = [] + if local_scheme is not None: + assert section_present + section = {"local_scheme": local_scheme} + else: + section = {} + return cls( + path=DEFAULT_PYPROJECT_PATH, + tool_name=DEFAULT_TOOL_NAME, + project=project, + section=section, + is_required=is_required, + section_present=section_present, + project_present=project_present, + build_requires=build_requires, + ) + + @classmethod + def empty( + cls, path: Path = DEFAULT_PYPROJECT_PATH, tool_name: str = DEFAULT_TOOL_NAME + ) -> PyProjectData: + return cls( + path=path, + tool_name=tool_name, + project={}, + section={}, + is_required=False, + section_present=False, + project_present=False, + build_requires=[], + ) + + @property + def project_name(self) -> str | None: + return self.project.get("name") + + @property + def project_version(self) -> str | None: + """Return the static version from [project] if present. + + When the project declares dynamic = ["version"], the version + is intentionally omitted from [project] and this returns None. + """ + return self.project.get("version") + + def should_infer(self) -> bool: + """ + Determine if setuptools_scm should infer version based on configuration. + + Infer when: + 1. An explicit [tool.setuptools_scm] section is present, OR + 2. setuptools-scm[simple] is in build-system.requires AND + version is in project.dynamic + + Returns: + True if [tool.setuptools_scm] is present, otherwise False + """ + # Original behavior: explicit tool section + if self.section_present: + return True + + # New behavior: simple extra + dynamic version + if self.project_present: + dynamic_fields = self.project.get("dynamic", []) + if "version" in dynamic_fields: + if has_build_package_with_extra( + self.build_requires, "setuptools-scm", "simple" + ): + return True + + return False + + +def has_build_package( + requires: Sequence[str], canonical_build_package_name: str +) -> bool: + for requirement in requires: + package_name = extract_package_name(requirement) + if package_name == canonical_build_package_name: + return True + return False + + +def has_build_package_with_extra( + requires: Sequence[str], canonical_build_package_name: str, extra_name: str +) -> bool: + """Check if a build dependency has a specific extra. + + Args: + requires: List of requirement strings from build-system.requires + canonical_build_package_name: The canonical package name to look for + extra_name: The extra name to check for (e.g., "simple") + + Returns: + True if the package is found with the specified extra + """ + from .._requirement_cls import Requirement + + for requirement_string in requires: + try: + requirement = Requirement(requirement_string) + package_name = extract_package_name(requirement_string) + if package_name == canonical_build_package_name: + if extra_name in requirement.extras: + return True + except Exception: + # If parsing fails, continue to next requirement + continue + return False + + +def read_pyproject( + path: Path = DEFAULT_PYPROJECT_PATH, + tool_name: str = DEFAULT_TOOL_NAME, + canonical_build_package_name: str = "setuptools-scm", + _given_result: _t.GivenPyProjectResult = None, + _given_definition: TOML_RESULT | None = None, +) -> PyProjectData: + """Read and parse pyproject configuration. + + This function supports dependency injection for tests via ``_given_result`` + and ``_given_definition``. + + :param path: Path to the pyproject file + :param tool_name: The tool section name (default: ``setuptools_scm``) + :param canonical_build_package_name: Normalized build requirement name + :param _given_result: Optional testing hook. Can be: + - ``PyProjectData``: returned directly + - ``InvalidTomlError`` | ``FileNotFoundError``: raised directly + - ``None``: read from filesystem (default) + :param _given_definition: Optional testing hook to provide parsed TOML content. + When provided, this dictionary is used instead of reading and parsing + the file from disk. Ignored if ``_given_result`` is provided. + """ + + if _given_result is not None: + if isinstance(_given_result, PyProjectData): + return _given_result + if isinstance(_given_result, (InvalidTomlError, FileNotFoundError)): + raise _given_result + + if _given_definition is not None: + defn = _given_definition + else: + defn = read_toml_content(path) + + requires: list[str] = defn.get("build-system", {}).get("requires", []) + is_required = has_build_package(requires, canonical_build_package_name) + + tool_section = defn.get("tool", {}) + section = tool_section.get(tool_name, {}) + section_present = tool_name in tool_section + + if not section_present: + log.warning( + "toml section missing %r does not contain a tool.%s section", + path, + tool_name, + ) + + project = defn.get("project", {}) + project_present = "project" in defn + pyproject_data = PyProjectData( + path, + tool_name, + project, + section, + is_required, + section_present, + project_present, + requires, + ) + + setuptools_dynamic_version = ( + defn.get("tool", {}) + .get("setuptools", {}) + .get("dynamic", {}) + .get("version", None) + ) + # Only warn if setuptools-scm is being used for version inference + # (not just file finding). When only file finders are used, it's valid + # to use tool.setuptools.dynamic.version for versioning. + if setuptools_dynamic_version is not None and pyproject_data.should_infer(): + from .deprecation import warn_pyproject_setuptools_dynamic_version + + warn_pyproject_setuptools_dynamic_version(path) + + return pyproject_data + + +def get_args_for_pyproject( + pyproject: PyProjectData, + dist_name: str | None, + kwargs: TOML_RESULT, +) -> TOML_RESULT: + """drops problematic details and figures the distribution name""" + section = pyproject.section.copy() + kwargs = kwargs.copy() + if "relative_to" in section: + relative = section.pop("relative_to") + warnings.warn( + f"{pyproject.path}: at [tool.{pyproject.tool_name}]\n" + f"ignoring value relative_to={relative!r}" + " as its always relative to the config file" + ) + if "dist_name" in section: + if dist_name is None: + dist_name = section.pop("dist_name") + else: + assert dist_name == section["dist_name"] + section.pop("dist_name") + if dist_name is None: + # minimal pep 621 support for figuring the pretend keys + dist_name = pyproject.project_name + if _ROOT in kwargs: + if kwargs[_ROOT] is None: + kwargs.pop(_ROOT, None) + elif _ROOT in section: + if section[_ROOT] != kwargs[_ROOT]: + warnings.warn( + f"root {section[_ROOT]} is overridden" + f" by the cli arg {kwargs[_ROOT]}" + ) + section.pop(_ROOT, None) + return {"dist_name": dist_name, **section, **kwargs} diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py new file mode 100644 index 0000000..893a9ad --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import os + +from dataclasses import dataclass +from pathlib import Path + +import setuptools + + +@dataclass +class SetuptoolsBasicData: + path: Path + name: str | None + version: str | None + + +def read_setup_cfg(input: str | os.PathLike[str] = "setup.cfg") -> SetuptoolsBasicData: + """Parse setup.cfg and return unified data. Does not raise if file is missing.""" + import configparser + + path = Path(input) + parser = configparser.ConfigParser() + parser.read([input], encoding="utf-8") + + name = parser.get("metadata", "name", fallback=None) + version = parser.get("metadata", "version", fallback=None) + if version is not None and "attr" in version: + from .deprecation import warn_setup_cfg_dynamic_version + + warn_setup_cfg_dynamic_version(path) + version = None + return SetuptoolsBasicData(path=path, name=name, version=version) + + +def extract_from_legacy( + dist: setuptools.Distribution, + *, + _given_legacy_data: SetuptoolsBasicData | None = None, +) -> SetuptoolsBasicData: + base = _given_legacy_data if _given_legacy_data is not None else read_setup_cfg() + if base.name is None: + base.name = dist.metadata.name + if base.version is None: + base.version = dist.metadata.version + return base diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py new file mode 100644 index 0000000..aa1c645 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py @@ -0,0 +1,159 @@ +from __future__ import annotations + +import logging +import warnings + +from typing import Any +from typing import Callable + +import setuptools + +from .. import _types as _t +from .pyproject_reading import PyProjectData +from .pyproject_reading import read_pyproject +from .setup_cfg import SetuptoolsBasicData +from .setup_cfg import extract_from_legacy +from .toml import InvalidTomlError +from .version_inference import get_version_inference_config + +log = logging.getLogger(__name__) + + +def _warn_on_old_setuptools(_version: str = setuptools.__version__) -> None: + if int(_version.split(".")[0]) < 61: + warnings.warn( + RuntimeWarning( + f""" +ERROR: setuptools=={_version} is used in combination with setuptools-scm>=8.x + +Your build configuration is incomplete and previously worked by accident! +setuptools-scm requires setuptools>=61 (recommended: >=80) + +Suggested workaround if applicable: + - migrating from the deprecated setup_requires mechanism to pep517/518 + and using a pyproject.toml to declare build dependencies + which are reliably pre-installed before running the build tools +""" + ) + ) + + +_warn_on_old_setuptools() + + +def _log_hookstart(hook: str, dist: setuptools.Distribution) -> None: + log.debug( + "%s %s %s %r", + hook, + id(dist), + id(dist.metadata), + {**vars(dist.metadata), "long_description": ...}, + ) + + +def get_keyword_overrides( + value: bool | dict[str, Any] | Callable[[], dict[str, Any]], +) -> dict[str, Any]: + """normalize the version keyword input""" + if value is True: + return {} + elif callable(value): + return value() + else: + assert isinstance(value, dict), "version_keyword expects a dict or True" + return value + + +def version_keyword( + dist: setuptools.Distribution, + keyword: str, + value: bool | dict[str, Any] | Callable[[], dict[str, Any]], + *, + _given_pyproject_data: _t.GivenPyProjectResult = None, + _given_legacy_data: SetuptoolsBasicData | None = None, + _get_version_inference_config: _t.GetVersionInferenceConfig = get_version_inference_config, +) -> None: + """apply version infernce when setup(use_scm_version=...) is used + this takes priority over the finalize_options based version + """ + + _log_hookstart("version_keyword", dist) + + # Parse overrides (integration point responsibility) + overrides = get_keyword_overrides(value) + + assert "dist_name" not in overrides, ( + "dist_name may not be specified in the setup keyword " + ) + + legacy_data = extract_from_legacy(dist, _given_legacy_data=_given_legacy_data) + dist_name: str | None = legacy_data.name + + was_set_by_infer = getattr(dist, "_setuptools_scm_version_set_by_infer", False) + + # Exit early if overrides is empty dict AND version was set by infer + if overrides == {} and was_set_by_infer: + return + + # Get pyproject data (support direct injection for tests) + try: + pyproject_data = read_pyproject(_given_result=_given_pyproject_data) + except FileNotFoundError: + log.debug("pyproject.toml not found, proceeding with empty configuration") + pyproject_data = PyProjectData.empty() + except InvalidTomlError as e: + log.debug("Configuration issue in pyproject.toml: %s", e) + return + + # Pass None as current_version if overrides is truthy AND version was set by infer + current_version = ( + None + if (overrides and was_set_by_infer) + else (legacy_data.version or pyproject_data.project_version) + ) + + result = _get_version_inference_config( + dist_name=dist_name, + current_version=current_version, + pyproject_data=pyproject_data, + overrides=overrides, + ) + + result.apply(dist) + + +def infer_version( + dist: setuptools.Distribution, + *, + _given_pyproject_data: _t.GivenPyProjectResult = None, + _given_legacy_data: SetuptoolsBasicData | None = None, + _get_version_inference_config: _t.GetVersionInferenceConfig = get_version_inference_config, +) -> None: + """apply version inference from the finalize_options hook + this is the default for pyproject.toml based projects that don't use the use_scm_version keyword + + if the version keyword is used, it will override the version from this hook + as user might have passed custom code version schemes + """ + + _log_hookstart("infer_version", dist) + + legacy_data = extract_from_legacy(dist, _given_legacy_data=_given_legacy_data) + dist_name = legacy_data.name + + try: + pyproject_data = read_pyproject(_given_result=_given_pyproject_data) + except FileNotFoundError: + log.debug("pyproject.toml not found, skipping infer_version") + return + except InvalidTomlError as e: + log.debug("Configuration issue in pyproject.toml: %s", e) + return + + # Only infer when tool section present per get_version_inference_config + result = _get_version_inference_config( + dist_name=dist_name, + current_version=legacy_data.version or pyproject_data.project_version, + pyproject_data=pyproject_data, + ) + result.apply(dist) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py new file mode 100644 index 0000000..2253287 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py @@ -0,0 +1,69 @@ +from __future__ import annotations + +import sys + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Dict +from typing import TypedDict +from typing import cast + +if sys.version_info >= (3, 11): + from tomllib import loads as load_toml +else: + from tomli import loads as load_toml + +if TYPE_CHECKING: + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + +from .. import _log + +log = _log.log.getChild("toml") + +TOML_RESULT: TypeAlias = Dict[str, Any] +TOML_LOADER: TypeAlias = Callable[[str], TOML_RESULT] + + +class InvalidTomlError(ValueError): + """Raised when TOML data cannot be parsed.""" + + +def read_toml_content(path: Path, default: TOML_RESULT | None = None) -> TOML_RESULT: + try: + data = path.read_text(encoding="utf-8") + except FileNotFoundError: + if default is None: + raise + else: + log.debug("%s missing, presuming default %r", path, default) + return default + else: + try: + return load_toml(data) + except Exception as e: # tomllib/tomli raise different decode errors + raise InvalidTomlError(f"Invalid TOML in {path}") from e + + +class _CheatTomlData(TypedDict): + cheat: dict[str, Any] + + +def load_toml_or_inline_map(data: str | None) -> dict[str, Any]: + """ + load toml data - with a special hack if only a inline map is given + """ + if not data: + return {} + try: + if data[0] == "{": + data = "cheat=" + data + loaded: _CheatTomlData = cast(_CheatTomlData, load_toml(data)) + return loaded["cheat"] + return load_toml(data) + except Exception as e: # tomllib/tomli raise different decode errors + raise InvalidTomlError("Invalid TOML content") from e diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py new file mode 100644 index 0000000..6258d90 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import TYPE_CHECKING +from typing import Any +from typing import Union + +from setuptools import Distribution + +from .. import _log + +if TYPE_CHECKING: + from .pyproject_reading import PyProjectData + +log = _log.log.getChild("version_inference") + + +@dataclass +class VersionInferenceConfig: + """Configuration for version inference.""" + + dist_name: str | None + pyproject_data: PyProjectData | None + overrides: dict[str, Any] | None + + def apply(self, dist: Distribution) -> None: + """Apply version inference to the distribution.""" + version_string = infer_version_string( + self.dist_name, + self.pyproject_data, # type: ignore[arg-type] + self.overrides, + force_write_version_files=True, + ) + dist.metadata.version = version_string + + # Mark that this version was set by infer_version if overrides is None (infer_version context) + if self.overrides is None: + dist._setuptools_scm_version_set_by_infer = True # type: ignore[attr-defined] + + +@dataclass +class VersionInferenceWarning: + """Error message for user.""" + + message: str + + def apply(self, dist: Distribution) -> None: + """Apply error handling to the distribution.""" + import warnings + + warnings.warn(self.message) + + +@dataclass(frozen=True) +class VersionInferenceNoOp: + """No operation result - silent skip.""" + + def apply(self, dist: Distribution) -> None: + """Apply no-op to the distribution.""" + + +VersionInferenceResult = Union[ + VersionInferenceConfig, # Proceed with inference + VersionInferenceWarning, # Show warning + VersionInferenceNoOp, # Don't infer (silent) +] + + +def infer_version_string( + dist_name: str | None, + pyproject_data: PyProjectData, + overrides: dict[str, Any] | None = None, + *, + force_write_version_files: bool = False, +) -> str: + """ + Compute the inferred version string from the given inputs without requiring a + setuptools Distribution instance. This is a pure helper that simplifies + integration tests by avoiding file I/O and side effects on a Distribution. + + Parameters: + dist_name: Optional distribution name (used for overrides and env scoping) + pyproject_data: Parsed PyProjectData (may be constructed via for_testing()) + overrides: Optional override configuration (same keys as [tool.setuptools_scm]) + force_write_version_files: When True, apply write_to/version_file effects + + Returns: + The computed version string. + """ + from .. import _config as _config_module + from .._get_version_impl import _get_version + from .._get_version_impl import _version_missing + + config = _config_module.Configuration.from_file( + dist_name=dist_name, pyproject_data=pyproject_data, **(overrides or {}) + ) + + maybe_version = _get_version( + config, force_write_version_files=force_write_version_files + ) + if maybe_version is None: + _version_missing(config) + return maybe_version + + +def get_version_inference_config( + dist_name: str | None, + current_version: str | None, + pyproject_data: PyProjectData, + overrides: dict[str, Any] | None = None, +) -> VersionInferenceResult: + """ + Determine whether and how to perform version inference. + + Args: + dist_name: The distribution name + current_version: Current version if any + pyproject_data: PyProjectData from parser (None if file doesn't exist) + overrides: Override configuration (None for no overrides) + + Returns: + VersionInferenceResult with the decision and configuration + """ + + config = VersionInferenceConfig( + dist_name=dist_name, + pyproject_data=pyproject_data, + overrides=overrides, + ) + + inference_implied = pyproject_data.should_infer() or overrides is not None + + if inference_implied: + if current_version is None: + return config + else: + return VersionInferenceWarning( + f"version of {dist_name} already set", + ) + else: + return VersionInferenceNoOp() diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py new file mode 100644 index 0000000..ea17f37 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py @@ -0,0 +1,87 @@ +""" +logging helpers, supports vendoring +""" + +from __future__ import annotations + +import contextlib +import logging +import os +import sys + +from typing import IO +from typing import Iterator +from typing import Mapping + +log = logging.getLogger(__name__.rsplit(".", 1)[0]) +log.propagate = False + + +class AlwaysStdErrHandler(logging.StreamHandler): # type: ignore[type-arg] + def __init__(self) -> None: + super().__init__(sys.stderr) + + @property + def stream(self) -> IO[str]: + return sys.stderr + + @stream.setter + def stream(self, value: IO[str]) -> None: + assert value is sys.stderr + + +def make_default_handler() -> logging.Handler: + try: + from rich.console import Console + + console = Console(stderr=True) + from rich.logging import RichHandler + + return RichHandler(console=console) + except ImportError: + last_resort = logging.lastResort + assert last_resort is not None + return last_resort + + +_default_handler = make_default_handler() + +log.addHandler(_default_handler) + + +def _default_log_level(_env: Mapping[str, str] = os.environ) -> int: + val: str | None = _env.get("SETUPTOOLS_SCM_DEBUG") + return logging.WARNING if val is None else logging.DEBUG + + +log.setLevel(_default_log_level()) + + +@contextlib.contextmanager +def defer_to_pytest() -> Iterator[None]: + log.propagate = True + old_level = log.level + log.setLevel(logging.NOTSET) + log.removeHandler(_default_handler) + try: + yield + finally: + log.addHandler(_default_handler) + log.propagate = False + log.setLevel(old_level) + + +@contextlib.contextmanager +def enable_debug(handler: logging.Handler = _default_handler) -> Iterator[None]: + log.addHandler(handler) + old_level = log.level + log.setLevel(logging.DEBUG) + old_handler_level = handler.level + handler.setLevel(logging.DEBUG) + try: + yield + finally: + log.setLevel(old_level) + handler.setLevel(old_handler_level) + if handler is not _default_handler: + log.removeHandler(handler) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py new file mode 100644 index 0000000..aae41a6 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import re + +from . import _types as _t + + +def strip_local(version_string: str) -> str: + public = version_string.partition("+")[0] + return public + + +def _add_post(version: str) -> str: + if "post" in version: + raise ValueError( + f"{version} already is a post release, refusing to guess the update" + ) + return f"{version}.post1" + + +def _bump_dev(version: str) -> str | None: + if ".dev" not in version: + return None + + prefix, tail = version.rsplit(".dev", 1) + if tail != "0": + raise ValueError( + "choosing custom numbers for the `.devX` distance " + "is not supported.\n " + f"The {version} can't be bumped\n" + "Please drop the tag or create a new supported one ending in .dev0" + ) + return prefix + + +def _bump_regex(version: str) -> str: + match = re.match(r"(.*?)(\d+)$", version) + if match is None: + raise ValueError( + f"{version} does not end with a number to bump, " + "please correct or use a custom version scheme" + ) + else: + prefix, tail = match.groups() + return f"{prefix}{int(tail) + 1}" + + +def _format_local_with_time(version: _t.SCMVERSION, time_format: str) -> str: + if version.exact or version.node is None: + return version.format_choice( + "", "+d{time:{time_format}}", time_format=time_format + ) + else: + return version.format_choice( + "+{node}", "+{node}.d{time:{time_format}}", time_format=time_format + ) + + +def _dont_guess_next_version(tag_version: _t.SCMVERSION) -> str: + version = strip_local(str(tag_version.tag)) + return _bump_dev(version) or _add_post(version) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py new file mode 100644 index 0000000..1a7a227 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py @@ -0,0 +1,46 @@ +"""Private utilities for consistent node ID handling across SCM backends.""" + +from __future__ import annotations + +# Standard node ID length used across all SCM backends +_NODE_ID_LENGTH = 10 + + +def _slice_node_id(node_id: str) -> str: + """ + Slice a node ID to a consistent length. + + This ensures that all SCM backends (git, mercurial, archival) + return the same length node IDs for consistency. + + Args: + node_id: The full node ID/hash from the SCM + + Returns: + The node ID sliced to the standard length + """ + return node_id[:_NODE_ID_LENGTH] + + +def _format_node_for_output(node_id: str | None) -> str | None: + """ + Format a node ID for output, applying consistent slicing. + + Args: + node_id: The full node ID/hash from the SCM or None + + Returns: + The node ID sliced to standard length for output, or None if input was None + """ + if node_id is None: + return None + + # Handle mercurial nodes with 'h' prefix + if node_id.startswith("h"): + # For mercurial nodes, slice the part after 'h' and reconstruct + hg_hash = node_id[1:] # Remove 'h' prefix + sliced_hash = _slice_node_id(hg_hash) + return "h" + sliced_hash + + # For git nodes (with or without 'g' prefix) and others + return _slice_node_id(node_id) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py new file mode 100644 index 0000000..4e06b7a --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py @@ -0,0 +1,298 @@ +from __future__ import annotations + +import dataclasses +import os + +from difflib import get_close_matches +from typing import Any +from typing import Mapping + +from packaging.utils import canonicalize_name + +from . import _config +from . import _log +from . import version +from ._integration.toml import load_toml_or_inline_map + +log = _log.log.getChild("overrides") + +PRETEND_KEY = "SETUPTOOLS_SCM_PRETEND_VERSION" +PRETEND_KEY_NAMED = PRETEND_KEY + "_FOR_{name}" +PRETEND_METADATA_KEY = "SETUPTOOLS_SCM_PRETEND_METADATA" +PRETEND_METADATA_KEY_NAMED = PRETEND_METADATA_KEY + "_FOR_{name}" + + +def _search_env_vars_with_prefix( + prefix: str, dist_name: str, env: Mapping[str, str] +) -> list[tuple[str, str]]: + """Search environment variables with a given prefix for potential dist name matches. + + Args: + prefix: The environment variable prefix (e.g., "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_") + dist_name: The original dist name to match against + env: Environment dictionary to search in + + Returns: + List of (env_var_name, env_var_value) tuples for potential matches + """ + # Get the canonical name for comparison + canonical_dist_name = canonicalize_name(dist_name) + + matches = [] + for env_var, value in env.items(): + if env_var.startswith(prefix): + suffix = env_var[len(prefix) :] + # Normalize the suffix and compare to canonical dist name + try: + normalized_suffix = canonicalize_name(suffix.lower().replace("_", "-")) + if normalized_suffix == canonical_dist_name: + matches.append((env_var, value)) + except Exception: + # If normalization fails for any reason, skip this env var + continue + + return matches + + +def _find_close_env_var_matches( + prefix: str, expected_suffix: str, env: Mapping[str, str], threshold: float = 0.6 +) -> list[str]: + """Find environment variables with similar suffixes that might be typos. + + Args: + prefix: The environment variable prefix + expected_suffix: The expected suffix (canonicalized dist name in env var format) + env: Environment dictionary to search in + threshold: Similarity threshold for matches (0.0 to 1.0) + + Returns: + List of environment variable names that are close matches + """ + candidates = [] + for env_var in env: + if env_var.startswith(prefix): + suffix = env_var[len(prefix) :] + candidates.append(suffix) + + # Use difflib to find close matches + close_matches = get_close_matches( + expected_suffix, candidates, n=3, cutoff=threshold + ) + + return [f"{prefix}{match}" for match in close_matches if match != expected_suffix] + + +def read_named_env( + *, + tool: str = "SETUPTOOLS_SCM", + name: str, + dist_name: str | None, + env: Mapping[str, str] = os.environ, +) -> str | None: + """Read a named environment variable, with fallback search for dist-specific variants. + + This function first tries the standard normalized environment variable name. + If that's not found and a dist_name is provided, it searches for alternative + normalizations and warns about potential issues. + + Args: + tool: The tool prefix (default: "SETUPTOOLS_SCM") + name: The environment variable name component + dist_name: The distribution name for dist-specific variables + env: Environment dictionary to search in (defaults to os.environ) + + Returns: + The environment variable value if found, None otherwise + """ + + # First try the generic version + generic_val = env.get(f"{tool}_{name}") + + if dist_name is not None: + # Normalize the dist name using packaging.utils.canonicalize_name + canonical_dist_name = canonicalize_name(dist_name) + env_var_dist_name = canonical_dist_name.replace("-", "_").upper() + expected_env_var = f"{tool}_{name}_FOR_{env_var_dist_name}" + + # Try the standard normalized name first + val = env.get(expected_env_var) + if val is not None: + return val + + # If not found, search for alternative normalizations + prefix = f"{tool}_{name}_FOR_" + alternative_matches = _search_env_vars_with_prefix(prefix, dist_name, env) + + if alternative_matches: + # Found alternative matches - use the first one but warn + env_var, value = alternative_matches[0] + log.warning( + "Found environment variable '%s' for dist name '%s', " + "but expected '%s'. Consider using the standard normalized name.", + env_var, + dist_name, + expected_env_var, + ) + if len(alternative_matches) > 1: + other_vars = [var for var, _ in alternative_matches[1:]] + log.warning( + "Multiple alternative environment variables found: %s. Using '%s'.", + other_vars, + env_var, + ) + return value + + # No exact or alternative matches found - look for potential typos + close_matches = _find_close_env_var_matches(prefix, env_var_dist_name, env) + if close_matches: + log.warning( + "Environment variable '%s' not found for dist name '%s' " + "(canonicalized as '%s'). Did you mean one of these? %s", + expected_env_var, + dist_name, + canonical_dist_name, + close_matches, + ) + + return generic_val + + +def _read_pretended_metadata_for( + config: _config.Configuration, +) -> dict[str, Any] | None: + """read overridden metadata from the environment + + tries ``SETUPTOOLS_SCM_PRETEND_METADATA`` + and ``SETUPTOOLS_SCM_PRETEND_METADATA_FOR_$UPPERCASE_DIST_NAME`` + + Returns a dictionary with metadata field overrides like: + {"node": "g1337beef", "distance": 4} + """ + log.debug("dist name: %s", config.dist_name) + + pretended = read_named_env(name="PRETEND_METADATA", dist_name=config.dist_name) + + if pretended: + try: + metadata_overrides = load_toml_or_inline_map(pretended) + # Validate that only known ScmVersion fields are provided + valid_fields = { + "tag", + "distance", + "node", + "dirty", + "preformatted", + "branch", + "node_date", + "time", + } + invalid_fields = set(metadata_overrides.keys()) - valid_fields + if invalid_fields: + log.warning( + "Invalid metadata fields in pretend metadata: %s. " + "Valid fields are: %s", + invalid_fields, + valid_fields, + ) + # Remove invalid fields but continue processing + for field in invalid_fields: + metadata_overrides.pop(field) + + return metadata_overrides or None + except Exception as e: + log.error("Failed to parse pretend metadata: %s", e) + return None + else: + return None + + +def _apply_metadata_overrides( + scm_version: version.ScmVersion | None, + config: _config.Configuration, +) -> version.ScmVersion | None: + """Apply metadata overrides to a ScmVersion object. + + This function reads pretend metadata from environment variables and applies + the overrides to the given ScmVersion. TOML type coercion is used so values + should be provided in their correct types (int, bool, datetime, etc.). + + Args: + scm_version: The ScmVersion to apply overrides to, or None + config: Configuration object + + Returns: + Modified ScmVersion with overrides applied, or None + """ + metadata_overrides = _read_pretended_metadata_for(config) + + if not metadata_overrides: + return scm_version + + if scm_version is None: + log.warning( + "PRETEND_METADATA specified but no base version found. " + "Metadata overrides cannot be applied without a base version." + ) + return None + + log.info("Applying metadata overrides: %s", metadata_overrides) + + # Define type checks and field mappings + from datetime import date + from datetime import datetime + + field_specs: dict[str, tuple[type | tuple[type, type], str]] = { + "distance": (int, "int"), + "dirty": (bool, "bool"), + "preformatted": (bool, "bool"), + "node_date": (date, "date"), + "time": (datetime, "datetime"), + "node": ((str, type(None)), "str or None"), + "branch": ((str, type(None)), "str or None"), + # tag is special - can be multiple types, handled separately + } + + # Apply each override individually using dataclasses.replace for type safety + result = scm_version + + for field, value in metadata_overrides.items(): + if field in field_specs: + expected_type, type_name = field_specs[field] + assert isinstance(value, expected_type), ( + f"{field} must be {type_name}, got {type(value).__name__}: {value!r}" + ) + result = dataclasses.replace(result, **{field: value}) + elif field == "tag": + # tag can be Version, NonNormalizedVersion, or str - we'll let the assignment handle validation + result = dataclasses.replace(result, tag=value) + else: + # This shouldn't happen due to validation in _read_pretended_metadata_for + log.warning("Unknown field '%s' in metadata overrides", field) + + # Ensure config is preserved (should not be overridden) + assert result.config is config, "Config must be preserved during metadata overrides" + + return result + + +def _read_pretended_version_for( + config: _config.Configuration, +) -> version.ScmVersion | None: + """read a a overridden version from the environment + + tries ``SETUPTOOLS_SCM_PRETEND_VERSION`` + and ``SETUPTOOLS_SCM_PRETEND_VERSION_FOR_$UPPERCASE_DIST_NAME`` + """ + log.debug("dist name: %s", config.dist_name) + + pretended = read_named_env(name="PRETEND_VERSION", dist_name=config.dist_name) + + if pretended: + return version.meta(tag=pretended, preformatted=True, config=config) + else: + return None + + +def read_toml_overrides(dist_name: str | None) -> dict[str, Any]: + data = read_named_env(name="OVERRIDES", dist_name=dist_name) + return load_toml_or_inline_map(data) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py new file mode 100644 index 0000000..9bb8846 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +__all__ = ["Requirement", "extract_package_name"] + +try: + from packaging.requirements import Requirement + from packaging.utils import canonicalize_name +except ImportError: + from setuptools.extern.packaging.requirements import ( # type: ignore[import-not-found,no-redef] + Requirement as Requirement, + ) + from setuptools.extern.packaging.utils import ( # type: ignore[import-not-found,no-redef] + canonicalize_name as canonicalize_name, + ) + +from . import _log + +log = _log.log.getChild("requirement_cls") + + +def extract_package_name(requirement_string: str) -> str: + """Extract the canonical package name from a requirement string. + + This function uses packaging.requirements.Requirement to properly parse + the requirement and extract the package name, handling all edge cases + that the custom regex-based approach might miss. + + Args: + requirement_string: The requirement string to parse + + Returns: + The package name as a string + """ + return canonicalize_name(Requirement(requirement_string).name) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py new file mode 100644 index 0000000..2dff636 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py @@ -0,0 +1,221 @@ +from __future__ import annotations + +import os +import shlex +import subprocess +import textwrap +import warnings + +from typing import TYPE_CHECKING +from typing import Callable +from typing import Final +from typing import Mapping +from typing import Sequence +from typing import TypeVar +from typing import overload + +from . import _log +from . import _types as _t + +if TYPE_CHECKING: + BaseCompletedProcess = subprocess.CompletedProcess[str] +else: + BaseCompletedProcess = subprocess.CompletedProcess + +# pick 40 seconds +# unfortunately github CI for windows sometimes needs +# up to 30 seconds to start a command + + +def _get_timeout(env: Mapping[str, str]) -> int: + return int(env.get("SETUPTOOLS_SCM_SUBPROCESS_TIMEOUT") or 40) + + +BROKEN_TIMEOUT: Final[int] = _get_timeout(os.environ) + +log = _log.log.getChild("run_cmd") + +PARSE_RESULT = TypeVar("PARSE_RESULT") +T = TypeVar("T") + + +class CompletedProcess(BaseCompletedProcess): + @classmethod + def from_raw( + cls, input: BaseCompletedProcess, strip: bool = True + ) -> CompletedProcess: + return cls( + args=input.args, + returncode=input.returncode, + stdout=input.stdout.strip() if strip and input.stdout else input.stdout, + stderr=input.stderr.strip() if strip and input.stderr else input.stderr, + ) + + @overload + def parse_success( + self, + parse: Callable[[str], PARSE_RESULT], + default: None = None, + error_msg: str | None = None, + ) -> PARSE_RESULT | None: ... + + @overload + def parse_success( + self, + parse: Callable[[str], PARSE_RESULT], + default: T, + error_msg: str | None = None, + ) -> PARSE_RESULT | T: ... + + def parse_success( + self, + parse: Callable[[str], PARSE_RESULT], + default: T | None = None, + error_msg: str | None = None, + ) -> PARSE_RESULT | T | None: + if self.returncode: + if error_msg: + log.warning("%s %s", error_msg, self) + return default + else: + return parse(self.stdout) + + +KEEP_GIT_ENV = ( + "GIT_CEILING_DIRECTORIES", + "GIT_EXEC_PATH", + "GIT_SSH", + "GIT_SSH_COMMAND", + "GIT_AUTHOR_DATE", + "GIT_COMMITTER_DATE", +) + + +def no_git_env(env: Mapping[str, str]) -> dict[str, str]: + # adapted from pre-commit + # Too many bugs dealing with environment variables and GIT: + # https://github.com/pre-commit/pre-commit/issues/300 + # In git 2.6.3 (maybe others), git exports GIT_WORK_TREE while running + # pre-commit hooks + # In git 1.9.1 (maybe others), git exports GIT_DIR and GIT_INDEX_FILE + # while running pre-commit hooks in submodules. + # GIT_DIR: Causes git clone to clone wrong thing + # GIT_INDEX_FILE: Causes 'error invalid object ...' during commit + for k, v in env.items(): + if k.startswith("GIT_"): + log.debug("%s: %s", k, v) + return { + k: v for k, v in env.items() if not k.startswith("GIT_") or k in KEEP_GIT_ENV + } + + +def avoid_pip_isolation(env: Mapping[str, str]) -> dict[str, str]: + """ + pip build isolation can break Mercurial + (see https://github.com/pypa/pip/issues/10635) + + pip uses PYTHONNOUSERSITE and a path in PYTHONPATH containing "pip-build-env-". + """ + new_env = {k: v for k, v in env.items() if k != "PYTHONNOUSERSITE"} + if "PYTHONPATH" not in new_env: + return new_env + + new_env["PYTHONPATH"] = os.pathsep.join( + [ + path + for path in new_env["PYTHONPATH"].split(os.pathsep) + if "-build-env-" not in path + ] + ) + return new_env + + +def ensure_stripped_str(str_or_bytes: str | bytes) -> str: + if isinstance(str_or_bytes, str): + return str_or_bytes.strip() + else: + return str_or_bytes.decode("utf-8", "surrogateescape").strip() + + +def run( + cmd: _t.CMD_TYPE, + cwd: _t.PathT, + *, + strip: bool = True, + trace: bool = True, + timeout: int | None = None, + check: bool = False, +) -> CompletedProcess: + if isinstance(cmd, str): + cmd = shlex.split(cmd) + else: + cmd = [os.fspath(x) for x in cmd] + cmd_4_trace = " ".join(map(_unsafe_quote_for_display, cmd)) + log.debug("at %s\n $ %s ", cwd, cmd_4_trace) + if timeout is None: + timeout = BROKEN_TIMEOUT + res = subprocess.run( + cmd, + capture_output=True, + cwd=os.fspath(cwd), + env=dict( + avoid_pip_isolation(no_git_env(os.environ)), + # os.environ, + # try to disable i18n, but still allow UTF-8 encoded text. + LC_ALL="C.UTF-8", + LANGUAGE="", + HGPLAIN="1", + ), + text=True, + encoding="utf-8", + timeout=timeout, + ) + + res = CompletedProcess.from_raw(res, strip=strip) + if trace: + if res.stdout: + log.debug("out:\n%s", textwrap.indent(res.stdout, " ")) + if res.stderr: + log.debug("err:\n%s", textwrap.indent(res.stderr, " ")) + if res.returncode: + log.debug("ret: %s", res.returncode) + if check: + res.check_returncode() + return res + + +def _unsafe_quote_for_display(item: _t.PathT) -> str: + # give better results than shlex.join in our cases + text = os.fspath(item) + return text if all(c not in text for c in " {[:") else f'"{text}"' + + +def has_command( + name: str, args: Sequence[str] = ["version"], warn: bool = True +) -> bool: + try: + p = run([name, *args], cwd=".") + if p.returncode != 0: + log.error("Command '%s' returned non-zero. This is stderr:", name) + log.error(p.stderr) + except OSError as e: + log.warning("command %s missing: %s", name, e) + res = False + except subprocess.TimeoutExpired as e: + log.warning("command %s timed out %s", name, e) + res = False + + else: + res = not p.returncode + if not res and warn: + warnings.warn(f"{name!r} was not found", category=RuntimeWarning) + return res + + +class CommandNotFoundError(LookupError, FileNotFoundError): + pass + + +def require_command(name: str) -> None: + if not has_command(name, warn=False): + raise CommandNotFoundError(name) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py new file mode 100644 index 0000000..4f8874f --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +import os + +from typing import TYPE_CHECKING +from typing import Callable +from typing import List +from typing import Protocol +from typing import Sequence +from typing import Tuple +from typing import Union + +from setuptools import Distribution + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import TypeAlias + else: + from typing_extensions import TypeAlias + + from . import version + from ._integration.pyproject_reading import PyProjectData + from ._integration.toml import InvalidTomlError + +PathT: TypeAlias = Union["os.PathLike[str]", str] + +CMD_TYPE: TypeAlias = Union[Sequence[PathT], str] + +VERSION_SCHEME: TypeAlias = Union[str, Callable[["version.ScmVersion"], str]] +VERSION_SCHEMES: TypeAlias = Union[List[str], Tuple[str, ...], VERSION_SCHEME] +SCMVERSION: TypeAlias = "version.ScmVersion" + +# Git pre-parse function types +GIT_PRE_PARSE: TypeAlias = Union[str, None] + +# Testing injection types for configuration reading +GivenPyProjectResult: TypeAlias = Union[ + "PyProjectData", "InvalidTomlError", FileNotFoundError, None +] + + +class VersionInferenceApplicable(Protocol): + """A result object from version inference decision that can be applied to a dist.""" + + def apply(self, dist: Distribution) -> None: # pragma: no cover - structural type + ... + + +class GetVersionInferenceConfig(Protocol): + """Callable protocol for the decision function used by integration points.""" + + def __call__( + self, + dist_name: str | None, + current_version: str | None, + pyproject_data: PyProjectData, + overrides: dict[str, object] | None = None, + ) -> VersionInferenceApplicable: # pragma: no cover - structural type + ... diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py new file mode 100644 index 0000000..e0fe387 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from typing import Type +from typing import Union +from typing import cast + +try: + from packaging.version import InvalidVersion + from packaging.version import Version as Version +except ImportError: + from setuptools.extern.packaging.version import ( # type: ignore[import-not-found, no-redef] + InvalidVersion, + ) + from setuptools.extern.packaging.version import ( # type: ignore[no-redef] + Version as Version, + ) +from . import _log + +log = _log.log.getChild("version_cls") + + +class NonNormalizedVersion(Version): + """A non-normalizing version handler. + + You can use this class to preserve version verification but skip normalization. + For example you can use this to avoid git release candidate version tags + ("1.0.0-rc1") to be normalized to "1.0.0rc1". Only use this if you fully + trust the version tags. + """ + + def __init__(self, version: str) -> None: + # parse and validate using parent + super().__init__(version) + + # store raw for str + self._raw_version = version + + def __str__(self) -> str: + # return the non-normalized version (parent returns the normalized) + return self._raw_version + + def __repr__(self) -> str: + # same pattern as parent + return f"" + + +def _version_as_tuple(version_str: str) -> tuple[int | str, ...]: + try: + parsed_version = Version(version_str) + except InvalidVersion as e: + log.error("failed to parse version %s: %s", e, version_str) + return (version_str,) + else: + version_fields: tuple[int | str, ...] = parsed_version.release + if parsed_version.epoch: + version_fields = (f"{parsed_version.epoch}!", *version_fields) + if parsed_version.pre is not None: + version_fields += (f"{parsed_version.pre[0]}{parsed_version.pre[1]}",) + + if parsed_version.post is not None: + version_fields += (f"post{parsed_version.post}",) + + if parsed_version.dev is not None: + version_fields += (f"dev{parsed_version.dev}",) + + if parsed_version.local is not None: + version_fields += (parsed_version.local,) + return version_fields + + +_VersionT = Union[Version, NonNormalizedVersion] + + +def import_name(name: str) -> object: + import importlib + + pkg_name, cls_name = name.rsplit(".", 1) + pkg = importlib.import_module(pkg_name) + return getattr(pkg, cls_name) + + +def _validate_version_cls( + version_cls: type[_VersionT] | str | None, normalize: bool +) -> type[_VersionT]: + if not normalize: + if version_cls is not None: + raise ValueError( + "Providing a custom `version_cls` is not permitted when " + "`normalize=False`" + ) + return NonNormalizedVersion + # Use `version_cls` if provided, default to packaging or pkg_resources + elif version_cls is None: + return Version + elif isinstance(version_cls, str): + try: + return cast(Type[_VersionT], import_name(version_cls)) + except Exception: + raise ValueError(f"Unable to import version_cls='{version_cls}'") from None + else: + return version_cls diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py new file mode 100644 index 0000000..e8208ca --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import os + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Iterable +from typing import Iterator + +from . import _entrypoints +from . import _log +from . import _types as _t +from ._config import Configuration + +if TYPE_CHECKING: + from ._entrypoints import im + + +log = _log.log.getChild("discover") + + +def walk_potential_roots(root: _t.PathT, search_parents: bool = True) -> Iterator[Path]: + """ + Iterate though a path and each of its parents. + :param root: File path. + :param search_parents: If ``False`` the parents are not considered. + """ + root = Path(root) + yield root + if search_parents: + yield from root.parents + + +def match_entrypoint(root: _t.PathT, name: str) -> bool: + """ + Consider a ``root`` as entry-point. + :param root: File path. + :param name: Subdirectory name. + :return: ``True`` if a subdirectory ``name`` exits in ``root``. + """ + + if os.path.exists(os.path.join(root, name)): + if not os.path.isabs(name): + return True + log.debug("ignoring bad ep %s", name) + + return False + + +# blocked entrypints from legacy plugins +_BLOCKED_EP_TARGETS = {"setuptools_scm_git_archive:parse"} + + +def iter_matching_entrypoints( + root: _t.PathT, entrypoint: str, config: Configuration +) -> Iterable[im.EntryPoint]: + """ + Consider different entry-points in ``root`` and optionally its parents. + :param root: File path. + :param entrypoint: Entry-point to consider. + :param config: Configuration, + read ``search_parent_directories``, write found parent to ``parent``. + """ + + log.debug("looking for ep %s in %s", entrypoint, root) + + for wd in walk_potential_roots(root, config.search_parent_directories): + for ep in _entrypoints.entry_points(group=entrypoint): + if ep.value in _BLOCKED_EP_TARGETS: + continue + if match_entrypoint(wd, ep.name): + log.debug("found ep %s in %s", ep, wd) + config.parent = wd + yield ep diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py new file mode 100644 index 0000000..45a7535 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +import logging +import os + +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from . import _types as _t +from . import Configuration +from .integration import data_from_mime +from .version import ScmVersion +from .version import meta +from .version import tag_to_version + +log = logging.getLogger(__name__) + +_UNKNOWN = "UNKNOWN" + + +def parse_pkginfo(root: _t.PathT, config: Configuration) -> ScmVersion | None: + pkginfo = Path(root) / "PKG-INFO" + log.debug("pkginfo %s", pkginfo) + data = data_from_mime(pkginfo) + version = data.get("Version", _UNKNOWN) + if version != _UNKNOWN: + return meta(version, preformatted=True, config=config) + else: + return None + + +def fallback_version(root: _t.PathT, config: Configuration) -> ScmVersion | None: + if config.parentdir_prefix_version is not None: + _, parent_name = os.path.split(os.path.abspath(root)) + if parent_name.startswith(config.parentdir_prefix_version): + version = tag_to_version( + parent_name[len(config.parentdir_prefix_version) :], config + ) + if version is not None: + return meta(str(version), preformatted=True, config=config) + if config.fallback_version is not None: + log.debug("FALLBACK %s", config.fallback_version) + return meta(config.fallback_version, preformatted=True, config=config) + return None diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py new file mode 100644 index 0000000..966ab69 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py @@ -0,0 +1,454 @@ +from __future__ import annotations + +import dataclasses +import logging +import os +import re +import shlex +import sys +import warnings + +from datetime import date +from datetime import datetime +from datetime import timezone +from enum import Enum +from os.path import samefile +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Callable +from typing import Sequence + +from . import Configuration +from . import _types as _t +from . import discover +from ._run_cmd import CompletedProcess as _CompletedProcess +from ._run_cmd import require_command as _require_command +from ._run_cmd import run as _run +from .integration import data_from_mime +from .scm_workdir import Workdir +from .scm_workdir import get_latest_file_mtime +from .version import ScmVersion +from .version import meta +from .version import tag_to_version + +if TYPE_CHECKING: + from . import hg_git +log = logging.getLogger(__name__) + +REF_TAG_RE = re.compile(r"(?<=\btag: )([^,]+)\b") +DESCRIBE_UNSUPPORTED = "%(describe" + +# If testing command in shell make sure to quote the match argument like +# '*[0-9]*' as it will expand before being sent to git if there are any matching +# files in current directory. +DEFAULT_DESCRIBE = [ + "git", + "describe", + "--dirty", + "--tags", + "--long", + "--abbrev=40", + "--match", + "*[0-9]*", +] + + +class GitPreParse(Enum): + """Available git pre-parse functions""" + + WARN_ON_SHALLOW = "warn_on_shallow" + FAIL_ON_SHALLOW = "fail_on_shallow" + FETCH_ON_SHALLOW = "fetch_on_shallow" + FAIL_ON_MISSING_SUBMODULES = "fail_on_missing_submodules" + + +def run_git( + args: Sequence[str | os.PathLike[str]], + repo: Path, + *, + check: bool = False, + timeout: int | None = None, +) -> _CompletedProcess: + return _run( + ["git", "--git-dir", repo / ".git", *args], + cwd=repo, + check=check, + timeout=timeout, + ) + + +class GitWorkdir(Workdir): + """experimental, may change at any time""" + + @classmethod + def from_potential_worktree(cls, wd: _t.PathT) -> GitWorkdir | None: + wd = Path(wd).resolve() + real_wd = run_git(["rev-parse", "--show-prefix"], wd).parse_success(parse=str) + if real_wd is None: + return None + else: + real_wd = real_wd[:-1] # remove the trailing pathsep + + if not real_wd: + real_wd = os.fspath(wd) + else: + str_wd = os.fspath(wd) + from ._compat import strip_path_suffix + + real_wd = strip_path_suffix(str_wd, real_wd) + log.debug("real root %s", real_wd) + if not samefile(real_wd, wd): + return None + + return cls(Path(real_wd)) + + def is_dirty(self) -> bool: + return run_git( + ["status", "--porcelain", "--untracked-files=no"], self.path + ).parse_success( + parse=bool, + default=False, + ) + + def get_branch(self) -> str | None: + return run_git( + ["rev-parse", "--abbrev-ref", "HEAD"], + self.path, + ).parse_success( + parse=str, + error_msg="branch err (abbrev-err)", + ) or run_git( + ["symbolic-ref", "--short", "HEAD"], + self.path, + ).parse_success( + parse=str, + error_msg="branch err (symbolic-ref)", + ) + + def get_head_date(self) -> date | None: + def parse_timestamp(timestamp_text: str) -> date | None: + if "%c" in timestamp_text: + log.warning("git too old -> timestamp is %r", timestamp_text) + return None + if sys.version_info < (3, 11) and timestamp_text.endswith("Z"): + timestamp_text = timestamp_text[:-1] + "+00:00" + + # Convert to UTC to ensure consistent date regardless of local timezone + dt = datetime.fromisoformat(timestamp_text) + log.debug("dt: %s", dt) + dt_utc = dt.astimezone(timezone.utc).date() + log.debug("dt utc: %s", dt_utc) + return dt_utc + + res = run_git( + [ + *("-c", "log.showSignature=false"), + *("log", "-n", "1", "HEAD"), + "--format=%cI", + ], + self.path, + ) + return res.parse_success( + parse=parse_timestamp, + error_msg="logging the iso date for head failed", + ) + + def get_dirty_tag_date(self) -> date | None: + """Get the latest modification time of changed files in the working directory. + + Returns the date of the most recently modified file that has changes, + or None if no files are changed or if an error occurs. + """ + if not self.is_dirty(): + return None + + try: + # Get list of changed files + changed_files_res = run_git(["diff", "--name-only"], self.path) + if changed_files_res.returncode != 0: + return None + + changed_files = changed_files_res.stdout.strip().split("\n") + return get_latest_file_mtime(changed_files, self.path) + + except Exception as e: + log.debug("Failed to get dirty tag date: %s", e) + return None + + def is_shallow(self) -> bool: + return self.path.joinpath(".git/shallow").is_file() + + def fetch_shallow(self) -> None: + run_git(["fetch", "--unshallow"], self.path, check=True, timeout=240) + + def node(self) -> str | None: + return run_git( + ["rev-parse", "--verify", "--quiet", "HEAD"], self.path + ).parse_success( + parse=str, + ) + + def count_all_nodes(self) -> int: + res = run_git(["rev-list", "HEAD"], self.path) + return res.stdout.count("\n") + 1 + + def default_describe(self) -> _CompletedProcess: + return run_git(DEFAULT_DESCRIBE[1:], self.path) + + +def warn_on_shallow(wd: GitWorkdir) -> None: + """experimental, may change at any time""" + if wd.is_shallow(): + warnings.warn(f'"{wd.path}" is shallow and may cause errors') + + +def fetch_on_shallow(wd: GitWorkdir) -> None: + """experimental, may change at any time""" + if wd.is_shallow(): + warnings.warn(f'"{wd.path}" was shallow, git fetch was used to rectify') + wd.fetch_shallow() + + +def fail_on_shallow(wd: GitWorkdir) -> None: + """experimental, may change at any time""" + if wd.is_shallow(): + raise ValueError( + f'{wd.path} is shallow, please correct with "git fetch --unshallow"' + ) + + +def fail_on_missing_submodules(wd: GitWorkdir) -> None: + """ + Fail if submodules are defined but not initialized/cloned. + + This pre_parse function checks if there are submodules defined in .gitmodules + but not properly initialized (cloned). This helps prevent packaging incomplete + projects when submodules are required for a complete build. + """ + gitmodules_path = wd.path / ".gitmodules" + if not gitmodules_path.exists(): + # No submodules defined, nothing to check + return + + # Get submodule status - lines starting with '-' indicate uninitialized submodules + status_result = run_git(["submodule", "status"], wd.path) + if status_result.returncode != 0: + # Command failed, might not be in a git repo or other error + log.debug("Failed to check submodule status: %s", status_result.stderr) + return + + status_lines = ( + status_result.stdout.strip().split("\n") if status_result.stdout.strip() else [] + ) + uninitialized_submodules = [] + + for line in status_lines: + line = line.strip() + if line.startswith("-"): + # Extract submodule path (everything after the commit hash) + parts = line.split() + if len(parts) >= 2: + submodule_path = parts[1] + uninitialized_submodules.append(submodule_path) + + # If .gitmodules exists but git submodule status returns nothing, + # it means submodules are defined but not properly set up (common after cloning without --recurse-submodules) + if not status_lines and gitmodules_path.exists(): + raise ValueError( + f"Submodules are defined in .gitmodules but not initialized in {wd.path}. " + f"Please run 'git submodule update --init --recursive' to initialize them." + ) + + if uninitialized_submodules: + submodule_list = ", ".join(uninitialized_submodules) + raise ValueError( + f"Submodules are not initialized in {wd.path}: {submodule_list}. " + f"Please run 'git submodule update --init --recursive' to initialize them." + ) + + +# Mapping from enum items to actual pre_parse functions +_GIT_PRE_PARSE_FUNCTIONS: dict[GitPreParse, Callable[[GitWorkdir], None]] = { + GitPreParse.WARN_ON_SHALLOW: warn_on_shallow, + GitPreParse.FAIL_ON_SHALLOW: fail_on_shallow, + GitPreParse.FETCH_ON_SHALLOW: fetch_on_shallow, + GitPreParse.FAIL_ON_MISSING_SUBMODULES: fail_on_missing_submodules, +} + + +def get_working_directory(config: Configuration, root: _t.PathT) -> GitWorkdir | None: + """ + Return the working directory (``GitWorkdir``). + """ + + if config.parent: # todo broken + return GitWorkdir.from_potential_worktree(config.parent) + + for potential_root in discover.walk_potential_roots( + root, search_parents=config.search_parent_directories + ): + potential_wd = GitWorkdir.from_potential_worktree(potential_root) + if potential_wd is not None: + return potential_wd + + return GitWorkdir.from_potential_worktree(root) + + +def parse( + root: _t.PathT, + config: Configuration, + describe_command: str | list[str] | None = None, + pre_parse: Callable[[GitWorkdir], None] | None = None, +) -> ScmVersion | None: + """ + :param pre_parse: experimental pre_parse action, may change at any time. + Takes precedence over config.git_pre_parse if provided. + """ + _require_command("git") + wd = get_working_directory(config, root) + if wd: + # Use function parameter first, then config setting, then default + if pre_parse is not None: + effective_pre_parse = pre_parse + else: + # config.scm.git.pre_parse is always a GitPreParse enum instance + effective_pre_parse = _GIT_PRE_PARSE_FUNCTIONS.get( + config.scm.git.pre_parse, warn_on_shallow + ) + + return _git_parse_inner( + config, wd, describe_command=describe_command, pre_parse=effective_pre_parse + ) + else: + return None + + +def version_from_describe( + wd: GitWorkdir | hg_git.GitWorkdirHgClient, + config: Configuration, + describe_command: _t.CMD_TYPE | None, +) -> ScmVersion | None: + if config.scm.git.describe_command is not None: + describe_command = config.scm.git.describe_command + + if describe_command is not None: + if isinstance(describe_command, str): + describe_command = shlex.split(describe_command) + # todo: figure how to ensure git with gitdir gets correctly invoked + if describe_command[0] == "git": + describe_res = run_git(describe_command[1:], wd.path) + else: + describe_res = _run(describe_command, wd.path) + else: + describe_res = wd.default_describe() + + def parse_describe(output: str) -> ScmVersion: + tag, distance, node, dirty = _git_parse_describe(output) + return meta(tag=tag, distance=distance, dirty=dirty, node=node, config=config) + + return describe_res.parse_success(parse=parse_describe) + + +def _git_parse_inner( + config: Configuration, + wd: GitWorkdir | hg_git.GitWorkdirHgClient, + pre_parse: (Callable[[GitWorkdir | hg_git.GitWorkdirHgClient], None]) | None = None, + describe_command: _t.CMD_TYPE | None = None, +) -> ScmVersion: + if pre_parse: + pre_parse(wd) + + version = version_from_describe(wd, config, describe_command) + + if version is None: + # If 'git git_describe_command' failed, try to get the information otherwise. + tag = config.version_cls(config.fallback_version or "0.0") + node = wd.node() + if node is None: + distance = 0 + dirty = True + else: + distance = wd.count_all_nodes() + node = "g" + node + dirty = wd.is_dirty() + version = meta( + tag=tag, distance=distance, dirty=dirty, node=node, config=config + ) + branch = wd.get_branch() + node_date = wd.get_head_date() + + # If we can't get node_date from HEAD (e.g., no commits yet), + # and the working directory is dirty, try to use the latest + # modification time of changed files instead of current time + if node_date is None and wd.is_dirty(): + dirty_date = wd.get_dirty_tag_date() + if dirty_date is not None: + node_date = dirty_date + + # Final fallback to current time + if node_date is None: + node_date = datetime.now(timezone.utc).date() + + return dataclasses.replace(version, branch=branch, node_date=node_date) + + +def _git_parse_describe( + describe_output: str, +) -> tuple[str, int, str | None, bool]: + # 'describe_output' looks e.g. like 'v1.5.0-0-g4060507' or + # 'v1.15.1rc1-37-g9bd1298-dirty'. + # It may also just be a bare tag name if this is a tagged commit and we are + # parsing a .git_archival.txt file. + + if describe_output.endswith("-dirty"): + dirty = True + describe_output = describe_output[:-6] + else: + dirty = False + + split = describe_output.rsplit("-", 2) + if len(split) < 3: # probably a tagged commit + tag = describe_output + number = 0 + node = None + else: + tag, number_, node = split + number = int(number_) + return tag, number, node, dirty + + +def archival_to_version( + data: dict[str, str], config: Configuration +) -> ScmVersion | None: + node: str | None + log.debug("data %s", data) + archival_describe = data.get("describe-name", DESCRIBE_UNSUPPORTED) + if DESCRIBE_UNSUPPORTED in archival_describe: + warnings.warn("git archive did not support describe output") + else: + tag, number, node, _ = _git_parse_describe(archival_describe) + return meta( + tag, + config=config, + distance=number, + node=node, + ) + + for ref in REF_TAG_RE.findall(data.get("ref-names", "")): + version = tag_to_version(ref, config) + if version is not None: + return meta(version, config=config) + node = data.get("node") + if node is None: + return None + elif "$FORMAT" in node.upper(): + warnings.warn("unprocessed git archival found (no export subst applied)") + return None + else: + return meta("0.0", node=node, config=config) + + +def parse_archival(root: _t.PathT, config: Configuration) -> ScmVersion | None: + archival = os.path.join(root, ".git_archival.txt") + data = data_from_mime(archival) + return archival_to_version(data, config=config) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py new file mode 100644 index 0000000..4232051 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py @@ -0,0 +1,308 @@ +from __future__ import annotations + +import datetime +import logging +import os + +from pathlib import Path +from typing import TYPE_CHECKING +from typing import Any + +from . import Configuration +from ._version_cls import Version +from .integration import data_from_mime +from .scm_workdir import Workdir +from .scm_workdir import get_latest_file_mtime +from .version import ScmVersion +from .version import meta +from .version import tag_to_version + +if TYPE_CHECKING: + from . import _types as _t + +from ._run_cmd import CompletedProcess +from ._run_cmd import require_command as _require_command +from ._run_cmd import run as _run + +log = logging.getLogger(__name__) + + +def _get_hg_command() -> str: + """Get the hg command from environment, allowing runtime configuration.""" + return os.environ.get("SETUPTOOLS_SCM_HG_COMMAND", "hg") + + +def run_hg(args: list[str], cwd: _t.PathT, **kwargs: Any) -> CompletedProcess: + """Run mercurial command with the configured hg executable.""" + cmd = [_get_hg_command(), *args] + return _run(cmd, cwd=cwd, **kwargs) + + +class HgWorkdir(Workdir): + @classmethod + def from_potential_worktree(cls, wd: _t.PathT) -> HgWorkdir | None: + res = run_hg(["root"], wd) + if res.returncode: + return None + return cls(Path(res.stdout)) + + def get_meta(self, config: Configuration) -> ScmVersion | None: + # TODO: support bookmarks and topics (but nowadays bookmarks are + # mainly used to emulate Git branches, which is already supported with + # the dedicated class GitWorkdirHgClient) + + node_info = self._get_node_info() + if node_info is None: + return None + + node, tags_str, node_date_str = node_info + branch_info = self._get_branch_info() + branch, dirty, dirty_date = branch_info + + # Determine the appropriate node date + node_date = self._get_node_date(dirty, node_date_str, dirty_date) + + # Handle initial/empty repository + if self._is_initial_node(node): + return self._create_initial_meta(config, dirty, branch, node_date) + + node = "h" + node + tags = self._parse_tags(tags_str) + + # Try to get version from current tags + tag_version = self._get_version_from_tags(tags, config) + if tag_version: + return meta(tag_version, dirty=dirty, branch=branch, config=config) + + # Fall back to distance-based versioning + return self._get_distance_based_version(config, dirty, branch, node, node_date) + + def _get_node_info(self) -> tuple[str, str, str] | None: + """Get node, tags, and date information from mercurial log.""" + try: + node, tags_str, node_date_str = self.hg_log( + ".", "{node}\n{tag}\n{date|shortdate}" + ).split("\n") + return node, tags_str, node_date_str + except ValueError: + log.exception("Failed to get node info") + return None + + def _get_branch_info(self) -> tuple[str, bool, str]: + """Get branch name, dirty status, and dirty date.""" + branch, dirty_str, dirty_date = run_hg( + ["id", "-T", "{branch}\n{if(dirty, 1, 0)}\n{date|shortdate}"], + cwd=self.path, + check=True, + ).stdout.split("\n") + dirty = bool(int(dirty_str)) + return branch, dirty, dirty_date + + def _get_node_date( + self, dirty: bool, node_date_str: str, dirty_date: str + ) -> datetime.date: + """Get the appropriate node date, preferring file modification times for dirty repos.""" + if dirty: + file_mod_date = self.get_dirty_tag_date() + if file_mod_date is not None: + return file_mod_date + # Fall back to hg id date for dirty repos + return datetime.date.fromisoformat(dirty_date) + else: + return datetime.date.fromisoformat(node_date_str) + + def _is_initial_node(self, node: str) -> bool: + """Check if this is an initial/empty repository node.""" + return node == "0" * len(node) + + def _create_initial_meta( + self, config: Configuration, dirty: bool, branch: str, node_date: datetime.date + ) -> ScmVersion: + """Create metadata for initial/empty repository.""" + log.debug("initial node %s", self.path) + return meta( + Version("0.0"), + config=config, + dirty=dirty, + branch=branch, + node_date=node_date, + ) + + def _parse_tags(self, tags_str: str) -> list[str]: + """Parse and filter tags from mercurial output.""" + tags = tags_str.split() + if "tip" in tags: + # tip is not a real tag + tags.remove("tip") + return tags + + def _get_version_from_tags( + self, tags: list[str], config: Configuration + ) -> Version | None: + """Try to get a version from the current tags.""" + if tags: + tag = tag_to_version(tags[0], config) + return tag + return None + + def _get_distance_based_version( + self, + config: Configuration, + dirty: bool, + branch: str, + node: str, + node_date: datetime.date, + ) -> ScmVersion | None: + """Get version based on distance from latest tag.""" + try: + tag_str = self.get_latest_normalizable_tag() + if tag_str is None: + dist = self.get_distance_revs("") + else: + dist = self.get_distance_revs(tag_str) + + if tag_str == "null" or tag_str is None: + tag = Version("0.0") + dist += 1 + else: + maybe_tag = tag_to_version(tag_str, config=config) + if maybe_tag is None: + # If tag conversion fails, treat as no tag found + tag = Version("0.0") + dist += 1 + else: + tag = maybe_tag + + if self.check_changes_since_tag(tag_str) or dirty: + return meta( + tag, + distance=dist, + node=node, + dirty=dirty, + branch=branch, + config=config, + node_date=node_date, + ) + else: + return meta(tag, config=config, node_date=node_date) + + except ValueError: + # unpacking failed, old hg + log.exception("error") + return None + + def hg_log(self, revset: str, template: str) -> str: + return run_hg( + ["log", "-r", revset, "-T", template], cwd=self.path, check=True + ).stdout + + def get_latest_normalizable_tag(self) -> str | None: + # Gets all tags containing a '.' (see #229) from oldest to newest + outlines = self.hg_log( + revset="ancestors(.) and tag('re:\\.')", + template="{tags}{if(tags, '\n', '')}", + ).split() + if not outlines: + return None + tag = outlines[-1].split()[-1] + return tag + + def get_distance_revs(self, rev1: str, rev2: str = ".") -> int: + revset = f"({rev1}::{rev2})" + out = self.hg_log(revset, ".") + return len(out) - 1 + + def check_changes_since_tag(self, tag: str | None) -> bool: + if tag == "0.0" or tag is None: + return True + + revset = ( + "(branch(.)" # look for revisions in this branch only + f" and tag({tag!r})::." # after the last tag + # ignore commits that only modify .hgtags and nothing else: + " and (merge() or file('re:^(?!\\.hgtags).*$'))" + f" and not tag({tag!r}))" # ignore the tagged commit itself + ) + + return bool(self.hg_log(revset, ".")) + + def get_dirty_tag_date(self) -> datetime.date | None: + """Get the latest modification time of changed files in the working directory. + + Returns the date of the most recently modified file that has changes, + or None if no files are changed or if an error occurs. + """ + try: + # Check if working directory is dirty first + res = run_hg(["id", "-T", "{dirty}"], cwd=self.path) + if res.returncode != 0 or not bool(res.stdout): + return None + + # Get list of changed files using hg status + status_res = run_hg(["status", "-m", "-a", "-r"], cwd=self.path) + if status_res.returncode != 0: + return None + + changed_files = [] + for line in status_res.stdout.strip().split("\n"): + if line and len(line) > 2: + # Format is "M filename" or "A filename" etc. + filepath = line[2:] # Skip status char and space + changed_files.append(filepath) + + return get_latest_file_mtime(changed_files, self.path) + + except Exception as e: + log.debug("Failed to get dirty tag date: %s", e) + + return None + + +def parse(root: _t.PathT, config: Configuration) -> ScmVersion | None: + hg_cmd = _get_hg_command() + _require_command(hg_cmd) + if os.path.exists(os.path.join(root, ".hg/git")): + res = run_hg(["path"], root) + if not res.returncode: + for line in res.stdout.split("\n"): + if line.startswith("default ="): + path = Path(line.split()[2]) + if path.name.endswith(".git") or (path / ".git").exists(): + from .git import _git_parse_inner + from .hg_git import GitWorkdirHgClient + + wd_hggit = GitWorkdirHgClient.from_potential_worktree(root) + if wd_hggit: + return _git_parse_inner(config, wd_hggit) + + wd = HgWorkdir.from_potential_worktree(config.absolute_root) + + if wd is None: + return None + + return wd.get_meta(config) + + +def archival_to_version(data: dict[str, str], config: Configuration) -> ScmVersion: + log.debug("data %s", data) + node = data.get("node", "") + if node: + node = "h" + node + if "tag" in data: + return meta(data["tag"], config=config) + elif "latesttag" in data: + return meta( + data["latesttag"], + distance=int(data["latesttagdistance"]), + node=node, + branch=data.get("branch"), + config=config, + ) + else: + return meta(config.version_cls("0.0"), node=node, config=config) + + +def parse_archival(root: _t.PathT, config: Configuration) -> ScmVersion: + archival = os.path.join(root, ".hg_archival.txt") + data = data_from_mime(archival) + return archival_to_version(data, config=config) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py new file mode 100644 index 0000000..3e91b20 --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +import logging +import os + +from contextlib import suppress +from datetime import date +from pathlib import Path + +from . import _types as _t +from ._run_cmd import CompletedProcess as _CompletedProcess +from .git import GitWorkdir +from .hg import HgWorkdir +from .hg import run_hg +from .scm_workdir import get_latest_file_mtime + +log = logging.getLogger(__name__) + +_FAKE_GIT_DESCRIBE_ERROR = _CompletedProcess( + "fake git describe output for hg", + 1, + "<>hg git failed to describe", +) + + +class GitWorkdirHgClient(GitWorkdir, HgWorkdir): + @classmethod + def from_potential_worktree(cls, wd: _t.PathT) -> GitWorkdirHgClient | None: + res = run_hg(["root"], cwd=wd).parse_success(parse=Path) + if res is None: + return None + return cls(res) + + def is_dirty(self) -> bool: + res = run_hg(["id", "-T", "{dirty}"], cwd=self.path, check=True) + return bool(res.stdout) + + def get_branch(self) -> str | None: + res = run_hg(["id", "-T", "{bookmarks}"], cwd=self.path) + if res.returncode: + log.info("branch err %s", res) + return None + return res.stdout + + def get_head_date(self) -> date | None: + return run_hg( + ["log", "-r", ".", "-T", "{shortdate(date)}"], cwd=self.path + ).parse_success(parse=date.fromisoformat, error_msg="head date err") + + def get_dirty_tag_date(self) -> date | None: + """Get the latest modification time of changed files in the working directory. + + Returns the date of the most recently modified file that has changes, + or None if no files are changed or if an error occurs. + """ + if not self.is_dirty(): + return None + + try: + # Get list of changed files using hg status + status_res = run_hg(["status", "-m", "-a", "-r"], cwd=self.path) + if status_res.returncode != 0: + return None + + changed_files = [] + for line in status_res.stdout.strip().split("\n"): + if line and len(line) > 2: + # Format is "M filename" or "A filename" etc. + filepath = line[2:] # Skip status char and space + changed_files.append(filepath) + + return get_latest_file_mtime(changed_files, self.path) + + except Exception as e: + log.debug("Failed to get dirty tag date: %s", e) + + return None + + def is_shallow(self) -> bool: + return False + + def fetch_shallow(self) -> None: + pass + + def get_hg_node(self) -> str | None: + res = run_hg(["log", "-r", ".", "-T", "{node}"], cwd=self.path) + if res.returncode: + return None + else: + return res.stdout + + def _hg2git(self, hg_node: str) -> str | None: + with suppress(FileNotFoundError): + with open(os.path.join(self.path, ".hg/git-mapfile")) as map_items: + for item in map_items: + if hg_node in item: + git_node, hg_node = item.split() + return git_node + return None + + def node(self) -> str | None: + hg_node = self.get_hg_node() + if hg_node is None: + return None + + git_node = self._hg2git(hg_node) + + if git_node is None: + # trying again after hg -> git + run_hg(["gexport"], cwd=self.path) + git_node = self._hg2git(hg_node) + + if git_node is None: + log.debug("Cannot get git node so we use hg node %s", hg_node) + + if hg_node == "0" * len(hg_node): + # mimic Git behavior + return None + + return hg_node + + return git_node + + def count_all_nodes(self) -> int: + res = run_hg(["log", "-r", "ancestors(.)", "-T", "."], cwd=self.path) + return len(res.stdout) + + def default_describe(self) -> _CompletedProcess: + """ + Tentative to reproduce the output of + + `git describe --dirty --tags --long --match *[0-9]*` + + """ + res = run_hg( + [ + "log", + "-r", + "(reverse(ancestors(.)) and tag(r're:v?[0-9].*'))", + "-T", + "{tags}{if(tags, ' ', '')}", + ], + cwd=self.path, + ) + if res.returncode: + return _FAKE_GIT_DESCRIBE_ERROR + hg_tags: list[str] = res.stdout.split() + + if not hg_tags: + return _FAKE_GIT_DESCRIBE_ERROR + + with self.path.joinpath(".hg/git-tags").open() as fp: + git_tags: dict[str, str] = dict(line.split()[::-1] for line in fp) + + tag: str + for hg_tag in hg_tags: + if hg_tag in git_tags: + tag = hg_tag + break + else: + logging.warning("tag not found hg=%s git=%s", hg_tags, git_tags) + return _FAKE_GIT_DESCRIBE_ERROR + + res = run_hg(["log", "-r", f"'{tag}'::.", "-T", "."], cwd=self.path) + if res.returncode: + return _FAKE_GIT_DESCRIBE_ERROR + distance = len(res.stdout) - 1 + + node = self.node() + assert node is not None + desc = f"{tag}-{distance}-g{node}" + + if self.is_dirty(): + desc += "-dirty" + log.debug("faked describe %r", desc) + return _CompletedProcess( + ["setuptools-scm", "faked", "describe"], + returncode=0, + stdout=desc, + stderr="", + ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py new file mode 100644 index 0000000..b15d74a --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import logging +import textwrap + +from pathlib import Path + +from . import _types as _t + +log = logging.getLogger(__name__) + + +def data_from_mime(path: _t.PathT, content: str | None = None) -> dict[str, str]: + """return a mapping from mime/pseudo-mime content + :param path: path to the mime file + :param content: content of the mime file, if None, read from path + :rtype: dict[str, str] + + """ + + if content is None: + content = Path(path).read_text(encoding="utf-8") + log.debug("mime %s content:\n%s", path, textwrap.indent(content, " ")) + + from email.parser import HeaderParser + + parser = HeaderParser() + message = parser.parsestr(content) + data = dict(message.items()) + log.debug("mime %s data:\n%s", path, data) + return data diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py new file mode 100644 index 0000000..b3ca7aa --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py @@ -0,0 +1,54 @@ +from __future__ import annotations + +import logging + +from dataclasses import dataclass +from datetime import date +from datetime import datetime +from datetime import timezone +from pathlib import Path + +from ._config import Configuration +from .version import ScmVersion + +log = logging.getLogger(__name__) + + +def get_latest_file_mtime(changed_files: list[str], base_path: Path) -> date | None: + """Get the latest modification time of the given files. + + Args: + changed_files: List of relative file paths + base_path: Base directory path to resolve relative paths + + Returns: + The date of the most recently modified file, or None if no valid files found + """ + if not changed_files or changed_files == [""]: + return None + + latest_mtime = 0.0 + for filepath in changed_files: + full_path = base_path / filepath + try: + file_stat = full_path.stat() + latest_mtime = max(latest_mtime, file_stat.st_mtime) + except OSError: + # File might not exist or be accessible, skip it + log.debug("Failed to get mtime for %s", full_path) + continue + + if latest_mtime > 0: + # Convert to UTC date + dt = datetime.fromtimestamp(latest_mtime, timezone.utc) + return dt.date() + + return None + + +@dataclass() +class Workdir: + path: Path + + def run_describe(self, config: Configuration) -> ScmVersion: + raise NotImplementedError(self.run_describe) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py new file mode 100644 index 0000000..77c26dc --- /dev/null +++ b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py @@ -0,0 +1,583 @@ +from __future__ import annotations + +import dataclasses +import logging +import os +import re +import warnings + +from datetime import date +from datetime import datetime +from datetime import timezone +from typing import TYPE_CHECKING +from typing import Any +from typing import Callable +from typing import Match + +from . import _entrypoints +from . import _modify_version +from ._node_utils import _format_node_for_output + +if TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 10): + from typing import Concatenate + from typing import ParamSpec + else: + from typing_extensions import Concatenate + from typing_extensions import ParamSpec + + _P = ParamSpec("_P") + +from typing import TypedDict + +from . import _config +from . import _version_cls as _v +from ._version_cls import Version as PkgVersion +from ._version_cls import _VersionT + +log = logging.getLogger(__name__) + + +SEMVER_MINOR = 2 +SEMVER_PATCH = 3 +SEMVER_LEN = 3 + + +class _TagDict(TypedDict): + version: str + prefix: str + suffix: str + + +def _parse_version_tag( + tag: str | object, config: _config.Configuration +) -> _TagDict | None: + match = config.tag_regex.match(str(tag)) + + if match: + key: str | int = 1 if len(match.groups()) == 1 else "version" + full = match.group(0) + log.debug("%r %r %s", tag, config.tag_regex, match) + log.debug( + "key %s data %s, %s, %r", key, match.groupdict(), match.groups(), full + ) + + if version := match.group(key): + result = _TagDict( + version=version, + prefix=full[: match.start(key)], + suffix=full[match.end(key) :], + ) + + log.debug("tag %r parsed to %r", tag, result) + return result + + raise ValueError( + f'The tag_regex "{config.tag_regex.pattern}" matched tag "{tag}", ' + "however the matched group has no value." + ) + else: + log.debug("tag %r did not parse", tag) + + return None + + +def callable_or_entrypoint(group: str, callable_or_name: str | Any) -> Any: + log.debug("ep %r %r", group, callable_or_name) + + if callable(callable_or_name): + return callable_or_name + + from ._entrypoints import _get_ep + + return _get_ep(group, callable_or_name) + + +def tag_to_version( + tag: _VersionT | str, config: _config.Configuration +) -> _VersionT | None: + """ + take a tag that might be prefixed with a keyword and return only the version part + """ + log.debug("tag %s", tag) + + tag_dict = _parse_version_tag(tag, config) + if tag_dict is None or not tag_dict.get("version", None): + warnings.warn(f"tag {tag!r} no version found") + return None + + version_str = tag_dict["version"] + log.debug("version pre parse %s", version_str) + + # Try to create version from base version first + try: + version: _VersionT = config.version_cls(version_str) + log.debug("version=%r", version) + except Exception: + warnings.warn( + f"tag {tag!r} will be stripped of its suffix {tag_dict.get('suffix', '')!r}" + ) + # Fall back to trying without any suffix + version = config.version_cls(version_str) + log.debug("version=%r", version) + return version + + # If base version is valid, check if we can preserve the suffix + if suffix := tag_dict.get("suffix", ""): + log.debug("tag %r includes local build data %r, preserving it", tag, suffix) + # Try creating version with suffix - if it fails, we'll use the base version + try: + version_with_suffix = config.version_cls(version_str + suffix) + log.debug("version with suffix=%r", version_with_suffix) + return version_with_suffix + except Exception: + warnings.warn(f"tag {tag!r} will be stripped of its suffix {suffix!r}") + # Return the base version without suffix + return version + + return version + + +def _source_epoch_or_utc_now() -> datetime: + if "SOURCE_DATE_EPOCH" in os.environ: + date_epoch = int(os.environ["SOURCE_DATE_EPOCH"]) + return datetime.fromtimestamp(date_epoch, timezone.utc) + else: + return datetime.now(timezone.utc) + + +@dataclasses.dataclass +class ScmVersion: + """represents a parsed version from scm""" + + tag: _v.Version | _v.NonNormalizedVersion + """the related tag or preformatted version""" + config: _config.Configuration + """the configuration used to parse the version""" + distance: int = 0 + """the number of commits since the tag""" + node: str | None = None + """the shortened node id""" + dirty: bool = False + """whether the working copy had uncommitted changes""" + preformatted: bool = False + """whether the version string was preformatted""" + branch: str | None = None + """the branch name if any""" + node_date: date | None = None + """the date of the commit if available""" + time: datetime = dataclasses.field(default_factory=_source_epoch_or_utc_now) + """the current time or source epoch time + only set for unit-testing version schemes + for real usage it must be `now(utc)` or `SOURCE_EPOCH` + """ + + @property + def exact(self) -> bool: + """returns true checked out exactly on a tag and no local changes apply""" + return self.distance == 0 and not self.dirty + + @property + def short_node(self) -> str | None: + """Return the node formatted for output.""" + return _format_node_for_output(self.node) + + def __repr__(self) -> str: + return ( + f"" + ) + + def format_with(self, fmt: str, **kw: object) -> str: + """format a given format string with attributes of this object""" + return fmt.format( + time=self.time, + tag=self.tag, + distance=self.distance, + node=_format_node_for_output(self.node), + dirty=self.dirty, + branch=self.branch, + node_date=self.node_date, + **kw, + ) + + def format_choice(self, clean_format: str, dirty_format: str, **kw: object) -> str: + """given `clean_format` and `dirty_format` + + choose one based on `self.dirty` and format it using `self.format_with`""" + + return self.format_with(dirty_format if self.dirty else clean_format, **kw) + + def format_next_version( + self, + guess_next: Callable[Concatenate[ScmVersion, _P], str], + fmt: str = "{guessed}.dev{distance}", + *k: _P.args, + **kw: _P.kwargs, + ) -> str: + guessed = guess_next(self, *k, **kw) + return self.format_with(fmt, guessed=guessed) + + +def _parse_tag( + tag: _VersionT | str, preformatted: bool, config: _config.Configuration +) -> _VersionT: + if preformatted: + # For preformatted versions, tag should already be validated as a version object + # String validation is handled in meta function before calling this + if isinstance(tag, str): + # This should not happen with enhanced meta, but kept for safety + return _v.NonNormalizedVersion(tag) + else: + # Already a version object (including test mocks), return as-is + return tag + elif not isinstance(tag, config.version_cls): + version = tag_to_version(tag, config) + assert version is not None + return version + else: + return tag + + +def meta( + tag: str | _VersionT, + *, + distance: int = 0, + dirty: bool = False, + node: str | None = None, + preformatted: bool = False, + branch: str | None = None, + config: _config.Configuration, + node_date: date | None = None, + time: datetime | None = None, +) -> ScmVersion: + parsed_version: _VersionT + # Enhanced string validation for preformatted versions + if preformatted and isinstance(tag, str): + # Validate PEP 440 compliance using NonNormalizedVersion + # Let validation errors bubble up to the caller + parsed_version = _v.NonNormalizedVersion(tag) + else: + # Use existing _parse_tag logic for non-preformatted or already validated inputs + parsed_version = _parse_tag(tag, preformatted, config) + + log.info("version %s -> %s", tag, parsed_version) + assert parsed_version is not None, f"Can't parse version {tag}" + scm_version = ScmVersion( + parsed_version, + distance=distance, + node=node, + dirty=dirty, + preformatted=preformatted, + branch=branch, + config=config, + node_date=node_date, + ) + if time is not None: + scm_version = dataclasses.replace(scm_version, time=time) + return scm_version + + +def guess_next_version(tag_version: ScmVersion) -> str: + version = _modify_version.strip_local(str(tag_version.tag)) + return _modify_version._bump_dev(version) or _modify_version._bump_regex(version) + + +def guess_next_dev_version(version: ScmVersion) -> str: + if version.exact: + return version.format_with("{tag}") + else: + return version.format_next_version(guess_next_version) + + +def guess_next_simple_semver( + version: ScmVersion, retain: int, increment: bool = True +) -> str: + if isinstance(version.tag, _v.Version): + parts = list(version.tag.release[:retain]) + else: + try: + parts = [int(i) for i in str(version.tag).split(".")[:retain]] + except ValueError: + raise ValueError(f"{version} can't be parsed as numeric version") from None + while len(parts) < retain: + parts.append(0) + if increment: + parts[-1] += 1 + while len(parts) < SEMVER_LEN: + parts.append(0) + return ".".join(str(i) for i in parts) + + +def simplified_semver_version(version: ScmVersion) -> str: + if version.exact: + return guess_next_simple_semver(version, retain=SEMVER_LEN, increment=False) + elif version.branch is not None and "feature" in version.branch: + return version.format_next_version( + guess_next_simple_semver, retain=SEMVER_MINOR + ) + else: + return version.format_next_version( + guess_next_simple_semver, retain=SEMVER_PATCH + ) + + +def release_branch_semver_version(version: ScmVersion) -> str: + if version.exact: + return version.format_with("{tag}") + if version.branch is not None: + # Does the branch name (stripped of namespace) parse as a version? + branch_ver_data = _parse_version_tag( + version.branch.split("/")[-1], version.config + ) + if branch_ver_data is not None: + branch_ver = branch_ver_data["version"] + if branch_ver[0] == "v": + # Allow branches that start with 'v', similar to Version. + branch_ver = branch_ver[1:] + # Does the branch version up to the minor part match the tag? If not it + # might be like, an issue number or something and not a version number, so + # we only want to use it if it matches. + tag_ver_up_to_minor = str(version.tag).split(".")[:SEMVER_MINOR] + branch_ver_up_to_minor = branch_ver.split(".")[:SEMVER_MINOR] + if branch_ver_up_to_minor == tag_ver_up_to_minor: + # We're in a release/maintenance branch, next is a patch/rc/beta bump: + return version.format_next_version(guess_next_version) + # We're in a development branch, next is a minor bump: + return version.format_next_version(guess_next_simple_semver, retain=SEMVER_MINOR) + + +def release_branch_semver(version: ScmVersion) -> str: + warnings.warn( + "release_branch_semver is deprecated and will be removed in the future. " + "Use release_branch_semver_version instead", + category=DeprecationWarning, + stacklevel=2, + ) + return release_branch_semver_version(version) + + +def only_version(version: ScmVersion) -> str: + return version.format_with("{tag}") + + +def no_guess_dev_version(version: ScmVersion) -> str: + if version.exact: + return version.format_with("{tag}") + else: + return version.format_next_version(_modify_version._dont_guess_next_version) + + +_DATE_REGEX = re.compile( + r""" + ^(?P + (?P[vV]?) + (?P\d{2}|\d{4})(?:\.\d{1,2}){2}) + (?:\.(?P\d*))?$ + """, + re.VERBOSE, +) + + +def date_ver_match(ver: str) -> Match[str] | None: + return _DATE_REGEX.match(ver) + + +def guess_next_date_ver( + version: ScmVersion, + node_date: date | None = None, + date_fmt: str | None = None, + version_cls: type | None = None, +) -> str: + """ + same-day -> patch +1 + other-day -> today + + distance is always added as .devX + """ + match = date_ver_match(str(version.tag)) + if match is None: + warnings.warn( + f"{version} does not correspond to a valid versioning date, " + "assuming legacy version" + ) + if date_fmt is None: + date_fmt = "%y.%m.%d" + else: + # deduct date format if not provided + if date_fmt is None: + date_fmt = "%Y.%m.%d" if len(match.group("year")) == 4 else "%y.%m.%d" + if prefix := match.group("prefix"): + if not date_fmt.startswith(prefix): + date_fmt = prefix + date_fmt + + today = version.time.date() + head_date = node_date or today + # compute patch + if match is None: + # For legacy non-date tags, always use patch=0 (treat as "other day") + # Use yesterday to ensure tag_date != head_date + from datetime import timedelta + + tag_date = head_date - timedelta(days=1) + else: + tag_date = ( + datetime.strptime(match.group("date"), date_fmt) + .replace(tzinfo=timezone.utc) + .date() + ) + if tag_date == head_date: + assert match is not None + # Same day as existing date tag - increment patch + patch = int(match.group("patch") or "0") + 1 + else: + # Different day or legacy non-date tag - use patch 0 + if tag_date > head_date and match is not None: + # warn on future times (only for actual date tags, not legacy) + warnings.warn( + f"your previous tag ({tag_date}) is ahead your node date ({head_date})" + ) + patch = 0 + next_version = "{node_date:{date_fmt}}.{patch}".format( + node_date=head_date, date_fmt=date_fmt, patch=patch + ) + # rely on the Version object to ensure consistency (e.g. remove leading 0s) + if version_cls is None: + version_cls = PkgVersion + next_version = str(version_cls(next_version)) + return next_version + + +def calver_by_date(version: ScmVersion) -> str: + if version.exact and not version.dirty: + return version.format_with("{tag}") + # TODO: move the release-X check to a new scheme + if version.branch is not None and version.branch.startswith("release-"): + branch_ver = _parse_version_tag(version.branch.split("-")[-1], version.config) + if branch_ver is not None: + ver = branch_ver["version"] + match = date_ver_match(ver) + if match: + return ver + return version.format_next_version( + guess_next_date_ver, + node_date=version.node_date, + version_cls=version.config.version_cls, + ) + + +def get_local_node_and_date(version: ScmVersion) -> str: + return _modify_version._format_local_with_time(version, time_format="%Y%m%d") + + +def get_local_node_and_timestamp(version: ScmVersion) -> str: + return _modify_version._format_local_with_time(version, time_format="%Y%m%d%H%M%S") + + +def get_local_dirty_tag(version: ScmVersion) -> str: + return version.format_choice("", "+dirty") + + +def get_no_local_node(version: ScmVersion) -> str: + return "" + + +def postrelease_version(version: ScmVersion) -> str: + if version.exact: + return version.format_with("{tag}") + else: + return version.format_with("{tag}.post{distance}") + + +def _combine_version_with_local_parts( + main_version: str, *local_parts: str | None +) -> str: + """ + Combine a main version with multiple local parts into a valid PEP 440 version string. + Handles deduplication of local parts to avoid adding the same local data twice. + + Args: + main_version: The main version string (e.g., "1.2.0", "1.2.dev3") + *local_parts: Variable number of local version parts, can be None or empty + + Returns: + A valid PEP 440 version string + + Examples: + _combine_version_with_local_parts("1.2.0", "build.123", "d20090213") -> "1.2.0+build.123.d20090213" + _combine_version_with_local_parts("1.2.0", "build.123", None) -> "1.2.0+build.123" + _combine_version_with_local_parts("1.2.0+build.123", "d20090213") -> "1.2.0+build.123.d20090213" + _combine_version_with_local_parts("1.2.0+build.123", "build.123") -> "1.2.0+build.123" # no duplication + _combine_version_with_local_parts("1.2.0", None, None) -> "1.2.0" + """ + # Split main version into base and existing local parts + if "+" in main_version: + main_part, existing_local = main_version.split("+", 1) + all_local_parts = existing_local.split(".") + else: + main_part = main_version + all_local_parts = [] + + # Process each new local part + for part in local_parts: + if not part or not part.strip(): + continue + + # Strip any leading + and split into segments + clean_part = part.strip("+") + if not clean_part: + continue + + # Split multi-part local identifiers (e.g., "build.123" -> ["build", "123"]) + part_segments = clean_part.split(".") + + # Add each segment if not already present + for segment in part_segments: + if segment and segment not in all_local_parts: + all_local_parts.append(segment) + + # Return combined result + if all_local_parts: + return main_part + "+" + ".".join(all_local_parts) + else: + return main_part + + +def format_version(version: ScmVersion) -> str: + log.debug("scm version %s", version) + log.debug("config %s", version.config) + if version.preformatted: + return str(version.tag) + + # Extract original tag's local data for later combination + original_local = "" + if hasattr(version.tag, "local") and version.tag.local is not None: + original_local = str(version.tag.local) + + # Create a patched ScmVersion with only the base version (no local data) for version schemes + from dataclasses import replace + + # Extract the base version (public part) from the tag using config's version_cls + base_version_str = str(version.tag.public) + base_tag = version.config.version_cls(base_version_str) + version_for_scheme = replace(version, tag=base_tag) + + main_version = _entrypoints._call_version_scheme( + version_for_scheme, + "setuptools_scm.version_scheme", + version.config.version_scheme, + ) + log.debug("version %s", main_version) + assert main_version is not None + + local_version = _entrypoints._call_version_scheme( + version, "setuptools_scm.local_scheme", version.config.local_scheme, "+unknown" + ) + log.debug("local_version %s", local_version) + + # Combine main version with original local data and new local scheme data + return _combine_version_with_local_parts( + str(main_version), original_local, local_version + ) diff --git a/Untitled.ipynb b/Untitled.ipynb index e3b4e8d..4669c05 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -149,7 +149,7 @@ { "data": { "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
12BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
13CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
22BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
23CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
32BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
33CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -1172,8 +1172,8 @@ "output_type": "stream", "text": [ "Fill missing completed (in-place). Summary:\n", - "Column 'age': filled missing with median=35.0.\n", - "Column 'salary': filled missing with median=65000.0.\n" + "Column 'age': filled missing with median=36.5.\n", + "Column 'salary': filled missing with median=82500.0.\n" ] }, { @@ -1182,11 +1182,25 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1194,283 +1208,843 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30.05000.030FBachelors51287.514050.255000.03000.08.5475.00
2BobNaN40.065000.0Engineering45MMasters203091.0320100.1120000.015000.09.0589.00
3CharlieEngineering35.0700000.0Sales38MBachelors101879.3015200.580000.07000.07.2370.01
4DavidHR25.048000.0DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveNaN35.065000.0Finance35FBachelors81588.013060.390000.08000.08.0485.00
6FrankEngineering28.072000.0HR50MHigh School25872.5010150.760000.04000.06.5260.01
7NaNGraceSales50.065000.042FBachelors182081.4125120.485000.07000.07.8374.00
8GraceSales45.065000.0HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
9AliceHR30.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
10BobNaN40.0JackSales55MHigh School301268.905250.865000.02000.05.5150.01
11CharlieEngineering35.0700000.0AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
12DavidHR25.048000.0BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
13EveNaN35.065000.0CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
14FrankDianaEngineering28.072000.029FPhD62295.225020.097000.010000.09.6595.00
15NaNSales50.065000.0EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
16GraceSales45.065000.0FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
17AliceHR30.05000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
18BobNaN40.065000.0HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
19CharlieEngineering35.0700000.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
20DavidHR25.048000.0JackSales55MHigh School301268.905250.865000.02000.05.5150.01
21EveNaN35.065000.0AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
22FrankBobEngineering28.072000.045MMasters203091.0320100.1120000.015000.09.0589.00
23NaNCharlieSales50.065000.038MBachelors101879.3015200.580000.07000.07.2370.01
24GraceSales45.065000.0DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
25AliceHR30.05000.0EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
26BobNaN40.065000.0FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
27CharlieEngineering35.0700000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
28DavidHR25.048000.0HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
29EveNaN35.065000.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
30FrankEngineering28.072000.0JackSales55MHigh School301268.905250.865000.02000.05.5150.01
31NaNSales50.065000.0AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
32GraceSales45.065000.0BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
33AliceHR30.05000.0CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
34BobNaN40.065000.0DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
35CharlieEngineering35.0700000.0EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
36DavidFrankHR25.048000.050MHigh School25872.5010150.760000.04000.06.5260.01
37EveNaN35.065000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
38FrankHenryEngineering28.072000.031MMasters72593.123550.295000.09000.09.1590.00
39NaNSales50.065000.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
40GraceJackSales45.055MHigh School301268.905250.865000.0
2000.05.5150.01
" ] }, @@ -4292,7 +4866,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 5, "id": "992bf2a2-15e2-4c67-a8fc-f0ac3c3e0630", "metadata": {}, "outputs": [ @@ -4300,449 +4874,456 @@ "name": "stdout", "output_type": "stream", "text": [ - "Split completed: total=76, train=52, test=16, val=8.\n" + "Split completed: total=40, train=28, test=8, val=4.\n" ] }, { "data": { "text/html": [ - "

Train (52 rows)

\n", + "

Train (28 rows)

\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", - "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lblbonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
16GraceSales45.065000.00.00.01.00.0Sales[9.200406422034337e-08, 1.243137546336329e-07, 0.9999996747434211, 1.0893876019841573e-07]2
27Charlie18HenryEngineering35.0700000.01.00.00.00.0300335.891003NaN31MMasters72593.123550.295000.09000.09.1590.00
28DavidHR25.048000.00.01.00.00.0HR[9.321102887398835e-257, 1.0, 1.6922240381185897e-260, 3.545685506178999e-152]1
25AliceHR30.05000.00.01.00.00.0NaNNaNEveFinance35FBachelors81588.013060.390000.08000.08.0485.00
5EveUnknown35.065000.00.00.00.01.0Unknown[3.028517537014313e-05, 1.0663051108596776e-09, 1.6103057563599044e-05, 0.9999536107007612]29IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
9AliceHR30.05000.00.01.00.00.0NaNNaN23CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
4David6FrankHR25.048000.00.01.00.00.0NaNNaN50MHigh School25872.5010150.760000.04000.06.5260.01
31Unknown40JackSales50.065000.00.00.01.00.0NaNNaN2
55MHigh School301268.905EveUnknown35.0250.865000.00.00.00.01.0Unknown[8.812514498750527e-08, 1.192424207762908e-07, 8.236229869877078e-08, 0.9999997102701355]32000.05.5150.01
27Charlie14DianaEngineering35.0700000.01.00.00.029FPhD62295.225020.0107701.601304NaN97000.010000.09.6595.00
28DavidHR25.048000.00.01.00.00.0104344.615975NaN1
16GraceSales45.065000.00.00.01.00.0Sales[0.0, 0.0, 0.96, 0.04]2
33AliceHR30.05000.00.01.00.00.0NaNNaN122BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
8GraceSales45.065000.00.00.01.00.0NaNNaN232BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
1012BobUnknown40.065000.00.00.00.01.0NaNNaNEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
19Charlie34DianaEngineering35.0700000.01.00.00.029FPhD62295.225020.0NaNNaN97000.010000.09.6595.00
16Grace33CharlieSales45.065000.00.00.01.00.0111058.586633NaN2
38MBachelors101879.301520DavidHR25.048000.00.01.00.00.0HR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]1
13EveUnknown35.065000.00.00.00.01.0Unknown[8.351973478650103e-08, 1.1257529759402385e-07, 9.123997229467953e-08, 0.9999997126649953]0.580000.07000.07.23
17AliceHR30.05000.00.01.00.00.0HR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]70.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Validation (8 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lbl
12DavidHR25.048000.00.01.00.00.0NaNNaN7GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
5EveUnknown35.065000.00.00.00.01.0107701.601304NaN326FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
13EveUnknown35.065000.00.00.00.01.0107701.601304NaN19IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
13EveUnknown35.010JackSales55MHigh School301268.905250.865000.00.00.00.01.0300335.891003NaN32000.05.5150.01
20DavidHR25.048000.00.01.00.00.036FrankHR[8.219883955756973e-08, 0.9999997238582116, 8.980877253636851e-08, 1.0413417643748582e-07]50MHigh School25872.5010150.760000.04000.06.5260.01
121AliceHR30.05000.00.01.00.00.0NaNNaN30FBachelors51287.514050.255000.03000.08.5475.00
38Frank24DianaEngineering28.072000.01.00.00.029FPhD62295.225020.072000.0NaN97000.010000.09.6595.00
1731AliceHR30.05000.00.01.00.00.0106023.108639NaN30FBachelors51287.514050.255000.03000.08.5475.00
" @@ -4754,247 +5335,317 @@ { "data": { "text/html": [ - "

Test (16 rows)

\n", + "

Validation (4 rows)

\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", + " \n", + "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown_predicted_pred_probadepartment_lblbonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
15EveFinance35FBachelors81588.01306FrankEngineering28.072000.01.00.00.00.0NaNNaN0.390000.08000.08.0485.00
28DavidHR25.048000.00.01.00.00.0HR[0.01, 0.95, 0.0, 0.04]1
34BobUnknown40.065000.00.00.00.01.0NaNNaN3
18BobUnknown40.030JackSales55MHigh School301268.905250.865000.00.00.00.01.0NaNNaN32000.05.5150.01
1711AliceHR30.05000.00.01.00.00.0HR[8.219883955756973e-08, 0.9999997238582116, 8.980877253636851e-08, 1.0413417643748582e-07]30FBachelors51287.514050.255000.03000.08.5475.00
273CharlieEngineering35.0700000.01.00.00.00.0Engineering[0.9999996765912755, 1.1755477900860697e-07, 9.537254044052893e-08, 1.104814050856308e-07]Sales38MBachelors101879.30
13EveUnknown35.065000.00.00.00.01.0Unknown[8.812514498750527e-08, 1.192424207762908e-07, 8.236229869877078e-08, 0.9999997102701355]15200.580000.07000.07.23
17AliceHR30.05000.00.01.00.00.0HR[0.0, 1.0, 0.0, 0.0]70.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (8 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
38FrankEngineering28.072000.01.00.00.00.0Engineering[0.9999996765912744, 1.1755477900860684e-07, 9.537254044052883e-08, 1.104814062137575e-07]20JackSales55MHigh School301268.905250.865000.02000.05.5150.01
17GraceSales42FBachelors1820DavidHR25.048000.00.01.00.00.0HR[0.01, 0.95, 0.0, 0.04]81.4125120.485000.07000.07.8374.00
28DavidHR25.048000.00.01.00.00.016FrankHR[8.507704216209058e-08, 0.9999997345523971, 7.949637279588989e-08, 1.0087418796343624e-07]50MHigh School25872.5010150.760000.04000.06.5260.01
1627GraceSales45.065000.00.00.01.00.065000.0NaN242FBachelors182081.4125120.485000.07000.07.8374.00
24GraceSales45.065000.00.00.01.00.0NaNNaN25EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
7Unknown13CharlieSales50.065000.00.00.01.00.0NaNNaN238MBachelors101879.3015200.580000.07000.07.2370.01
38FrankEngineering28.072000.01.00.00.00.0HenryEngineering[0.9991777270016182, 7.148513430183318e-113, 0.0008222729983817936, 1.9337772157312066e-34]31MMasters72593.123550.295000.09000.09.1590.00
13EveUnknown35.065000.00.00.00.01.0Unknown[0.01, 0.0, 0.0, 0.99]328HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
" @@ -5010,7 +5661,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "000a2e5b-1918-4371-8b47-d3a4547a1759", "metadata": {}, "outputs": [ @@ -5027,153 +5678,447 @@ "

Train (23 rows)

\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
28David31AliceHR25.048000.030FBachelors51287.514050.255000.03000.08.5475.00
1AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
19IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
21AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
15EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
14DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
24DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
18HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
20David26FrankHR25.048000.050MHigh School25872.5010150.760000.04000.06.5260.01
3523CharlieEngineering35.0700000.0Sales38MBachelors101879.3015200.580000.07000.07.2370.01
29IvyFinance27FBachelors31085.00208GraceSales45.065000.00.670000.05000.08.2482.00
18BobUnknown40.020JackSales55MHigh School301268.905250.865000.02000.05.5150.01
2417GraceSales45.065000.042FBachelors182081.4125120.485000.07000.07.8374.00
5EveUnknown35.065000.0
33AliceHR30.05000.0Finance35FBachelors81588.013060.390000.08000.08.0485.00
3CharlieEngineering35.0700000.0Sales38MBachelors101879.3015200.580000.07000.07.2370.01
40JackSales55MHigh School30FrankEngineering28.072000.01268.905250.865000.02000.05.5150.01
17AliceHR30.039IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
22Frank28HenryEngineering28.072000.0
31MMasters7UnknownSales50.065000.02593.123550.295000.09000.09.1590.00
2BobUnknown40.065000.0Engineering45MMasters203091.0320100.1120000.015000.09.0589.00
30JackSales55MHigh School3012DavidHR25.048000.0
34BobUnknown40.068.905250.865000.0
23UnknownSales2000.05.5150.065000.0
32GraceSales45.065000.0
37EveUnknown35.065000.0
25AliceHR30.05000.01
" @@ -5188,48 +6133,132 @@ "

Validation (5 rows)

\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
26BobUnknown40.065000.033CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
14FrankEngineering28.072000.025EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
911AliceHR30.05000.030FBachelors51287.514050.255000.03000.08.5475.00
39Unknown37GraceSales50.065000.042FBachelors182081.4125120.485000.07000.07.8374.00
15UnknownSales50.065000.034DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
" @@ -5244,97 +6273,279 @@ "

Test (12 rows)

\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
13CharlieSales38MBachelors10BobUnknown40.065000.01879.3015200.580000.07000.07.2370.01
16GraceSales45.065000.06FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
11Charlie38HenryEngineering35.0700000.031MMasters72593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors31085.002080.670000.05000.08.24DavidHR25.048000.082.00
38Frank22BobEngineering28.072000.045MMasters203091.0320100.1120000.015000.09.0589.00
29EveUnknown35.065000.07GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
35EveFinance35FBachelors81588.01AliceHR30.05000.03060.390000.08000.08.0485.00
40Grace10JackSales45.055MHigh School301268.905250.865000.02000.05.5150.01
31UnknownSales50.065000.016FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
27Charlie4DianaEngineering35.0700000.029FPhD62295.225020.097000.010000.09.6595.00
13EveUnknown35.065000.027GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
36DavidHR25.048000.08HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
" @@ -5571,7 +6782,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "2543a896-7047-45a7-a118-3adcfb822023", "metadata": {}, "outputs": [ @@ -5579,7 +6790,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Model 'random_forest' trained and saved to data['last_model']. problem=classification. train_rows=23\n" + "Model 'random_forest' trained and saved to data['last_model']. problem=classification. train_rows=28\n" ] } ], @@ -5679,7 +6890,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "1d90ce87-aafd-4958-8318-09f66793b98e", "metadata": {}, "outputs": [ @@ -5699,18 +6910,18 @@ " ROC AUC1.0000\n", " \n", " \n", - "
\"confusion
\n", + "
\"confusion
\n", "

Classification report

\n", "
              precision    recall  f1-score   support\n",
        "\n",
-       " Engineering       1.00      1.00      1.00         3\n",
-       "          HR       1.00      1.00      1.00         3\n",
-       "       Sales       1.00      1.00      1.00         3\n",
-       "     Unknown       1.00      1.00      1.00         3\n",
+       " Engineering       1.00      1.00      1.00         2\n",
+       "     Finance       1.00      1.00      1.00         1\n",
+       "          HR       1.00      1.00      1.00         1\n",
+       "       Sales       1.00      1.00      1.00         4\n",
        "\n",
-       "    accuracy                           1.00        12\n",
-       "   macro avg       1.00      1.00      1.00        12\n",
-       "weighted avg       1.00      1.00      1.00        12\n",
+       "    accuracy                           1.00         8\n",
+       "   macro avg       1.00      1.00      1.00         8\n",
+       "weighted avg       1.00      1.00      1.00         8\n",
        "
\n", "" ] @@ -5731,64 +6942,44 @@ " \n", " \n", " \n", - " Unknown\n", - " Unknown\n", - " [0.0, 0.0, 0.0, 1.0]\n", - " \n", - " \n", " Sales\n", " Sales\n", - " [0.0, 0.0, 0.97, 0.03]\n", - " \n", - " \n", - " Engineering\n", - " Engineering\n", - " [0.96, 0.0, 0.0, 0.04]\n", - " \n", - " \n", - " HR\n", - " HR\n", - " [0.0, 0.98, 0.0, 0.02]\n", - " \n", - " \n", - " Engineering\n", - " Engineering\n", - " [0.99, 0.01, 0.0, 0.0]\n", + " [0.01, 0.0, 0.11, 0.88]\n", " \n", " \n", - " Unknown\n", - " Unknown\n", - " [0.01, 0.0, 0.0, 0.99]\n", + " Sales\n", + " Sales\n", + " [0.0, 0.02, 0.01, 0.97]\n", " \n", " \n", " HR\n", " HR\n", - " [0.0, 1.0, 0.0, 0.0]\n", + " [0.03, 0.0, 0.96, 0.01]\n", " \n", " \n", " Sales\n", " Sales\n", - " [0.0, 0.0, 0.97, 0.03]\n", + " [0.0, 0.02, 0.01, 0.97]\n", + " \n", + " \n", + " Finance\n", + " Finance\n", + " [0.0, 0.92, 0.0, 0.08]\n", " \n", " \n", " Sales\n", " Sales\n", - " [0.0, 0.0, 1.0, 0.0]\n", + " [0.0, 0.09, 0.0, 0.91]\n", " \n", " \n", " Engineering\n", " Engineering\n", - " [0.96, 0.0, 0.0, 0.04]\n", - " \n", - " \n", - " Unknown\n", - " Unknown\n", - " [0.01, 0.0, 0.0, 0.99]\n", + " [0.98, 0.0, 0.02, 0.0]\n", " \n", " \n", - " HR\n", - " HR\n", - " [0.0, 0.98, 0.0, 0.02]\n", + " Engineering\n", + " Engineering\n", + " [0.98, 0.0, 0.02, 0.0]\n", " \n", " \n", "" @@ -5804,20 +6995,59 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 23, "id": "3e1a5300-a034-469a-abed-b50108a7f3a9", "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Model from data['last_model'] saved to ./models/test_model.joblib\n" + ] + } + ], + "source": [ + "%savemodel model_name_in_data=last_model save_path=./models/test_model.joblib overwrite=True" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "5d5a292a-f4af-47bc-9eaa-44ddd63078ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Loaded model from ./models/test_model.joblib → data['restored_model']\n" + ] + } + ], + "source": [ + "%loadmodel load_path=./models/test_model.joblib target_key=restored_model" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "788d9f75-9fe5-4bd2-b39d-1732eeee5bcd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - "The %savemodel magic command does not exist\n" + "✅ Using inline feature values for prediction: {'age': 38, 'salary': 80000.0}\n", + "prediction\n", + " Sales\n", + "✅ Predictions stored in data['last_preds'] with shape=(1, 1)\n" ] } ], "source": [ - "%save_model model_name=last_model save_path=/tmp/test_model.joblib" + "%predict model_name=last_model data_name=[38,\"80000.0\"] output_name=last_preds" ] }, { diff --git a/mariadb_kernel.egg-info/PKG-INFO b/mariadb_kernel.egg-info/PKG-INFO new file mode 100644 index 0000000..b066d1e --- /dev/null +++ b/mariadb_kernel.egg-info/PKG-INFO @@ -0,0 +1,76 @@ +Metadata-Version: 2.4 +Name: mariadb_kernel +Version: 0.1.dev254+dirty +Summary: A simple MariaDB Jupyter kernel +Home-page: https://github.com/MariaDB/mariadb_kernel +Author: MariaDB Foundation +Author-email: foundation@mariadb.org +Classifier: License :: OSI Approved :: BSD License +Classifier: Programming Language :: Python :: 3 +Requires-Python: >=3.5 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: pandas +Requires-Dist: json2html +Requires-Dist: matplotlib +Requires-Dist: lxml +Requires-Dist: setuptools +Requires-Dist: setuptools-scm +Requires-Dist: ipykernel +Requires-Dist: beautifulsoup4 +Requires-Dist: mycli +Dynamic: author +Dynamic: author-email +Dynamic: classifier +Dynamic: description +Dynamic: description-content-type +Dynamic: home-page +Dynamic: license-file +Dynamic: requires-dist +Dynamic: requires-python +Dynamic: summary + +# MariaDB Jupyter Kernel + +[![badge](https://img.shields.io/badge/Try%20MariaDB-@%20binder-579ACA.svg?logo=)](https://mybinder.org/v2/gh/MariaDB/mariadb_kernel.git/master?urlpath=lab/tree/binder/try_it_out.ipynb) +![GitHub](https://github.com/MariaDB/mariadb_kernel/workflows/CI/badge.svg) +![badge](https://img.shields.io/badge/version-v0.2.0-yellow) + + + +`mariadb_kernel` is a an Open Source kernel for Jupyter which enables users to run MariaDB in a Jupyter notebook. + +# Quick Installation Steps + +`mariadb_kernel` has been packaged for the PyPI package manager, +packaging for `conda-forge` is coming soon. + +Assuming you already have Jupyter Lab and MariaDB installed on your system, +all you need to do is: + +1. Install the kernel +```bash +python3 -m pip install mariadb_kernel +``` +2. Install the kernelspec so that the kernel becomes visible to JupyterLab +```bash +python3 -m mariadb_kernel.install +``` + +For a more complete guide on how to install `mariadb_kernel`, check out our +[Installation docs](https://mariadb.com/kb/en/mariadb-jupyter-kernel-installation/) + +# Using the kernel +Using `mariadb_kernel` is pretty simple, please check our [Using the MariaDB Jupyter Kernel](https://mariadb.com/kb/en/using-the-mariadb-jupyter-kernel/) docs for +some quick tips on how to get started and some links to our example notebooks. + +# Documentation +To get started with `mariadb_kernel`, see the full documentation +https://mariadb.com/kb/en/the-mariadb-jupyter-kernel/ + +# Contributing +Please check the [CONTRIBUTING.md](https://github.com/MariaDB/mariadb_kernel/blob/master/CONTRIBUTING.md) file to +see our guidelines for contributing to `mariadb_kernel`, how to set up a development environment and +how to add a new magic command. + +Please note this project is still in its very early stages and we expect it to change frequently. diff --git a/mariadb_kernel.egg-info/SOURCES.txt b/mariadb_kernel.egg-info/SOURCES.txt new file mode 100644 index 0000000..f91c288 --- /dev/null +++ b/mariadb_kernel.egg-info/SOURCES.txt @@ -0,0 +1,96 @@ +.git_archival.txt +.gitattributes +.gitignore +.pre-commit-config.yaml +.pylintrc +CONTRIBUTING.md +Dockerfile +LICENSE +README.md +Untitled.ipynb +dev-requirements.txt +last_query.csv +requirements.txt +sample_sales_export.csv +setup.py +test.py +.github/workflows/pre-commit.yml +.github/workflows/pylint.yml +.github/workflows/tests.yml +binder/2016_gbp_usd.csv +binder/Dockerfile +binder/apt.txt +binder/mariadb_config.json +binder/postBuild +binder/requirements.txt +binder/try_it_out.ipynb +catboost_info/catboost_training.json +catboost_info/learn_error.tsv +catboost_info/time_left.tsv +catboost_info/learn/events.out.tfevents +mariadb_kernel/__init__.py +mariadb_kernel/__main__.py +mariadb_kernel/_version.py +mariadb_kernel/client_config.py +mariadb_kernel/code_parser.py +mariadb_kernel/install.py +mariadb_kernel/kernel.py +mariadb_kernel/mariadb_client.py +mariadb_kernel/mariadb_server.py +mariadb_kernel.egg-info/PKG-INFO +mariadb_kernel.egg-info/SOURCES.txt +mariadb_kernel.egg-info/dependency_links.txt +mariadb_kernel.egg-info/requires.txt +mariadb_kernel.egg-info/top_level.txt +mariadb_kernel/code_completion/__init__.py +mariadb_kernel/code_completion/autocompleter.py +mariadb_kernel/code_completion/completion_engine.py +mariadb_kernel/code_completion/introspector.py +mariadb_kernel/code_completion/sql_analyze.py +mariadb_kernel/code_completion/sql_fetch.py +mariadb_kernel/maria_magics/__init__.py +mariadb_kernel/maria_magics/bar.py +mariadb_kernel/maria_magics/cell_magic.py +mariadb_kernel/maria_magics/delimiter.py +mariadb_kernel/maria_magics/df.py +mariadb_kernel/maria_magics/help.py +mariadb_kernel/maria_magics/line.py +mariadb_kernel/maria_magics/line_magic.py +mariadb_kernel/maria_magics/load.py +mariadb_kernel/maria_magics/lsmagic.py +mariadb_kernel/maria_magics/magic_factory.py +mariadb_kernel/maria_magics/maria_magic.py +mariadb_kernel/maria_magics/pie.py +mariadb_kernel/maria_magics/supported_magics.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py +mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py +mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py +mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py +mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py +mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py +mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py +mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py +mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py +mariadb_kernel/tests/__init__.py +mariadb_kernel/tests/conftest.py +mariadb_kernel/tests/test_autocompleter.py +mariadb_kernel/tests/test_clientconfig.py +mariadb_kernel/tests/test_codeparser.py +mariadb_kernel/tests/test_introspector.py +mariadb_kernel/tests/test_magic_linemagic.py +mariadb_kernel/tests/test_magicfactory.py +mariadb_kernel/tests/test_magics.py +mariadb_kernel/tests/test_mariadbclient.py +mariadb_kernel/tests/test_mariadbkernel.py +mariadb_kernel/tests/test_mariadbserver.py +mariadb_kernel/tests/test_sql_fetch.py +mariadb_kernel/tests/docker/Dockerfile +notebooks/FOSSASIA Summit 2021 - NEW (MariaDB) SQL.ipynb +notebooks/covid_datasets_charts.ipynb +notebooks/fosdem_tryout.ipynb +static/lab_open.png \ No newline at end of file diff --git a/mariadb_kernel.egg-info/dependency_links.txt b/mariadb_kernel.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/mariadb_kernel.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/mariadb_kernel.egg-info/requires.txt b/mariadb_kernel.egg-info/requires.txt new file mode 100644 index 0000000..f95ca0d --- /dev/null +++ b/mariadb_kernel.egg-info/requires.txt @@ -0,0 +1,9 @@ +pandas +json2html +matplotlib +lxml +setuptools +setuptools-scm +ipykernel +beautifulsoup4 +mycli diff --git a/mariadb_kernel.egg-info/top_level.txt b/mariadb_kernel.egg-info/top_level.txt new file mode 100644 index 0000000..5944a0c --- /dev/null +++ b/mariadb_kernel.egg-info/top_level.txt @@ -0,0 +1 @@ +mariadb_kernel diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py new file mode 100644 index 0000000..0442b27 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py @@ -0,0 +1,78 @@ +import joblib +import shlex +import json +from distutils import util +import logging +from mariadb_kernel.maria_magics.maria_magic import MariaMagic + + +def _str_to_obj(s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + +class LoadModel(MariaMagic): + """ + %load_model load_path=/tmp/model.joblib [target_key=last_model] + + Loads a locally saved .joblib model into the `data` dictionary. + """ + + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger(__name__) + + def type(self): + return "Line" + + def name(self): + return "load_model" + + def help(self): + return "Load a saved model from a local .joblib file." + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = _str_to_obj(v) + return pairs + + def execute(self, kernel, data): + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments.") + return + + load_path = args.get("load_path") + target_key = args.get("target_key", "last_model") + + if not load_path: + kernel._send_message("stderr", "You must provide load_path=/path/to/file.joblib") + return + + try: + model_obj = joblib.load(load_path) + data[target_key] = model_obj + kernel._send_message("stdout", f"Loaded model from {load_path} → data['{target_key}']") + except Exception as e: + kernel._send_message("stderr", f"Failed to load model: {e}") diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py new file mode 100644 index 0000000..629ef5d --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py @@ -0,0 +1,162 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import numpy as np +import shlex +import json +from distutils import util + + +class Predict(MariaMagic): + """ + %predict_model model_name=last_model data_name=last_select_test output_name=last_preds + [show_cols=10] [proba=True|False] + + You can also provide inline values: + %predict_model model_name=last_model data_name=[38, 80000.0] output_name=last_preds + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "predict_model" + + def help(self): + return "Run predictions using a trained model stored in data[model_name], with optional inline feature values." + + def _str_to_obj(self, s): + # try to interpret numbers, booleans, lists, or JSON + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + # strip quotes + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def execute(self, kernel, data): + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + model_name = args.get("model_name", "last_model") + data_arg = args.get("data_name", "last_select_test") + output_name = args.get("output_name", "last_preds") + show_cols = int(args.get("show_cols", 10)) + show_proba = bool(args.get("proba", False)) + + # --- 1. Retrieve model --- + model = data.get(model_name) + if model is None: + kernel._send_message("stderr", f"No model found in data['{model_name}']. Train one first.") + return + + # --- 2. Load metadata --- + meta = data.get(model_name + "_meta", {}) + features = meta.get("features") + problem = meta.get("problem", "regression") + + if not features: + kernel._send_message("stderr", "Model meta missing 'features'. Using numeric columns only if applicable.") + features = [] + + # --- 3. Determine input mode --- + df = None + if isinstance(data_arg, list): + # Inline list of feature values + if not features: + kernel._send_message("stderr", "Cannot use inline values: model has no stored feature names.") + return + if len(data_arg) != len(features): + kernel._send_message("stderr", f"Number of values ({len(data_arg)}) doesn't match expected features ({len(features)}): {features}") + return + df = pd.DataFrame([data_arg], columns=features) + kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, data_arg))}") + elif isinstance(data_arg, str) and data_arg.startswith("[") and data_arg.endswith("]"): + # If user passed JSON array as string, parse it + try: + vals = json.loads(data_arg) + if len(vals) != len(features): + kernel._send_message("stderr", f"Number of values ({len(vals)}) doesn't match expected features ({len(features)}): {features}") + return + df = pd.DataFrame([vals], columns=features) + kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, vals))}") + except Exception as e: + kernel._send_message("stderr", f"Error parsing inline data list: {e}") + return + else: + # DataFrame-based mode + df = data.get(data_arg) + if df is None or df.empty: + kernel._send_message("stderr", f"No DataFrame found in data['{data_arg}'] or it's empty.") + return + + # --- 4. Align columns to features --- + df_cols = df.columns.tolist() + missing = [c for c in features if c not in df_cols] + extra = [c for c in df_cols if c not in features] + + if missing: + kernel._send_message("stderr", f"Missing columns not in input: {missing}. Filling with zeros.") + if extra: + kernel._send_message("stderr", f"Ignoring extra columns not seen during training: {extra}.") + + X = pd.DataFrame({col: df[col] if col in df.columns else 0 for col in features}) + + # --- 5. Run predictions --- + try: + if show_proba and problem == "classification" and hasattr(model, "predict_proba"): + preds = model.predict_proba(X) + if hasattr(model, "classes_"): + class_labels = [str(c) for c in model.classes_] + pred_df = pd.DataFrame(preds, columns=[f"proba_{c}" for c in class_labels]) + else: + pred_df = pd.DataFrame(preds, columns=[f"proba_{i}" for i in range(preds.shape[1])]) + else: + y_pred = model.predict(X) + pred_df = pd.DataFrame(y_pred, columns=["prediction"]) + except Exception as e: + kernel._send_message("stderr", f"Error during prediction: {e}") + return + + # --- 6. Save & display --- + data[output_name] = pred_df + + try: + kernel._send_html(pred_df.head(show_cols), title=f"Predictions ({output_name})") + except Exception: + kernel._send_message("stdout", pred_df.head(show_cols).to_string(index=False)) + + kernel._send_message( + "stdout", + f"Predictions stored in data['{output_name}'] with shape={pred_df.shape}" + ) diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py index 01d0667..d7dfd9c 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py @@ -1,341 +1,90 @@ -import os -import tempfile -import datetime -import pickle import joblib import shlex import json +import time from distutils import util - +import logging from mariadb_kernel.maria_magics.maria_magic import MariaMagic + +def _str_to_obj(s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + class SaveModel(MariaMagic): """ - %savemodel [model_name=last_model] [save_path=/path/to/model.joblib] - [db_table=] [db_conn_key=mariadb_conn] [db_uri=] - [db_host=...] [db_user=...] [db_password=...] [db_name=...] - [overwrite=True|False] [auto_db=True] - - Save a trained model (stored in data[model_name]) to disk or to a MariaDB table as a BLOB. + %save_model model_name_in_data=last_model save_path=/tmp/model.joblib [overwrite=True|False] - This magic will attempt to automatically detect the active DB connection from: - - common keys in `data`: mariadb_conn, db_conn, conn, connection, engine, sqlalchemy_engine - - attributes on the `kernel` object with the same names - - a connection info dict in data/kernel (e.g. connection_info) + Saves a trained model (from the `data` dict) to a local file using joblib. """ + def __init__(self, args=""): self.args = args + self.log = logging.getLogger(__name__) def type(self): return "Line" def name(self): - return "savemodel" + return "save_model" def help(self): - return "Save trained model to disk or MariaDB storage (auto-detects active DB connection if possible)." - - def _str_to_obj(self, s): - try: - return int(s) - except Exception: - pass - try: - return float(s) - except Exception: - pass - try: - return bool(util.strtobool(s)) - except Exception: - pass - try: - return json.loads(s) - except Exception: - pass - if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): - return s[1:-1] - return s + return "Save a trained model to a local .joblib file." def parse_args(self, input_str): if not input_str or input_str.strip() == "": return {} pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) for k, v in pairs.items(): - pairs[k] = self._str_to_obj(v) + pairs[k] = _str_to_obj(v) return pairs - def _detect_connection(self, kernel, data, preferred_key="mariadb_conn"): - """ - Try multiple strategies to obtain a DB-API connection or SQLAlchemy engine. - Returns tuple (conn_obj, cursor_factory, created_conn_bool, info_dict) - - conn_obj: DB-API connection or SQLAlchemy engine/raw_connection - - cursor_factory: callable to obtain a cursor from conn_obj (conn_obj.cursor) - - created_conn_bool: whether this method created the connection (so caller can close it) - - info_dict: dict with connection metadata (e.g., database name) if found - """ - # 1) check data for common keys - keys = [preferred_key, "db_conn", "mariadb_connection", "conn", "connection", "engine", "sqlalchemy_engine", "connection_info"] - for k in keys: - if k in data and data[k] is not None: - obj = data[k] - # SQLAlchemy engine - try: - from sqlalchemy.engine.base import Engine as _Engine # type: ignore - except Exception: - _Engine = None - if _Engine is not None and isinstance(obj, _Engine): - try: - raw_conn = obj.raw_connection() - return raw_conn, (lambda c: c.cursor()), True, {"source": f"data['{k}'] (sqlalchemy engine)"} - except Exception: - pass - # DB-API connection-like - if hasattr(obj, "cursor") and hasattr(obj, "commit"): - return obj, (lambda c: c.cursor()), False, {"source": f"data['{k}'] (db-api conn)"} - # SQLAlchemy connection object (Connection) - if hasattr(obj, "connection"): - try: - raw_conn = obj.connection - return raw_conn, (lambda c: c.cursor()), True, {"source": f"data['{k}'] (sqlalchemy raw connection)"} - except Exception: - pass - # a plain dict of connection params - if isinstance(obj, dict): - return None, None, False, {"conn_params": obj, "source": f"data['{k}'] (params dict)"} - - # 2) check kernel attributes for same keys - for k in keys + ["mariadb_conn", "db_conn", "connection", "conn", "engine", "sqlalchemy_engine", "current_database", "current_db", "_last_use_db", "connection_info"]: - if hasattr(kernel, k): - obj = getattr(kernel, k) - if obj is None: - continue - try: - from sqlalchemy.engine.base import Engine as _Engine # type: ignore - except Exception: - _Engine = None - if _Engine is not None and isinstance(obj, _Engine): - try: - raw_conn = obj.raw_connection() - return raw_conn, (lambda c: c.cursor()), True, {"source": f"kernel.{k} (sqlalchemy engine)"} - except Exception: - pass - if hasattr(obj, "cursor") and hasattr(obj, "commit"): - return obj, (lambda c: c.cursor()), False, {"source": f"kernel.{k} (db-api conn)"} - if isinstance(obj, dict): - return None, None, False, {"conn_params": obj, "source": f"kernel.{k} (params dict)"} - - # 3) try to read a small connection-info dict from common locations - for info_key in ("connection_info", "conn_info", "db_info"): - if info_key in data and isinstance(data[info_key], dict): - return None, None, False, {"conn_params": data[info_key], "source": f"data['{info_key}']"} - if hasattr(kernel, info_key): - obj = getattr(kernel, info_key) - if isinstance(obj, dict): - return None, None, False, {"conn_params": obj, "source": f"kernel.{info_key}"} - - # 4) nothing found - return None, None, False, {} - def execute(self, kernel, data): try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + kernel._send_message("stderr", "Error parsing arguments.") return - model_name = args.get("model_name", args.get("model", "last_model")) - save_path = args.get("save_path", None) - db_table = args.get("db_table", None) - db_conn_key = args.get("db_conn_key", "mariadb_conn") - db_uri = args.get("db_uri", None) + model_key = args.get("model_name_in_data", "last_model") + save_path = args.get("save_path") overwrite = bool(args.get("overwrite", False)) - auto_db = bool(args.get("auto_db", True)) - - # optional explicit connection details (fallback) - db_host = args.get("db_host") - db_user = args.get("db_user") - db_password = args.get("db_password") - db_name = args.get("db_name") - model = data.get(model_name) - if model is None: - kernel._send_message("stderr", f"No model found in data['{model_name}']. Train and save a model first.") + if not save_path: + kernel._send_message("stderr", "You must provide save_path=/path/to/file.joblib") return - did_something = False - - # Save to disk if requested - if save_path: - try: - os.makedirs(os.path.dirname(save_path), exist_ok=True) - joblib.dump(model, save_path) - kernel._send_message("stdout", f"Model saved to {save_path}") - did_something = True - except Exception as e: - kernel._send_message("stderr", f"Failed to save model to disk ({save_path}): {e}") - - # If user asked to save to DB, attempt detection and insert - if db_table: - # serialize model to bytes - try: - model_bytes = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) - except Exception as e: - kernel._send_message("stderr", f"Failed to serialize model with pickle: {e}") - model_bytes = None - - if model_bytes is None: - kernel._send_message("stderr", "Model serialization failed; cannot save to DB.") - else: - conn_obj, cursor_factory, created_conn, info = (None, None, False, {}) - # If db_uri explicitly provided, prefer it (SQLAlchemy) - if db_uri: - try: - from sqlalchemy import create_engine, text - engine = create_engine(db_uri) - raw_conn = engine.raw_connection() - conn_obj = raw_conn - cursor_factory = (lambda c: c.cursor()) - created_conn = True - info = {"source": "db_uri"} - except Exception as e: - kernel._send_message("stderr", f"Could not connect via db_uri: {e}") - conn_obj = None - - # If auto_db requested, attempt to detect connection from kernel/data - if conn_obj is None and auto_db: - detected_conn, cursor_factory, created_conn_flag, info = self._detect_connection(kernel, data, preferred_key=db_conn_key) - conn_obj = detected_conn - created_conn = created_conn_flag - - # If detection returned connection params dict, try to open via mariadb connector - conn_params = info.get("conn_params") if isinstance(info, dict) else None - if conn_obj is None and conn_params: - try: - import mariadb - # rename keys if necessary - host = conn_params.get("host") or conn_params.get("db_host") or conn_params.get("hostaddr") - user = conn_params.get("user") or conn_params.get("username") - password = conn_params.get("password") or conn_params.get("passwd") or conn_params.get("db_password") - database = conn_params.get("database") or conn_params.get("db_name") or conn_params.get("schema") - conn_obj = mariadb.connect(host=host, user=user, password=password or "", database=database) - cursor_factory = (lambda c: c.cursor()) - created_conn = True - info["source_detail"] = "opened via mariadb from conn_params" - except Exception as e: - kernel._send_message("stderr", f"Failed to open mariadb connection from params: {e}") - conn_obj = None - - # If nothing found yet but explicit host/user provided on command line, try them - if conn_obj is None and db_host and db_user and db_name: - try: - import mariadb - conn_obj = mariadb.connect(host=db_host, user=db_user, password=db_password or "", database=db_name) - cursor_factory = (lambda c: c.cursor()) - created_conn = True - info = {"source": "db_host/db_user arguments"} - except Exception as e: - kernel._send_message("stderr", f"Could not connect using provided db_host/db_user/db_name: {e}") - conn_obj = None - - # Final check: if conn_obj is still None, return helpful error - if conn_obj is None: - kernel._send_message("stderr", "No usable DB connection detected. Provide one via:\n" - " - data['mariadb_conn'] (DB-API connection), or\n" - " - data['engine'] (SQLAlchemy engine), or\n" - " - db_uri=..., or\n" - " - db_host/db_user/db_name arguments.\n" - "Set auto_db=False to suppress detection and provide explicit params.") - else: - # We have a connection-like object (conn_obj) and a cursor factory. - inserted = False - created_local_conn = created_conn - try: - # Try to obtain a cursor - try: - cursor = cursor_factory(conn_obj) - except Exception: - # fallback: try conn_obj.cursor() - try: - cursor = conn_obj.cursor() - except Exception as e: - raise RuntimeError(f"Could not obtain cursor from connection: {e}") - - # Ensure table exists (simple create) - try: - create_sql = f""" - CREATE TABLE IF NOT EXISTS `{db_table}` ( - id BIGINT AUTO_INCREMENT PRIMARY KEY, - model_name VARCHAR(255), - created_at DATETIME, - model_blob LONGBLOB - ) - """ - try: - cursor.execute(create_sql) - except Exception: - # some drivers need different execution path (SQLAlchemy) - try: - conn_obj.execute(create_sql) - except Exception: - pass - except Exception: - pass - - # If overwrite requested, delete previous with same model_name - now = datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") - try: - if overwrite: - try: - cursor.execute(f"DELETE FROM `{db_table}` WHERE model_name=%s", (model_name,)) - except Exception: - try: - cursor.execute(f"DELETE FROM `{db_table}` WHERE model_name=:%s", (model_name,)) - except Exception: - pass - except Exception: - pass - - # Insert; adapt paramstyle if necessary - insert_sql = f"INSERT INTO `{db_table}` (model_name, created_at, model_blob) VALUES (%s, %s, %s)" - try: - cursor.execute(insert_sql, (model_name, now, model_bytes)) - except Exception: - # try SQLAlchemy style named params - try: - cursor.execute(insert_sql.replace("%s", ":blob"), {"blob": model_bytes, "model_name": model_name, "created_at": now}) - except Exception as e: - # last resort: use execute with binary literal (unsafe for special bytes) -- avoid - raise - - # commit if method available - try: - conn_obj.commit() - except Exception: - pass - - kernel._send_message("stdout", f"Model stored into DB table '{db_table}' (model_name='{model_name}'). source={info.get('source') or info.get('source_detail', 'detected')}") - inserted = True - did_something = True - except Exception as e: - kernel._send_message("stderr", f"Failed to insert model into table '{db_table}': {e}") - finally: - # close created connections only - try: - if created_local_conn and conn_obj: - try: - cursor.close() - except Exception: - pass - try: - conn_obj.close() - except Exception: - pass - except Exception: - pass + model_obj = data.get(model_key) + if model_obj is None: + kernel._send_message("stderr", f"No model found in data['{model_key}'].") + return - if not inserted: - kernel._send_message("stderr", f"Model was not inserted into DB table '{db_table}'.") + # If file exists and overwrite=False + import os + if os.path.exists(save_path) and not overwrite: + kernel._send_message("stderr", f"File {save_path} already exists. Use overwrite=True to replace it.") + return - if not did_something: - kernel._send_message("stderr", "No action taken. Provide save_path and/or db_table to save the model.") - return + try: + joblib.dump(model_obj, save_path) + kernel._send_message("stdout", f"Model from data['{model_key}'] saved to {save_path}") + except Exception as e: + kernel._send_message("stderr", f"Failed to save model: {e}") diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index d67659c..aadd450 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -27,6 +27,8 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel +from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel +from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict def get(): return { @@ -51,6 +53,8 @@ def get(): "train_model": TrainModel, "evaluate_model": EvaluateModel, "savemodel": SaveModel, + "loadmodel": LoadModel, + "predict": Predict, "select_features": SelectFeatures, "select_model": SelectModel, } diff --git a/models/test_model.joblib b/models/test_model.joblib new file mode 100644 index 0000000000000000000000000000000000000000..68a9ce5ba7ea4c901bd1a2e4d0cabbea894c107f GIT binary patch literal 146537 zcmeHw3!Gh5b@wHNS0E&T5D-zuX__W!gz!cgxQ9WE7>XkGfg|_lLW}y@Z|}4Af3LIFxqD`xdu}M+ zkNM@7b@yZK=h^?Y_S&!IBd?UDP+ZcMlV7{=!*3i-?^{^A`8i`cCT|w{U)U*V0)F7cZE%v@4C;H*-Yq*sQOq(m8!Icj`UB zdVX@={BvtPUFR&BHK)g9)0GC!p!!!Ty?fcT&zU!?XPK$CNq%qZwsE?;7tKw5_F9rQ zMAy=uSv~VijQwn?^A|2&W=ej6Db}1N^JXob*Oy5$t-j0HFzhjB&f>0F3l`UAWg?AU zxTt$k&!X=6bGqi3=02@=Y?o`pu20NcVwxyhXE3wQKvG*1Ma@clIJvlbMy?QJ`}U4p`fbIh07*E{Ys>pXGEl35pg&{SY?_5<8~ z>9W~rQ=67bx%zr{>z+Hy#x-AQ?=DMgvlg4$jJU-7UbJMc{h)R_$^4$%bAj=%y%oPq z+!60Nw(s)KUe@=%zR5Fp?%nCH`%XLUw0Hlv`Ah$sS?%eY-8*X5f(45{*|l(?X~)ZZ z-$><5b2FVg|J?3{^SXPwW?#@_KAM?hm(5_(JX57ZuIxey_wM$CAngt_q(*i5an?8Upx#m4U5l3WEMC^Lw9AgI-Q5`P>Z?s5 zK&VZt9a@_{bEGNW;k6@bN7mj}>$tpjylac^A*ZIch0Uyyw%GIJZUZ*+4DfWHYrdLy z&hMVpJ;$`phfpB+dwGptT9V=yse@Wfz09NjzO9PxQ~ zsL#7%zJ*%HXQ{8o+Rr}w=NiC~?N#?9A-a2V<&T_C?d0%>zU$IC_PaBmc+C`Nex^9J z_sOwl3v<%$BVC$(mzpuzJ30%}>F2kL6*LAl_K||e`WmrerH#qTw2b^VcknrE)M1`2SlO1|S-F!wz_wM|GqmN{8slENQ z$-Sc%%sZ#2Yff$cg1LRYdoD=@lymLopwYl~PtAy=+9DfgcgUN*UNchD5&(>Q_a_NA zZ?5?wWyn4Y=NqZreQwvei@JIi8L4(oPhU@3k&nmp!pzF0eYGXEi)QX_=u$(+8roAk zedYm%E;F>v&`%iu$%cN?xThO>zD;@AVWPqUOKtzoJoeJQWi!oJJ9BLI)7vHe>|Co? zMz~J^ZtGnBng2+(FBx_AX?Jd`j&b2cIubau->&`r2j}6!S0J9#ft<4~(f;@j;7_z8 zd3o|#Us(Clmg-(2;K&5d*!*Ss2YLQoTtd4#nyr8Gk1r*C-Cbw?bmr65-H~BRrzUX5 z<}d3%zA!(^N%?a7T{+#Aq<7a(%v!L_AgLZ4rr$?i1}Sm6ABnqC$-g}D6!Y&9hco?E zxNljw;YSNMtOEX^x{i8Ws$BgK_HTXlob+$4ir=r?%!dTWA9vHf$KAA9#j8}-U;a<) z%)gvJ_=qAsP#J1nS6<@$md&mmSbL-Szo1Ci<;5f3Tv8-(qwn>*EU3G?mjIGtNBA_f@j7^Xn#?dI}f$~^qa^=g(l7sH~_(Atwt z5A7kxlZ%`r$qC1e?4I`NP1W55|D*&?v;$}e?_~RM`GGS?ALX*?TfCiq{lMV(+n=n; z@-bc^U(itg*>o!v|D9EUV|$=NS(ZJ&ex`hj<0aae z_!+OSoZ@Oy>(!rq04PVJs44BXu=t~=s=!B0(N09U-QoZK^lxL8=r^k0xxxP3=PBa7 zbN`R_tUmdcYD)Hh%2R|(n$L%De5J$lm9D&719+O}r}EwH<5M}G^KQyN&nL^JU5F2= z!<&HDlYevQ;#S4$kp{2#$l>)KMesU{3l?vdTJ`S!!P0mw@rv;h<5=Pe`n@Ch&0lGr+B=DJR$#SUS0r4>ftrZo7Gi` zr;~mDvE`TYqEeV2<)iXXEX*(4kMj@X1wt@@mZ^ z_HR>_DD>1y<<05e9g07XE0q6MkAuho_y}Cnc!zkB=F_L}kiX>4yS7(L>!*i5c~V)E z#cNhSwRqQ{ev0v&{jTkJz5N=ip>D12Cpx3Uk9UkiIi6hPq%--$otK<4di9p--h%%y z&p#yIE$>KD_4MOUs`+Glg5+ZH0{IYM9`Ar#NH?WMza{s8(k~D2aOEA2F|A>GATa zzqfyDs~=APo~RO@KIFp*4+ou|`k41ePY@jr_XZpvsN`l>D4mc$bJ8!Q530lA4yC94 zwHy3R&`&?~(cjH&W&N~4ysoPsE~5xKVo_z`QnlP|-` z8IF%%l7{P7TEBk#`ENR-Rq@)|f5+tP31edH2^g=UACk`Gv#WmF|Dl7oR>zBgc)Wi6 zYm1K`_4sBrpPrZ?xyHsTe;>dh;4aclsgXAap8u1hs?#%8bFY{--jO9muR6g!M*oiRz z%En7}eI>cJ`@u1v>(ln6!@a!1IwIBwviw%wLSH~TQU1-7x3TrdcS8BIz7WeV`;X^0 zVR`oZ>-~^TJ`>f|fIMIAQG8`_%1BE}+((>|- zcfPc&RpsTbjmt~m4!5-VwPHq+nxmF%t?nxV^7be>o)Nhtd4By{7SDOZ7PUX)=marh z@j~J?%PYuB;4acl>57w+2?G;fxUFlWnx7rv@fLCx`a%|OtvznUxo3TD5^4s~*h2y_FW8;O7s`b+GzWlN6FZ-AA zI+j0Mu14lJ=x1ub;`)XxKl4M*LS9q)?*HW7!S!a;C$|1F|KjV#$d5|K_*hn65`J$` z>ks{YJ;W^XnkC+iZvMpcD9o!wdu7(~;eYl?7mOY}YD;zU_r7}jrN6qt(VsYaU-eqM zi{|pDsct;m*r=94>=2cC0l+guhVu~ zu|>)2>AwDumv!|8vwriTukY3I)YZEE+r7MmoP|D+&ELw~b(dKK!^2A69_0CB`%kt% z%d1%aY`Gedx00XrjoAG0eWm?TVSOhouSQ4B@33AEdm!AHhfsd~ZhrMcZ@(k|(El1w zf9~cn{ek%m($Rdw`6K*Em+to+UEwJ2C&l~{^D)f(kOJf(+*JSP3zj!jexwSj<7qeI zHDUef(!tZGx2n9^0sC2hXRMhAKHu#9@2sB!hk$q7)8?0%AD(~J2mkTmKiR7GzaQiA z4)O-LnZ*^0*GJ!W?DVr98dUbw{QL~KE%Hw~lMkc=V`ETWLQX?&NdCmnkEFce`p4p3 zwqBLWt1mCnPgFm~+t~7>pBm+t`E&ilcptZ4fXdUa)_t(gs$bz<9+!^&>9c>a=)yJC zxo_F?%zHb(>FC-j{L|@4-{We2a$gnx_+AKqAx|E2`(b(hf%yf|qVo#mk}p3(>QlrA ztsg7z%kfLw)6N|6PY1O!UN<;Dka&U~=t#DXw>!MH`{w5d_2tD$$*pg` zWka$-+C3{)^2Y{Y>ex zyo}{Xe#Fn?J(gd#?<7|*V=tIODOMUh?}l}K*h4~ikNv|$pU$1v(Dibq$`PCg9{T(* zw@)sC)B|#0i^uo+@j~egXP*l3>0A&GREIagp7WWT4nMV3@p@;oe$XIZ19wD=C!NVx z?D6EYwSFF6KM?$t4!?dN$J^ut@nXko2rqzRb@AHl&q~eb2d`E7CEL$q^G7~Z?&9_g ziB}b0f3yp5Hyh96?eeFUJ!-n=k1c=e|X}zzTL0n4evLI&0qEp z<83UzEI+qjS-ggRPI~1kXV)^26V4~2$BB0M`)Bn2D8!2$ zZxZi#y$^b8J-o8^^U3>7dio33==J8Ky?%;xfd^TOLFkd486ajto^h2d|Kia+XEWI8a+T6&c{AjG(8?(Z#UH53_T%z5 zR)5B~vixCr6L)@s>euh&LVn$KxO3Uq&q?&b+&)f9zfzI=PI(;_;Y7Jzq`!9Qc-pPw z!<~;;|C}K9y@t*UayrIwyegJz4^mpusgnb41 zNUHH;AZIV3b8f3WUJ-s_-y)xblj&*~=R@UVoCp3(4vsJVLoUPz&BrTk581El;y-C+ zylc>Yjq#Ah3#%Vueoz+gw-%_vz~G>mSySGjjj)JezF{u z3igw0f7s2nzjSYR=cajI3c~B}PeCXs$^{C4rls-v;$Oe7)~a}I*Bcw$k60Hk3|`{? z^x^C|63<+JEk8b){oIn@eR?mwfBTRGJqjF=c#crE$H?&FcqzwwS-fDpg`AOi>dNo$ zt9x46BZ}`2K|4{(#qH0b`VWQI6|*1nP2bq?!4LdW-zPFPK@Y{sE15sH-*EOE$Sbrn z<@@Td7t8WjeEq}nB5pq-XWun!UOhTe~eSey>cGuWO@_dKw@hd4MsQjoM? z;Wzdr%2@2u>1j7&hWwkb-uvVBFShJ?r4sj1L;uN+cZ;`A+UFz|mOrlqJf$9POVHE6 zc{v_kXwsRyFEuFW`>c=j{J=4pK2q;+@$`MtoPVsmK>MI%$U);Z{8Jjdudeug)-r$A zuVc%f?N?K_1my(I*Ok|1KO)6jr`FpidwW`J{%9vkuid5Fs6C43cVYcJ4zH;^u;bu8 zl1HjiPp4S?KjRbZHaP!@I1fHx|6ni2x*aHDY0Bv$1n2(sya3@u^L#QKu28&wh2w}H zi|3(qn}F9>pZa&Fx2iqI-Ura2JqP0zJ(bcG??}??sNa@-zvZ`hKVizw z@402`Hg&$UxPFcFseF0;8n~0q-_8$k-U?LL=a0oJ zw3Ex%^FuF}OlE1jTfLx`TasLf%&zX_zm0pm0kFtkN zEG#c%{_Q2}&-fIU7jgC+YL5YbUmEN>Sbv2c54(kS6TkkS%LhFkb`r$YZj=Mz_1&DV zj8~~>c>*~j+ZMO_HTEM8x857VYwRaP`qYy=KLCz~ z@EZ4z$nnbd=UBW$K9qkvUIXtW|L$I1#pYiY@3Q@p9hVxfsebr>M$&F>)9xF!MxB>x zp8X&5O6d8>Ii==4WO8eH$L$~6ezN`*mu|d0s4TyfcN2a7VR;cZKY;xo_I}6-ZBME{ zFJN3Je!L5b{YBLJ{C^l%<8T})h0+Q6gRfGdb%cAWhyLW}*M05~{r)b;qhF!)P;by! zzEHeSK25MktzUQVgjSU|c7M@sx%-fJi@6V3;i2moes8_(eo&b{+arMEkS78c?ENz%sQup@L{C={c`Dbx3JD-j%ztk_;ej3X!+n41{EWd1@NeSv-7VmJc66s}Y z-MZCv{yuHwv*r=|x5?dW`dxRQDc}ELrQ*r^|H9uR0zOGYc!OBjPtfmi4Iz?-(h2!X zZgwF)D2L!zY5ne*8K>^s%6Qk{`9Ro_vg0yqf7}lpkkb5m5%46-Z|7sUKLqzQP%?)l zNG^7~$??keEBM#buk8MUPhD+qAbCdZhvNH3hQzz&?{MW@r`E@dpBF(nsour$Cflwi z4Ogy81#$}W#caG*dQT4Zr1p!!+qm^2;4P)g`^vCSfcz_!52t@mIJ|u*hqs7FG>>2U z`~>dZkbJ;f`1UK@&8Lf)^(n{K``yT0^xXqaPv#f$Z-#z)(DetmDqfec-$DP#j(3aK zq5Z#z2fZnT*Pk-CPu$f$sP2#E^`cn3M)~MFVf_*~2;3FlQPP=QG4H)oPrdz_CZCrm zj<;wZDj(118>|1s1oaH-hjI2h=!FuWcwUTru%1BYgxC3V3ka7qKYsC!B;t`i`Bane zD4!qlDXrh#{o?kPKHoJiXYU^uWABH4hjvb>8E*rF_JNbl>iz}n!>?2{UI>1(Z^)kR z`pnCl)Oc-A5HEJT$nnPWIp78K8>E|3vtOUwdjBZz*9Xo>{lTST-haeQj zyS<#!dal-kDIL8orQOI4RF^}$5f5~@@Nhi5!;Bc@#(T`6^?S~b-cR_hZx5tH{{NBl zM>tWWvmy`=_HD$I+{3$?aKGKzr+sJ1>+^%0{TgyBiwkysKUYEK zKaJIILgNkoA-uSC!XD>Mea8l+UrtR>o!IzNL63K|YJU9c^H86d{b%N%et#p+hZO5m#0SmCDvcL+JhJZ_U&o7Ryuf>@a{pc& zUQ~Yi%fvh`{r*MI>-S2F&j;mrW%)iFUf{lYX)j>;96R2T4}Fov=W~+3xcyS%MSFsg zQC7bIE)ZToZ@{}i_xpJQ%~PJJUNCy_sJ_Sht7oSVG~=8NN&}i7zk2_TJl7NELr8s! zd_eQDO5?>Vdk>u3s(4|~zc+DxdBqw&-vT=c^++6EWc3Sa-z>gfA@PF8E5;%8_3V6V z`HJ88^Ng?$Qu+BjBji5tMDS3d2FBUJe z?PBVXxEe;EM;rl$l=6HsHhL&qBDQdHlk8 zMa09swQv*F`kHp{xb7iCCfZ5e|xYpzvo}T0i~V8yg~Th)KbAjRF0<>=*B>|p{T%H=bcSEwm+e~|?`8QJZ%4Td<$8$oH0Q6;f0Cce z-&p+_9~$H5^2hSa@0;x*VS7T{`D<#a0l&W;xDV=DQoTR+h+9AA^FMMOnD65TKT&Qs z=?@q5VNm#M8cHYR&-_?_*6Y$7Bl|0#9@X7S0U;lpP z)RsM8R6@T7-ij7qF(Vn39x&QcmHS2V_7LE978k94{(*=0zW9yz4^*XoeRP6+w7kjk zTm72iy>{P6D2rFXF~|*Ff8e#07scfr+L_vu=LfOn&z8%i;rhjl$0?LzrPf=UGUejF zi~1(lrqvFuO`jR3UlTX*_BMZSD&g(TRlH+E`#kZN2&nsUc5nv zhmGPsQfVZ4pA|6;vN+BKlRUgz~US|=N*V%<#d`=jrX z+ZEp@DCv_wcJInPvT3B42Ve!&fXNkjQ)y;!?tDtSFLly2jErR^aTZvOn-R>kWk zpD(JI`+0D_sL}mEcppIOr$;9E0%Gw3xJ-O(zl1;ZW2Bo>gSV}@AE+%sJ;Qhvw||iI zK;Y9#Z+8WbAs^re?$ej|=OY$qe%$}j?g#qV>@9!Vs(4XCzZ-$B7PKRI{-ik<)o$EY z)p&tuvEw}?-{B8CTz1}M^~?8u=Z*GkT5Zw~5*&^w@~iSir!>udde*0TK<%P+sL;`3>=GxbkiOuXh#4n=Fo-)DzURF}!1b$eED5+WzI;c7EvA zwMt%1Eu6pXpEl1AIg9nKY(1_081f43O!emTw6XQacS8K!f3f@{h!OuM_z}x5zYnf| zSig>2?}B~}`5-CwiJ`RpUY?K-<2ms)e&4W6kJClYl7{d(r6&&h*TdM72cKLYXd z=}CJ<{vC7T%<~~dNf8fJhFVu?d&1E_KC7*j&@UhPNlk@W}h6DklctUqb$|cqQtUc{d`KAN8bs+moXC z%l<8n*XSS0caqOPjJI*?UAW%~`ZK7uXHdR*T%h%KzV}4#%fnM z@E=$8yGopb{gp3=42LTePy1^(_?v)tlRo+WceN_sm9U@6@qqU3@aL!X`6$GQ9k0MO z;>+V5^cR5(2Jf!ee60Dmsk%2ZOlcnPpuc7Lv-%yjo43c!>+|`b1d<{is0_8P()!UI z?>X`OR>g~5&3^kax%-*M#N5v$#{=5E!>_mK{q~3vJ6d`eWZQL z@8;|E`Mmv9>OYR(+^_U6Gv@r|Tc&JPc*E=Sz!~647H{}|rSkW0qMlT;;&`F@Cvi3O z_iy5!A^4}%+%I&(aU;8@eR`9ckMsLKn8#yYovlBw&u>)sJ-7M%%l1F!@mP<`=5Oui z*bgP!ueg4fO*gCG#FjtQ{;~XJ%P;v^UWN73F;Vv;(f88t_rKD-eyy|bu2OiP$7R@y z$$tDzXE)y9?Z@e{+rG!$zJeS4Ib58}MgBzbf6;k*zFbHN_Xg5^`?vV|z+dLS!iSUp z7Uf2&pxgaESnP{MNPQYAXUHF^3>S`gpd7!ny!`z1|JuFej2+!p)-N~R_doy6MR{*`AfVij@OXOly7nU9{of7EU&_N8K+;v zeu{hKD1F?2%Jx&T8{SpL{sf8_vUBoyrFfMJ?3QrXw_oP@5WhY}e9(NX(s=RBTWT$P zU$Whw&}97@>s|W(0rX5`O&7lrVCec*^3kB)!&@d^FfqV|vS zc#AE+)Xyp|-AV=H82WuSo;@E{Uq39qUw(wE5)sy~%9da2*CHFEpX2=}**|O#h|M4U zOzAV;LchlRO0=JmSLfYfPi$;Z?+dW~6W>MM`9%wzFO%)Z-)~r6#Oc@2ry)1wdfV{& zHS|>IujGDEor~yqdMc=vGhw{P_Xz!!Xy{xQVu#Yp`%}Hyok=kM{?ETs`vT1iz$+2P#9YtF*j4^_DBnYgPTaggpwlBU;?pgC3Z2{@ZprbX!%w z4+>lZj?3|kR63Gdw_bMgeJeIAdl29M5R13S$EBPghd6n;N$K~~J%4QevVLq&gS-Up zqg`zI?EK`Sf4%liBaYb`v;EeIvH~q4pb6UM8Kco$~u* zP|mXX$MPdT+pgv>N&h$h#q#4jAwKRu$XUp1q?=MBZ|m=G;qyMR`OEL8Z2z&o)LePF z(x0=&`UUROAieK_D$(0L73~#V$An!6_8v{ccrWGgJx)F`MgCF_;kqEY1gaqKFKJmxck4XzQK4qq`b86ORV~B|A!8K zLft=He13_3q6)P6_Jf`QeNFct=I4^Xth_1Sf3p9%{lfApZa*dV@nBv@^P81we>`v- zRD28>M;@0c-Qo6GB8fadg@pY19xJ2(xe7T8e@Qt#%xh&h?swPsE9+RiexgGlKefMf zqa2_z)VfN`n-@E;*!lJOS?>O2IiAq&yk9`?-$#tb^i$T4A*X;hNH?XK{Q~=lfc$;| z;FgGH@bc2yyQl5FeWQARj^E>foC2O}yz95upZtryZ;RL4A*X@cBLAc_nQm_D8h`tf zYJSD*H(K6E{v;vy>*wjEykWeL{r)6B%d6q$XOMUJPN+YM+oSQF6MymYtMTc+3xkJK zD8>B!Eye7an2$m3QL6pQ9+$^uI$pNHoj1dNf&2ZrGySeK-VY-GN8EhtasRF~?17k< zx!P4Ahe07n5l$5H7*vL`D_;)^3;8pr{Dt_SGL&7zg>y`jldqxsXMXtDJuUsdNrUGP z>e`vvrVT*wuSW8ew=?H6F}al`Ew0FFpJbKg&La)S2;WH`&4SiD3zDP^ADHO9|) z2YCs(=Sof_3GY`xJH_UYd??|G3G#{MFI#`~FXh|j^AF>7-1!6O=Y)T2)xF>S3aHQ6Q{{yAZ@nwD?7vh8Fga-)p=B4%l>-oA(az-YW2z(l0yS zD+#3&Z%>kV!{Z(Pb>|0WellsyuGf70t((+-+^If);H0ddD_zo=th>w_gdSG+=@!TP zY`bR1dt-RX`gJTn`k880oPPv5ETufZ4C8IweGHT=&R<;T@Rj%1(tg>^UjN0t3-bL1k#z z#qmqy^(BXXt7Y#;+yVPpKXug^+n0>m8ht-v78k7j8|%f=-f~ESzHV&1^LT~cD)lFq zy1_g8Ret>f`A~YCe{A{7@%2`#_(ohf*y+H&z7qZ z{SNh`d^!IxUd7F4F^>g4kzR6}T5m&b^SD6gTc2>}cD~{CnH5#|gOl{1o7K6f(0W!r zAJ`?49@!^x+wW9HwKCo{xPL!1pNhSY6yqT~t}I@W9=^GX z^(5*^zP|%FBJ~H?-~HsAhwy(?RP`~70Q zOO~I%pVLp%y^Pl(3be?xc&9EUv5#fNN=3^{_@~sYe_;JVem`aJr{w<4@meYgW3CX45@<)oHcqkv@k^5%%P7=;7 zAXL-sk?RLsADu#dH^(n6FQ5DBwSV5q^UDV9X~129_eQ^_^TC6;_t?-+(7Pco;Ga_S z-Z9aQ%APgd+ruC)B_6p{%={AfK|n51$W%(#IAZYz`4B(LYsgudKEKVO_Zjc|H{x1YhD28w)$0)OsxcneyoVE;z_ zeH%!HaRfe+BK`cmmNH%_oshreW*6dvatMBv#=G%1tR3+>-bK$3(&s17AKc;ZbznSc zyzM#o;qjNeN#DmCo*!iSEBn9b-&2!#R{VUX#ACKcAzk2oHvi=-ev{C5tM`AG)epspl4y+6V2m7`b@@zP5!+M?AyRT4)9~&hCDy{Go_~@ zc0L~p(RLo~20y4yN4rDu;NOJxgZqDe=rOP3T@LR;c#|H_J8)Zd0(XJ=6@GtGjCZrY zV&Z@P?3zWLgK9tAVF{9p)lZ;@5g+3Xa0ubq{O$hZE9Sj->Z!Lsle9B^yHGQq#r{{| z8HE&IFGV?tpT|4W1#Y2SDfRQ&b!t9aRzKtKI~MOlNhbV z8gCDf`Bw__%l2(6%rDCygST<}e}T8Ke_|f3?RvmhqN}QO&h2TBm&EtHyZ>aR_eVPu z<$jm`aN!=^{s2GXh0+Q6gAeaX{48g8P7SXXJhzWl6TzoS+0dI!9p zX*U0wDfzZ)bMJp={StUR%5`-XZ>+qSyx*j!zhLBrJRf(oxBmg>fxF_1F?fsjGqEm2 z;ll*-5ZJzPsVlE)?5-@xaQKbk7OkE;6_EB-t(C6s?ZbGUq@ zJz9tUrt%S(ES{FK3Oj9LVQpi-UPhf=WSc(w<=!S^K}ib7fC!qyYu@#W8sz3 zjs(ue^h3rw;1F;pi+2{U@A#lSsd-$$`$DvS z{zJEq>~2pfeXUonclu@g`-^&g9_#iy@69kl~fEmv=%JUkK}Gz!8kc?6|ggIn`cgdwqC65^xf@g)cCrM!&^1OBavF#V)56hdl{nyxM4SNdg*i`TS z>Wka$-+;GQlO5%bfW0G++q5rjOHQ6q`ux9K#|z0J#LwqLAw~Tw6}^9&?CPtlG|$lX z@=*RO0`2iaz+a~mnrDRkHGinyO^}xdU9ffER+g8ebMN_$j(N`yc$FQO7BBGL$Uf+z zl;+nPA+JQVq%(P9roGX0i;|aopDE;ENM3#D>w9%Pb@emqd}Djio|7%V#e3-I$cM_u zc-a_U@%$i`U&@Q(cq{o?-e6t@xhLAcG9y9z$@b;;i!HyDw_Jb7S;#%z{;>bc@)y@P zvguYToPTWjB|qa!EWiA|Cnjj;u)K=94;JzYatd}AxRDRtue#dZTmEDfZW+GOhr=Hf z^P9%zHAscq4L*_%7moX5QU0Rqy_n}c>*jYw_s7W6vfdIdcj@(m1v~9Bt(E0XgZ3!k zR(4!kd4u~yv^@&(fFrn^p^b_$H$7|VsQxi(Z=zmk(uUc$;~?9%bI8!i(S{3hhzAH|ChZBn)L6Wcc*%GLoQAwXx+zUM6LPOp?-3TqD^ah^ z{)O1`OT69Pm8;SHFCdspsr z_J4jKhtlJB;>gYC^l<(U?-T&vk+Y=va!`1`!e{K40e6TG;hGK?AK{>yzX^D~?=!#L zrB(5|3HOm^$F6)6KsvYCqyp33@hmydodsW4rmr2HB0fb;d`mATLK zx%F>ZJZHVKzfVdioksDtI9|*0w-wG`^7HK(UtBz@uMF94!Crd z`M0i0^j>EV-{8a1o<#HbrS~twE)PDf4{LoH@g&vxXg9}(T^v4SH|KXl$v?~wK1mTD zREAntX?wyhe>i(;tK#(zcrW{R<~6GWpD({3QI0pp8{j%Jl>(6R6dqBkTX*MceOD3HSO=&rtVK+`~Q%5y*x=M zr2(a%7ne6uUKYpe5I^kR_0~_UJ&MY|Ma^#}JOB9h59L4Pdhf2U`1KF@eU`QVH?RG} z@+xkXE}t&q*QXpG^AN~4 zNin~H-3U~t%IWD4=?=x~SM`B=`20|QnGf#&jeQq9RIVn-%TH}w^}$w^mnG&mz+KUA z=Ds0%-(|DfKObK2&B_%!Kce};GwOWlu?g~l9049>`R#s6iubhA&v}0gA~hDAW$aRD=#GvOtBk6^Ya^wYv2fQSM-=U-@5Cy-4BlWoVG`?ejbZAz-3CG^+WjA#p}w6 zUOxpc5r1+081*H7{$3$xFi#QnH~V8?50ZE}DM7s0`pfp^`61Fp{j%lE;;n3-vgZeO zFI;|nCsgm^{IY!~`uxk{HP#QH&%+)?^bwE8&_l`2|3F|L41X8sUrmt?(UATcP6u|S z`t~H9TPPjiFla1aC|)R^Cg|5+x#@G~wJKikfb&+@q#IneHu-!k;|*{KxRb>jyWacW z@7yur&RP0=EWd}Q@jA{W(-VW{%%VvmDlcho!Iu5{mbhIvHY@q`F!ed_IIpT%lwP$*Yf)* zE-$kEnq7Yh>&J2X$MJoTeterhmjwF;&i#;HdZ&8_{d!058CD*glm2aW`>nsNDDS@} zeVNbY*4Lk*&w^6v^Z$qNJLI2FfqW1jG!&2hvF?oea7gx7Y9)k4gt6{u=yn)=vepQvEGwJK@I`gM9_5Fm! z_3Kdnv3QGeQu*2vlrxsUxIG~5{YkCY0_U_n0Jsl|b6}{4r1|j-y%zEC?iSoc^L#Q~ zyAU5#hc{t9e*cAcx9ont5_o}eFUI>%f9d!0?IXIP`2K4-p4$^7hj9o!FPp#BFYkT$ zYlq$NMQwjQ!tbBPIF#wT)D2$z{!45A`d_boO5J}__B}7&PZm4gQBNvaas3YML;Q?4 zVf`o$FC;GX`+aP%Bch)qJi$$r z)0O@q7vh8F9&Z>>fzZ z#{2XuKe_V9EA)Blqk?#$@y70kJ86!_3qG$ATYlgYm8?B+B`p2EM`QVO`<2BD=o9k& zp?){+K~Y;!9-r54yTJT=D7PP$<`a+kcQbI01X8F^Ib9hZ@|Uslt|rV!t{8pb39X73 zJ7E7Q?EPi+3*Z9vDa%**WB;M(b7Oyd+ZCISy<)R|pCW(0OX}+`6|;Z!yJ!6No|i{% zRObbbNRSWolDc@c{CF2n+rM}}PwaR{Jt<$Ve=I-nT=T=O1UnMymC_X_`}JX2Ki02c zHUM`u8%jkUL^6<^^U>I8^?V9rLW$! zRjv2r_n%66fZoNteaG#$hu#ahm({QB`8wE{Wc}I`lrR==W&L=)s4;%VOUQZ1J(Me@ zM&6}#xt7a#6CH}GEcSz_!lXFc@(MbB${4BKykLy#NUac%cgHEX>` z;tk_PEZ)ibwI!%0JFe_{6MctU)%mQveqDFG8~v8}H&u~7^&;P2fpIAD z!1cR%Uz+e@z1j~o+1sxh8}IFj%b@}s!+6ft-{LLwbBUL%AIIjOP1oe@^KXow#|QLs z=*g&8N|Vkcy$C*TKO%WxNJ@FxA0 zY*Bd6_5h4y;BL13R^Fui#tw*Zau?SR(LTgq9IsJN%?~*T+!f_F_b0%9Ez8gLh}ia* z`IoIf+Y`cg8K<98d$2tK_H5{>6t4f*`0%8|#d{S#0#9K-0M+Tit$OI z1H)aPB0tc4tkQUQ&v*LX*{XO~!v2f#oE>i#@9=)$KB6mN*NwM7%JItj3C1DDyU51i z-P`T!Dua4IR(OAf;5YAZH|snpaFx>M@&l)VTiNzwynXH7X9>M_lx#S18T&Krx3d4r z;w`ry?8bHN&y`bN^Yeo+-pAR0VRs3*~=YCmD*{v0E(p10RozCoYIos^)S zz){Ezk$>iV8Olj~V+-43RQvk;WA)=~xl9>6KjbvzMmC<+7e2e{x8{bwC)9mGjQ6qm zmo2}P*TwD6s3*1mRA2uv-pAQPaNj-jdBXjbZXe<*g|B(sB3#?(aF5?#)c4~M-rwWS zl|Am>vwcdX5B(pMYMcK*Uk-{B^5+yF*GSHV)&qx6N9GrfUwVE}-Dzu2tKxMN?z`ch zHNVV$M&Px!hX5Baj_cut)sO%F#BWc3@o%(#&+mf+2Qi+Jt}Tbfn|ssuYM)W(EhZ;6 z-2OF&H@hb(v-uBr4Lr!^Z|_GNcy{gYzyGeEDSHav=Lnp~`~c~u)aAXZO3i+};p7eE zEao?Yzw*w6-0Q07CvumaU$DH4?Z0flWbFpA{IdOuw?EpM>fh$tWvKlnarPYOyU_1J z;ZONhDr65|Qw7!gqe+fz^zsFKk}{4;|4Ie<4Xt;BE0j*iAAEQ}@OF2<8_Oq-VSn{I zIR!hC3}sg+zmPw3(l5jV)!|Jz&;8sPr~G*<%bTXY?|xD;FgX79Yd)p#+wVxwGZ@E^ zH(6Y;@(S-+$??kf%>c)M_eeLT=Kf*erNn#I&*2ZbA^45FBKIFZUS`|1QYns?XeVk9 z<{whtnDqm*cw;^Q{hk>2Ut@UB`aR@iJ$=W{52?I^O5b3-Z!CY7clx|lww>+wi}mVI z`!$w-#nmf*euQyG?ZM>_&yV8HTS3l9y3)^=2fufQa7mFL*5wfjcmJ^U46UnceF7;o;Rh_^+*x{reBD+o;|{Iwabj)2#C{*?MK|0l=BM zcxmi8*uRc&s#)><>k@C;6W2l&;Fz3$xKM+47*`?vDi&`-^>2*7xPB<}FV2s4qL!KJ z+b^tN7F#bRe7re$U(~qt?@nh=G4JK1f2%!S(s`Y`s-KwsXXc-|@B6r$_C4;VyL>p6 zJB({&U&nrP-iPkK1V!@c5I@`FwY`40{8a|dna@YYLpZ1mwXV|o-D6+rTinWcJ+5}1 zdf0y4C4IHSYe&?MtSzo}e71J{%(2U6=kBYLc;ec9`SH&FoY(KrGr$q4zaZ4?XS~dA zuy|OlHx{>F%kj?NM=ajR`tf`mc#ZiuN}f_<5BT=U_jdi@Yx=wspH~8oV_q-#6TDxz zIk(=6`T&Q3*YLOL*!!FRYW#O+Y`k!bI?rBQzeGQ|@+FSu-gAO~O3nL;ts*bv_sQQM z>>=0>All#PD|lZX{Y34<^Mk!zgN<=C%g^?l{C--*i~W9O`Pm*4%a8o1n8nAB%%8u% zu)G{MbElajF9Y98Q=i7h1NZi-o_C=85B4Viuy~+_<;3iP{h;rW5^YRQ~I&? zZR|(l_q%|LNRKF&OZtagh!2{NR~qlWxaDKhTNUq0oR1CRjalry>WuCBJ*)hE2EZH9 zV`jYn;m%7=8NE7ezXsl6JZ169;w|jYGX3IsfqbZZj8~1}1>2v2W57FIe#-9wHNRuL zkIg@uZq}X_%a8V@^00m%%a8gI|D=TIQ1nb9}c;q^)-}J(h$Cc{3W;9DDCnM?T?8~2k}64K25OaEIRa(*YPfI&k5lj?i1Gf z9mXH@28lEv%5z#)uBq-*0@yu)`a(=V>yWYaZCCpLZi7dzfj zKH`eUyG`i<({Gg2`k?*lM+i}x0Q=fq!pJ}dFA zJ)v~s@k;U+*Dpiu7oJbX?N6Y3@VhKC_v;1coi zcm>W%`-)55ybtr0^Q%j){+{0NKB;j2SsXU0dw$@c)W2N*W_=&)yBhD19`>Qh`L5$P z>*HI@>p140TAydUj%`2L|6G2^5xE}X@;Cdr=-0Zcd&+o%voKVPkzaKC*f7yOz>(BicmRH5r$02{9*OGj~{rj#xGVct)XU{Y5?VND) zK$Yl<>QA5ji$xc%x!=1<{~h%Alw#eN=#6e(&i#pWLHAD|E_|vg?`^_89=E9ZKDanN z*fXJrBL>PL{rTPx^1pl7@*zE3o(YKhz~k+Y#R!_puRo;{@C=IaNpDS zemUa3tvWv_aULCV13i+`q%&DM>g?0*ynj&Lx5V-uI3n=~sdOZZk015;*B0ydQ6?sc z5sOzLyoDUDFK?5i^RM>!#h?D$t|-|HRxUJJ-A(I_C3_sr`gy-`^~*Z^`nrybQ~`xbx_+ ztB}0A+3yF#IvVV>grC@Vh4)u5ia_%?ihHghUqHb@RIhtsJxs=8m(EAKIi~!Tk0<3C z&KW*Xg`O$-;o@}kJQgvi{f57vkm7TC;0G;jk2+vuTg#rWD{)>1as%y5v?Ia$GG*=8 za=hl}myjDGe=|S4Z_(vnf8qK;bzbL~1o2|!l}w-WA5z{Jyq&W1dv4i!nZCcJ`1~|f zf5=J5`z+qt`?8MR{Nah;`u2d@Pd2q={o6dhmN#fuqVMqc*@oILw*9m9%F4S~e%XF3 zufljAcU}hfQvtUyzrlP*wnrY9zI4g7xo_EXebxMDW;);W^BZ!n9d>>|`2T=A$9<#Q z54+mAc|9^z4my{!#+}dkfuB#{y&cJ?U3gFIS-+21$B^!P{V6=;-voQ$VLy2)Y32Dr zQ|}MyOnzr>6FuBUI|21#~ zxSPcVtKU-m{Cf+C*BD-~ei+M-cB1kW=SMpcKjT$n`7=N88unPUi_ORC=fm~iC@N3| zdcJcyG9$tM68U`;pC8Klv%G~J5%$n*`ECDW{RrO)6^!{CYro?CC;PvWpq*m#m-XlV z3(Kow_lHn<@b1O^4)^sv=mToMAMOud@9)vl`#~`n}?Q+KBiUWWr_Vc_2iY^AA|Mw{m=m^&F{a) zc-DCJ^5nC=u=1rX>OHz66C@Xl7m$a<$9M-El;JM*q%*nnzP-=5YQr;C$OQ^1yWZZG zxE!L_+fh#9BScrSegK@2 z`m>8{*4t^lYLmj7@Oukcys~(=>dRfzUb$Ppe|boP;s9rWm+()inIGUjSj407g9@*A zqMpRhc-L6_RT4@kem|isKjRH>6nb*D{j7dE-2I5L<(Kv6{A2mE<;vPq!gv{H4*^a> zufqEa-qIhos?=O7A-7~V$(L3F`FW>WlendG4uS4%3Aa)2Ji~1uyrevQP z?w-9+KAaByYQlQy1;6}I%i2S%e%Byg*T<_9{d%dszZ*RR9G2q|p&iK=ui5&smtN5J z*GUOt#Ew_w(}o2WAo3ZYsyfVALTQ==8yF|=xz1z(%K^q z{PYX&|38yAsr%0l^YQ|64!D~wr^VatU*2u!hi+Y4+Wsxuzqma>w*SN=Z3gpSZ2Qaa zi{*V;yvDjD;U%wEVjVNx9`n&H?p+(~i@8VPIBJu}F|1c22JF>v*QYui?Pe~VJA@CN zKU`I%`~33frLSFg`9H0@{J|={Bd_<1K`HJ88 z^Mwef{P})8q$}~nRo3X&cz;udmz6ig@ecK*lC>qSgwc4HZI`Tm8jF{*{*1R_yo@^! z4O}F=pY4xku z`u%vo0g0z*g&7I`((8rA@hYSrLQX+HMb0TT_rc=5U$irYm)-x(`e7_yX8W}fyp`=c zH9LuzweT!YyOL^zeqQ8 zZX}jp_D^yB7WJcYPj>aHU4`4q* z{Cm5E>pX9#doAzv_gYeU^}PN2F28#Nc8PxHulZcQmq-X#H4O-)67>Ux|gI RWwUDs*50^m_B Date: Mon, 27 Oct 2025 19:15:20 +0000 Subject: [PATCH 18/38] Added ML pipeline --- .gitignore | 6 + Untitled.ipynb | 2051 ++++++++++++++++- catboost_info/catboost_training.json | 1052 ++++++++- catboost_info/learn/events.out.tfevents | Bin 2398 -> 54870 bytes catboost_info/learn_error.tsv | 1052 ++++++++- catboost_info/time_left.tsv | 1050 ++++++++- .../ml_commands/data_preprocessing/encode.py | 33 +- .../ml_commands/ml_pipeline/ml_pipeline.py | 366 +++ .../ml_pipeline/select_features.py | 40 +- .../ml_commands/ml_pipeline/select_model.py | 146 +- .../maria_magics/supported_magics.py | 2 + test.py | 1 - 12 files changed, 5479 insertions(+), 320 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py delete mode 100644 test.py diff --git a/.gitignore b/.gitignore index dda68c2..14e75e1 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,9 @@ mariadb_kernel/_version.py # Ignore vscode editor setting .vscode/ + +.eggs/ +catboost_info/ +mariadb_kernel.egg-info/ +models + diff --git a/Untitled.ipynb b/Untitled.ipynb index 4669c05..a5e051f 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -149,7 +149,7 @@ { "data": { "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
12BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
13CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
22BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
23CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
32BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
33CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -7052,82 +7052,20 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "6c5def76-a36c-45be-8712-d886a1e52e25", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "

Feature Selection Results (method=correlation)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
remote_ratio0.744150
training_hours0.742307
age0.683720
certifications0.654654
years_experience0.623764
bonus0.480500
salary0.463771
projects_completed0.441624
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Selected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, remote_ratio, training_hours\n" + "Error during feature selection: could not convert string to float: 'Alice'\n" ] } ], "source": [ - "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" + "%select_features target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" ] }, { @@ -7207,7 +7145,7 @@ } ], "source": [ - "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=potential_score method=rf_importance k=7 problem=regression" + "%select_features target=potential_score method=rf_importance k=7 problem=regression" ] }, { @@ -7287,7 +7225,1982 @@ } ], "source": [ - "%select_features features=age,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score target=attrition_flag method=chi2 k=5 problem=classification output_name=top_features" + "%select_features target=attrition_flag method=chi2 k=5 problem=classification output_name=top_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc570adc-ee80-42b9-a5a5-7a678224a220", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e535919d-788e-44c3-8a42-7c499044a265", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped rows with missing values (in-place). Updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
3CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
6FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
10JackSales55MHigh School301268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
13051287.514050.255000.03000.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.01.00.01.00.00.00.0
245203091.0320100.1120000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.01.00.00.01.00.0
338101879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.01.01.00.00.00.0
42962295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.00.00.00.01.0
53581588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.00.0
65025872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.0
742182081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.01.00.01.00.00.00.0
83172593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.00.00.01.00.0
92731085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.01.00.01.00.00.00.0
1055301268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.01.00.01.00.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Feature Selection Results (method=correlation)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
potential_score0.828136
performance_rating0.817918
education_level_High School0.763763
remote_ratio0.744150
training_hours0.742307
age0.683720
gender_M0.654654
certifications0.654654
gender_F0.654654
years_experience0.623764
department_Sales0.523810
name_Charlie0.509175
name_Frank0.509175
name_Jack0.509175
bonus0.480500
salary0.463771
projects_completed0.441624
department_Engineering0.428571
education_level_Masters0.327327
department_Finance0.327327
name_Ivy0.218218
name_Henry0.218218
education_level_Bachelors0.218218
department_HR0.218218
name_Eve0.218218
name_Grace0.218218
name_Diana0.218218
name_Alice0.218218
name_Bob0.218218
education_level_PhD0.218218
emp_id0.189934
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n", + "Standardized 5 column(s) (mean=0, std=1). Updated data['last_select'] in-place.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
1305120.402806140-0.8357660.255000.03000.00.4698750.312348-0.14882301.00.00.00.00.00.00.00.00.00.00.00.01.00.01.00.01.00.00.00.0
24520300.828734320-0.1152780.1120000.015000.00.8894071.0932160.89294000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.01.00.00.01.00.0
3381018-0.5950820151.3256980.580000.07000.0-0.620907-0.468521-0.52088210.00.01.00.00.00.00.00.00.00.00.00.00.01.00.01.01.00.00.00.0
4296221.339847250-1.2680590.097000.010000.01.3928451.0932161.33941000.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.00.00.00.01.0
5358150.463653130-0.6916690.390000.08000.00.0503440.3123480.59529300.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.00.0
650258-1.4225990100.6052100.760000.04000.0-1.208251-1.249390-1.26499910.00.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.0
7421820-0.3395251250.1729170.485000.07000.0-0.117469-0.468521-0.22323500.00.00.00.00.00.01.00.00.00.00.00.00.01.01.00.01.00.00.00.0
8317251.084291235-0.8357660.295000.09000.00.9733131.0932160.96735200.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.00.00.01.00.0
9273100.098572020-0.4034730.670000.05000.00.2181560.3123480.37205800.00.00.00.00.00.00.00.01.00.00.01.00.00.01.00.01.00.00.00.0
10553012-1.860696052.0461860.865000.02000.0-2.047314-2.030259-2.00911510.00.00.00.00.00.00.00.00.01.00.00.00.01.00.01.00.01.00.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=10, train=8, test=2, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (8 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
650258-1.4225990100.6052100.760000.04000.0-1.208251-1.249390-1.26499910.00.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.0
1305120.402806140-0.8357660.255000.03000.00.4698750.312348-0.14882301.00.00.00.00.00.00.00.00.00.00.00.01.00.01.00.01.00.00.00.0
10553012-1.860696052.0461860.865000.02000.0-2.047314-2.030259-2.00911510.00.00.00.00.00.00.00.00.01.00.00.00.01.00.01.00.01.00.00.0
24520300.828734320-0.1152780.1120000.015000.00.8894071.0932160.89294000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.01.00.00.01.00.0
5358150.463653130-0.6916690.390000.08000.00.0503440.3123480.59529300.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.00.0
4296221.339847250-1.2680590.097000.010000.01.3928451.0932161.33941000.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.00.00.00.01.0
7421820-0.3395251250.1729170.485000.07000.0-0.117469-0.468521-0.22323500.00.00.00.00.00.01.00.00.00.00.00.00.01.01.00.01.00.00.00.0
8317251.084291235-0.8357660.295000.09000.00.9733131.0932160.96735200.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.00.00.01.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (2 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
3381018-0.5950820151.3256980.580000.07000.0-0.620907-0.468521-0.52088210.00.01.00.00.00.00.00.00.00.00.00.00.01.00.01.01.00.00.00.0
9273100.098572020-0.4034730.670000.05000.00.2181560.3123480.37205800.00.00.00.00.00.00.00.01.00.00.01.00.00.01.00.01.00.00.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Model Selection Results (primary_metric=accuracy)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
catboost1.00.00000.40.48990.40.48990.40.4899
gbm1.00.00000.40.48990.40.48990.40.4899
rf0.90.20000.20.40000.40.48990.40.4899
logistic0.90.20000.20.40000.20.40000.20.4000
ada0.90.20000.20.40000.20.40000.20.4000
knn0.80.24490.00.00000.00.00000.00.0000
svm0.80.24490.00.00000.00.00000.00.0000
mlp0.80.40000.40.48990.40.48990.40.4899
xgboost0.80.24490.00.00000.00.00000.00.0000
lightgbm0.80.24490.00.00000.00.00000.00.0000
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model 'gbm' (mean accuracy=1.0000) saved to data['last_model'].\n", + "[MLPipeline] Automatically selected best model via SelectModel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy0.5000
Precision (w)0.2500
Recall (w)0.5000
F1 (w)0.3333
ROC AUC1.0000
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
              precision    recall  f1-score   support\n",
+       "\n",
+       "           0       0.50      1.00      0.67         1\n",
+       "           1       0.00      0.00      0.00         1\n",
+       "\n",
+       "    accuracy                           0.50         2\n",
+       "   macro avg       0.25      0.50      0.33         2\n",
+       "weighted avg       0.25      0.50      0.33         2\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Predictions preview (actual vs predicted)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
attrition_flag_predicted_pred_proba
100.022933
000.000011
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model from data['last_model'] saved to ./models/model.joblib\n", + "[MLPipeline] Model saved to ./models/model.joblib.\n", + "[MLPipeline] ML pipeline completed successfully.\n" + ] + } + ], + "source": [ + "%ml_pipeline target=attrition_flag problem=classification save_path=./models/model.joblib" ] }, { diff --git a/catboost_info/catboost_training.json b/catboost_info/catboost_training.json index 00cf6ba..b468cdf 100644 --- a/catboost_info/catboost_training.json +++ b/catboost_info/catboost_training.json @@ -1,54 +1,1004 @@ { -"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"RMSE"}],"launch_mode":"Train","parameters":"","iteration_count":50,"learn_sets":["learn"],"name":"experiment"}, +"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":1000,"learn_sets":["learn"],"name":"experiment"}, "iterations":[ -{"learn":[0.9710631926],"iteration":0,"passed_time":0.05695364861,"remaining_time":2.790728782}, -{"learn":[0.8247710073],"iteration":1,"passed_time":0.05764274648,"remaining_time":1.383425916}, -{"learn":[0.7022282601],"iteration":2,"passed_time":0.05819126054,"remaining_time":0.9116630817}, -{"learn":[0.6270477627],"iteration":3,"passed_time":0.06090029169,"remaining_time":0.7003533545}, -{"learn":[0.5346013406],"iteration":4,"passed_time":0.06149235543,"remaining_time":0.5534311989}, -{"learn":[0.444717163],"iteration":5,"passed_time":0.06183177846,"remaining_time":0.4534330421}, -{"learn":[0.3839413771],"iteration":6,"passed_time":0.06219568651,"remaining_time":0.3820592172}, -{"learn":[0.3332464887],"iteration":7,"passed_time":0.06268006686,"remaining_time":0.329070351}, -{"learn":[0.2896858851],"iteration":8,"passed_time":0.06303172973,"remaining_time":0.2871445465}, -{"learn":[0.2546482153],"iteration":9,"passed_time":0.06325858125,"remaining_time":0.253034325}, -{"learn":[0.2217991067],"iteration":10,"passed_time":0.06379790184,"remaining_time":0.2261925611}, -{"learn":[0.1892739767],"iteration":11,"passed_time":0.0646127055,"remaining_time":0.2046069008}, -{"learn":[0.1682408917],"iteration":12,"passed_time":0.06483456629,"remaining_time":0.1845291502}, -{"learn":[0.1400203532],"iteration":13,"passed_time":0.06534558127,"remaining_time":0.1680314947}, -{"learn":[0.1217420713],"iteration":14,"passed_time":0.06562706824,"remaining_time":0.1531298259}, -{"learn":[0.1038814836],"iteration":15,"passed_time":0.06632963735,"remaining_time":0.1409504794}, -{"learn":[0.08639816643],"iteration":16,"passed_time":0.06670532651,"remaining_time":0.1294868103}, -{"learn":[0.07526269779],"iteration":17,"passed_time":0.06711008939,"remaining_time":0.1193068256}, -{"learn":[0.06268846321],"iteration":18,"passed_time":0.06768000763,"remaining_time":0.1104252756}, -{"learn":[0.05362374493],"iteration":19,"passed_time":0.07160355027,"remaining_time":0.1074053254}, -{"learn":[0.04587823084],"iteration":20,"passed_time":0.07205487013,"remaining_time":0.09950434447}, -{"learn":[0.03940739145],"iteration":21,"passed_time":0.07219507641,"remaining_time":0.0918846427}, -{"learn":[0.03396654535],"iteration":22,"passed_time":0.07238429637,"remaining_time":0.08497286966}, -{"learn":[0.0283153494],"iteration":23,"passed_time":0.07257083526,"remaining_time":0.07861840487}, -{"learn":[0.02431601344],"iteration":24,"passed_time":0.07274715604,"remaining_time":0.07274715604}, -{"learn":[0.02028646573],"iteration":25,"passed_time":0.07293322932,"remaining_time":0.06732298091}, -{"learn":[0.01693585957],"iteration":26,"passed_time":0.07308056369,"remaining_time":0.06225381351}, -{"learn":[0.01414882591],"iteration":27,"passed_time":0.07323423661,"remaining_time":0.05754118591}, -{"learn":[0.01183896794],"iteration":28,"passed_time":0.07350821259,"remaining_time":0.05323008498}, -{"learn":[0.009906073428],"iteration":29,"passed_time":0.07368584825,"remaining_time":0.04912389883}, -{"learn":[0.008624575516],"iteration":30,"passed_time":0.07389072582,"remaining_time":0.04528786421}, -{"learn":[0.007223667424],"iteration":31,"passed_time":0.07407611524,"remaining_time":0.04166781482}, -{"learn":[0.00605654099],"iteration":32,"passed_time":0.07424812302,"remaining_time":0.03824903307}, -{"learn":[0.005085089643],"iteration":33,"passed_time":0.07443328806,"remaining_time":0.03502742967}, -{"learn":[0.004273904823],"iteration":34,"passed_time":0.07461059825,"remaining_time":0.03197597068}, -{"learn":[0.003596032008],"iteration":35,"passed_time":0.07478362838,"remaining_time":0.02908252215}, -{"learn":[0.003101566493],"iteration":36,"passed_time":0.07496663232,"remaining_time":0.02633962757}, -{"learn":[0.002677233959],"iteration":37,"passed_time":0.07515916974,"remaining_time":0.02373447465}, -{"learn":[0.002312693569],"iteration":38,"passed_time":0.07534052031,"remaining_time":0.02124989034}, -{"learn":[0.002014958717],"iteration":39,"passed_time":0.07554323296,"remaining_time":0.01888580824}, -{"learn":[0.001755565836],"iteration":40,"passed_time":0.07573511792,"remaining_time":0.01662478198}, -{"learn":[0.001529576385],"iteration":41,"passed_time":0.07590874078,"remaining_time":0.01445880777}, -{"learn":[0.001332687989],"iteration":42,"passed_time":0.07606214567,"remaining_time":0.01238220976}, -{"learn":[0.001161152485],"iteration":43,"passed_time":0.07621032472,"remaining_time":0.01039231701}, -{"learn":[0.001011704525],"iteration":44,"passed_time":0.07638826593,"remaining_time":0.008487585104}, -{"learn":[0.0008814993786],"iteration":45,"passed_time":0.07655899022,"remaining_time":0.006657303498}, -{"learn":[0.0007680587495],"iteration":46,"passed_time":0.07675038047,"remaining_time":0.004898960456}, -{"learn":[0.0006692235685],"iteration":47,"passed_time":0.07692142334,"remaining_time":0.003205059306}, -{"learn":[0.0005831128718],"iteration":48,"passed_time":0.07707417959,"remaining_time":0.001572942441}, -{"learn":[0.0005080879754],"iteration":49,"passed_time":0.07723189595,"remaining_time":0} +{"learn":[0.6921376331],"iteration":0,"passed_time":0.0005506633771,"remaining_time":0.5501127137}, +{"learn":[0.6914311222],"iteration":1,"passed_time":0.0007853338881,"remaining_time":0.3918816102}, +{"learn":[0.6902726804],"iteration":2,"passed_time":0.001054080287,"remaining_time":0.3503060152}, +{"learn":[0.6893236297],"iteration":3,"passed_time":0.001332495648,"remaining_time":0.3317914165}, +{"learn":[0.6884269714],"iteration":4,"passed_time":0.001567760683,"remaining_time":0.3119843759}, +{"learn":[0.6877708094],"iteration":5,"passed_time":0.001757945965,"remaining_time":0.2912330483}, +{"learn":[0.6868745599],"iteration":6,"passed_time":0.00198443086,"remaining_time":0.281505692}, +{"learn":[0.6854426605],"iteration":7,"passed_time":0.002176281622,"remaining_time":0.2698589211}, +{"learn":[0.6846512471],"iteration":8,"passed_time":0.002514425851,"remaining_time":0.2768662242}, +{"learn":[0.6835081152],"iteration":9,"passed_time":0.0027196756,"remaining_time":0.2692478844}, +{"learn":[0.6828162926],"iteration":10,"passed_time":0.002932122931,"remaining_time":0.2636245072}, +{"learn":[0.6821300387],"iteration":11,"passed_time":0.003192283539,"remaining_time":0.2628313447}, +{"learn":[0.6810161471],"iteration":12,"passed_time":0.005394840798,"remaining_time":0.4095929129}, +{"learn":[0.6800281746],"iteration":13,"passed_time":0.00563666743,"remaining_time":0.3969824347}, +{"learn":[0.6789965715],"iteration":14,"passed_time":0.005860724626,"remaining_time":0.3848542504}, +{"learn":[0.6778871247],"iteration":15,"passed_time":0.006066001399,"remaining_time":0.373059086}, +{"learn":[0.6767778482],"iteration":16,"passed_time":0.006307268676,"remaining_time":0.3647085358}, +{"learn":[0.675800315],"iteration":17,"passed_time":0.006694250074,"remaining_time":0.3652085318}, +{"learn":[0.6745953049],"iteration":18,"passed_time":0.006899477982,"remaining_time":0.3562309421}, +{"learn":[0.6735743199],"iteration":19,"passed_time":0.007181293899,"remaining_time":0.3518834011}, +{"learn":[0.6728023546],"iteration":20,"passed_time":0.00745657228,"remaining_time":0.3476182982}, +{"learn":[0.6717833025],"iteration":21,"passed_time":0.007663767372,"remaining_time":0.340689295}, +{"learn":[0.6705873864],"iteration":22,"passed_time":0.007879479029,"remaining_time":0.3347065657}, +{"learn":[0.6691952518],"iteration":23,"passed_time":0.008064386899,"remaining_time":0.3279517339}, +{"learn":[0.6683306949],"iteration":24,"passed_time":0.008284067351,"remaining_time":0.3230786267}, +{"learn":[0.6672187277],"iteration":25,"passed_time":0.008483715772,"remaining_time":0.3178130447}, +{"learn":[0.6662092464],"iteration":26,"passed_time":0.008697806743,"remaining_time":0.3134431837}, +{"learn":[0.6652034862],"iteration":27,"passed_time":0.008924122091,"remaining_time":0.309794524}, +{"learn":[0.664099523],"iteration":28,"passed_time":0.009119880197,"remaining_time":0.3053587473}, +{"learn":[0.6628215143],"iteration":29,"passed_time":0.009304048429,"remaining_time":0.3008308992}, +{"learn":[0.6619682567],"iteration":30,"passed_time":0.009508690698,"remaining_time":0.2972232673}, +{"learn":[0.6605963962],"iteration":31,"passed_time":0.009684086934,"remaining_time":0.2929436298}, +{"learn":[0.6597477538],"iteration":32,"passed_time":0.009889587671,"remaining_time":0.2897948872}, +{"learn":[0.6585793495],"iteration":33,"passed_time":0.01006296194,"remaining_time":0.2859065068}, +{"learn":[0.6574136274],"iteration":34,"passed_time":0.01025222915,"remaining_time":0.2826686037}, +{"learn":[0.6566625323],"iteration":35,"passed_time":0.01047332483,"remaining_time":0.280452365}, +{"learn":[0.6557201062],"iteration":36,"passed_time":0.01065918187,"remaining_time":0.2774268146}, +{"learn":[0.6547810435],"iteration":37,"passed_time":0.01083729085,"remaining_time":0.2743545736}, +{"learn":[0.6537957873],"iteration":38,"passed_time":0.01125047385,"remaining_time":0.2772232145}, +{"learn":[0.6524456825],"iteration":39,"passed_time":0.01142843771,"remaining_time":0.2742825051}, +{"learn":[0.651703792],"iteration":40,"passed_time":0.01168960449,"remaining_time":0.2734227002}, +{"learn":[0.6505536692],"iteration":41,"passed_time":0.01189282376,"remaining_time":0.2712696466}, +{"learn":[0.6495760253],"iteration":42,"passed_time":0.01207861379,"remaining_time":0.2688193812}, +{"learn":[0.6482374328],"iteration":43,"passed_time":0.01223421047,"remaining_time":0.2658160275}, +{"learn":[0.6469012243],"iteration":44,"passed_time":0.01240492197,"remaining_time":0.2632600106}, +{"learn":[0.6460723281],"iteration":45,"passed_time":0.01259484997,"remaining_time":0.2612062363}, +{"learn":[0.645103676],"iteration":46,"passed_time":0.01281349796,"remaining_time":0.2598141183}, +{"learn":[0.6441353474],"iteration":47,"passed_time":0.01300224247,"remaining_time":0.257877809}, +{"learn":[0.6431703482],"iteration":48,"passed_time":0.01320365299,"remaining_time":0.2562586529}, +{"learn":[0.6423503671],"iteration":49,"passed_time":0.01342363478,"remaining_time":0.2550490607}, +{"learn":[0.6417189837],"iteration":50,"passed_time":0.01364485225,"remaining_time":0.2539012703}, +{"learn":[0.6408985257],"iteration":51,"passed_time":0.01385111336,"remaining_time":0.2525164512}, +{"learn":[0.6395800029],"iteration":52,"passed_time":0.01401858126,"remaining_time":0.2504829519}, +{"learn":[0.638454846],"iteration":53,"passed_time":0.01418940011,"remaining_time":0.2485772686}, +{"learn":[0.637829457],"iteration":54,"passed_time":0.01440893286,"remaining_time":0.2475716645}, +{"learn":[0.6369738238],"iteration":55,"passed_time":0.01462075124,"remaining_time":0.2464640923}, +{"learn":[0.6356642927],"iteration":56,"passed_time":0.01481690396,"remaining_time":0.2451287796}, +{"learn":[0.6346218245],"iteration":57,"passed_time":0.01499538758,"remaining_time":0.2435457775}, +{"learn":[0.6335089888],"iteration":58,"passed_time":0.01517758233,"remaining_time":0.2420695758}, +{"learn":[0.6327144418],"iteration":59,"passed_time":0.01543631179,"remaining_time":0.2418355513}, +{"learn":[0.6314146519],"iteration":60,"passed_time":0.01561911365,"remaining_time":0.2404319298}, +{"learn":[0.6302148189],"iteration":61,"passed_time":0.01585171444,"remaining_time":0.2398210991}, +{"learn":[0.6294122423],"iteration":62,"passed_time":0.01606182921,"remaining_time":0.2388878408}, +{"learn":[0.6283095479],"iteration":63,"passed_time":0.01626120998,"remaining_time":0.237820196}, +{"learn":[0.6273769736],"iteration":64,"passed_time":0.01649659274,"remaining_time":0.2372971417}, +{"learn":[0.626091625],"iteration":65,"passed_time":0.01666773328,"remaining_time":0.2358736801}, +{"learn":[0.6248111555],"iteration":66,"passed_time":0.01682289351,"remaining_time":0.2342650694}, +{"learn":[0.623531299],"iteration":67,"passed_time":0.01698280179,"remaining_time":0.2327642833}, +{"learn":[0.6225138136],"iteration":68,"passed_time":0.01714976845,"remaining_time":0.2313976004}, +{"learn":[0.62191024],"iteration":69,"passed_time":0.0173738312,"remaining_time":0.2308237574}, +{"learn":[0.6212150369],"iteration":70,"passed_time":0.01900881115,"remaining_time":0.2487209234}, +{"learn":[0.6205226438],"iteration":71,"passed_time":0.0192645902,"remaining_time":0.2482991626}, +{"learn":[0.6194400617],"iteration":72,"passed_time":0.01947590217,"remaining_time":0.2473172782}, +{"learn":[0.6186558434],"iteration":73,"passed_time":0.01975672413,"remaining_time":0.2472260343}, +{"learn":[0.6179682612],"iteration":74,"passed_time":0.02002807369,"remaining_time":0.2470129089}, +{"learn":[0.6168900728],"iteration":75,"passed_time":0.02027816331,"remaining_time":0.246539775}, +{"learn":[0.61597624],"iteration":76,"passed_time":0.02048544466,"remaining_time":0.2455592912}, +{"learn":[0.6151563355],"iteration":77,"passed_time":0.02069465358,"remaining_time":0.2446214179}, +{"learn":[0.6140871133],"iteration":78,"passed_time":0.02087465311,"remaining_time":0.2433614622}, +{"learn":[0.6135282346],"iteration":79,"passed_time":0.02136253616,"remaining_time":0.2456691658}, +{"learn":[0.6125324198],"iteration":80,"passed_time":0.02155299945,"remaining_time":0.2445334135}, +{"learn":[0.6116280981],"iteration":81,"passed_time":0.02174286526,"remaining_time":0.2434140281}, +{"learn":[0.610728332],"iteration":82,"passed_time":0.02194195358,"remaining_time":0.242418933}, +{"learn":[0.6099583251],"iteration":83,"passed_time":0.02215505208,"remaining_time":0.2415955679}, +{"learn":[0.6091488685],"iteration":84,"passed_time":0.02236177111,"remaining_time":0.240717889}, +{"learn":[0.607908896],"iteration":85,"passed_time":0.02252964473,"remaining_time":0.2394429685}, +{"learn":[0.6066734365],"iteration":86,"passed_time":0.02268634347,"remaining_time":0.2380762252}, +{"learn":[0.6057140146],"iteration":87,"passed_time":0.02288567612,"remaining_time":0.2371788252}, +{"learn":[0.6048230018],"iteration":88,"passed_time":0.02309271758,"remaining_time":0.2363760193}, +{"learn":[0.6038645932],"iteration":89,"passed_time":0.02327931351,"remaining_time":0.2353797255}, +{"learn":[0.603286062],"iteration":90,"passed_time":0.02364643761,"remaining_time":0.2362045251}, +{"learn":[0.6020624808],"iteration":91,"passed_time":0.0238396081,"remaining_time":0.2352865669}, +{"learn":[0.6010184118],"iteration":92,"passed_time":0.02403660522,"remaining_time":0.2344215155}, +{"learn":[0.5997997522],"iteration":93,"passed_time":0.02419529595,"remaining_time":0.2332014694}, +{"learn":[0.5988519958],"iteration":94,"passed_time":0.02443855262,"remaining_time":0.2328093697}, +{"learn":[0.5978163225],"iteration":95,"passed_time":0.02460600645,"remaining_time":0.2317065608}, +{"learn":[0.5968528134],"iteration":96,"passed_time":0.0247872302,"remaining_time":0.2307512255}, +{"learn":[0.5956415364],"iteration":97,"passed_time":0.02497165127,"remaining_time":0.2298411168}, +{"learn":[0.5950270976],"iteration":98,"passed_time":0.02514632157,"remaining_time":0.2288569266}, +{"learn":[0.5940695831],"iteration":99,"passed_time":0.02530889668,"remaining_time":0.2277800702}, +{"learn":[0.5931115406],"iteration":100,"passed_time":0.02546676559,"remaining_time":0.2266794283}, +{"learn":[0.5923869354],"iteration":101,"passed_time":0.0256237601,"remaining_time":0.2255895742}, +{"learn":[0.5916092396],"iteration":102,"passed_time":0.02585865088,"remaining_time":0.225196212}, +{"learn":[0.5907429201],"iteration":103,"passed_time":0.02603825912,"remaining_time":0.2243296171}, +{"learn":[0.5900058321],"iteration":104,"passed_time":0.02622273942,"remaining_time":0.223517636}, +{"learn":[0.5892723969],"iteration":105,"passed_time":0.02642358874,"remaining_time":0.2228555503}, +{"learn":[0.5880816919],"iteration":106,"passed_time":0.02659274359,"remaining_time":0.2219375703}, +{"learn":[0.5874404567],"iteration":107,"passed_time":0.02681064862,"remaining_time":0.2214360978}, +{"learn":[0.586706996],"iteration":108,"passed_time":0.02701284949,"remaining_time":0.2208114577}, +{"learn":[0.5859788145],"iteration":109,"passed_time":0.02719723798,"remaining_time":0.22005038}, +{"learn":[0.5852130651],"iteration":110,"passed_time":0.02741903814,"remaining_time":0.2195993235}, +{"learn":[0.584120197],"iteration":111,"passed_time":0.02766346202,"remaining_time":0.2193317345}, +{"learn":[0.5833940932],"iteration":112,"passed_time":0.02791707139,"remaining_time":0.2191366577}, +{"learn":[0.5823940975],"iteration":113,"passed_time":0.02809780908,"remaining_time":0.2183742004}, +{"learn":[0.5816756827],"iteration":114,"passed_time":0.02831838058,"remaining_time":0.2179284071}, +{"learn":[0.5805052434],"iteration":115,"passed_time":0.02848656368,"remaining_time":0.2170872612}, +{"learn":[0.5794262716],"iteration":116,"passed_time":0.02867859954,"remaining_time":0.2164376359}, +{"learn":[0.5787069798],"iteration":117,"passed_time":0.02888154301,"remaining_time":0.2158772961}, +{"learn":[0.5777818561],"iteration":118,"passed_time":0.02905500131,"remaining_time":0.2151046736}, +{"learn":[0.5768609388],"iteration":119,"passed_time":0.02923108684,"remaining_time":0.2143613035}, +{"learn":[0.576025205],"iteration":120,"passed_time":0.02940838066,"remaining_time":0.2136360876}, +{"learn":[0.5753114138],"iteration":121,"passed_time":0.02960232595,"remaining_time":0.2130396901}, +{"learn":[0.5741571529],"iteration":122,"passed_time":0.02985105772,"remaining_time":0.2128404685}, +{"learn":[0.5731786489],"iteration":123,"passed_time":0.0300463927,"remaining_time":0.2122632258}, +{"learn":[0.5722654206],"iteration":124,"passed_time":0.03364201036,"remaining_time":0.2354940725}, +{"learn":[0.5715580412],"iteration":125,"passed_time":0.03391048615,"remaining_time":0.2352203563}, +{"learn":[0.5707720092],"iteration":126,"passed_time":0.03412408699,"remaining_time":0.2345695114}, +{"learn":[0.5702917491],"iteration":127,"passed_time":0.03433297237,"remaining_time":0.2338933742}, +{"learn":[0.5693833147],"iteration":128,"passed_time":0.03455619219,"remaining_time":0.2333212667}, +{"learn":[0.5687683991],"iteration":129,"passed_time":0.03481659491,"remaining_time":0.2330033659}, +{"learn":[0.567802012],"iteration":130,"passed_time":0.03502881383,"remaining_time":0.2323667116}, +{"learn":[0.5668365444],"iteration":131,"passed_time":0.035243034,"remaining_time":0.2317496478}, +{"learn":[0.566021783],"iteration":132,"passed_time":0.03546937045,"remaining_time":0.2312176254}, +{"learn":[0.5654101968],"iteration":133,"passed_time":0.03569811275,"remaining_time":0.2307057138}, +{"learn":[0.5645956993],"iteration":134,"passed_time":0.03590905008,"remaining_time":0.2300839135}, +{"learn":[0.5639859353],"iteration":135,"passed_time":0.03614988091,"remaining_time":0.2296580669}, +{"learn":[0.5632997581],"iteration":136,"passed_time":0.03637466849,"remaining_time":0.2291338606}, +{"learn":[0.5626934341],"iteration":137,"passed_time":0.03686424329,"remaining_time":0.2302679545}, +{"learn":[0.5619667087],"iteration":138,"passed_time":0.04770563842,"remaining_time":0.2955003934}, +{"learn":[0.561355676],"iteration":139,"passed_time":0.04803399708,"remaining_time":0.2950659821}, +{"learn":[0.5606338722],"iteration":140,"passed_time":0.04827343044,"remaining_time":0.2940913245}, +{"learn":[0.5599243215],"iteration":141,"passed_time":0.04846762228,"remaining_time":0.2928536614}, +{"learn":[0.5588081224],"iteration":142,"passed_time":0.04874712044,"remaining_time":0.2921418337}, +{"learn":[0.5582083719],"iteration":143,"passed_time":0.04900995344,"remaining_time":0.2913369455}, +{"learn":[0.5571798342],"iteration":144,"passed_time":0.04921146206,"remaining_time":0.2901779315}, +{"learn":[0.5560722521],"iteration":145,"passed_time":0.04940132454,"remaining_time":0.288963912}, +{"learn":[0.555395518],"iteration":146,"passed_time":0.04963731404,"remaining_time":0.2880314889}, +{"learn":[0.5548768214],"iteration":147,"passed_time":0.04990071232,"remaining_time":0.2872662628}, +{"learn":[0.5540812016],"iteration":148,"passed_time":0.0501297989,"remaining_time":0.2863118044}, +{"learn":[0.5532853092],"iteration":149,"passed_time":0.05034942456,"remaining_time":0.2853134058}, +{"learn":[0.5524131826],"iteration":150,"passed_time":0.05056340966,"remaining_time":0.2842936079}, +{"learn":[0.5514765126],"iteration":151,"passed_time":0.0507795685,"remaining_time":0.2832965401}, +{"learn":[0.5503822821],"iteration":152,"passed_time":0.05116399042,"remaining_time":0.2832411757}, +{"learn":[0.5496355976],"iteration":153,"passed_time":0.05141092491,"remaining_time":0.2824262499}, +{"learn":[0.5487687332],"iteration":154,"passed_time":0.05165100907,"remaining_time":0.2815813075}, +{"learn":[0.5479253531],"iteration":155,"passed_time":0.05186466321,"remaining_time":0.2806011266}, +{"learn":[0.5468407444],"iteration":156,"passed_time":0.05205473411,"remaining_time":0.2795040819}, +{"learn":[0.5457558547],"iteration":157,"passed_time":0.05226718662,"remaining_time":0.278537792}, +{"learn":[0.5448993189],"iteration":158,"passed_time":0.05247221352,"remaining_time":0.277541708}, +{"learn":[0.5442429866],"iteration":159,"passed_time":0.05270743302,"remaining_time":0.2767140234}, +{"learn":[0.54350417],"iteration":160,"passed_time":0.05291273311,"remaining_time":0.2757377831}, +{"learn":[0.5428456068],"iteration":161,"passed_time":0.05315264439,"remaining_time":0.2749500988}, +{"learn":[0.5421104516],"iteration":162,"passed_time":0.05337751341,"remaining_time":0.274091894}, +{"learn":[0.5410383344],"iteration":163,"passed_time":0.05355567422,"remaining_time":0.2730033149}, +{"learn":[0.5401923571],"iteration":164,"passed_time":0.05374770452,"remaining_time":0.2719959592}, +{"learn":[0.5392873032],"iteration":165,"passed_time":0.0539680254,"remaining_time":0.2711405613}, +{"learn":[0.5386331763],"iteration":166,"passed_time":0.05418953977,"remaining_time":0.2702987223}, +{"learn":[0.5379828811],"iteration":167,"passed_time":0.05441635894,"remaining_time":0.2694905395}, +{"learn":[0.5372601577],"iteration":168,"passed_time":0.05558315582,"remaining_time":0.2733112573}, +{"learn":[0.5364935909],"iteration":169,"passed_time":0.05586399481,"remaining_time":0.2727477394}, +{"learn":[0.5359273042],"iteration":170,"passed_time":0.0561265324,"remaining_time":0.2720988033}, +{"learn":[0.5352867246],"iteration":171,"passed_time":0.05651085288,"remaining_time":0.2720406174}, +{"learn":[0.5346508111],"iteration":172,"passed_time":0.05671870839,"remaining_time":0.2711350973}, +{"learn":[0.5338960375],"iteration":173,"passed_time":0.05703869967,"remaining_time":0.2707699191}, +{"learn":[0.5331440909],"iteration":174,"passed_time":0.05730524196,"remaining_time":0.2701532835}, +{"learn":[0.5325764247],"iteration":175,"passed_time":0.05761228736,"remaining_time":0.2697302545}, +{"learn":[0.5319340825],"iteration":176,"passed_time":0.05788239346,"remaining_time":0.2691367786}, +{"learn":[0.5308882339],"iteration":177,"passed_time":0.05809320604,"remaining_time":0.26827312}, +{"learn":[0.5301761712],"iteration":178,"passed_time":0.0583282945,"remaining_time":0.2675280993}, +{"learn":[0.5292916383],"iteration":179,"passed_time":0.05855921721,"remaining_time":0.2667697673}, +{"learn":[0.5285842078],"iteration":180,"passed_time":0.0588231667,"remaining_time":0.2661667046}, +{"learn":[0.5280214122],"iteration":181,"passed_time":0.0591590054,"remaining_time":0.2658904748}, +{"learn":[0.5274686984],"iteration":182,"passed_time":0.06032077807,"remaining_time":0.26930096}, +{"learn":[0.5265111327],"iteration":183,"passed_time":0.06059102374,"remaining_time":0.2687080183}, +{"learn":[0.5254784737],"iteration":184,"passed_time":0.06079603508,"remaining_time":0.2678311816}, +{"learn":[0.5246844462],"iteration":185,"passed_time":0.06124754337,"remaining_time":0.2680403242}, +{"learn":[0.5240639022],"iteration":186,"passed_time":0.06168427593,"remaining_time":0.2681781622}, +{"learn":[0.523193121],"iteration":187,"passed_time":0.06204974676,"remaining_time":0.2680020977}, +{"learn":[0.5224034829],"iteration":188,"passed_time":0.06249607129,"remaining_time":0.2681709726}, +{"learn":[0.5215357968],"iteration":189,"passed_time":0.06776152737,"remaining_time":0.2888780904}, +{"learn":[0.5205158591],"iteration":190,"passed_time":0.06820345739,"remaining_time":0.2888827069}, +{"learn":[0.519499617],"iteration":191,"passed_time":0.06841906687,"remaining_time":0.2879302398}, +{"learn":[0.5189555841],"iteration":192,"passed_time":0.06874475572,"remaining_time":0.2874456884}, +{"learn":[0.5182280072],"iteration":193,"passed_time":0.06899695616,"remaining_time":0.286657457}, +{"learn":[0.5176118144],"iteration":194,"passed_time":0.06929205371,"remaining_time":0.2860518115}, +{"learn":[0.516600694],"iteration":195,"passed_time":0.06949547325,"remaining_time":0.2850732678}, +{"learn":[0.5160629494],"iteration":196,"passed_time":0.06978863324,"remaining_time":0.2844683883}, +{"learn":[0.5152068564],"iteration":197,"passed_time":0.07002515432,"remaining_time":0.2836372412}, +{"learn":[0.5146640411],"iteration":198,"passed_time":0.07031274963,"remaining_time":0.2830176505}, +{"learn":[0.5139804397],"iteration":199,"passed_time":0.0705935727,"remaining_time":0.2823742908}, +{"learn":[0.5130543666],"iteration":200,"passed_time":0.07085005622,"remaining_time":0.2816377857}, +{"learn":[0.5122875103],"iteration":201,"passed_time":0.07112385792,"remaining_time":0.2809744486}, +{"learn":[0.5114408987],"iteration":202,"passed_time":0.07147159041,"remaining_time":0.2806052096}, +{"learn":[0.5108332038],"iteration":203,"passed_time":0.07814299613,"remaining_time":0.3049109064}, +{"learn":[0.5102274801],"iteration":204,"passed_time":0.07866349022,"remaining_time":0.3050608523}, +{"learn":[0.5096990253],"iteration":205,"passed_time":0.07911588658,"remaining_time":0.3049418153}, +{"learn":[0.5091704939],"iteration":206,"passed_time":0.07955885831,"remaining_time":0.3047834524}, +{"learn":[0.5084650133],"iteration":207,"passed_time":0.07998701286,"remaining_time":0.3045659336}, +{"learn":[0.5076262014],"iteration":208,"passed_time":0.08035209092,"remaining_time":0.3041076742}, +{"learn":[0.5067908296],"iteration":209,"passed_time":0.08071680953,"remaining_time":0.3036489501}, +{"learn":[0.5058066249],"iteration":210,"passed_time":0.08121877271,"remaining_time":0.3037043207}, +{"learn":[0.505107935],"iteration":211,"passed_time":0.08166758157,"remaining_time":0.3035568598}, +{"learn":[0.5043528165],"iteration":212,"passed_time":0.08207285141,"remaining_time":0.3032456998}, +{"learn":[0.5037583411],"iteration":213,"passed_time":0.08253401348,"remaining_time":0.3031389467}, +{"learn":[0.5033062739],"iteration":214,"passed_time":0.08302365529,"remaining_time":0.3031328809}, +{"learn":[0.502780165],"iteration":215,"passed_time":0.08352755011,"remaining_time":0.3031740708}, +{"learn":[0.5019570121],"iteration":216,"passed_time":0.08413131042,"remaining_time":0.3035705809}, +{"learn":[0.5011352003],"iteration":217,"passed_time":0.08455725568,"remaining_time":0.303320064}, +{"learn":[0.500619271],"iteration":218,"passed_time":0.08503471347,"remaining_time":0.3032516494}, +{"learn":[0.499652671],"iteration":219,"passed_time":0.09698412874,"remaining_time":0.3438528201}, +{"learn":[0.4991394196],"iteration":220,"passed_time":0.1092758406,"remaining_time":0.3851849768}, +{"learn":[0.4984841432],"iteration":221,"passed_time":0.1096579418,"remaining_time":0.3842967512}, +{"learn":[0.4977248609],"iteration":222,"passed_time":0.1099168009,"remaining_time":0.3829836515}, +{"learn":[0.4969089159],"iteration":223,"passed_time":0.1101721498,"remaining_time":0.3816678045}, +{"learn":[0.4962528561],"iteration":224,"passed_time":0.1105039267,"remaining_time":0.3806246366}, +{"learn":[0.4952942176],"iteration":225,"passed_time":0.1107545805,"remaining_time":0.3793099351}, +{"learn":[0.4947139238],"iteration":226,"passed_time":0.1124684203,"remaining_time":0.3829871759}, +{"learn":[0.494205675],"iteration":227,"passed_time":0.1157605753,"remaining_time":0.3919612462}, +{"learn":[0.4934548054],"iteration":228,"passed_time":0.117541986,"remaining_time":0.3957417958}, +{"learn":[0.4928752099],"iteration":229,"passed_time":0.118417462,"remaining_time":0.3964410686}, +{"learn":[0.4921438651],"iteration":230,"passed_time":0.1199319189,"remaining_time":0.3992538771}, +{"learn":[0.4915419051],"iteration":231,"passed_time":0.1203728927,"remaining_time":0.3984757826}, +{"learn":[0.4906675262],"iteration":232,"passed_time":0.1208700419,"remaining_time":0.3978855028}, +{"learn":[0.4898703141],"iteration":233,"passed_time":0.121417344,"remaining_time":0.3974601944}, +{"learn":[0.4894825518],"iteration":234,"passed_time":0.12185313,"remaining_time":0.3966708273}, +{"learn":[0.4889106836],"iteration":235,"passed_time":0.1222620739,"remaining_time":0.3957975614}, +{"learn":[0.4883435837],"iteration":236,"passed_time":0.1227171697,"remaining_time":0.3950767953}, +{"learn":[0.4875512464],"iteration":237,"passed_time":0.1234467287,"remaining_time":0.3952370054}, +{"learn":[0.4866151299],"iteration":238,"passed_time":0.1238333033,"remaining_time":0.3942976728}, +{"learn":[0.4858255344],"iteration":239,"passed_time":0.125482289,"remaining_time":0.3973605817}, +{"learn":[0.4852675327],"iteration":240,"passed_time":0.1259663076,"remaining_time":0.3967154667}, +{"learn":[0.4844093025],"iteration":241,"passed_time":0.1263979249,"remaining_time":0.3959075499}, +{"learn":[0.4839843852],"iteration":242,"passed_time":0.1268836657,"remaining_time":0.395271337}, +{"learn":[0.4834949417],"iteration":243,"passed_time":0.1297374992,"remaining_time":0.4019735631}, +{"learn":[0.4828630686],"iteration":244,"passed_time":0.1304808537,"remaining_time":0.4020940592}, +{"learn":[0.4820830396],"iteration":245,"passed_time":0.1309091752,"remaining_time":0.4012419434}, +{"learn":[0.4811603895],"iteration":246,"passed_time":0.1312974567,"remaining_time":0.4002711938}, +{"learn":[0.4806003102],"iteration":247,"passed_time":0.1317431478,"remaining_time":0.3994792224}, +{"learn":[0.4801141407],"iteration":248,"passed_time":0.1322640065,"remaining_time":0.3989167426}, +{"learn":[0.4795589915],"iteration":249,"passed_time":0.1327555156,"remaining_time":0.3982665467}, +{"learn":[0.4790101647],"iteration":250,"passed_time":0.1332032868,"remaining_time":0.3974870988}, +{"learn":[0.4783102529],"iteration":251,"passed_time":0.1340747063,"remaining_time":0.3979677789}, +{"learn":[0.4777278134],"iteration":252,"passed_time":0.1346313837,"remaining_time":0.3975084729}, +{"learn":[0.4772474894],"iteration":253,"passed_time":0.1363684936,"remaining_time":0.4005153395}, +{"learn":[0.4764735358],"iteration":254,"passed_time":0.1367531262,"remaining_time":0.3995336431}, +{"learn":[0.4755656251],"iteration":255,"passed_time":0.1370101783,"remaining_time":0.3981858307}, +{"learn":[0.4751491078],"iteration":256,"passed_time":0.1382981169,"remaining_time":0.3998268516}, +{"learn":[0.4743878543],"iteration":257,"passed_time":0.1386027668,"remaining_time":0.3986172596}, +{"learn":[0.4736746209],"iteration":258,"passed_time":0.1403680436,"remaining_time":0.4015935147}, +{"learn":[0.4729804865],"iteration":259,"passed_time":0.1412752119,"remaining_time":0.4020909876}, +{"learn":[0.4724334478],"iteration":260,"passed_time":0.1420032738,"remaining_time":0.4020705722}, +{"learn":[0.4720227505],"iteration":261,"passed_time":0.1427360627,"remaining_time":0.4020580698}, +{"learn":[0.4712655161],"iteration":262,"passed_time":0.1434610555,"remaining_time":0.4020182428}, +{"learn":[0.4704395831],"iteration":263,"passed_time":0.1449213751,"remaining_time":0.4040232276}, +{"learn":[0.4698364777],"iteration":264,"passed_time":0.1456504406,"remaining_time":0.4039738637}, +{"learn":[0.4693632637],"iteration":265,"passed_time":0.1461866855,"remaining_time":0.4033873202}, +{"learn":[0.468612028],"iteration":266,"passed_time":0.1474025943,"remaining_time":0.4046670472}, +{"learn":[0.4677223393],"iteration":267,"passed_time":0.1481455952,"remaining_time":0.4046364765}, +{"learn":[0.4670231215],"iteration":268,"passed_time":0.1502482533,"remaining_time":0.4082954392}, +{"learn":[0.4663278971],"iteration":269,"passed_time":0.152772893,"remaining_time":0.4130526367}, +{"learn":[0.4658006259],"iteration":270,"passed_time":0.1535577956,"remaining_time":0.4130761366}, +{"learn":[0.4653341855],"iteration":271,"passed_time":0.1541995322,"remaining_time":0.4127105125}, +{"learn":[0.4646414902],"iteration":272,"passed_time":0.1546745237,"remaining_time":0.4118988233}, +{"learn":[0.4639707293],"iteration":273,"passed_time":0.1553459531,"remaining_time":0.4116100801}, +{"learn":[0.4632801924],"iteration":274,"passed_time":0.1558574794,"remaining_time":0.410896991}, +{"learn":[0.4628103631],"iteration":275,"passed_time":0.1564273611,"remaining_time":0.41033844}, +{"learn":[0.4620725938],"iteration":276,"passed_time":0.1571536921,"remaining_time":0.4101881565}, +{"learn":[0.4616752139],"iteration":277,"passed_time":0.1578959374,"remaining_time":0.4100750605}, +{"learn":[0.4608016184],"iteration":278,"passed_time":0.1583776694,"remaining_time":0.4092842281}, +{"learn":[0.459932306],"iteration":279,"passed_time":0.1614568995,"remaining_time":0.4151748844}, +{"learn":[0.4593444594],"iteration":280,"passed_time":0.1618450488,"remaining_time":0.4141159791}, +{"learn":[0.4588904636],"iteration":281,"passed_time":0.1622101039,"remaining_time":0.4130030306}, +{"learn":[0.4582101745],"iteration":282,"passed_time":0.1630924951,"remaining_time":0.4132060741}, +{"learn":[0.4575310392],"iteration":283,"passed_time":0.1634776178,"remaining_time":0.4121477969}, +{"learn":[0.4566698628],"iteration":284,"passed_time":0.1640455756,"remaining_time":0.4115529353}, +{"learn":[0.4561560367],"iteration":285,"passed_time":0.1644130466,"remaining_time":0.4104577456}, +{"learn":[0.4554346417],"iteration":286,"passed_time":0.1648104569,"remaining_time":0.4094420063}, +{"learn":[0.454855838],"iteration":287,"passed_time":0.1652337472,"remaining_time":0.4084945417}, +{"learn":[0.4541831996],"iteration":288,"passed_time":0.1663634164,"remaining_time":0.4092885434}, +{"learn":[0.4536689264],"iteration":289,"passed_time":0.1668685602,"remaining_time":0.4085402681}, +{"learn":[0.4531557347],"iteration":290,"passed_time":0.1673281334,"remaining_time":0.4076826343}, +{"learn":[0.4527086743],"iteration":291,"passed_time":0.1698112535,"remaining_time":0.4117341352}, +{"learn":[0.4522080932],"iteration":292,"passed_time":0.1708048201,"remaining_time":0.4121467844}, +{"learn":[0.4516352969],"iteration":293,"passed_time":0.171830531,"remaining_time":0.4126270574}, +{"learn":[0.4509672991],"iteration":294,"passed_time":0.1725648195,"remaining_time":0.4124006704}, +{"learn":[0.4502509066],"iteration":295,"passed_time":0.1732911723,"remaining_time":0.4121519774}, +{"learn":[0.4495855272],"iteration":296,"passed_time":0.1738579907,"remaining_time":0.4115224495}, +{"learn":[0.4488103007],"iteration":297,"passed_time":0.1745239639,"remaining_time":0.4111269216}, +{"learn":[0.4482808496],"iteration":298,"passed_time":0.1761683281,"remaining_time":0.4130234047}, +{"learn":[0.44764027],"iteration":299,"passed_time":0.1767470974,"remaining_time":0.4124098938}, +{"learn":[0.4471345288],"iteration":300,"passed_time":0.1773201968,"remaining_time":0.411783447}, +{"learn":[0.4464275283],"iteration":301,"passed_time":0.1779259195,"remaining_time":0.4112327542}, +{"learn":[0.445842513],"iteration":302,"passed_time":0.1786413143,"remaining_time":0.4109339804}, +{"learn":[0.4453389304],"iteration":303,"passed_time":0.179133654,"remaining_time":0.4101217869}, +{"learn":[0.4445062748],"iteration":304,"passed_time":0.1796293844,"remaining_time":0.4093194168}, +{"learn":[0.4438071592],"iteration":305,"passed_time":0.180148086,"remaining_time":0.4085711493}, +{"learn":[0.4431103936],"iteration":306,"passed_time":0.1805860483,"remaining_time":0.4076421221}, +{"learn":[0.4424160804],"iteration":307,"passed_time":0.1815578997,"remaining_time":0.4079158005}, +{"learn":[0.4419180283],"iteration":308,"passed_time":0.1820822926,"remaining_time":0.4071807902}, +{"learn":[0.4410959354],"iteration":309,"passed_time":0.1825291209,"remaining_time":0.406274495}, +{"learn":[0.4403396802],"iteration":310,"passed_time":0.1832653693,"remaining_time":0.4060123454}, +{"learn":[0.439715841],"iteration":311,"passed_time":0.1839737579,"remaining_time":0.4056857226}, +{"learn":[0.4391380974],"iteration":312,"passed_time":0.1846743289,"remaining_time":0.4053395015}, +{"learn":[0.4387702474],"iteration":313,"passed_time":0.1853864409,"remaining_time":0.4050162371}, +{"learn":[0.4382839075],"iteration":314,"passed_time":0.1860388514,"remaining_time":0.404560677}, +{"learn":[0.437851578],"iteration":315,"passed_time":0.1866688863,"remaining_time":0.4040554374}, +{"learn":[0.4370380257],"iteration":316,"passed_time":0.1882798317,"remaining_time":0.4056628551}, +{"learn":[0.4365532909],"iteration":317,"passed_time":0.188876909,"remaining_time":0.405075635}, +{"learn":[0.4359790853],"iteration":318,"passed_time":0.1894348055,"remaining_time":0.4044047102}, +{"learn":[0.4353643443],"iteration":319,"passed_time":0.1906715673,"remaining_time":0.4051770806}, +{"learn":[0.4349343138],"iteration":320,"passed_time":0.191420987,"remaining_time":0.4049060753}, +{"learn":[0.4341925383],"iteration":321,"passed_time":0.1931586684,"remaining_time":0.4067129727}, +{"learn":[0.4335165833],"iteration":322,"passed_time":0.1946537089,"remaining_time":0.4079893526}, +{"learn":[0.4328867538],"iteration":323,"passed_time":0.1955175543,"remaining_time":0.4079316873}, +{"learn":[0.4323144172],"iteration":324,"passed_time":0.1964996124,"remaining_time":0.4081145797}, +{"learn":[0.4316435754],"iteration":325,"passed_time":0.1973518754,"remaining_time":0.4080219755}, +{"learn":[0.4310806351],"iteration":326,"passed_time":0.1983415272,"remaining_time":0.4082074857}, +{"learn":[0.4305228548],"iteration":327,"passed_time":0.1987165671,"remaining_time":0.4071266252}, +{"learn":[0.4299000289],"iteration":328,"passed_time":0.1990773521,"remaining_time":0.4060209825}, +{"learn":[0.4294867728],"iteration":329,"passed_time":0.1999732747,"remaining_time":0.4060063456}, +{"learn":[0.4287576846],"iteration":330,"passed_time":0.2005555481,"remaining_time":0.4053524523}, +{"learn":[0.4281379964],"iteration":331,"passed_time":0.2011324594,"remaining_time":0.4046882014}, +{"learn":[0.4274753375],"iteration":332,"passed_time":0.2016183908,"remaining_time":0.4038422422}, +{"learn":[0.4269457119],"iteration":333,"passed_time":0.2021672636,"remaining_time":0.4031239447}, +{"learn":[0.4265914985],"iteration":334,"passed_time":0.2028451346,"remaining_time":0.4026627298}, +{"learn":[0.4261210859],"iteration":335,"passed_time":0.2034348599,"remaining_time":0.4020260327}, +{"learn":[0.4253347473],"iteration":336,"passed_time":0.2039420057,"remaining_time":0.4012271508}, +{"learn":[0.4246753199],"iteration":337,"passed_time":0.2044512739,"remaining_time":0.4004341518}, +{"learn":[0.4242052266],"iteration":338,"passed_time":0.2050077051,"remaining_time":0.3997347879}, +{"learn":[0.423488817],"iteration":339,"passed_time":0.2055324916,"remaining_time":0.3989748366}, +{"learn":[0.4228350009],"iteration":340,"passed_time":0.2061729365,"remaining_time":0.3984397805}, +{"learn":[0.4222266035],"iteration":341,"passed_time":0.2068351056,"remaining_time":0.3979459049}, +{"learn":[0.4217495535],"iteration":342,"passed_time":0.2075861733,"remaining_time":0.397621329}, +{"learn":[0.4212821083],"iteration":343,"passed_time":0.208360868,"remaining_time":0.3973393296}, +{"learn":[0.4208801644],"iteration":344,"passed_time":0.2093298237,"remaining_time":0.3974232885}, +{"learn":[0.4203901972],"iteration":345,"passed_time":0.209898891,"remaining_time":0.3967453027}, +{"learn":[0.4198750854],"iteration":346,"passed_time":0.2104952424,"remaining_time":0.3961192889}, +{"learn":[0.4193912702],"iteration":347,"passed_time":0.2110447189,"remaining_time":0.3954056228}, +{"learn":[0.4187875901],"iteration":348,"passed_time":0.2115227897,"remaining_time":0.3945597023}, +{"learn":[0.4183272166],"iteration":349,"passed_time":0.2121592355,"remaining_time":0.3940100088}, +{"learn":[0.4178133053],"iteration":350,"passed_time":0.2126936594,"remaining_time":0.3932711822}, +{"learn":[0.4172735342],"iteration":351,"passed_time":0.2149225279,"remaining_time":0.3956528354}, +{"learn":[0.4165695991],"iteration":352,"passed_time":0.2163618086,"remaining_time":0.3965611619}, +{"learn":[0.416060222],"iteration":353,"passed_time":0.2169572078,"remaining_time":0.3959162605}, +{"learn":[0.4154641713],"iteration":354,"passed_time":0.2175585204,"remaining_time":0.3952823821}, +{"learn":[0.414705119],"iteration":355,"passed_time":0.2180793806,"remaining_time":0.3945031492}, +{"learn":[0.4140638581],"iteration":356,"passed_time":0.2186555459,"remaining_time":0.3938249749}, +{"learn":[0.4134714263],"iteration":357,"passed_time":0.2191818016,"remaining_time":0.3930578678}, +{"learn":[0.4131337319],"iteration":358,"passed_time":0.2209840315,"remaining_time":0.3945703737}, +{"learn":[0.4126064479],"iteration":359,"passed_time":0.2216364612,"remaining_time":0.3940203755}, +{"learn":[0.4121523883],"iteration":360,"passed_time":0.2221682313,"remaining_time":0.3932562321}, +{"learn":[0.4115223118],"iteration":361,"passed_time":0.2227907028,"remaining_time":0.3926532276}, +{"learn":[0.4110527294],"iteration":362,"passed_time":0.2234476052,"remaining_time":0.3921105359}, +{"learn":[0.4106555283],"iteration":363,"passed_time":0.2242245521,"remaining_time":0.3917769646}, +{"learn":[0.4100904507],"iteration":364,"passed_time":0.2249400169,"remaining_time":0.391334002}, +{"learn":[0.4095082581],"iteration":365,"passed_time":0.2254414988,"remaining_time":0.3905188805}, +{"learn":[0.4088843039],"iteration":366,"passed_time":0.2263359222,"remaining_time":0.3903832119}, +{"learn":[0.4081387094],"iteration":367,"passed_time":0.226922421,"remaining_time":0.3897145925}, +{"learn":[0.4073968657],"iteration":368,"passed_time":0.2284440017,"remaining_time":0.3906454338}, +{"learn":[0.40671779],"iteration":369,"passed_time":0.2302709512,"remaining_time":0.392082971}, +{"learn":[0.4063282268],"iteration":370,"passed_time":0.2310629611,"remaining_time":0.3917482548}, +{"learn":[0.4058952715],"iteration":371,"passed_time":0.2351638011,"remaining_time":0.3969969546}, +{"learn":[0.4054609069],"iteration":372,"passed_time":0.23588082,"remaining_time":0.3965074373}, +{"learn":[0.4049458589],"iteration":373,"passed_time":0.2364975176,"remaining_time":0.3958487862}, +{"learn":[0.4045187661],"iteration":374,"passed_time":0.2372490337,"remaining_time":0.3954150561}, +{"learn":[0.4038458211],"iteration":375,"passed_time":0.2379071814,"remaining_time":0.394824684}, +{"learn":[0.4031138676],"iteration":376,"passed_time":0.2384520287,"remaining_time":0.3940467212}, +{"learn":[0.4025977935],"iteration":377,"passed_time":0.2390649109,"remaining_time":0.3933819433}, +{"learn":[0.4021515165],"iteration":378,"passed_time":0.2396257877,"remaining_time":0.3926322274}, +{"learn":[0.4017208729],"iteration":379,"passed_time":0.2402559285,"remaining_time":0.3919965148}, +{"learn":[0.4012702831],"iteration":380,"passed_time":0.2413900987,"remaining_time":0.3921797142}, +{"learn":[0.400815968],"iteration":381,"passed_time":0.2422200659,"remaining_time":0.3918638763}, +{"learn":[0.4003585875],"iteration":382,"passed_time":0.2430818001,"remaining_time":0.3915965292}, +{"learn":[0.3996368136],"iteration":383,"passed_time":0.243582873,"remaining_time":0.3907475255}, +{"learn":[0.3989150098],"iteration":384,"passed_time":0.2441080496,"remaining_time":0.3899388325}, +{"learn":[0.3984945757],"iteration":385,"passed_time":0.2448157116,"remaining_time":0.3894218832}, +{"learn":[0.3979838576],"iteration":386,"passed_time":0.2457731315,"remaining_time":0.3892995597}, +{"learn":[0.3975541421],"iteration":387,"passed_time":0.2465708034,"remaining_time":0.3889209579}, +{"learn":[0.3969472051],"iteration":388,"passed_time":0.2470648508,"remaining_time":0.3880633004}, +{"learn":[0.3965793593],"iteration":389,"passed_time":0.2477127547,"remaining_time":0.3874481548}, +{"learn":[0.3960817797],"iteration":390,"passed_time":0.2482348895,"remaining_time":0.3866369506}, +{"learn":[0.3953677969],"iteration":391,"passed_time":0.2486813269,"remaining_time":0.3857098131}, +{"learn":[0.3948702344],"iteration":392,"passed_time":0.2491915071,"remaining_time":0.3848835747}, +{"learn":[0.3943742556],"iteration":393,"passed_time":0.2497690259,"remaining_time":0.3841625119}, +{"learn":[0.3938194088],"iteration":394,"passed_time":0.2503357995,"remaining_time":0.3834257182}, +{"learn":[0.393111263],"iteration":395,"passed_time":0.2507962749,"remaining_time":0.3825276516}, +{"learn":[0.3924046883],"iteration":396,"passed_time":0.2513014639,"remaining_time":0.3816997046}, +{"learn":[0.3919914365],"iteration":397,"passed_time":0.2518446428,"remaining_time":0.3809308416}, +{"learn":[0.3914010099],"iteration":398,"passed_time":0.2523300163,"remaining_time":0.3800760396}, +{"learn":[0.390907675],"iteration":399,"passed_time":0.2528522688,"remaining_time":0.3792784032}, +{"learn":[0.390361424],"iteration":400,"passed_time":0.2533540735,"remaining_time":0.3784515961}, +{"learn":[0.3897764002],"iteration":401,"passed_time":0.2538981331,"remaining_time":0.3776892627}, +{"learn":[0.3892910055],"iteration":402,"passed_time":0.2544391916,"remaining_time":0.3769235667}, +{"learn":[0.3889340375],"iteration":403,"passed_time":0.2553128122,"remaining_time":0.3766495942}, +{"learn":[0.3882963104],"iteration":404,"passed_time":0.2560603135,"remaining_time":0.3761873742}, +{"learn":[0.3878849873],"iteration":405,"passed_time":0.2582861316,"remaining_time":0.3778866063}, +{"learn":[0.3873047105],"iteration":406,"passed_time":0.2590881032,"remaining_time":0.377492003}, +{"learn":[0.3869512975],"iteration":407,"passed_time":0.2609867835,"remaining_time":0.3786867055}, +{"learn":[0.3864888975],"iteration":408,"passed_time":0.261609425,"remaining_time":0.3780224209}, +{"learn":[0.3860683399],"iteration":409,"passed_time":0.2624639087,"remaining_time":0.3776919661}, +{"learn":[0.3854924994],"iteration":410,"passed_time":0.2635370918,"remaining_time":0.3776723773}, +{"learn":[0.3850928077],"iteration":411,"passed_time":0.2646888275,"remaining_time":0.377759783}, +{"learn":[0.3847414127],"iteration":412,"passed_time":0.2654285953,"remaining_time":0.3772556548}, +{"learn":[0.3842214559],"iteration":413,"passed_time":0.2661514942,"remaining_time":0.3767265111}, +{"learn":[0.3839176638],"iteration":414,"passed_time":0.2675213856,"remaining_time":0.3771084592}, +{"learn":[0.3834964207],"iteration":415,"passed_time":0.2682169357,"remaining_time":0.3765353136}, +{"learn":[0.3829643939],"iteration":416,"passed_time":0.2687970532,"remaining_time":0.3758001966}, +{"learn":[0.3826096952],"iteration":417,"passed_time":0.2698969921,"remaining_time":0.3757895918}, +{"learn":[0.3820993347],"iteration":418,"passed_time":0.2731110483,"remaining_time":0.3787052961}, +{"learn":[0.3815246395],"iteration":419,"passed_time":0.2745873571,"remaining_time":0.3791920646}, +{"learn":[0.3812263012],"iteration":420,"passed_time":0.2761327299,"remaining_time":0.3797644908}, +{"learn":[0.380756293],"iteration":421,"passed_time":0.2770033586,"remaining_time":0.3794027045}, +{"learn":[0.3803054477],"iteration":422,"passed_time":0.277682967,"remaining_time":0.3787779478}, +{"learn":[0.3799018434],"iteration":423,"passed_time":0.2782834207,"remaining_time":0.3780454018}, +{"learn":[0.3793343902],"iteration":424,"passed_time":0.2788537715,"remaining_time":0.3772727497}, +{"learn":[0.3789422768],"iteration":425,"passed_time":0.2795747181,"remaining_time":0.3767039628}, +{"learn":[0.3786449858],"iteration":426,"passed_time":0.2803431543,"remaining_time":0.3761981907}, +{"learn":[0.3781736961],"iteration":427,"passed_time":0.28105148,"remaining_time":0.3756108565}, +{"learn":[0.3776146003],"iteration":428,"passed_time":0.28163332,"remaining_time":0.3748546054}, +{"learn":[0.3772632352],"iteration":429,"passed_time":0.2821840289,"remaining_time":0.3740578987}, +{"learn":[0.3768726545],"iteration":430,"passed_time":0.2827157601,"remaining_time":0.3732372795}, +{"learn":[0.3763188847],"iteration":431,"passed_time":0.2832323109,"remaining_time":0.3723980384}, +{"learn":[0.3759762943],"iteration":432,"passed_time":0.2838044071,"remaining_time":0.3716330227}, +{"learn":[0.3756239925],"iteration":433,"passed_time":0.2847285037,"remaining_time":0.3713279564}, +{"learn":[0.3752392616],"iteration":434,"passed_time":0.2855756343,"remaining_time":0.3709200768}, +{"learn":[0.3746320861],"iteration":435,"passed_time":0.2861310843,"remaining_time":0.3701328705}, +{"learn":[0.3739689333],"iteration":436,"passed_time":0.2866816654,"remaining_time":0.3693404523}, +{"learn":[0.3736773006],"iteration":437,"passed_time":0.2872724155,"remaining_time":0.3686006792}, +{"learn":[0.3732340549],"iteration":438,"passed_time":0.2878093622,"remaining_time":0.3677928296}, +{"learn":[0.3725745422],"iteration":439,"passed_time":0.2883467403,"remaining_time":0.3669867603}, +{"learn":[0.3719174904],"iteration":440,"passed_time":0.2894269674,"remaining_time":0.3668700108}, +{"learn":[0.3714807417],"iteration":441,"passed_time":0.2899982225,"remaining_time":0.3661063533}, +{"learn":[0.3710459641],"iteration":442,"passed_time":0.2905078938,"remaining_time":0.3652661328}, +{"learn":[0.3703917946],"iteration":443,"passed_time":0.2909793189,"remaining_time":0.3643795075}, +{"learn":[0.3699332518],"iteration":444,"passed_time":0.2915444008,"remaining_time":0.3636115561}, +{"learn":[0.3694799159],"iteration":445,"passed_time":0.2920996989,"remaining_time":0.3628323615}, +{"learn":[0.3688319496],"iteration":446,"passed_time":0.2925414694,"remaining_time":0.3619137194}, +{"learn":[0.3682378743],"iteration":447,"passed_time":0.2931437852,"remaining_time":0.3611950211}, +{"learn":[0.3677363523],"iteration":448,"passed_time":0.2936419292,"remaining_time":0.3603490044}, +{"learn":[0.3674085396],"iteration":449,"passed_time":0.2942661609,"remaining_time":0.3596586411}, +{"learn":[0.367077423],"iteration":450,"passed_time":0.2949064941,"remaining_time":0.3589881713}, +{"learn":[0.3667051792],"iteration":451,"passed_time":0.2954570075,"remaining_time":0.3582089383}, +{"learn":[0.3664227],"iteration":452,"passed_time":0.2960547363,"remaining_time":0.357487728}, +{"learn":[0.3660525254],"iteration":453,"passed_time":0.2965446128,"remaining_time":0.3566373538}, +{"learn":[0.3654101065],"iteration":454,"passed_time":0.2970064531,"remaining_time":0.3557549823}, +{"learn":[0.3649876288],"iteration":455,"passed_time":0.2975205166,"remaining_time":0.3549367567}, +{"learn":[0.3646104591],"iteration":456,"passed_time":0.2980737725,"remaining_time":0.3541664298}, +{"learn":[0.364115434],"iteration":457,"passed_time":0.2990222064,"remaining_time":0.3538647071}, +{"learn":[0.3637939692],"iteration":458,"passed_time":0.3000197081,"remaining_time":0.3536180002}, +{"learn":[0.3634185621],"iteration":459,"passed_time":0.3006429692,"remaining_time":0.352928703}, +{"learn":[0.3628906863],"iteration":460,"passed_time":0.301148196,"remaining_time":0.3521016869}, +{"learn":[0.362450093],"iteration":461,"passed_time":0.3016677058,"remaining_time":0.3512926963}, +{"learn":[0.3618173812],"iteration":462,"passed_time":0.3021884164,"remaining_time":0.350486349}, +{"learn":[0.3612926432],"iteration":463,"passed_time":0.302972743,"remaining_time":0.3499857549}, +{"learn":[0.3608229033],"iteration":464,"passed_time":0.3040010826,"remaining_time":0.3497646864}, +{"learn":[0.3603342303],"iteration":465,"passed_time":0.3049503335,"remaining_time":0.3494495238}, +{"learn":[0.3599510874],"iteration":466,"passed_time":0.3096284342,"remaining_time":0.3533874849}, +{"learn":[0.3593257538],"iteration":467,"passed_time":0.3103259611,"remaining_time":0.3527636994}, +{"learn":[0.3588909294],"iteration":468,"passed_time":0.3113314734,"remaining_time":0.3524882993}, +{"learn":[0.3584752466],"iteration":469,"passed_time":0.3120732412,"remaining_time":0.3519123784}, +{"learn":[0.3580092447],"iteration":470,"passed_time":0.3127731955,"remaining_time":0.3512887907}, +{"learn":[0.3575279159],"iteration":471,"passed_time":0.3134354027,"remaining_time":0.3506226539}, +{"learn":[0.3570059666],"iteration":472,"passed_time":0.3153024356,"remaining_time":0.3512989081}, +{"learn":[0.356525468],"iteration":473,"passed_time":0.3157099761,"remaining_time":0.3503448258}, +{"learn":[0.3561132167],"iteration":474,"passed_time":0.3161932925,"remaining_time":0.349476797}, +{"learn":[0.3557490196],"iteration":475,"passed_time":0.3166485945,"remaining_time":0.3485795452}, +{"learn":[0.3554368232],"iteration":476,"passed_time":0.3172151434,"remaining_time":0.3478061216}, +{"learn":[0.3550801958],"iteration":477,"passed_time":0.3176272495,"remaining_time":0.3468649043}, +{"learn":[0.3544650078],"iteration":478,"passed_time":0.3179188014,"remaining_time":0.3457947715}, +{"learn":[0.3541559832],"iteration":479,"passed_time":0.3183427669,"remaining_time":0.3448713308}, +{"learn":[0.3537509612],"iteration":480,"passed_time":0.3187753508,"remaining_time":0.3439592662}, +{"learn":[0.3532782325],"iteration":481,"passed_time":0.3191261262,"remaining_time":0.3429612726}, +{"learn":[0.3527637294],"iteration":482,"passed_time":0.3194514467,"remaining_time":0.3419387121}, +{"learn":[0.3522067794],"iteration":483,"passed_time":0.3197839534,"remaining_time":0.3409266941}, +{"learn":[0.3516512513],"iteration":484,"passed_time":0.3201487053,"remaining_time":0.339951718}, +{"learn":[0.3513434274],"iteration":485,"passed_time":0.3205595731,"remaining_time":0.3390280259}, +{"learn":[0.3509157343],"iteration":486,"passed_time":0.3209247611,"remaining_time":0.3380583213}, +{"learn":[0.3505166599],"iteration":487,"passed_time":0.3212523916,"remaining_time":0.3370516896}, +{"learn":[0.3500145929],"iteration":488,"passed_time":0.3215824291,"remaining_time":0.3360503502}, +{"learn":[0.3496588809],"iteration":489,"passed_time":0.3219426458,"remaining_time":0.335083162}, +{"learn":[0.3492625185],"iteration":490,"passed_time":0.322399651,"remaining_time":0.3342187829}, +{"learn":[0.3488656453],"iteration":491,"passed_time":0.3227923015,"remaining_time":0.3332896121}, +{"learn":[0.3484638504],"iteration":492,"passed_time":0.3231560736,"remaining_time":0.3323329195}, +{"learn":[0.3480004881],"iteration":493,"passed_time":0.3234965985,"remaining_time":0.3313548154}, +{"learn":[0.3476062758],"iteration":494,"passed_time":0.323941694,"remaining_time":0.3304859706}, +{"learn":[0.3472642175],"iteration":495,"passed_time":0.324350876,"remaining_time":0.3295823417}, +{"learn":[0.3468141258],"iteration":496,"passed_time":0.3247473794,"remaining_time":0.3286678709}, +{"learn":[0.3463197521],"iteration":497,"passed_time":0.3250688244,"remaining_time":0.3276798189}, +{"learn":[0.3460200386],"iteration":498,"passed_time":0.3254756201,"remaining_time":0.3267801316}, +{"learn":[0.3455273892],"iteration":499,"passed_time":0.3258117316,"remaining_time":0.3258117316}, +{"learn":[0.3449356386],"iteration":500,"passed_time":0.3261248511,"remaining_time":0.3248229555}, +{"learn":[0.3445860488],"iteration":501,"passed_time":0.3265338473,"remaining_time":0.3239319839}, +{"learn":[0.3441699871],"iteration":502,"passed_time":0.3269095564,"remaining_time":0.3230100388}, +{"learn":[0.3437617634],"iteration":503,"passed_time":0.3272397571,"remaining_time":0.3220454753}, +{"learn":[0.343414247],"iteration":504,"passed_time":0.3276446868,"remaining_time":0.3211566732}, +{"learn":[0.3430223976],"iteration":505,"passed_time":0.3283799178,"remaining_time":0.3205922518}, +{"learn":[0.34276789],"iteration":506,"passed_time":0.3288634981,"remaining_time":0.3197824548}, +{"learn":[0.3423123913],"iteration":507,"passed_time":0.3292155659,"remaining_time":0.3188465717}, +{"learn":[0.3419258424],"iteration":508,"passed_time":0.3295586355,"remaining_time":0.3179043026}, +{"learn":[0.3414402349],"iteration":509,"passed_time":0.3298939341,"remaining_time":0.3169569171}, +{"learn":[0.3409900154],"iteration":510,"passed_time":0.3302300308,"remaining_time":0.3160126909}, +{"learn":[0.3404066435],"iteration":511,"passed_time":0.330526685,"remaining_time":0.3150332467}, +{"learn":[0.3400745732],"iteration":512,"passed_time":0.3315326622,"remaining_time":0.3147298372}, +{"learn":[0.3396446535],"iteration":513,"passed_time":0.3319733954,"remaining_time":0.3138892416}, +{"learn":[0.3392627665],"iteration":514,"passed_time":0.3323251441,"remaining_time":0.3129663978}, +{"learn":[0.3389605539],"iteration":515,"passed_time":0.3327506488,"remaining_time":0.3121149497}, +{"learn":[0.3386215099],"iteration":516,"passed_time":0.3331681775,"remaining_time":0.3112576977}, +{"learn":[0.3381765485],"iteration":517,"passed_time":0.3334935631,"remaining_time":0.3103164043}, +{"learn":[0.3376931037],"iteration":518,"passed_time":0.3338169224,"remaining_time":0.3093756063}, +{"learn":[0.337248815],"iteration":519,"passed_time":0.3341415136,"remaining_time":0.3084383203}, +{"learn":[0.3367245538],"iteration":520,"passed_time":0.3344925904,"remaining_time":0.3075277367}, +{"learn":[0.3363859143],"iteration":521,"passed_time":0.3376304277,"remaining_time":0.3091711579}, +{"learn":[0.3359881639],"iteration":522,"passed_time":0.3385165422,"remaining_time":0.3087426207}, +{"learn":[0.335651734],"iteration":523,"passed_time":0.3391893682,"remaining_time":0.3081185864}, +{"learn":[0.335174254],"iteration":524,"passed_time":0.3398135656,"remaining_time":0.3074503688}, +{"learn":[0.3346553019],"iteration":525,"passed_time":0.340461328,"remaining_time":0.3068035541}, +{"learn":[0.3342645126],"iteration":526,"passed_time":0.3411102502,"remaining_time":0.3061577768}, +{"learn":[0.333931561],"iteration":527,"passed_time":0.3421015002,"remaining_time":0.3058180078}, +{"learn":[0.3334565205],"iteration":528,"passed_time":0.3429734602,"remaining_time":0.3053695647}, +{"learn":[0.3331309302],"iteration":529,"passed_time":0.3437869073,"remaining_time":0.3048676348}, +{"learn":[0.3328454452],"iteration":530,"passed_time":0.3462173018,"remaining_time":0.3057926827}, +{"learn":[0.3325251554],"iteration":531,"passed_time":0.3466204815,"remaining_time":0.304921777}, +{"learn":[0.3321362521],"iteration":532,"passed_time":0.3474775273,"remaining_time":0.3044502913}, +{"learn":[0.3318056336],"iteration":533,"passed_time":0.3478953928,"remaining_time":0.3035941068}, +{"learn":[0.3313364259],"iteration":534,"passed_time":0.3482484741,"remaining_time":0.3026832532}, +{"learn":[0.3309532276],"iteration":535,"passed_time":0.3486176657,"remaining_time":0.3017884271}, +{"learn":[0.3306332145],"iteration":536,"passed_time":0.3494756577,"remaining_time":0.301317001}, +{"learn":[0.3302503697],"iteration":537,"passed_time":0.3499166267,"remaining_time":0.3004860252}, +{"learn":[0.3298849165],"iteration":538,"passed_time":0.3502547191,"remaining_time":0.2995685075}, +{"learn":[0.3294564698],"iteration":539,"passed_time":0.3505782409,"remaining_time":0.2986407237}, +{"learn":[0.3290752811],"iteration":540,"passed_time":0.3509396463,"remaining_time":0.2977473154}, +{"learn":[0.3286145287],"iteration":541,"passed_time":0.351250937,"remaining_time":0.2968135224}, +{"learn":[0.3283276004],"iteration":542,"passed_time":0.3517529965,"remaining_time":0.2960425771}, +{"learn":[0.327948004],"iteration":543,"passed_time":0.352163302,"remaining_time":0.295195709}, +{"learn":[0.3276326656],"iteration":544,"passed_time":0.3524674003,"remaining_time":0.2942617746}, +{"learn":[0.3273104302],"iteration":545,"passed_time":0.3528592852,"remaining_time":0.2934031419}, +{"learn":[0.3269880712],"iteration":546,"passed_time":0.3532791953,"remaining_time":0.292569425}, +{"learn":[0.3266501256],"iteration":547,"passed_time":0.3536846858,"remaining_time":0.2917253248}, +{"learn":[0.3262259236],"iteration":548,"passed_time":0.3540171476,"remaining_time":0.2908228298}, +{"learn":[0.3258500312],"iteration":549,"passed_time":0.3543562466,"remaining_time":0.2899278381}, +{"learn":[0.3253507018],"iteration":550,"passed_time":0.3546589755,"remaining_time":0.2890052269}, +{"learn":[0.3250135141],"iteration":551,"passed_time":0.355055667,"remaining_time":0.288161121}, +{"learn":[0.3246421516],"iteration":552,"passed_time":0.3554378948,"remaining_time":0.2873069421}, +{"learn":[0.3244070964],"iteration":553,"passed_time":0.3558949874,"remaining_time":0.2865147371}, +{"learn":[0.3240867768],"iteration":554,"passed_time":0.3562683347,"remaining_time":0.2856565927}, +{"learn":[0.3238518877],"iteration":555,"passed_time":0.3570102351,"remaining_time":0.2850945043}, +{"learn":[0.3235307634],"iteration":556,"passed_time":0.3574072901,"remaining_time":0.2842575036}, +{"learn":[0.3231597074],"iteration":557,"passed_time":0.3577621642,"remaining_time":0.2833886677}, +{"learn":[0.3226190891],"iteration":558,"passed_time":0.3580537165,"remaining_time":0.2824717155}, +{"learn":[0.3221721266],"iteration":559,"passed_time":0.3583790962,"remaining_time":0.2815835756}, +{"learn":[0.3217267905],"iteration":560,"passed_time":0.3587386326,"remaining_time":0.2807241706}, +{"learn":[0.32127652],"iteration":561,"passed_time":0.3590720036,"remaining_time":0.2798461523}, +{"learn":[0.3210437383],"iteration":562,"passed_time":0.3595055899,"remaining_time":0.2790478558}, +{"learn":[0.3208125659],"iteration":563,"passed_time":0.3598968851,"remaining_time":0.2782181594}, +{"learn":[0.3204400071],"iteration":564,"passed_time":0.3602902071,"remaining_time":0.2773915754}, +{"learn":[0.3199909372],"iteration":565,"passed_time":0.36064548,"remaining_time":0.2765373469}, +{"learn":[0.3196365365],"iteration":566,"passed_time":0.3609983237,"remaining_time":0.2756830232}, +{"learn":[0.3191042415],"iteration":567,"passed_time":0.3612906944,"remaining_time":0.2747844718}, +{"learn":[0.3186643677],"iteration":568,"passed_time":0.3616025116,"remaining_time":0.2739027812}, +{"learn":[0.3183591408],"iteration":569,"passed_time":0.362010737,"remaining_time":0.2730958191}, +{"learn":[0.3178278974],"iteration":570,"passed_time":0.3623038477,"remaining_time":0.2722037665}, +{"learn":[0.3174204188],"iteration":571,"passed_time":0.3626155564,"remaining_time":0.271327724}, +{"learn":[0.3169393625],"iteration":572,"passed_time":0.3629143003,"remaining_time":0.2704439899}, +{"learn":[0.3165036993],"iteration":573,"passed_time":0.363220913,"remaining_time":0.269568134}, +{"learn":[0.3161586651],"iteration":574,"passed_time":0.363547929,"remaining_time":0.2687093388}, +{"learn":[0.3158007775],"iteration":575,"passed_time":0.3639338083,"remaining_time":0.26789572}, +{"learn":[0.3153626706],"iteration":576,"passed_time":0.3642983688,"remaining_time":0.267067955}, +{"learn":[0.3148854673],"iteration":577,"passed_time":0.3646425797,"remaining_time":0.2662269354}, +{"learn":[0.3143635946],"iteration":578,"passed_time":0.3650496702,"remaining_time":0.2654333526}, +{"learn":[0.3138878771],"iteration":579,"passed_time":0.3660084401,"remaining_time":0.2650405946}, +{"learn":[0.3134522523],"iteration":580,"passed_time":0.3665438913,"remaining_time":0.2643406032}, +{"learn":[0.3131497758],"iteration":581,"passed_time":0.3669453893,"remaining_time":0.2635449703}, +{"learn":[0.3128520804],"iteration":582,"passed_time":0.3673621961,"remaining_time":0.2627616394}, +{"learn":[0.3124257113],"iteration":583,"passed_time":0.3679513895,"remaining_time":0.2621023596}, +{"learn":[0.3121328269],"iteration":584,"passed_time":0.3684001299,"remaining_time":0.2613436819}, +{"learn":[0.311617566],"iteration":585,"passed_time":0.3687012989,"remaining_time":0.260481805}, +{"learn":[0.3113503967],"iteration":586,"passed_time":0.3690998272,"remaining_time":0.2596903384}, +{"learn":[0.3110510664],"iteration":587,"passed_time":0.3695252416,"remaining_time":0.2589190468}, +{"learn":[0.3105376831],"iteration":588,"passed_time":0.3698207864,"remaining_time":0.2580583076}, +{"learn":[0.3101793698],"iteration":589,"passed_time":0.3701629642,"remaining_time":0.2572318903}, +{"learn":[0.3097576456],"iteration":590,"passed_time":0.3705036756,"remaining_time":0.256406097}, +{"learn":[0.3093302974],"iteration":591,"passed_time":0.3708391752,"remaining_time":0.2555783505}, +{"learn":[0.3089044435],"iteration":592,"passed_time":0.3711963898,"remaining_time":0.254767168}, +{"learn":[0.3085143992],"iteration":593,"passed_time":0.3715781563,"remaining_time":0.2539742954}, +{"learn":[0.308161561],"iteration":594,"passed_time":0.3719649689,"remaining_time":0.2531862393}, +{"learn":[0.307898977],"iteration":595,"passed_time":0.3724476686,"remaining_time":0.252464527}, +{"learn":[0.3076050154],"iteration":596,"passed_time":0.3728517823,"remaining_time":0.2516905666}, +{"learn":[0.3073084865],"iteration":597,"passed_time":0.3732415627,"remaining_time":0.2509082077}, +{"learn":[0.3070910786],"iteration":598,"passed_time":0.3736596733,"remaining_time":0.2501461252}, +{"learn":[0.3066311819],"iteration":599,"passed_time":0.3740099804,"remaining_time":0.249339987}, +{"learn":[0.3063479534],"iteration":600,"passed_time":0.3743961785,"remaining_time":0.2485591934}, +{"learn":[0.3059329774],"iteration":601,"passed_time":0.3747714456,"remaining_time":0.247772484}, +{"learn":[0.3055891906],"iteration":602,"passed_time":0.3751904495,"remaining_time":0.2470159344}, +{"learn":[0.3052941263],"iteration":603,"passed_time":0.3756562411,"remaining_time":0.246291178}, +{"learn":[0.3049547076],"iteration":604,"passed_time":0.3760269905,"remaining_time":0.2455052252}, +{"learn":[0.3044544842],"iteration":605,"passed_time":0.3763324552,"remaining_time":0.2446781969}, +{"learn":[0.3041649589],"iteration":606,"passed_time":0.3768138714,"remaining_time":0.2439668064}, +{"learn":[0.3036680775],"iteration":607,"passed_time":0.3773010596,"remaining_time":0.2432598937}, +{"learn":[0.3033767513],"iteration":608,"passed_time":0.3780158821,"remaining_time":0.2426998521}, +{"learn":[0.3029951879],"iteration":609,"passed_time":0.3786339673,"remaining_time":0.2420774545}, +{"learn":[0.3025867684],"iteration":610,"passed_time":0.3794267042,"remaining_time":0.241566265}, +{"learn":[0.3020925096],"iteration":611,"passed_time":0.3800789752,"remaining_time":0.2409651019}, +{"learn":[0.3018062498],"iteration":612,"passed_time":0.3807328261,"remaining_time":0.2403647695}, +{"learn":[0.3013960038],"iteration":613,"passed_time":0.3813861455,"remaining_time":0.2397639286}, +{"learn":[0.3010602551],"iteration":614,"passed_time":0.3821270163,"remaining_time":0.2392177257}, +{"learn":[0.3007759707],"iteration":615,"passed_time":0.383546687,"remaining_time":0.2390940387}, +{"learn":[0.3005227702],"iteration":616,"passed_time":0.3853002015,"remaining_time":0.2391733828}, +{"learn":[0.3001474057],"iteration":617,"passed_time":0.3858573272,"remaining_time":0.2385072799}, +{"learn":[0.2998298875],"iteration":618,"passed_time":0.3864455551,"remaining_time":0.2378606729}, +{"learn":[0.2994279989],"iteration":619,"passed_time":0.3870972323,"remaining_time":0.2372531424}, +{"learn":[0.2990533326],"iteration":620,"passed_time":0.387642183,"remaining_time":0.2365803339}, +{"learn":[0.2986103424],"iteration":621,"passed_time":0.3882269796,"remaining_time":0.2359321516}, +{"learn":[0.2982386308],"iteration":622,"passed_time":0.3890611999,"remaining_time":0.2354351081}, +{"learn":[0.2980017236],"iteration":623,"passed_time":0.389633898,"remaining_time":0.2347794001}, +{"learn":[0.2976457221],"iteration":624,"passed_time":0.3902746228,"remaining_time":0.2341647737}, +{"learn":[0.2973187694],"iteration":625,"passed_time":0.3908383146,"remaining_time":0.233504041}, +{"learn":[0.2970820921],"iteration":626,"passed_time":0.3915603281,"remaining_time":0.2329378028}, +{"learn":[0.2968406635],"iteration":627,"passed_time":0.3927190752,"remaining_time":0.2326297706}, +{"learn":[0.29651102],"iteration":628,"passed_time":0.3932948847,"remaining_time":0.2319752023}, +{"learn":[0.2962718989],"iteration":629,"passed_time":0.3939082796,"remaining_time":0.2313429579}, +{"learn":[0.2960237563],"iteration":630,"passed_time":0.3945193363,"remaining_time":0.2307094059}, +{"learn":[0.2957421456],"iteration":631,"passed_time":0.3951295835,"remaining_time":0.2300754537}, +{"learn":[0.2954565712],"iteration":632,"passed_time":0.395840817,"remaining_time":0.2295001261}, +{"learn":[0.2951291544],"iteration":633,"passed_time":0.3968526821,"remaining_time":0.2290979206}, +{"learn":[0.2948581576],"iteration":634,"passed_time":0.3975057375,"remaining_time":0.2284875499}, +{"learn":[0.2944618165],"iteration":635,"passed_time":0.398007172,"remaining_time":0.2277902683}, +{"learn":[0.2940285674],"iteration":636,"passed_time":0.398549799,"remaining_time":0.2271170754}, +{"learn":[0.2937956708],"iteration":637,"passed_time":0.3990961718,"remaining_time":0.2264464173}, +{"learn":[0.2935192755],"iteration":638,"passed_time":0.3998986777,"remaining_time":0.2259208492}, +{"learn":[0.2932442342],"iteration":639,"passed_time":0.4008465293,"remaining_time":0.2254761727}, +{"learn":[0.2928142079],"iteration":640,"passed_time":0.4014032505,"remaining_time":0.2248108688}, +{"learn":[0.2925079678],"iteration":641,"passed_time":0.4019529065,"remaining_time":0.2241419635}, +{"learn":[0.2922439575],"iteration":642,"passed_time":0.4026259892,"remaining_time":0.2235419567}, +{"learn":[0.2919395225],"iteration":643,"passed_time":0.4032024573,"remaining_time":0.2228883149}, +{"learn":[0.2915480094],"iteration":644,"passed_time":0.4037289074,"remaining_time":0.2222073831}, +{"learn":[0.2912747434],"iteration":645,"passed_time":0.4049154446,"remaining_time":0.2218886492}, +{"learn":[0.2908083541],"iteration":646,"passed_time":0.405585167,"remaining_time":0.2212852611}, +{"learn":[0.2906082656],"iteration":647,"passed_time":0.4064820636,"remaining_time":0.2208050716}, +{"learn":[0.2902667182],"iteration":648,"passed_time":0.407054487,"remaining_time":0.2201481124}, +{"learn":[0.2900387049],"iteration":649,"passed_time":0.4077896999,"remaining_time":0.2195790692}, +{"learn":[0.2898409878],"iteration":650,"passed_time":0.4084738505,"remaining_time":0.218982141}, +{"learn":[0.289457913],"iteration":651,"passed_time":0.4091963881,"remaining_time":0.2184054341}, +{"learn":[0.2891859795],"iteration":652,"passed_time":0.4112077824,"remaining_time":0.2185131707}, +{"learn":[0.288721157],"iteration":653,"passed_time":0.4117850476,"remaining_time":0.2178556979}, +{"learn":[0.288452323],"iteration":654,"passed_time":0.4126922254,"remaining_time":0.2173722409}, +{"learn":[0.2882551636],"iteration":655,"passed_time":0.4134877791,"remaining_time":0.2168289573}, +{"learn":[0.2879799902],"iteration":656,"passed_time":0.4154170392,"remaining_time":0.21687678}, +{"learn":[0.2876007472],"iteration":657,"passed_time":0.4163582726,"remaining_time":0.2164050596}, +{"learn":[0.287326932],"iteration":658,"passed_time":0.4190196687,"remaining_time":0.2168220137}, +{"learn":[0.2869494557],"iteration":659,"passed_time":0.4195774715,"remaining_time":0.2161459702}, +{"learn":[0.2866300855],"iteration":660,"passed_time":0.4201767647,"remaining_time":0.2154915632}, +{"learn":[0.286373198],"iteration":661,"passed_time":0.4207802955,"remaining_time":0.2148394862}, +{"learn":[0.2859906852],"iteration":662,"passed_time":0.4213216894,"remaining_time":0.2141559718}, +{"learn":[0.2855351972],"iteration":663,"passed_time":0.4218715628,"remaining_time":0.2134771763}, +{"learn":[0.2851559839],"iteration":664,"passed_time":0.4224128604,"remaining_time":0.2127944485}, +{"learn":[0.2848492605],"iteration":665,"passed_time":0.4229119453,"remaining_time":0.2120909756}, +{"learn":[0.2845873215],"iteration":666,"passed_time":0.4235194938,"remaining_time":0.211442266}, +{"learn":[0.2843303766],"iteration":667,"passed_time":0.4240966697,"remaining_time":0.2107785844}, +{"learn":[0.2840162835],"iteration":668,"passed_time":0.4246249729,"remaining_time":0.2100909806}, +{"learn":[0.2837108139],"iteration":669,"passed_time":0.4252157833,"remaining_time":0.2094346395}, +{"learn":[0.2833648005],"iteration":670,"passed_time":0.4257435042,"remaining_time":0.2087475602}, +{"learn":[0.2830218524],"iteration":671,"passed_time":0.4262304825,"remaining_time":0.2080410689}, +{"learn":[0.2826870212],"iteration":672,"passed_time":0.4267960153,"remaining_time":0.2073733982}, +{"learn":[0.2823174106],"iteration":673,"passed_time":0.4274357705,"remaining_time":0.2067419306}, +{"learn":[0.2820060615],"iteration":674,"passed_time":0.4279301271,"remaining_time":0.2060404316}, +{"learn":[0.2815587755],"iteration":675,"passed_time":0.4283903348,"remaining_time":0.2053231782}, +{"learn":[0.2813848065],"iteration":676,"passed_time":0.4289439582,"remaining_time":0.2046512533}, +{"learn":[0.28112724],"iteration":677,"passed_time":0.4295107341,"remaining_time":0.2039859238}, +{"learn":[0.2808396901],"iteration":678,"passed_time":0.4300008989,"remaining_time":0.2032846665}, +{"learn":[0.2806121494],"iteration":679,"passed_time":0.430644332,"remaining_time":0.2026561562}, +{"learn":[0.2802471199],"iteration":680,"passed_time":0.4311365141,"remaining_time":0.2019567518}, +{"learn":[0.2798035315],"iteration":681,"passed_time":0.4315784996,"remaining_time":0.2012345497}, +{"learn":[0.2795483172],"iteration":682,"passed_time":0.4321520022,"remaining_time":0.2005742089}, +{"learn":[0.2792404869],"iteration":683,"passed_time":0.4327219428,"remaining_time":0.1999124765}, +{"learn":[0.2789540802],"iteration":684,"passed_time":0.4332210177,"remaining_time":0.1992184242}, +{"learn":[0.2786558292],"iteration":685,"passed_time":0.4337036596,"remaining_time":0.1985174185}, +{"learn":[0.2782940418],"iteration":686,"passed_time":0.4342437486,"remaining_time":0.1978432217}, +{"learn":[0.277895276],"iteration":687,"passed_time":0.4347123066,"remaining_time":0.1971369763}, +{"learn":[0.2775917564],"iteration":688,"passed_time":0.4352487322,"remaining_time":0.1964620547}, +{"learn":[0.2772567102],"iteration":689,"passed_time":0.4357865156,"remaining_time":0.1957881447}, +{"learn":[0.2770056639],"iteration":690,"passed_time":0.4364056991,"remaining_time":0.195151029}, +{"learn":[0.2768216282],"iteration":691,"passed_time":0.4371092238,"remaining_time":0.1945515042}, +{"learn":[0.2765277241],"iteration":692,"passed_time":0.4378316981,"remaining_time":0.1939600741}, +{"learn":[0.2762676307],"iteration":693,"passed_time":0.4386821423,"remaining_time":0.193424691}, +{"learn":[0.2760578649],"iteration":694,"passed_time":0.4394127408,"remaining_time":0.1928358071}, +{"learn":[0.2757622919],"iteration":695,"passed_time":0.4399842425,"remaining_time":0.1921770255}, +{"learn":[0.2754316394],"iteration":696,"passed_time":0.4404868095,"remaining_time":0.1914885269}, +{"learn":[0.2749989714],"iteration":697,"passed_time":0.4409797478,"remaining_time":0.1907963952}, +{"learn":[0.2745687238],"iteration":698,"passed_time":0.4419040317,"remaining_time":0.1902905773}, +{"learn":[0.2743486528],"iteration":699,"passed_time":0.4439039594,"remaining_time":0.190244554}, +{"learn":[0.2741394831],"iteration":700,"passed_time":0.4447956434,"remaining_time":0.189720253}, +{"learn":[0.2738577638],"iteration":701,"passed_time":0.445638625,"remaining_time":0.1891742311}, +{"learn":[0.2736173953],"iteration":702,"passed_time":0.4463544359,"remaining_time":0.1885736379}, +{"learn":[0.2734091218],"iteration":703,"passed_time":0.4474496523,"remaining_time":0.1881322402}, +{"learn":[0.2732017615],"iteration":704,"passed_time":0.4482101406,"remaining_time":0.1875489241}, +{"learn":[0.2729077467],"iteration":705,"passed_time":0.4488166122,"remaining_time":0.1869009688}, +{"learn":[0.2726194156],"iteration":706,"passed_time":0.4491539468,"remaining_time":0.1861415932}, +{"learn":[0.2723656573],"iteration":707,"passed_time":0.4496752879,"remaining_time":0.1854592995}, +{"learn":[0.2721585333],"iteration":708,"passed_time":0.4500881214,"remaining_time":0.1847329243}, +{"learn":[0.2719530272],"iteration":709,"passed_time":0.4505072974,"remaining_time":0.1840100229}, +{"learn":[0.2716654135],"iteration":710,"passed_time":0.4508423546,"remaining_time":0.1832537841}, +{"learn":[0.2713763544],"iteration":711,"passed_time":0.4513510253,"remaining_time":0.182568954}, +{"learn":[0.271054617],"iteration":712,"passed_time":0.4517413073,"remaining_time":0.1818369638}, +{"learn":[0.270765279],"iteration":713,"passed_time":0.4521095738,"remaining_time":0.1810971122}, +{"learn":[0.2705314734],"iteration":714,"passed_time":0.4524674402,"remaining_time":0.1803541545}, +{"learn":[0.2701860922],"iteration":715,"passed_time":0.4527827674,"remaining_time":0.1795953994}, +{"learn":[0.2699523228],"iteration":716,"passed_time":0.4532239041,"remaining_time":0.1788875382}, +{"learn":[0.2695332468],"iteration":717,"passed_time":0.4535409006,"remaining_time":0.1781316629}, +{"learn":[0.2692927846],"iteration":718,"passed_time":0.4539108459,"remaining_time":0.1773977019}, +{"learn":[0.2690525545],"iteration":719,"passed_time":0.4542673415,"remaining_time":0.1766595217}, +{"learn":[0.2686732369],"iteration":720,"passed_time":0.4545854733,"remaining_time":0.1759075549}, +{"learn":[0.2684303522],"iteration":721,"passed_time":0.4549509786,"remaining_time":0.1751750305}, +{"learn":[0.268084062],"iteration":722,"passed_time":0.4553284021,"remaining_time":0.1744480876}, +{"learn":[0.2678163158],"iteration":723,"passed_time":0.4556837334,"remaining_time":0.1737136884}, +{"learn":[0.2675443845],"iteration":724,"passed_time":0.4565697413,"remaining_time":0.173181626}, +{"learn":[0.2673065875],"iteration":725,"passed_time":0.4573022374,"remaining_time":0.1725906516}, +{"learn":[0.267035761],"iteration":726,"passed_time":0.4577266771,"remaining_time":0.1718836078}, +{"learn":[0.2668061469],"iteration":727,"passed_time":0.4582369921,"remaining_time":0.1712094256}, +{"learn":[0.2665300156],"iteration":728,"passed_time":0.458587581,"remaining_time":0.1704763161}, +{"learn":[0.2663027665],"iteration":729,"passed_time":0.4589974774,"remaining_time":0.1697661903}, +{"learn":[0.2661044661],"iteration":730,"passed_time":0.4596895263,"remaining_time":0.1691607149}, +{"learn":[0.2658744007],"iteration":731,"passed_time":0.4601046902,"remaining_time":0.1684536297}, +{"learn":[0.2655614189],"iteration":732,"passed_time":0.4606128688,"remaining_time":0.1677812223}, +{"learn":[0.265313834],"iteration":733,"passed_time":0.4610676066,"remaining_time":0.1670898956}, +{"learn":[0.2650115107],"iteration":734,"passed_time":0.4614170024,"remaining_time":0.1663612322}, +{"learn":[0.2646720558],"iteration":735,"passed_time":0.4617790771,"remaining_time":0.1656381472}, +{"learn":[0.2644053825],"iteration":736,"passed_time":0.4621144345,"remaining_time":0.1649065079}, +{"learn":[0.2639993003],"iteration":737,"passed_time":0.4624059627,"remaining_time":0.1641603824}, +{"learn":[0.2637928831],"iteration":738,"passed_time":0.4627744662,"remaining_time":0.1634426734}, +{"learn":[0.26338629],"iteration":739,"passed_time":0.4631056449,"remaining_time":0.1627127942}, +{"learn":[0.2631142416],"iteration":740,"passed_time":0.4634984127,"remaining_time":0.1620055181}, +{"learn":[0.262746945],"iteration":741,"passed_time":0.4638036187,"remaining_time":0.1612686437}, +{"learn":[0.2625778381],"iteration":742,"passed_time":0.4645685041,"remaining_time":0.1606919321}, +{"learn":[0.26234528],"iteration":743,"passed_time":0.4650294033,"remaining_time":0.1600101173}, +{"learn":[0.262142009],"iteration":744,"passed_time":0.4654747442,"remaining_time":0.1593235702}, +{"learn":[0.2617390794],"iteration":745,"passed_time":0.4658131417,"remaining_time":0.1586012574}, +{"learn":[0.2614471678],"iteration":746,"passed_time":0.4662386401,"remaining_time":0.1579094725}, +{"learn":[0.2611715304],"iteration":747,"passed_time":0.466616248,"remaining_time":0.1572022654}, +{"learn":[0.2608939814],"iteration":748,"passed_time":0.4670498365,"remaining_time":0.1565146982}, +{"learn":[0.260690721],"iteration":749,"passed_time":0.4674780929,"remaining_time":0.155826031}, +{"learn":[0.2603874398],"iteration":750,"passed_time":0.4677855066,"remaining_time":0.1550979909}, +{"learn":[0.2601193339],"iteration":751,"passed_time":0.4681716736,"remaining_time":0.1543970413}, +{"learn":[0.2597940266],"iteration":752,"passed_time":0.4685466653,"remaining_time":0.1536932621}, +{"learn":[0.2595251692],"iteration":753,"passed_time":0.4689000876,"remaining_time":0.1529833177}, +{"learn":[0.2592016033],"iteration":754,"passed_time":0.4692272787,"remaining_time":0.1522658057}, +{"learn":[0.2588046512],"iteration":755,"passed_time":0.4695262306,"remaining_time":0.151540212}, +{"learn":[0.2584771727],"iteration":756,"passed_time":0.4699087931,"remaining_time":0.1508425848}, +{"learn":[0.258083484],"iteration":757,"passed_time":0.4702699272,"remaining_time":0.1501389477}, +{"learn":[0.2576890822],"iteration":758,"passed_time":0.4706126784,"remaining_time":0.1494303761}, +{"learn":[0.2575262125],"iteration":759,"passed_time":0.4710687548,"remaining_time":0.1487585541}, +{"learn":[0.2573358842],"iteration":760,"passed_time":0.4714924382,"remaining_time":0.1480771258}, +{"learn":[0.2571070407],"iteration":761,"passed_time":0.4738636861,"remaining_time":0.1480046684}, +{"learn":[0.2567511073],"iteration":762,"passed_time":0.4750951317,"remaining_time":0.1475721444}, +{"learn":[0.2564899155],"iteration":763,"passed_time":0.4755368151,"remaining_time":0.1468935712}, +{"learn":[0.2561713372],"iteration":764,"passed_time":0.4758664528,"remaining_time":0.1461811979}, +{"learn":[0.2558850923],"iteration":765,"passed_time":0.4763280569,"remaining_time":0.1455101375}, +{"learn":[0.255597336],"iteration":766,"passed_time":0.4766923409,"remaining_time":0.1448100592}, +{"learn":[0.2553038406],"iteration":767,"passed_time":0.4769909137,"remaining_time":0.1440910052}, +{"learn":[0.2551415861],"iteration":768,"passed_time":0.4800090564,"remaining_time":0.1441899766}, +{"learn":[0.2549170256],"iteration":769,"passed_time":0.4805868065,"remaining_time":0.1435519032}, +{"learn":[0.2547311187],"iteration":770,"passed_time":0.4811551019,"remaining_time":0.1429111781}, +{"learn":[0.2544388579],"iteration":771,"passed_time":0.4816654229,"remaining_time":0.1422535187}, +{"learn":[0.2541200263],"iteration":772,"passed_time":0.4822915889,"remaining_time":0.1416302596}, +{"learn":[0.253959594],"iteration":773,"passed_time":0.4828837652,"remaining_time":0.1409970684}, +{"learn":[0.2537388567],"iteration":774,"passed_time":0.483432444,"remaining_time":0.1403513547}, +{"learn":[0.2534203806],"iteration":775,"passed_time":0.4839090885,"remaining_time":0.1396850977}, +{"learn":[0.2531755737],"iteration":776,"passed_time":0.4843985186,"remaining_time":0.1390229982}, +{"learn":[0.2529106076],"iteration":777,"passed_time":0.48489424,"remaining_time":0.1383631379}, +{"learn":[0.25252861],"iteration":778,"passed_time":0.485382096,"remaining_time":0.1377014675}, +{"learn":[0.2522401299],"iteration":779,"passed_time":0.4860105642,"remaining_time":0.1370799027}, +{"learn":[0.2518930776],"iteration":780,"passed_time":0.4865325231,"remaining_time":0.136428454}, +{"learn":[0.2515788674],"iteration":781,"passed_time":0.4871425093,"remaining_time":0.135801876}, +{"learn":[0.2513293752],"iteration":782,"passed_time":0.4881200719,"remaining_time":0.1352772102}, +{"learn":[0.2510782033],"iteration":783,"passed_time":0.488686901,"remaining_time":0.1346382278}, +{"learn":[0.2506980555],"iteration":784,"passed_time":0.491481106,"remaining_time":0.1346094749}, +{"learn":[0.2503226527],"iteration":785,"passed_time":0.4928145453,"remaining_time":0.1341759703}, +{"learn":[0.2499814757],"iteration":786,"passed_time":0.4933862558,"remaining_time":0.1335340184}, +{"learn":[0.2496719041],"iteration":787,"passed_time":0.4939144002,"remaining_time":0.1328805239}, +{"learn":[0.2494325127],"iteration":788,"passed_time":0.4944342649,"remaining_time":0.132225133}, +{"learn":[0.249093871],"iteration":789,"passed_time":0.4949448406,"remaining_time":0.1315676158}, +{"learn":[0.2488785131],"iteration":790,"passed_time":0.4964334978,"remaining_time":0.1311689014}, +{"learn":[0.2486212573],"iteration":791,"passed_time":0.4970151808,"remaining_time":0.1305292394}, +{"learn":[0.2483151789],"iteration":792,"passed_time":0.4976228655,"remaining_time":0.1298965109}, +{"learn":[0.2481008215],"iteration":793,"passed_time":0.4982025468,"remaining_time":0.1292565802}, +{"learn":[0.2478614556],"iteration":794,"passed_time":0.498857468,"remaining_time":0.1286362024}, +{"learn":[0.2475220348],"iteration":795,"passed_time":0.4993843042,"remaining_time":0.1279829121}, +{"learn":[0.2471528138],"iteration":796,"passed_time":0.4999155123,"remaining_time":0.1273310527}, +{"learn":[0.2467823178],"iteration":797,"passed_time":0.5004594386,"remaining_time":0.126682715}, +{"learn":[0.2465691545],"iteration":798,"passed_time":0.501245397,"remaining_time":0.1260955254}, +{"learn":[0.2464111426],"iteration":799,"passed_time":0.5017683106,"remaining_time":0.1254420777}, +{"learn":[0.2462568326],"iteration":800,"passed_time":0.502630562,"remaining_time":0.1248732607}, +{"learn":[0.245888627],"iteration":801,"passed_time":0.5034427368,"remaining_time":0.124291349}, +{"learn":[0.2457131488],"iteration":802,"passed_time":0.5043012692,"remaining_time":0.1237202367}, +{"learn":[0.2454448321],"iteration":803,"passed_time":0.5055405473,"remaining_time":0.1232412279}, +{"learn":[0.24514739],"iteration":804,"passed_time":0.5062728608,"remaining_time":0.1226375253}, +{"learn":[0.2449321066],"iteration":805,"passed_time":0.5085157913,"remaining_time":0.1223971011}, +{"learn":[0.244754027],"iteration":806,"passed_time":0.5094402014,"remaining_time":0.1218363803}, +{"learn":[0.2444520933],"iteration":807,"passed_time":0.5103976546,"remaining_time":0.121282611}, +{"learn":[0.244150364],"iteration":808,"passed_time":0.5111959177,"remaining_time":0.1206902599}, +{"learn":[0.2438756845],"iteration":809,"passed_time":0.5117936802,"remaining_time":0.1200503694}, +{"learn":[0.2436781462],"iteration":810,"passed_time":0.5130265818,"remaining_time":0.1195585992}, +{"learn":[0.2434370794],"iteration":811,"passed_time":0.5140712848,"remaining_time":0.1190214305}, +{"learn":[0.243231222],"iteration":812,"passed_time":0.5151709642,"remaining_time":0.1184956584}, +{"learn":[0.2429795031],"iteration":813,"passed_time":0.5163394584,"remaining_time":0.1179842006}, +{"learn":[0.2428085953],"iteration":814,"passed_time":0.5177108502,"remaining_time":0.1175171868}, +{"learn":[0.2425811674],"iteration":815,"passed_time":0.5184515148,"remaining_time":0.1169057337}, +{"learn":[0.242311599],"iteration":816,"passed_time":0.519181002,"remaining_time":0.1162914607}, +{"learn":[0.2419550121],"iteration":817,"passed_time":0.5197364035,"remaining_time":0.1156381729}, +{"learn":[0.2418074012],"iteration":818,"passed_time":0.5205502008,"remaining_time":0.11504223}, +{"learn":[0.2415643207],"iteration":819,"passed_time":0.5212907748,"remaining_time":0.1144296823}, +{"learn":[0.2412046386],"iteration":820,"passed_time":0.5218312713,"remaining_time":0.1137732004}, +{"learn":[0.2410006757],"iteration":821,"passed_time":0.5224740591,"remaining_time":0.1131391515}, +{"learn":[0.240788513],"iteration":822,"passed_time":0.5231319496,"remaining_time":0.1125083294}, +{"learn":[0.2405819957],"iteration":823,"passed_time":0.5237085713,"remaining_time":0.1118600832}, +{"learn":[0.2402247212],"iteration":824,"passed_time":0.5243313708,"remaining_time":0.1112218059}, +{"learn":[0.2400239493],"iteration":825,"passed_time":0.5249420104,"remaining_time":0.1105810046}, +{"learn":[0.2398227815],"iteration":826,"passed_time":0.5256867701,"remaining_time":0.1099683328}, +{"learn":[0.2394675868],"iteration":827,"passed_time":0.5264600676,"remaining_time":0.1093612701}, +{"learn":[0.2391150721],"iteration":828,"passed_time":0.5275203545,"remaining_time":0.1088130044}, +{"learn":[0.238944539],"iteration":829,"passed_time":0.528225089,"remaining_time":0.1081906809}, +{"learn":[0.2387435521],"iteration":830,"passed_time":0.5289842217,"remaining_time":0.107579222}, +{"learn":[0.2385065428],"iteration":831,"passed_time":0.529543322,"remaining_time":0.1069270169}, +{"learn":[0.2383424767],"iteration":832,"passed_time":0.5301500187,"remaining_time":0.1062845776}, +{"learn":[0.2381619513],"iteration":833,"passed_time":0.5307885353,"remaining_time":0.1056485574}, +{"learn":[0.2378786377],"iteration":834,"passed_time":0.5312926611,"remaining_time":0.1049859749}, +{"learn":[0.2375881033],"iteration":835,"passed_time":0.5317917068,"remaining_time":0.104322775}, +{"learn":[0.2373639579],"iteration":836,"passed_time":0.5323433501,"remaining_time":0.1036702104}, +{"learn":[0.2370155092],"iteration":837,"passed_time":0.5328155297,"remaining_time":0.1030025248}, +{"learn":[0.2366988765],"iteration":838,"passed_time":0.5333837297,"remaining_time":0.1023537312}, +{"learn":[0.2364143282],"iteration":839,"passed_time":0.533889267,"remaining_time":0.1016931937}, +{"learn":[0.2361525744],"iteration":840,"passed_time":0.534412663,"remaining_time":0.1010364012}, +{"learn":[0.2359784991],"iteration":841,"passed_time":0.5350258961,"remaining_time":0.1003967834}, +{"learn":[0.2357424561],"iteration":842,"passed_time":0.535566698,"remaining_time":0.09974373854}, +{"learn":[0.2355073158],"iteration":843,"passed_time":0.5361661841,"remaining_time":0.09910180655}, +{"learn":[0.2353411147],"iteration":844,"passed_time":0.5367596976,"remaining_time":0.09845887944}, +{"learn":[0.2350888635],"iteration":845,"passed_time":0.5373840649,"remaining_time":0.09782168557}, +{"learn":[0.2348985119],"iteration":846,"passed_time":0.5380018638,"remaining_time":0.0971833355}, +{"learn":[0.2346166734],"iteration":847,"passed_time":0.5384990209,"remaining_time":0.0965234094}, +{"learn":[0.2343535147],"iteration":848,"passed_time":0.5389994596,"remaining_time":0.09586445041}, +{"learn":[0.2340152775],"iteration":849,"passed_time":0.5395533792,"remaining_time":0.0952153022}, +{"learn":[0.2338253856],"iteration":850,"passed_time":0.540102651,"remaining_time":0.09456556404}, +{"learn":[0.2335178043],"iteration":851,"passed_time":0.5405720594,"remaining_time":0.09390218872}, +{"learn":[0.2333585535],"iteration":852,"passed_time":0.5411738392,"remaining_time":0.09326208014}, +{"learn":[0.2331409327],"iteration":853,"passed_time":0.5416619499,"remaining_time":0.09260262844}, +{"learn":[0.232799132],"iteration":854,"passed_time":0.5420831053,"remaining_time":0.09193222253}, +{"learn":[0.2326270938],"iteration":855,"passed_time":0.545022903,"remaining_time":0.09168609583}, +{"learn":[0.232290253],"iteration":856,"passed_time":0.5455285614,"remaining_time":0.09102751958}, +{"learn":[0.2320625271],"iteration":857,"passed_time":0.5460920766,"remaining_time":0.09037887515}, +{"learn":[0.2317883436],"iteration":858,"passed_time":0.5467508422,"remaining_time":0.08974606373}, +{"learn":[0.2315655244],"iteration":859,"passed_time":0.547951441,"remaining_time":0.08920139737}, +{"learn":[0.2313702852],"iteration":860,"passed_time":0.5488170515,"remaining_time":0.08860112677}, +{"learn":[0.2311274941],"iteration":861,"passed_time":0.5495685823,"remaining_time":0.08798197721}, +{"learn":[0.2307902468],"iteration":862,"passed_time":0.5524229874,"remaining_time":0.0876963491}, +{"learn":[0.230628665],"iteration":863,"passed_time":0.5537842252,"remaining_time":0.08716973916}, +{"learn":[0.2304723965],"iteration":864,"passed_time":0.5545513147,"remaining_time":0.08654847109}, +{"learn":[0.2302186894],"iteration":865,"passed_time":0.5551528647,"remaining_time":0.08590125157}, +{"learn":[0.2300390857],"iteration":866,"passed_time":0.5564569068,"remaining_time":0.08536190151}, +{"learn":[0.2297347912],"iteration":867,"passed_time":0.5570938476,"remaining_time":0.08471934087}, +{"learn":[0.2295422937],"iteration":868,"passed_time":0.5580044049,"remaining_time":0.08411804032}, +{"learn":[0.2294070827],"iteration":869,"passed_time":0.5591088771,"remaining_time":0.08354500463}, +{"learn":[0.229074195],"iteration":870,"passed_time":0.5598280886,"remaining_time":0.08291368936}, +{"learn":[0.2288957728],"iteration":871,"passed_time":0.5608158459,"remaining_time":0.08232159205}, +{"learn":[0.2287379333],"iteration":872,"passed_time":0.5617823568,"remaining_time":0.0817254975}, +{"learn":[0.2284071765],"iteration":873,"passed_time":0.5627034871,"remaining_time":0.08112201302}, +{"learn":[0.2281981345],"iteration":874,"passed_time":0.5634399657,"remaining_time":0.08049142367}, +{"learn":[0.2279745298],"iteration":875,"passed_time":0.5642612997,"remaining_time":0.07987260407}, +{"learn":[0.2277943641],"iteration":876,"passed_time":0.5648610068,"remaining_time":0.07922223927}, +{"learn":[0.227464397],"iteration":877,"passed_time":0.5654543985,"remaining_time":0.07857111232}, +{"learn":[0.2272489561],"iteration":878,"passed_time":0.5662696285,"remaining_time":0.07795065421}, +{"learn":[0.2270080894],"iteration":879,"passed_time":0.5672281575,"remaining_time":0.0773492942}, +{"learn":[0.226743977],"iteration":880,"passed_time":0.5678032533,"remaining_time":0.07669533161}, +{"learn":[0.2265238464],"iteration":881,"passed_time":0.5684084677,"remaining_time":0.07604557732}, +{"learn":[0.2261985966],"iteration":882,"passed_time":0.5691880037,"remaining_time":0.07541902201}, +{"learn":[0.2259350951],"iteration":883,"passed_time":0.5697210672,"remaining_time":0.07475977805}, +{"learn":[0.2256665911],"iteration":884,"passed_time":0.5702514542,"remaining_time":0.07410047145}, +{"learn":[0.2254206049],"iteration":885,"passed_time":0.5707525763,"remaining_time":0.07343769041}, +{"learn":[0.2251793891],"iteration":886,"passed_time":0.5712200224,"remaining_time":0.07277098368}, +{"learn":[0.2249354039],"iteration":887,"passed_time":0.5719631158,"remaining_time":0.07213949209}, +{"learn":[0.224719188],"iteration":888,"passed_time":0.573473036,"remaining_time":0.07160349493}, +{"learn":[0.2244787983],"iteration":889,"passed_time":0.5740166277,"remaining_time":0.07094587533}, +{"learn":[0.2242641449],"iteration":890,"passed_time":0.5745484288,"remaining_time":0.07028706929}, +{"learn":[0.2240526804],"iteration":891,"passed_time":0.5750417033,"remaining_time":0.06962388336}, +{"learn":[0.2239209754],"iteration":892,"passed_time":0.5756626222,"remaining_time":0.06897637243}, +{"learn":[0.2236602732],"iteration":893,"passed_time":0.5761377019,"remaining_time":0.06831162908}, +{"learn":[0.2235089796],"iteration":894,"passed_time":0.5771884627,"remaining_time":0.06771484758}, +{"learn":[0.2233028178],"iteration":895,"passed_time":0.5778392655,"remaining_time":0.06707062903}, +{"learn":[0.2231716088],"iteration":896,"passed_time":0.5785026788,"remaining_time":0.06642784383}, +{"learn":[0.2229924926],"iteration":897,"passed_time":0.5790633561,"remaining_time":0.06577334335}, +{"learn":[0.2226752298],"iteration":898,"passed_time":0.5795658509,"remaining_time":0.06511251495}, +{"learn":[0.2225238638],"iteration":899,"passed_time":0.5801448497,"remaining_time":0.06446053885}, +{"learn":[0.2223377015],"iteration":900,"passed_time":0.5806770458,"remaining_time":0.06380358217}, +{"learn":[0.2220517312],"iteration":901,"passed_time":0.5812016734,"remaining_time":0.06314607982}, +{"learn":[0.2217935941],"iteration":902,"passed_time":0.581771487,"remaining_time":0.06249372563}, +{"learn":[0.2215100569],"iteration":903,"passed_time":0.5823889162,"remaining_time":0.06184661057}, +{"learn":[0.2213576053],"iteration":904,"passed_time":0.5830686959,"remaining_time":0.0612061062}, +{"learn":[0.2210437059],"iteration":905,"passed_time":0.5835408115,"remaining_time":0.06054396941}, +{"learn":[0.2207309093],"iteration":906,"passed_time":0.5839938446,"remaining_time":0.05988029498}, +{"learn":[0.2205256373],"iteration":907,"passed_time":0.5844979671,"remaining_time":0.05922226098}, +{"learn":[0.2203404031],"iteration":908,"passed_time":0.5851540432,"remaining_time":0.0585797777}, +{"learn":[0.2201428967],"iteration":909,"passed_time":0.5856712456,"remaining_time":0.05792352978}, +{"learn":[0.2199867836],"iteration":910,"passed_time":0.5862662073,"remaining_time":0.0572751838}, +{"learn":[0.2197325953],"iteration":911,"passed_time":0.5868183988,"remaining_time":0.05662282795}, +{"learn":[0.21941949],"iteration":912,"passed_time":0.5872918285,"remaining_time":0.05596318629}, +{"learn":[0.2192427261],"iteration":913,"passed_time":0.5878288912,"remaining_time":0.05530993943}, +{"learn":[0.2190470483],"iteration":914,"passed_time":0.5883754177,"remaining_time":0.05465782569}, +{"learn":[0.2188108265],"iteration":915,"passed_time":0.5888956267,"remaining_time":0.05400352909}, +{"learn":[0.2185763461],"iteration":916,"passed_time":0.5893668761,"remaining_time":0.05334509347}, +{"learn":[0.218397545],"iteration":917,"passed_time":0.5901757336,"remaining_time":0.05271722239}, +{"learn":[0.2182284423],"iteration":918,"passed_time":0.5911811992,"remaining_time":0.05210628633}, +{"learn":[0.2180648446],"iteration":919,"passed_time":0.5917557335,"remaining_time":0.0514570203}, +{"learn":[0.2178877081],"iteration":920,"passed_time":0.5926940813,"remaining_time":0.05083912315}, +{"learn":[0.2177140968],"iteration":921,"passed_time":0.5932395564,"remaining_time":0.05018729436}, +{"learn":[0.2174067859],"iteration":922,"passed_time":0.5937335631,"remaining_time":0.04953140234}, +{"learn":[0.2172427475],"iteration":923,"passed_time":0.594441586,"remaining_time":0.04889346379}, +{"learn":[0.2170075242],"iteration":924,"passed_time":0.5949819889,"remaining_time":0.04824178288}, +{"learn":[0.2167770352],"iteration":925,"passed_time":0.5957625359,"remaining_time":0.0476095331}, +{"learn":[0.2166043286],"iteration":926,"passed_time":0.5969523115,"remaining_time":0.04700918958}, +{"learn":[0.2163751168],"iteration":927,"passed_time":0.5976561483,"remaining_time":0.04636987357}, +{"learn":[0.2161562507],"iteration":928,"passed_time":0.5997954706,"remaining_time":0.04584012746}, +{"learn":[0.2159849725],"iteration":929,"passed_time":0.6006319213,"remaining_time":0.04520885429}, +{"learn":[0.2157700509],"iteration":930,"passed_time":0.6014893591,"remaining_time":0.04457869579}, +{"learn":[0.2155961884],"iteration":931,"passed_time":0.6021677417,"remaining_time":0.04393498545}, +{"learn":[0.2153693672],"iteration":932,"passed_time":0.6031781508,"remaining_time":0.04331504406}, +{"learn":[0.2152064868],"iteration":933,"passed_time":0.6039009417,"remaining_time":0.04267394234}, +{"learn":[0.2149531543],"iteration":934,"passed_time":0.6045038154,"remaining_time":0.04202432941}, +{"learn":[0.2146568788],"iteration":935,"passed_time":0.6050656118,"remaining_time":0.04137200765}, +{"learn":[0.2144625975],"iteration":936,"passed_time":0.6056736374,"remaining_time":0.04072298736}, +{"learn":[0.2142915385],"iteration":937,"passed_time":0.6063558656,"remaining_time":0.04007895913}, +{"learn":[0.2141212757],"iteration":938,"passed_time":0.6070668518,"remaining_time":0.03943671774}, +{"learn":[0.2139780266],"iteration":939,"passed_time":0.6078386553,"remaining_time":0.03879821204}, +{"learn":[0.2137883178],"iteration":940,"passed_time":0.6084696227,"remaining_time":0.03815059271}, +{"learn":[0.2135832033],"iteration":941,"passed_time":0.6089643342,"remaining_time":0.0374946193}, +{"learn":[0.213338275],"iteration":942,"passed_time":0.6097087637,"remaining_time":0.03685408222}, +{"learn":[0.2130945367],"iteration":943,"passed_time":0.6115469788,"remaining_time":0.03627821061}, +{"learn":[0.2127992234],"iteration":944,"passed_time":0.6120421491,"remaining_time":0.03562150074}, +{"learn":[0.2125003551],"iteration":945,"passed_time":0.6125252692,"remaining_time":0.03496444454}, +{"learn":[0.2123049072],"iteration":946,"passed_time":0.6130689864,"remaining_time":0.03431114707}, +{"learn":[0.2120079526],"iteration":947,"passed_time":0.6135146875,"remaining_time":0.03365270438}, +{"learn":[0.2117684079],"iteration":948,"passed_time":0.6140811754,"remaining_time":0.03300120121}, +{"learn":[0.2115994564],"iteration":949,"passed_time":0.6147059998,"remaining_time":0.03235294736}, +{"learn":[0.2113548624],"iteration":950,"passed_time":0.6152105047,"remaining_time":0.03169854335}, +{"learn":[0.2111350383],"iteration":951,"passed_time":0.6158322973,"remaining_time":0.03105036793}, +{"learn":[0.2108422305],"iteration":952,"passed_time":0.616352162,"remaining_time":0.030397221}, +{"learn":[0.2106553572],"iteration":953,"passed_time":0.6168799266,"remaining_time":0.02974473441}, +{"learn":[0.2103629942],"iteration":954,"passed_time":0.6173293363,"remaining_time":0.0290888169}, +{"learn":[0.2100700906],"iteration":955,"passed_time":0.6178461359,"remaining_time":0.02843643303}, +{"learn":[0.2098822147],"iteration":956,"passed_time":0.61836141,"remaining_time":0.02778426398}, +{"learn":[0.2096414268],"iteration":957,"passed_time":0.6188691933,"remaining_time":0.02713205232}, +{"learn":[0.2094775353],"iteration":958,"passed_time":0.6194397643,"remaining_time":0.02648282621}, +{"learn":[0.209343031],"iteration":959,"passed_time":0.6200932802,"remaining_time":0.02583722001}, +{"learn":[0.2091770747],"iteration":960,"passed_time":0.6206941564,"remaining_time":0.02518946108}, +{"learn":[0.2089391159],"iteration":961,"passed_time":0.6211844623,"remaining_time":0.02453743198}, +{"learn":[0.2087473593],"iteration":962,"passed_time":0.621883416,"remaining_time":0.02389375534}, +{"learn":[0.2084878087],"iteration":963,"passed_time":0.6224494507,"remaining_time":0.02324500023}, +{"learn":[0.2082003291],"iteration":964,"passed_time":0.6229478789,"remaining_time":0.02259396452}, +{"learn":[0.2079655379],"iteration":965,"passed_time":0.6237996107,"remaining_time":0.02195567988}, +{"learn":[0.2078309123],"iteration":966,"passed_time":0.6248229071,"remaining_time":0.02132280862}, +{"learn":[0.2076701543],"iteration":967,"passed_time":0.6253783193,"remaining_time":0.02067366345}, +{"learn":[0.2074827892],"iteration":968,"passed_time":0.6258766768,"remaining_time":0.02002288646}, +{"learn":[0.2072977998],"iteration":969,"passed_time":0.6264908643,"remaining_time":0.01937600611}, +{"learn":[0.2071178449],"iteration":970,"passed_time":0.629342721,"remaining_time":0.01879602359}, +{"learn":[0.2069490254],"iteration":971,"passed_time":0.6302602156,"remaining_time":0.01815564407}, +{"learn":[0.2068189063],"iteration":972,"passed_time":0.6317412629,"remaining_time":0.01753033309}, +{"learn":[0.2066576438],"iteration":973,"passed_time":0.632383109,"remaining_time":0.01688086328}, +{"learn":[0.2063765356],"iteration":974,"passed_time":0.6329266285,"remaining_time":0.01622888791}, +{"learn":[0.2061198865],"iteration":975,"passed_time":0.6336683404,"remaining_time":0.01558200837}, +{"learn":[0.2059267993],"iteration":976,"passed_time":0.6352760759,"remaining_time":0.01495532216}, +{"learn":[0.2057632846],"iteration":977,"passed_time":0.636087611,"remaining_time":0.01430871927}, +{"learn":[0.2055076659],"iteration":978,"passed_time":0.6370669428,"remaining_time":0.01366537875}, +{"learn":[0.2053591971],"iteration":979,"passed_time":0.6379596244,"remaining_time":0.01301958417}, +{"learn":[0.2050760154],"iteration":980,"passed_time":0.6387834672,"remaining_time":0.01237195298}, +{"learn":[0.2047961482],"iteration":981,"passed_time":0.639321911,"remaining_time":0.01171873157}, +{"learn":[0.2045843367],"iteration":982,"passed_time":0.6400619719,"remaining_time":0.01106923044}, +{"learn":[0.2043832455],"iteration":983,"passed_time":0.6438909715,"remaining_time":0.01046977189}, +{"learn":[0.2041069418],"iteration":984,"passed_time":0.6445882918,"remaining_time":0.009816065358}, +{"learn":[0.2038964459],"iteration":985,"passed_time":0.6451535577,"remaining_time":0.009160395342}, +{"learn":[0.2037365884],"iteration":986,"passed_time":0.6460587828,"remaining_time":0.008509386197}, +{"learn":[0.2035126856],"iteration":987,"passed_time":0.6467510394,"remaining_time":0.007855275782}, +{"learn":[0.203312993],"iteration":988,"passed_time":0.6473119973,"remaining_time":0.007199627877}, +{"learn":[0.2031612588],"iteration":989,"passed_time":0.6514480141,"remaining_time":0.006580282971}, +{"learn":[0.2029575471],"iteration":990,"passed_time":0.6520319575,"remaining_time":0.005921581854}, +{"learn":[0.202779838],"iteration":991,"passed_time":0.6524651455,"remaining_time":0.005261815689}, +{"learn":[0.2026532761],"iteration":992,"passed_time":0.6531322932,"remaining_time":0.004604155139}, +{"learn":[0.2023749756],"iteration":993,"passed_time":0.6536547356,"remaining_time":0.003945602026}, +{"learn":[0.2022287718],"iteration":994,"passed_time":0.6550559346,"remaining_time":0.003291738365}, +{"learn":[0.2019803503],"iteration":995,"passed_time":0.6567962655,"remaining_time":0.002637736006}, +{"learn":[0.2017732369],"iteration":996,"passed_time":0.6572189676,"remaining_time":0.001977589672}, +{"learn":[0.201497757],"iteration":997,"passed_time":0.6575678617,"remaining_time":0.001317771266}, +{"learn":[0.2012941028],"iteration":998,"passed_time":0.6578914572,"remaining_time":0.0006585500072}, +{"learn":[0.2011675132],"iteration":999,"passed_time":0.6583236286,"remaining_time":0} ]} \ No newline at end of file diff --git a/catboost_info/learn/events.out.tfevents b/catboost_info/learn/events.out.tfevents index 5dcfc9e11856200c628bdac0adfeea813ecea0ee..16a27132e66003b1ce82de968a4b7759a1637e84 100644 GIT binary patch literal 54870 zcmZ|YcRWz<8$bS*riS*Qy@%4?dk<~YyS-DI(o!}VA*&%Xl&FY88d7F95eW?z_JDrz!Osf+Z|d8Hs%kCccydO)EuuNuo$!<^w#U8=EquX)u( zu7(~bs_b9uQ@m;_SL$(l;JCn$2q)eE?0`lMClGP5?tjLa@Ee0 zsIgznjQCiJpfrh^m9eFmS1sjgRY#)g_3-J>t5$Lqo-s`7MnIY32wt_8tAwPkV78@%cuSG7NfNEr^DcuSjC9p%a`fT-UeVlsKvNv<|8BdTUnhkRajmaFhi zM1AhF$B$QCMB=3HbfoQo$-`c-Q+56AyEdo>)Y{4S*~;%5cTF$ ze>aO?5`v~;8kzAnz@*$mb;w(@v4tpHS0!H!DQ$5yy`1gb6*XR>e5#Z$mLZ( zxr+24YLVZ*NM7}qt3exy`taakA6^ZRtDB99D*1A~8LtM)RqO2jQeEetE(qh*Ai26_ zMO0i^#%W#+mMgb~M4dG4Z^5e}a`pCWKPkheomv(0YN%YTa3v~ySqH(}VVGR~+eXy* zfG%l#ELFKO?MKvEh2;Ib8ZK9@O8QE5_1ro`k5?n)D$S3makcXWW3QTAwL3`EO4DCY z_*f(5>h?UMq7`)gc{NI|J}D8kK02x^uSUz2X=ER%8>_68gLySZuHp_7bv@;g;65BH zSI_1Xb@k}^PJFC!auwEwDBWuTD|j_tu8QyVmg>4>w@)xoSJULG_(e~tu0C_F|KZhixr%coO6kXA!My>iHmi9=SxnFq+#56Hu{^7KNEwDr z@EXQvI7_Y`c@UNTPa%s}v*k)@JyBzN?4Hi6IdWw@hN#EWGB5LLu3S|$A*yV>zTl3U zCsz)6-KB0=zI>9$$C@u!o{mJ>kIk~>)dIPiw27#oUM7QhwNS2xj3R2L%Ur>%a*hRoZBxwrh`Q%d4ewHKGks zUtb0}^JGW}4X~DOc^<5M|+VQ-xPr za&_~0SE;Va?Yv@mwMMQAU5M(l?$QZft(B|l14Om|IY%%{`A@E%_9QAaCP#3U*U6Pu zeix~(ixU-+_zc&})d?G-A_p%M%r`d3)p~WJ7ENgKijTEXu1+^6s$Tk_#k|@iS8JYi zmg@2RwMDLOuO!OPze=z|+$vXJ+Y@!}>%={LtZi~NFuRjf zSI*#E!76*ZTm||OwPUKLV3oZ?u0jtJRbH#{n9p#hTvbdYYRLfavAo(PS02rYvWXLo z^SkBBHm{@94fQLIN_;GBxiYsQYE_ukH(u?LD}yaW^|3ip#jCw?wQnR*lS>~9_F#47 z>P`cq`Yf$~osYFouFTUqNZq)SxS<2D_RH1#Gej->7E;5j19G)|4N+sabkZx)P-yv$7>0OINPuuO{lC*7ixfIwDtn`xABI zK%U^<(32~ZvUXBk^;4U6=VKj}t3fx28m(|Jkypp$YO5Ymb0TeOd39W_F3%%swvC-K zuTIES&F{8SU70TD1#hC0a+MuP)YP$lf^lA7uKpSjPGFY!mb1APO>6j2hWvGvJBxUtJ@R8RHhVL^UMfX$|b_8 z^71$F%oy15{|FoQ;@uvenE)H5OjvQ>;R|?X3M{3hjntCYX6M%M%nVq>L&E&+4Fh;) z4ou6EFc<6hc098Hwq-M6%Wa!v@$3vR?a_oiTQuMq&n$u6P$cYP$bEI5SpjSLs>E{e}}M$POD$>%m!Gf8DV<|uXf|vSzx0y37fvEeFe{KfwgQ!*vm8LTk*^e z*u#=mQcGT^J|4+4dte$@2upA>4$mBb?NlY~l(uCw&m4j6QY0*? z)?zcyoPf=K(Nb#3Mho>5JaY!t&4)1mk;#I`)CE|THeum2bd30D=YgFYMp(^}ZYy|p z0a#wKqEwr*Zn*}}E&?kLB`m~X`YxVb0;a4_*xOvMdOULlmOY!W)N>lXJi83+@!u9w zZF(EtZseI8uxob->u{!LKF{2NnV%+X*u}kqSBD2MV46>?*Kn5rkQXYNYb)8n8E(gv~qHDurkMzb6z z-Q`&buxkc{4V(S)H_t+WEm}_4nRUhkcy>jX|g9yv?$>`0qNML^|6{Ol) zZri59v-`j%-zRLL;m-3sdjM>s31OY8sx2m;2^6Uw)FVhH1-*NgV&z=GcRVJ)+ z!j?3i#Q?kbuA$UPl}1kmwZ#H!{FpFR-+tZrXmP+MI1x5w?MYLf#REH|Ntj}@w6#1- z05-QDVV?q=BYBnxEb&tVsU?{$(*#4%Ghj6_glRAM?!iY(0yf8;GVP9&d6o?9%1OfJ zE5(22SqiX!T7(6~ zGJsj@6Q^(d6olg)Lp`|(*kbt>;0h-_-y2HTvU=-ZOcY4{Wp@VGEWt6YM;_0=8@dVS5a92Jz7ffIWL&Dz9v|&Bu+iuLN=fcH8#R_^Z-Dh%MA-H6-h!w5Eij9w zgw52hoy13b2Ta-TkCfzv4;kh>D+bngEn(URdk99Z5@7Rx{g$HL|E}-EM|%%!a{yuH z_1zBhtQ1)6c)~s<+%V=@8L;_jzoaBjPF%0bvk${v7YlvnpVIHiW63t`uCiZ@{u95H{`g z4qHB2HL&xU-=!qWR;%Xn>^rb)YY7``P%)BcHNd7S5q4q6;eR~)0n9ADT1qnG_n;*_ z`w1-i3}M!4GXy&rzkp>-Cv5A0biphCH?R)H-=rkl=fw}_ll%kh!3Dwww6)RZ*fJmu16C7A*sk{T z1gm^=U||agYZkZDfsbYZ?8vLnQj+>{mM3|32G}lJ!dAJO3D$3xz_u(SY_UzO;2_Nk z*oRM*Qj*795(R^~HLxYe2`k*R%9^ju2G}(f!j^vj`iy61ft`u2kdoYGr7u`@*aF*g zim2vy>l6R!SKlFXUhY!T1S z1DmE#*d&wlg8j)0z(x)yOy^Y>!REk4V8K-%q$Kw**d%xfUjk-+iLl`&DT39aD=_03 zgq613HJWe9WnkWEWm1w2Y8DEX4{pE~oF&Xy^LPtBnme#aRl<6t`)KjZ1K6uKrBaf% zzO4#*<_RpsiZHEiN$q&%1#Hng!ro1t^pI!Xz?w87%*OQb3!Ys8wlCqmRNLydjaTu^ z2Uvy$VG}He#q-P;*u%+$T{HXDk7s_sMiiGwNxDBj^^Ip&fi18g%)4Rw5T0EF)@}r0 z3h$pD;+a3Nqlv{*lC3(A5ey2~fhn00*1*4BXFgg0ux@IEdBv&=HW~wgrB}R@k}P_1 zU$E-90j$3RVVnBi`@tu96WFmigjF;O&*0fDU`HDg_G4nnGM)tiTjlars!h4&vrFg6*+zV0W^Mq$C%ZW(vOehyZrlkFXO< zZwbc9d%&W05H@h9{RzIdNMIdQ33EAKHGyaMfgOHQDAm?7Jh?f~9sv7bLYR+VuHc@0 z2<+;3!kT}%FBpOz0c%%UASJoD_JA^o1AANj zN=njbxr5+P{t2+vR|wlRY?>z@?J2P3T7=Ed_TS307+_oL5%#xza01U_fnB__`$di)XG-dg7o+ShGx?<;`w3e!FvgFMmH|vhjWC5W^=&-M1ZI)>LQ3*! zn`?q4Z5FVsQ-tlevk`1-J_puOm9PV)TZ8!|vw_)F=156awA2x7Yvur3c$=`O8ZE)c zxi5gZo+7NG^n&1?%mub(3Sn=}Jq5Mp0W0{KE!7s9Jgf=dl9#}A?-6#?e2(Dh&Ih(? z4`G#+PUrb(uYmngC2Uy!$c8*C0Onc#T&nGf<&;xAD+E^LLzvsiv4XW}5wOei3EL2? zk;q4T4eW&iVcB|jy?FKpSa?{LRNHVpbHNPnEwD+e2vb;Fa)gie4wz~^!agiNyP9Xk zz$|WON=Z6?4Hqo?N`TEaC+z;}u7WM3_rM0sAncc_S2~|$DX=RFguPt0x|(NYz*>f6 zNVP4SdPlJF@d2385yH~LQZMn*%7Go6LD+1X;PAUH=P6rGM|C1a3QQnM^W(Ld;zv>3So_s%ZvC< zeg!t8IbmtHvKRBL3YbxPs#M#u3@>$_eFNrjfv_oB2L)g1R0C_Xny_67!#48Kz60x^ zLRjPIbAr8~8ep%VrbxA=*gX~O(EkATWgB6U`+n;4N&W=3xEEm#Mrwj1l3&15N|L1{ z3np%8#7FxLY~4-56f74D)^C4+Ej~<`?it&_Vdv!fcfbV z7V}rVC(q6R+c%W3MMFzI@yrs~-tq(~$=8dsTzO^%EX13z7s(!iwW&3**K-K7yP+&t zzu5qrT^27Txg}!nY(B}ez`BMIwr|16Cp@zS7Pg%*C(Fp~;(R_iO`Wh|O_T53T zmS=vzwuccG<*q-DXIFt47!x+CMYmQwy9TWLc)}J~9T5!X{=gy{5mw*vl;EDc4(#!( z$5KnWW-19TcL1+0nsn3)FyOI%La#Qk3c_vCG0>v|AYJH6x=pJWKI zxYS2dOM=s!1fP_K0(*Iku*ZL6JMhu&0P|i!*xa72cJb^kFw4$_&G0%P_?##VSf}WR zQf-&qL!J0&;lO_EApJ7sMfy;E=c$Nk1Y!qRiJhnCA*>hk@R)mF=Mh)dz zHn6{o33J(;H-cw5z)D&YcI0H+l{|X^Oh4hS)Dpk3e+1ui$mtrQIWJbLgVCJUa)hk3C^N3XXW)>Y_OS z3)d#>`0}m;_-Ky69!w`}!^6ziJaYo}q$OdU+Lo&F%o*6eH~vy5x2c;n=a~zz3LnBU za^!j0LxdeK-HU^HT_-O9i=I!I%BxrnKH5cKP8|trly?t5C#s8f3D}L2Yf?+x z%qI2YqqzbzyGhts=X;HLb{UwhK4Itj54z4XH(-OO6IOiTa4OH-f!TH?%)BOc0?#~v zC6`{6T2kKRuQty-fgO89SpV(WxUpY%0la`sb|!3zR{}1L>X)RN)@S-Lz60QMo3u-+Fjs_kQ*4-tyfIX`6ky?^>;LS5W zS`e^J5riFzDO|#{U||29CahV`$x%GJ4eZ56!lH&X!_{J4Z6Uy{XAqXLP7x2%>R2eS zyKM-2Z#v@|pX422dC#v%o$P%`rG{sBf$g|T*x32w+VLz5nD=JF8dhBpT!3(3zXlN2 zc}Q>)A1wmdkAL1$ZC`qM+41ZiFwJ{}?e5;+mS>T`DohE>+qXk--R=V`nnReuic2

uP%ftn3b970Vv`@X?+EOVcN8xwod^o{RxD zWHn)tLEiW^OI?z&!1}8arrv+fR6fZ#V6(n^NG;h^o70zP@xY2w2|KjGS&3%}z%*&nKA%EMg#Gv-7ft@+=+L#cDUHB||a|%y^ao zY;7E2DtA2uul!74l|F>M&i^Zzs$~J&b(*lI5&dwvUUzdn2WF*7SV(-AiF|F@z*qmILfb?PaNxiz5y3$9U@43t&FEghgma<5x&^EEm|cD}kbQ9k4a#gjK~nJIhBa2IjPku!*tb1uv=+U=Do=+j%*$J|FEp zu%-VlN-f#euR!pkDh1Y~fUt2z*91dQ8L*Fmgk1{kBbd;C0H$t2*t|(obokoJfqAME z_HlESU}^Lbn3)P;&t{Lo)4sZ!>k}~Dw-=;Nmhb;Acz0I-yMLRouqqqe#jcB13G9|5 zVX-5||KeNn8JNO$!c49kP3PGcV1p+TX5(qPg=b%Z_3K8My0aFZch}Wc1-y=^nx|9S5fI><6$8 za|tW)F6hOxpTLH6BW(2V>v=r;1+9JhKG$ z&493xFZ$uJU0rQfz?$zM%(l4eXFi%Wu&hyp*=YEkY!x)7$9+CwnB+5=NLMA!fyqw{>UbHH{@Crq*1 z7Z;v605er4?8uD}!9kiMu;%3sQYR}XUcJLda{{LNm@v<>BEi$`49wVrFr&|^f}<@L zU>kJ^8{g-%U{pR2EI^GglV;xpuZ|1AYW|&*T9T}^R?w1*z@Fp~<~pz)ei>JHPhJAH z?+#%lmo@S8y*lO!EXt9v5#M@W;;-9fV9wfv^;1wQ;F%jR>zRb@4Ve?dGk0KW9SGYK zW6+yt9>AuS+DjL}FEBKKXP&^;-X|<+;>J*(c>yc6CQP;E5y6R=H?S^S2}|4BXBi*u z3b5k?3Cn+ZRWJnk01Hqe%!6MTaSXrf=)Jf-;4F#vPe!#{j5~f)7Kfmj*?w-5~ zY)ZEmPd?fWV7~$gi{2b1xF>G{v(+c;u*G@7l`r^~gaHd1L0HDK)q>?iIIv9$gr)6E z6&%P!0K1-KBVD)ONBXMrN!|lC*NL$F{Q9Omiv*@~lrY22?Vs}OKCp)C2s=7p;$@ya z0M>CFVeML<6Krcf1m@I{0@EbH1S1BR&J@CW^q<~{ zj}{9ovJ+vU(S~X~ivyPP#Y(zv?FzgFcS$_3z&OH29%QL)O9G}gj<8V$BLuUcWMH%E5!TdORq)110rs%SQn~=X z-~Q*XJJ!uhQi08UOjw-il#YCDX~3-P2s8Ft{E}zsz(V&Erqp$Z;Mgt$*s_^~HCO5( zc)BxzjqgENlc=R}e3DtfoPM8?I_a)4LxpG0f$h#G?Ddb2g01#!U}fQi)hON-ycTnS zrC%bf%c5n1Tlobr4?V&j3~%Sg*Om(`TZ=Hg*c zyM*aq`zd%)6$5)}PS}zx+aY|CCBQBnAj~4{S_04B1M9JZu=dkJ1+V;4U?;{AHoZr5 zb3R%buE_Y9N64&!Za2(6}$sJ0xNbTtpC9cpZO#| z0h@80uogXH1+V-HU`ERcJFwv7Ha=PI zpxN}dUOf8>tleG03Uj^-=3G_4S{M^nldxN`@$n5<4^6@{)XsbJNmc_h>qD5%#OZ>Q zmG8hZD@~-9X!V{Yn7!5jJCR3N=)iajKFJ@zEFKf~)!J(o&wc{4yh7OVWf>!Q_6yh> zUBZ^RT)oS)-@trV5O#lxy=cAbeYyH7Ux^Bha9B=T< z0@(5>!c0C7n8LF&z$SST_VI#}V2W)C%C6moddRTFk#jw4t(L61F(L-4Wv$PO3D?y0~~?X`V;oa_sVZRniH@o zdkEXp+s=q*&cKegCCob2<~7e;fHlZFE!B2TwT)m4={zt87s7@HwcN``y8ujME@9gr zEjq}vi@^G}BCN$=qYgZ~1Z;ciDXF%|0n-HAny$e1=nBm|cTcg4#TRMGYqG&fpBe(UupmiHRqr z+5+R33C>NukzFQCYpQ6=_6o3fs|hPCTrr$)i4U;yCWJlxFF|nFItc~guv>f zd^A5`$)1FnHG0#EXIFvwEhkK&efoT!T?1y@fG{gZzgIl-2WE5cxKvwpE1xzzyAJHB zIbjh2GX$%}0ALRD2&+HdTkw7h1m>zln9i4><$RJifVrn0lWI$AJX~+H(?ux zsR&+Fw}6e>N!XBgfv5N+gMgjsNmxbT1;K_@FtAonj!LyX8b7E7AMG}p1?j58_)dlcJNEa8RGW5|i(vCR0$5HwVU_kbqxop}fOT~utb5!1xP1ivcz=^pI5BVE-Q7coqvxZ8Kp*9NLxfEDqSkj)eWX-uyey;(`56KPV;n zCc@v0X9>X8_z?EqVU6G&kO=J94#FZA#VhgAo&n1mNSMD_|Jy`}>T z@gl74qH!De+A@HB+)vn&HR}WqPA0HLJqWA!^@d;;lm*PXdY@FAtM-pye3H+B8QdqV zm5$nao@E0o-a^>T|NiICG}i5^L)eX0Z3Qo?7r+c&=}5Jy_nIe|hUNlG_9AR$ z`_LbJZF#^xt|4r~ntcU4dkM^>2VvITh6*OV`M_3x+$+^~d*nyKoa+^^pg_X@v#ocF zPqF~myj6r593S_AXNAB%4kE10sm(<^D*{$skFbyXO$9TB*T4=x+#|K5(Y&sL9r`!G zHrNqn=EeT82oUo2>NineCGHt20{x2fG@T>%wjssx_ z<~I@?x4s8s$Q zKFM-m0}m0_F#YK?o_z%NX(nNNGc;Uz_6eBI$6Zowg>FY}c~$|eDul42UUpM?RtYRs zm$18Swg@IqpMg0JCv2AQQNiiM7hna|JEhw8ovd);ll%(I)swJU|NBd6>)ruXz|QnIJeK`320-m#}^H-U?ds8<^5Q!lK4aujX6w z2bk3&!tUJt9l^7|z(zG8tcB9q13aq*W>vUNYKfPE+ajL*19sY0kWP2}fq zT`UNTDm!_Sk7fj{pB7<;Hx3Fmb&P@iYDCykllOu*jtQ{EueM4p*>|UnU@yoNm_Z<6 z55Jrc95$H&v)D}7%q2GVd`rxM<*N|(_4%6NJhK3HCUc8aTZL~w!C-y{*tr10z8q`V znvZ4)Z2lU;Qj)x5d1eJ{K@-AU{~k@{nKiKE>6@k6S~&K8#xomWA3O=0JA2Iuo}C5O zSBJ1(>m3jA%ofYBprYiE+On=X-o~z9D#jpO_=>ld%>ve1nfk{Mya;>`+gnaqd5a}y+W8p)RSF2 za{)H*C}E9k69iAdd0^WX6V~z9k4b#A3&5&X2y5HGRgY&EfgSq1L28N1x35cib_v)U z55lII>=%4;pH0=>#Cy7d3FWZ zv>3v)`xr&?%m-MaC1K|Me+sUfFR)G<2}>VnFW4aV19nrHu&X_m4B?Z!3anx3e^N`H zH?26vvunUQTqdk}l)`78`2(A}pD^D(DTO?{4(#$w!VEOq3tssFz*?ygw&ZFP7d~1b zu$AR&rIu8B{jT8I4Pdr+2>U!}RT9r`0$X#0utC#$D)Q_WFx@ePwY72q| zncv#;><%!C8-)41{Uq2NxC?Bz4q=96S{i(`FklIz2s?RThC0u}fi-!rDb_CU60`?HtJR`#L3O$1OXpev?Y#_|oq|11oMFBG%N!Z(nkq3G97+9ZgtEHA~ndU2) zb43Fycud&Tzhx$Tv?su_P7&5wAiA1xM` zXF6eHXg^!jB ztdkO9kHP{L@+=M5iGmeUOa2YW6--0Zfn5$E?8x)yg2yxi*rfx6S+86#7$-A<-CRIe z%T^l%E8#3)hyE>>Y75BdpTM`|Ij|0Q3DY^zPH~p5N)RJcf8}xYg4w$bSVSm4> r37&vrVEa}RHnMZYVm?|4uogWD8-0509-h4i_Of<~R9n_>&u9N1#T4LU literal 2398 zcmZ|OTSyaN6bEpvdE1;i=bFx(^U_M4u9*h8yfwwlCaVb&;mmbWQG_5XM6*InY%R@j zrG>F_l2I%uRu5@l*@9+T+QnOqN=pi~h*qE#?GWeT?0Y&7KmPEY|Mv;qoagmqV4~08 z@$Pi`cxEh@$zd|KE7jS&^M=FeDgQ=`y=Y1VI_Aq&)qfq}j=U^d4Zf~YtfBof(DUdZQm^@1NO8fVkq8~NMY&VtW+)vl(!5Aqo$$H6Pg4DPgNBA*|=8+__l+XvdS zkY|o$f>%oSNNMkjd|{3Ryyi(+1?}0$*O&N%e{b)rqP-vTAJsoooqhE+DqQT-ANi5~ zQSimBo0{nS0OT7N%-|=wFGSIvgM2|$2l&ap+V!*#ME<+SG^-(4O7-YBue(S8N;$3!gf&oqg}wC5vl_&&SQ*_XXzCWQ9E z$nQ#g1m2iu8lrs&@;0dnyec-u#s3u`A9$}BeBEVF1Dzj=e9dqv_>Q z|MtYe9NLE=&r?Q%4@uVRXfH&bpU(!rG~4M$`*7qNZhe+J_n%$UwVC!J#}~ zx_F-v$Unbi0zayqKSJk6A|K|f11}c#*k~VxeA%@M@cnwuH0`62?@{D|pYs!LrTr@8 zn+{09>lZ98`Vu35x`hv3z-4Ob{ME?sV>7^y%n$9L{Tk#Mp))DYzIre=&ZX_VLJ929<)(5Bror y`vl}q73P8;H13_Jy%hPeDk=CZN4c5yYmsM727_-e*y2Td8S=?85Adyufyh67bX#Kp diff --git a/catboost_info/learn_error.tsv b/catboost_info/learn_error.tsv index 5631041..b27d717 100644 --- a/catboost_info/learn_error.tsv +++ b/catboost_info/learn_error.tsv @@ -1,51 +1,1001 @@ -iter RMSE -0 0.9710631926 -1 0.8247710073 -2 0.7022282601 -3 0.6270477627 -4 0.5346013406 -5 0.444717163 -6 0.3839413771 -7 0.3332464887 -8 0.2896858851 -9 0.2546482153 -10 0.2217991067 -11 0.1892739767 -12 0.1682408917 -13 0.1400203532 -14 0.1217420713 -15 0.1038814836 -16 0.08639816643 -17 0.07526269779 -18 0.06268846321 -19 0.05362374493 -20 0.04587823084 -21 0.03940739145 -22 0.03396654535 -23 0.0283153494 -24 0.02431601344 -25 0.02028646573 -26 0.01693585957 -27 0.01414882591 -28 0.01183896794 -29 0.009906073428 -30 0.008624575516 -31 0.007223667424 -32 0.00605654099 -33 0.005085089643 -34 0.004273904823 -35 0.003596032008 -36 0.003101566493 -37 0.002677233959 -38 0.002312693569 -39 0.002014958717 -40 0.001755565836 -41 0.001529576385 -42 0.001332687989 -43 0.001161152485 -44 0.001011704525 -45 0.0008814993786 -46 0.0007680587495 -47 0.0006692235685 -48 0.0005831128718 -49 0.0005080879754 +iter Logloss +0 0.6921376331 +1 0.6914311222 +2 0.6902726804 +3 0.6893236297 +4 0.6884269714 +5 0.6877708094 +6 0.6868745599 +7 0.6854426605 +8 0.6846512471 +9 0.6835081152 +10 0.6828162926 +11 0.6821300387 +12 0.6810161471 +13 0.6800281746 +14 0.6789965715 +15 0.6778871247 +16 0.6767778482 +17 0.675800315 +18 0.6745953049 +19 0.6735743199 +20 0.6728023546 +21 0.6717833025 +22 0.6705873864 +23 0.6691952518 +24 0.6683306949 +25 0.6672187277 +26 0.6662092464 +27 0.6652034862 +28 0.664099523 +29 0.6628215143 +30 0.6619682567 +31 0.6605963962 +32 0.6597477538 +33 0.6585793495 +34 0.6574136274 +35 0.6566625323 +36 0.6557201062 +37 0.6547810435 +38 0.6537957873 +39 0.6524456825 +40 0.651703792 +41 0.6505536692 +42 0.6495760253 +43 0.6482374328 +44 0.6469012243 +45 0.6460723281 +46 0.645103676 +47 0.6441353474 +48 0.6431703482 +49 0.6423503671 +50 0.6417189837 +51 0.6408985257 +52 0.6395800029 +53 0.638454846 +54 0.637829457 +55 0.6369738238 +56 0.6356642927 +57 0.6346218245 +58 0.6335089888 +59 0.6327144418 +60 0.6314146519 +61 0.6302148189 +62 0.6294122423 +63 0.6283095479 +64 0.6273769736 +65 0.626091625 +66 0.6248111555 +67 0.623531299 +68 0.6225138136 +69 0.62191024 +70 0.6212150369 +71 0.6205226438 +72 0.6194400617 +73 0.6186558434 +74 0.6179682612 +75 0.6168900728 +76 0.61597624 +77 0.6151563355 +78 0.6140871133 +79 0.6135282346 +80 0.6125324198 +81 0.6116280981 +82 0.610728332 +83 0.6099583251 +84 0.6091488685 +85 0.607908896 +86 0.6066734365 +87 0.6057140146 +88 0.6048230018 +89 0.6038645932 +90 0.603286062 +91 0.6020624808 +92 0.6010184118 +93 0.5997997522 +94 0.5988519958 +95 0.5978163225 +96 0.5968528134 +97 0.5956415364 +98 0.5950270976 +99 0.5940695831 +100 0.5931115406 +101 0.5923869354 +102 0.5916092396 +103 0.5907429201 +104 0.5900058321 +105 0.5892723969 +106 0.5880816919 +107 0.5874404567 +108 0.586706996 +109 0.5859788145 +110 0.5852130651 +111 0.584120197 +112 0.5833940932 +113 0.5823940975 +114 0.5816756827 +115 0.5805052434 +116 0.5794262716 +117 0.5787069798 +118 0.5777818561 +119 0.5768609388 +120 0.576025205 +121 0.5753114138 +122 0.5741571529 +123 0.5731786489 +124 0.5722654206 +125 0.5715580412 +126 0.5707720092 +127 0.5702917491 +128 0.5693833147 +129 0.5687683991 +130 0.567802012 +131 0.5668365444 +132 0.566021783 +133 0.5654101968 +134 0.5645956993 +135 0.5639859353 +136 0.5632997581 +137 0.5626934341 +138 0.5619667087 +139 0.561355676 +140 0.5606338722 +141 0.5599243215 +142 0.5588081224 +143 0.5582083719 +144 0.5571798342 +145 0.5560722521 +146 0.555395518 +147 0.5548768214 +148 0.5540812016 +149 0.5532853092 +150 0.5524131826 +151 0.5514765126 +152 0.5503822821 +153 0.5496355976 +154 0.5487687332 +155 0.5479253531 +156 0.5468407444 +157 0.5457558547 +158 0.5448993189 +159 0.5442429866 +160 0.54350417 +161 0.5428456068 +162 0.5421104516 +163 0.5410383344 +164 0.5401923571 +165 0.5392873032 +166 0.5386331763 +167 0.5379828811 +168 0.5372601577 +169 0.5364935909 +170 0.5359273042 +171 0.5352867246 +172 0.5346508111 +173 0.5338960375 +174 0.5331440909 +175 0.5325764247 +176 0.5319340825 +177 0.5308882339 +178 0.5301761712 +179 0.5292916383 +180 0.5285842078 +181 0.5280214122 +182 0.5274686984 +183 0.5265111327 +184 0.5254784737 +185 0.5246844462 +186 0.5240639022 +187 0.523193121 +188 0.5224034829 +189 0.5215357968 +190 0.5205158591 +191 0.519499617 +192 0.5189555841 +193 0.5182280072 +194 0.5176118144 +195 0.516600694 +196 0.5160629494 +197 0.5152068564 +198 0.5146640411 +199 0.5139804397 +200 0.5130543666 +201 0.5122875103 +202 0.5114408987 +203 0.5108332038 +204 0.5102274801 +205 0.5096990253 +206 0.5091704939 +207 0.5084650133 +208 0.5076262014 +209 0.5067908296 +210 0.5058066249 +211 0.505107935 +212 0.5043528165 +213 0.5037583411 +214 0.5033062739 +215 0.502780165 +216 0.5019570121 +217 0.5011352003 +218 0.500619271 +219 0.499652671 +220 0.4991394196 +221 0.4984841432 +222 0.4977248609 +223 0.4969089159 +224 0.4962528561 +225 0.4952942176 +226 0.4947139238 +227 0.494205675 +228 0.4934548054 +229 0.4928752099 +230 0.4921438651 +231 0.4915419051 +232 0.4906675262 +233 0.4898703141 +234 0.4894825518 +235 0.4889106836 +236 0.4883435837 +237 0.4875512464 +238 0.4866151299 +239 0.4858255344 +240 0.4852675327 +241 0.4844093025 +242 0.4839843852 +243 0.4834949417 +244 0.4828630686 +245 0.4820830396 +246 0.4811603895 +247 0.4806003102 +248 0.4801141407 +249 0.4795589915 +250 0.4790101647 +251 0.4783102529 +252 0.4777278134 +253 0.4772474894 +254 0.4764735358 +255 0.4755656251 +256 0.4751491078 +257 0.4743878543 +258 0.4736746209 +259 0.4729804865 +260 0.4724334478 +261 0.4720227505 +262 0.4712655161 +263 0.4704395831 +264 0.4698364777 +265 0.4693632637 +266 0.468612028 +267 0.4677223393 +268 0.4670231215 +269 0.4663278971 +270 0.4658006259 +271 0.4653341855 +272 0.4646414902 +273 0.4639707293 +274 0.4632801924 +275 0.4628103631 +276 0.4620725938 +277 0.4616752139 +278 0.4608016184 +279 0.459932306 +280 0.4593444594 +281 0.4588904636 +282 0.4582101745 +283 0.4575310392 +284 0.4566698628 +285 0.4561560367 +286 0.4554346417 +287 0.454855838 +288 0.4541831996 +289 0.4536689264 +290 0.4531557347 +291 0.4527086743 +292 0.4522080932 +293 0.4516352969 +294 0.4509672991 +295 0.4502509066 +296 0.4495855272 +297 0.4488103007 +298 0.4482808496 +299 0.44764027 +300 0.4471345288 +301 0.4464275283 +302 0.445842513 +303 0.4453389304 +304 0.4445062748 +305 0.4438071592 +306 0.4431103936 +307 0.4424160804 +308 0.4419180283 +309 0.4410959354 +310 0.4403396802 +311 0.439715841 +312 0.4391380974 +313 0.4387702474 +314 0.4382839075 +315 0.437851578 +316 0.4370380257 +317 0.4365532909 +318 0.4359790853 +319 0.4353643443 +320 0.4349343138 +321 0.4341925383 +322 0.4335165833 +323 0.4328867538 +324 0.4323144172 +325 0.4316435754 +326 0.4310806351 +327 0.4305228548 +328 0.4299000289 +329 0.4294867728 +330 0.4287576846 +331 0.4281379964 +332 0.4274753375 +333 0.4269457119 +334 0.4265914985 +335 0.4261210859 +336 0.4253347473 +337 0.4246753199 +338 0.4242052266 +339 0.423488817 +340 0.4228350009 +341 0.4222266035 +342 0.4217495535 +343 0.4212821083 +344 0.4208801644 +345 0.4203901972 +346 0.4198750854 +347 0.4193912702 +348 0.4187875901 +349 0.4183272166 +350 0.4178133053 +351 0.4172735342 +352 0.4165695991 +353 0.416060222 +354 0.4154641713 +355 0.414705119 +356 0.4140638581 +357 0.4134714263 +358 0.4131337319 +359 0.4126064479 +360 0.4121523883 +361 0.4115223118 +362 0.4110527294 +363 0.4106555283 +364 0.4100904507 +365 0.4095082581 +366 0.4088843039 +367 0.4081387094 +368 0.4073968657 +369 0.40671779 +370 0.4063282268 +371 0.4058952715 +372 0.4054609069 +373 0.4049458589 +374 0.4045187661 +375 0.4038458211 +376 0.4031138676 +377 0.4025977935 +378 0.4021515165 +379 0.4017208729 +380 0.4012702831 +381 0.400815968 +382 0.4003585875 +383 0.3996368136 +384 0.3989150098 +385 0.3984945757 +386 0.3979838576 +387 0.3975541421 +388 0.3969472051 +389 0.3965793593 +390 0.3960817797 +391 0.3953677969 +392 0.3948702344 +393 0.3943742556 +394 0.3938194088 +395 0.393111263 +396 0.3924046883 +397 0.3919914365 +398 0.3914010099 +399 0.390907675 +400 0.390361424 +401 0.3897764002 +402 0.3892910055 +403 0.3889340375 +404 0.3882963104 +405 0.3878849873 +406 0.3873047105 +407 0.3869512975 +408 0.3864888975 +409 0.3860683399 +410 0.3854924994 +411 0.3850928077 +412 0.3847414127 +413 0.3842214559 +414 0.3839176638 +415 0.3834964207 +416 0.3829643939 +417 0.3826096952 +418 0.3820993347 +419 0.3815246395 +420 0.3812263012 +421 0.380756293 +422 0.3803054477 +423 0.3799018434 +424 0.3793343902 +425 0.3789422768 +426 0.3786449858 +427 0.3781736961 +428 0.3776146003 +429 0.3772632352 +430 0.3768726545 +431 0.3763188847 +432 0.3759762943 +433 0.3756239925 +434 0.3752392616 +435 0.3746320861 +436 0.3739689333 +437 0.3736773006 +438 0.3732340549 +439 0.3725745422 +440 0.3719174904 +441 0.3714807417 +442 0.3710459641 +443 0.3703917946 +444 0.3699332518 +445 0.3694799159 +446 0.3688319496 +447 0.3682378743 +448 0.3677363523 +449 0.3674085396 +450 0.367077423 +451 0.3667051792 +452 0.3664227 +453 0.3660525254 +454 0.3654101065 +455 0.3649876288 +456 0.3646104591 +457 0.364115434 +458 0.3637939692 +459 0.3634185621 +460 0.3628906863 +461 0.362450093 +462 0.3618173812 +463 0.3612926432 +464 0.3608229033 +465 0.3603342303 +466 0.3599510874 +467 0.3593257538 +468 0.3588909294 +469 0.3584752466 +470 0.3580092447 +471 0.3575279159 +472 0.3570059666 +473 0.356525468 +474 0.3561132167 +475 0.3557490196 +476 0.3554368232 +477 0.3550801958 +478 0.3544650078 +479 0.3541559832 +480 0.3537509612 +481 0.3532782325 +482 0.3527637294 +483 0.3522067794 +484 0.3516512513 +485 0.3513434274 +486 0.3509157343 +487 0.3505166599 +488 0.3500145929 +489 0.3496588809 +490 0.3492625185 +491 0.3488656453 +492 0.3484638504 +493 0.3480004881 +494 0.3476062758 +495 0.3472642175 +496 0.3468141258 +497 0.3463197521 +498 0.3460200386 +499 0.3455273892 +500 0.3449356386 +501 0.3445860488 +502 0.3441699871 +503 0.3437617634 +504 0.343414247 +505 0.3430223976 +506 0.34276789 +507 0.3423123913 +508 0.3419258424 +509 0.3414402349 +510 0.3409900154 +511 0.3404066435 +512 0.3400745732 +513 0.3396446535 +514 0.3392627665 +515 0.3389605539 +516 0.3386215099 +517 0.3381765485 +518 0.3376931037 +519 0.337248815 +520 0.3367245538 +521 0.3363859143 +522 0.3359881639 +523 0.335651734 +524 0.335174254 +525 0.3346553019 +526 0.3342645126 +527 0.333931561 +528 0.3334565205 +529 0.3331309302 +530 0.3328454452 +531 0.3325251554 +532 0.3321362521 +533 0.3318056336 +534 0.3313364259 +535 0.3309532276 +536 0.3306332145 +537 0.3302503697 +538 0.3298849165 +539 0.3294564698 +540 0.3290752811 +541 0.3286145287 +542 0.3283276004 +543 0.327948004 +544 0.3276326656 +545 0.3273104302 +546 0.3269880712 +547 0.3266501256 +548 0.3262259236 +549 0.3258500312 +550 0.3253507018 +551 0.3250135141 +552 0.3246421516 +553 0.3244070964 +554 0.3240867768 +555 0.3238518877 +556 0.3235307634 +557 0.3231597074 +558 0.3226190891 +559 0.3221721266 +560 0.3217267905 +561 0.32127652 +562 0.3210437383 +563 0.3208125659 +564 0.3204400071 +565 0.3199909372 +566 0.3196365365 +567 0.3191042415 +568 0.3186643677 +569 0.3183591408 +570 0.3178278974 +571 0.3174204188 +572 0.3169393625 +573 0.3165036993 +574 0.3161586651 +575 0.3158007775 +576 0.3153626706 +577 0.3148854673 +578 0.3143635946 +579 0.3138878771 +580 0.3134522523 +581 0.3131497758 +582 0.3128520804 +583 0.3124257113 +584 0.3121328269 +585 0.311617566 +586 0.3113503967 +587 0.3110510664 +588 0.3105376831 +589 0.3101793698 +590 0.3097576456 +591 0.3093302974 +592 0.3089044435 +593 0.3085143992 +594 0.308161561 +595 0.307898977 +596 0.3076050154 +597 0.3073084865 +598 0.3070910786 +599 0.3066311819 +600 0.3063479534 +601 0.3059329774 +602 0.3055891906 +603 0.3052941263 +604 0.3049547076 +605 0.3044544842 +606 0.3041649589 +607 0.3036680775 +608 0.3033767513 +609 0.3029951879 +610 0.3025867684 +611 0.3020925096 +612 0.3018062498 +613 0.3013960038 +614 0.3010602551 +615 0.3007759707 +616 0.3005227702 +617 0.3001474057 +618 0.2998298875 +619 0.2994279989 +620 0.2990533326 +621 0.2986103424 +622 0.2982386308 +623 0.2980017236 +624 0.2976457221 +625 0.2973187694 +626 0.2970820921 +627 0.2968406635 +628 0.29651102 +629 0.2962718989 +630 0.2960237563 +631 0.2957421456 +632 0.2954565712 +633 0.2951291544 +634 0.2948581576 +635 0.2944618165 +636 0.2940285674 +637 0.2937956708 +638 0.2935192755 +639 0.2932442342 +640 0.2928142079 +641 0.2925079678 +642 0.2922439575 +643 0.2919395225 +644 0.2915480094 +645 0.2912747434 +646 0.2908083541 +647 0.2906082656 +648 0.2902667182 +649 0.2900387049 +650 0.2898409878 +651 0.289457913 +652 0.2891859795 +653 0.288721157 +654 0.288452323 +655 0.2882551636 +656 0.2879799902 +657 0.2876007472 +658 0.287326932 +659 0.2869494557 +660 0.2866300855 +661 0.286373198 +662 0.2859906852 +663 0.2855351972 +664 0.2851559839 +665 0.2848492605 +666 0.2845873215 +667 0.2843303766 +668 0.2840162835 +669 0.2837108139 +670 0.2833648005 +671 0.2830218524 +672 0.2826870212 +673 0.2823174106 +674 0.2820060615 +675 0.2815587755 +676 0.2813848065 +677 0.28112724 +678 0.2808396901 +679 0.2806121494 +680 0.2802471199 +681 0.2798035315 +682 0.2795483172 +683 0.2792404869 +684 0.2789540802 +685 0.2786558292 +686 0.2782940418 +687 0.277895276 +688 0.2775917564 +689 0.2772567102 +690 0.2770056639 +691 0.2768216282 +692 0.2765277241 +693 0.2762676307 +694 0.2760578649 +695 0.2757622919 +696 0.2754316394 +697 0.2749989714 +698 0.2745687238 +699 0.2743486528 +700 0.2741394831 +701 0.2738577638 +702 0.2736173953 +703 0.2734091218 +704 0.2732017615 +705 0.2729077467 +706 0.2726194156 +707 0.2723656573 +708 0.2721585333 +709 0.2719530272 +710 0.2716654135 +711 0.2713763544 +712 0.271054617 +713 0.270765279 +714 0.2705314734 +715 0.2701860922 +716 0.2699523228 +717 0.2695332468 +718 0.2692927846 +719 0.2690525545 +720 0.2686732369 +721 0.2684303522 +722 0.268084062 +723 0.2678163158 +724 0.2675443845 +725 0.2673065875 +726 0.267035761 +727 0.2668061469 +728 0.2665300156 +729 0.2663027665 +730 0.2661044661 +731 0.2658744007 +732 0.2655614189 +733 0.265313834 +734 0.2650115107 +735 0.2646720558 +736 0.2644053825 +737 0.2639993003 +738 0.2637928831 +739 0.26338629 +740 0.2631142416 +741 0.262746945 +742 0.2625778381 +743 0.26234528 +744 0.262142009 +745 0.2617390794 +746 0.2614471678 +747 0.2611715304 +748 0.2608939814 +749 0.260690721 +750 0.2603874398 +751 0.2601193339 +752 0.2597940266 +753 0.2595251692 +754 0.2592016033 +755 0.2588046512 +756 0.2584771727 +757 0.258083484 +758 0.2576890822 +759 0.2575262125 +760 0.2573358842 +761 0.2571070407 +762 0.2567511073 +763 0.2564899155 +764 0.2561713372 +765 0.2558850923 +766 0.255597336 +767 0.2553038406 +768 0.2551415861 +769 0.2549170256 +770 0.2547311187 +771 0.2544388579 +772 0.2541200263 +773 0.253959594 +774 0.2537388567 +775 0.2534203806 +776 0.2531755737 +777 0.2529106076 +778 0.25252861 +779 0.2522401299 +780 0.2518930776 +781 0.2515788674 +782 0.2513293752 +783 0.2510782033 +784 0.2506980555 +785 0.2503226527 +786 0.2499814757 +787 0.2496719041 +788 0.2494325127 +789 0.249093871 +790 0.2488785131 +791 0.2486212573 +792 0.2483151789 +793 0.2481008215 +794 0.2478614556 +795 0.2475220348 +796 0.2471528138 +797 0.2467823178 +798 0.2465691545 +799 0.2464111426 +800 0.2462568326 +801 0.245888627 +802 0.2457131488 +803 0.2454448321 +804 0.24514739 +805 0.2449321066 +806 0.244754027 +807 0.2444520933 +808 0.244150364 +809 0.2438756845 +810 0.2436781462 +811 0.2434370794 +812 0.243231222 +813 0.2429795031 +814 0.2428085953 +815 0.2425811674 +816 0.242311599 +817 0.2419550121 +818 0.2418074012 +819 0.2415643207 +820 0.2412046386 +821 0.2410006757 +822 0.240788513 +823 0.2405819957 +824 0.2402247212 +825 0.2400239493 +826 0.2398227815 +827 0.2394675868 +828 0.2391150721 +829 0.238944539 +830 0.2387435521 +831 0.2385065428 +832 0.2383424767 +833 0.2381619513 +834 0.2378786377 +835 0.2375881033 +836 0.2373639579 +837 0.2370155092 +838 0.2366988765 +839 0.2364143282 +840 0.2361525744 +841 0.2359784991 +842 0.2357424561 +843 0.2355073158 +844 0.2353411147 +845 0.2350888635 +846 0.2348985119 +847 0.2346166734 +848 0.2343535147 +849 0.2340152775 +850 0.2338253856 +851 0.2335178043 +852 0.2333585535 +853 0.2331409327 +854 0.232799132 +855 0.2326270938 +856 0.232290253 +857 0.2320625271 +858 0.2317883436 +859 0.2315655244 +860 0.2313702852 +861 0.2311274941 +862 0.2307902468 +863 0.230628665 +864 0.2304723965 +865 0.2302186894 +866 0.2300390857 +867 0.2297347912 +868 0.2295422937 +869 0.2294070827 +870 0.229074195 +871 0.2288957728 +872 0.2287379333 +873 0.2284071765 +874 0.2281981345 +875 0.2279745298 +876 0.2277943641 +877 0.227464397 +878 0.2272489561 +879 0.2270080894 +880 0.226743977 +881 0.2265238464 +882 0.2261985966 +883 0.2259350951 +884 0.2256665911 +885 0.2254206049 +886 0.2251793891 +887 0.2249354039 +888 0.224719188 +889 0.2244787983 +890 0.2242641449 +891 0.2240526804 +892 0.2239209754 +893 0.2236602732 +894 0.2235089796 +895 0.2233028178 +896 0.2231716088 +897 0.2229924926 +898 0.2226752298 +899 0.2225238638 +900 0.2223377015 +901 0.2220517312 +902 0.2217935941 +903 0.2215100569 +904 0.2213576053 +905 0.2210437059 +906 0.2207309093 +907 0.2205256373 +908 0.2203404031 +909 0.2201428967 +910 0.2199867836 +911 0.2197325953 +912 0.21941949 +913 0.2192427261 +914 0.2190470483 +915 0.2188108265 +916 0.2185763461 +917 0.218397545 +918 0.2182284423 +919 0.2180648446 +920 0.2178877081 +921 0.2177140968 +922 0.2174067859 +923 0.2172427475 +924 0.2170075242 +925 0.2167770352 +926 0.2166043286 +927 0.2163751168 +928 0.2161562507 +929 0.2159849725 +930 0.2157700509 +931 0.2155961884 +932 0.2153693672 +933 0.2152064868 +934 0.2149531543 +935 0.2146568788 +936 0.2144625975 +937 0.2142915385 +938 0.2141212757 +939 0.2139780266 +940 0.2137883178 +941 0.2135832033 +942 0.213338275 +943 0.2130945367 +944 0.2127992234 +945 0.2125003551 +946 0.2123049072 +947 0.2120079526 +948 0.2117684079 +949 0.2115994564 +950 0.2113548624 +951 0.2111350383 +952 0.2108422305 +953 0.2106553572 +954 0.2103629942 +955 0.2100700906 +956 0.2098822147 +957 0.2096414268 +958 0.2094775353 +959 0.209343031 +960 0.2091770747 +961 0.2089391159 +962 0.2087473593 +963 0.2084878087 +964 0.2082003291 +965 0.2079655379 +966 0.2078309123 +967 0.2076701543 +968 0.2074827892 +969 0.2072977998 +970 0.2071178449 +971 0.2069490254 +972 0.2068189063 +973 0.2066576438 +974 0.2063765356 +975 0.2061198865 +976 0.2059267993 +977 0.2057632846 +978 0.2055076659 +979 0.2053591971 +980 0.2050760154 +981 0.2047961482 +982 0.2045843367 +983 0.2043832455 +984 0.2041069418 +985 0.2038964459 +986 0.2037365884 +987 0.2035126856 +988 0.203312993 +989 0.2031612588 +990 0.2029575471 +991 0.202779838 +992 0.2026532761 +993 0.2023749756 +994 0.2022287718 +995 0.2019803503 +996 0.2017732369 +997 0.201497757 +998 0.2012941028 +999 0.2011675132 diff --git a/catboost_info/time_left.tsv b/catboost_info/time_left.tsv index 911fdf9..6dc4bb5 100644 --- a/catboost_info/time_left.tsv +++ b/catboost_info/time_left.tsv @@ -1,51 +1,1001 @@ iter Passed Remaining -0 56 2790 -1 57 1383 -2 58 911 -3 60 700 -4 61 553 -5 61 453 -6 62 382 -7 62 329 -8 63 287 -9 63 253 -10 63 226 -11 64 204 -12 64 184 -13 65 168 -14 65 153 -15 66 140 -16 66 129 -17 67 119 -18 67 110 -19 71 107 -20 72 99 -21 72 91 -22 72 84 -23 72 78 -24 72 72 -25 72 67 -26 73 62 -27 73 57 -28 73 53 -29 73 49 -30 73 45 -31 74 41 -32 74 38 -33 74 35 -34 74 31 -35 74 29 -36 74 26 -37 75 23 -38 75 21 -39 75 18 -40 75 16 -41 75 14 -42 76 12 -43 76 10 -44 76 8 -45 76 6 -46 76 4 -47 76 3 -48 77 1 -49 77 0 +0 0 550 +1 0 391 +2 1 350 +3 1 331 +4 1 311 +5 1 291 +6 1 281 +7 2 269 +8 2 276 +9 2 269 +10 2 263 +11 3 262 +12 5 409 +13 5 396 +14 5 384 +15 6 373 +16 6 364 +17 6 365 +18 6 356 +19 7 351 +20 7 347 +21 7 340 +22 7 334 +23 8 327 +24 8 323 +25 8 317 +26 8 313 +27 8 309 +28 9 305 +29 9 300 +30 9 297 +31 9 292 +32 9 289 +33 10 285 +34 10 282 +35 10 280 +36 10 277 +37 10 274 +38 11 277 +39 11 274 +40 11 273 +41 11 271 +42 12 268 +43 12 265 +44 12 263 +45 12 261 +46 12 259 +47 13 257 +48 13 256 +49 13 255 +50 13 253 +51 13 252 +52 14 250 +53 14 248 +54 14 247 +55 14 246 +56 14 245 +57 14 243 +58 15 242 +59 15 241 +60 15 240 +61 15 239 +62 16 238 +63 16 237 +64 16 237 +65 16 235 +66 16 234 +67 16 232 +68 17 231 +69 17 230 +70 19 248 +71 19 248 +72 19 247 +73 19 247 +74 20 247 +75 20 246 +76 20 245 +77 20 244 +78 20 243 +79 21 245 +80 21 244 +81 21 243 +82 21 242 +83 22 241 +84 22 240 +85 22 239 +86 22 238 +87 22 237 +88 23 236 +89 23 235 +90 23 236 +91 23 235 +92 24 234 +93 24 233 +94 24 232 +95 24 231 +96 24 230 +97 24 229 +98 25 228 +99 25 227 +100 25 226 +101 25 225 +102 25 225 +103 26 224 +104 26 223 +105 26 222 +106 26 221 +107 26 221 +108 27 220 +109 27 220 +110 27 219 +111 27 219 +112 27 219 +113 28 218 +114 28 217 +115 28 217 +116 28 216 +117 28 215 +118 29 215 +119 29 214 +120 29 213 +121 29 213 +122 29 212 +123 30 212 +124 33 235 +125 33 235 +126 34 234 +127 34 233 +128 34 233 +129 34 233 +130 35 232 +131 35 231 +132 35 231 +133 35 230 +134 35 230 +135 36 229 +136 36 229 +137 36 230 +138 47 295 +139 48 295 +140 48 294 +141 48 292 +142 48 292 +143 49 291 +144 49 290 +145 49 288 +146 49 288 +147 49 287 +148 50 286 +149 50 285 +150 50 284 +151 50 283 +152 51 283 +153 51 282 +154 51 281 +155 51 280 +156 52 279 +157 52 278 +158 52 277 +159 52 276 +160 52 275 +161 53 274 +162 53 274 +163 53 273 +164 53 271 +165 53 271 +166 54 270 +167 54 269 +168 55 273 +169 55 272 +170 56 272 +171 56 272 +172 56 271 +173 57 270 +174 57 270 +175 57 269 +176 57 269 +177 58 268 +178 58 267 +179 58 266 +180 58 266 +181 59 265 +182 60 269 +183 60 268 +184 60 267 +185 61 268 +186 61 268 +187 62 268 +188 62 268 +189 67 288 +190 68 288 +191 68 287 +192 68 287 +193 68 286 +194 69 286 +195 69 285 +196 69 284 +197 70 283 +198 70 283 +199 70 282 +200 70 281 +201 71 280 +202 71 280 +203 78 304 +204 78 305 +205 79 304 +206 79 304 +207 79 304 +208 80 304 +209 80 303 +210 81 303 +211 81 303 +212 82 303 +213 82 303 +214 83 303 +215 83 303 +216 84 303 +217 84 303 +218 85 303 +219 96 343 +220 109 385 +221 109 384 +222 109 382 +223 110 381 +224 110 380 +225 110 379 +226 112 382 +227 115 391 +228 117 395 +229 118 396 +230 119 399 +231 120 398 +232 120 397 +233 121 397 +234 121 396 +235 122 395 +236 122 395 +237 123 395 +238 123 394 +239 125 397 +240 125 396 +241 126 395 +242 126 395 +243 129 401 +244 130 402 +245 130 401 +246 131 400 +247 131 399 +248 132 398 +249 132 398 +250 133 397 +251 134 397 +252 134 397 +253 136 400 +254 136 399 +255 137 398 +256 138 399 +257 138 398 +258 140 401 +259 141 402 +260 142 402 +261 142 402 +262 143 402 +263 144 404 +264 145 403 +265 146 403 +266 147 404 +267 148 404 +268 150 408 +269 152 413 +270 153 413 +271 154 412 +272 154 411 +273 155 411 +274 155 410 +275 156 410 +276 157 410 +277 157 410 +278 158 409 +279 161 415 +280 161 414 +281 162 413 +282 163 413 +283 163 412 +284 164 411 +285 164 410 +286 164 409 +287 165 408 +288 166 409 +289 166 408 +290 167 407 +291 169 411 +292 170 412 +293 171 412 +294 172 412 +295 173 412 +296 173 411 +297 174 411 +298 176 413 +299 176 412 +300 177 411 +301 177 411 +302 178 410 +303 179 410 +304 179 409 +305 180 408 +306 180 407 +307 181 407 +308 182 407 +309 182 406 +310 183 406 +311 183 405 +312 184 405 +313 185 405 +314 186 404 +315 186 404 +316 188 405 +317 188 405 +318 189 404 +319 190 405 +320 191 404 +321 193 406 +322 194 407 +323 195 407 +324 196 408 +325 197 408 +326 198 408 +327 198 407 +328 199 406 +329 199 406 +330 200 405 +331 201 404 +332 201 403 +333 202 403 +334 202 402 +335 203 402 +336 203 401 +337 204 400 +338 205 399 +339 205 398 +340 206 398 +341 206 397 +342 207 397 +343 208 397 +344 209 397 +345 209 396 +346 210 396 +347 211 395 +348 211 394 +349 212 394 +350 212 393 +351 214 395 +352 216 396 +353 216 395 +354 217 395 +355 218 394 +356 218 393 +357 219 393 +358 220 394 +359 221 394 +360 222 393 +361 222 392 +362 223 392 +363 224 391 +364 224 391 +365 225 390 +366 226 390 +367 226 389 +368 228 390 +369 230 392 +370 231 391 +371 235 396 +372 235 396 +373 236 395 +374 237 395 +375 237 394 +376 238 394 +377 239 393 +378 239 392 +379 240 391 +380 241 392 +381 242 391 +382 243 391 +383 243 390 +384 244 389 +385 244 389 +386 245 389 +387 246 388 +388 247 388 +389 247 387 +390 248 386 +391 248 385 +392 249 384 +393 249 384 +394 250 383 +395 250 382 +396 251 381 +397 251 380 +398 252 380 +399 252 379 +400 253 378 +401 253 377 +402 254 376 +403 255 376 +404 256 376 +405 258 377 +406 259 377 +407 260 378 +408 261 378 +409 262 377 +410 263 377 +411 264 377 +412 265 377 +413 266 376 +414 267 377 +415 268 376 +416 268 375 +417 269 375 +418 273 378 +419 274 379 +420 276 379 +421 277 379 +422 277 378 +423 278 378 +424 278 377 +425 279 376 +426 280 376 +427 281 375 +428 281 374 +429 282 374 +430 282 373 +431 283 372 +432 283 371 +433 284 371 +434 285 370 +435 286 370 +436 286 369 +437 287 368 +438 287 367 +439 288 366 +440 289 366 +441 289 366 +442 290 365 +443 290 364 +444 291 363 +445 292 362 +446 292 361 +447 293 361 +448 293 360 +449 294 359 +450 294 358 +451 295 358 +452 296 357 +453 296 356 +454 297 355 +455 297 354 +456 298 354 +457 299 353 +458 300 353 +459 300 352 +460 301 352 +461 301 351 +462 302 350 +463 302 349 +464 304 349 +465 304 349 +466 309 353 +467 310 352 +468 311 352 +469 312 351 +470 312 351 +471 313 350 +472 315 351 +473 315 350 +474 316 349 +475 316 348 +476 317 347 +477 317 346 +478 317 345 +479 318 344 +480 318 343 +481 319 342 +482 319 341 +483 319 340 +484 320 339 +485 320 339 +486 320 338 +487 321 337 +488 321 336 +489 321 335 +490 322 334 +491 322 333 +492 323 332 +493 323 331 +494 323 330 +495 324 329 +496 324 328 +497 325 327 +498 325 326 +499 325 325 +500 326 324 +501 326 323 +502 326 323 +503 327 322 +504 327 321 +505 328 320 +506 328 319 +507 329 318 +508 329 317 +509 329 316 +510 330 316 +511 330 315 +512 331 314 +513 331 313 +514 332 312 +515 332 312 +516 333 311 +517 333 310 +518 333 309 +519 334 308 +520 334 307 +521 337 309 +522 338 308 +523 339 308 +524 339 307 +525 340 306 +526 341 306 +527 342 305 +528 342 305 +529 343 304 +530 346 305 +531 346 304 +532 347 304 +533 347 303 +534 348 302 +535 348 301 +536 349 301 +537 349 300 +538 350 299 +539 350 298 +540 350 297 +541 351 296 +542 351 296 +543 352 295 +544 352 294 +545 352 293 +546 353 292 +547 353 291 +548 354 290 +549 354 289 +550 354 289 +551 355 288 +552 355 287 +553 355 286 +554 356 285 +555 357 285 +556 357 284 +557 357 283 +558 358 282 +559 358 281 +560 358 280 +561 359 279 +562 359 279 +563 359 278 +564 360 277 +565 360 276 +566 360 275 +567 361 274 +568 361 273 +569 362 273 +570 362 272 +571 362 271 +572 362 270 +573 363 269 +574 363 268 +575 363 267 +576 364 267 +577 364 266 +578 365 265 +579 366 265 +580 366 264 +581 366 263 +582 367 262 +583 367 262 +584 368 261 +585 368 260 +586 369 259 +587 369 258 +588 369 258 +589 370 257 +590 370 256 +591 370 255 +592 371 254 +593 371 253 +594 371 253 +595 372 252 +596 372 251 +597 373 250 +598 373 250 +599 374 249 +600 374 248 +601 374 247 +602 375 247 +603 375 246 +604 376 245 +605 376 244 +606 376 243 +607 377 243 +608 378 242 +609 378 242 +610 379 241 +611 380 240 +612 380 240 +613 381 239 +614 382 239 +615 383 239 +616 385 239 +617 385 238 +618 386 237 +619 387 237 +620 387 236 +621 388 235 +622 389 235 +623 389 234 +624 390 234 +625 390 233 +626 391 232 +627 392 232 +628 393 231 +629 393 231 +630 394 230 +631 395 230 +632 395 229 +633 396 229 +634 397 228 +635 398 227 +636 398 227 +637 399 226 +638 399 225 +639 400 225 +640 401 224 +641 401 224 +642 402 223 +643 403 222 +644 403 222 +645 404 221 +646 405 221 +647 406 220 +648 407 220 +649 407 219 +650 408 218 +651 409 218 +652 411 218 +653 411 217 +654 412 217 +655 413 216 +656 415 216 +657 416 216 +658 419 216 +659 419 216 +660 420 215 +661 420 214 +662 421 214 +663 421 213 +664 422 212 +665 422 212 +666 423 211 +667 424 210 +668 424 210 +669 425 209 +670 425 208 +671 426 208 +672 426 207 +673 427 206 +674 427 206 +675 428 205 +676 428 204 +677 429 203 +678 430 203 +679 430 202 +680 431 201 +681 431 201 +682 432 200 +683 432 199 +684 433 199 +685 433 198 +686 434 197 +687 434 197 +688 435 196 +689 435 195 +690 436 195 +691 437 194 +692 437 193 +693 438 193 +694 439 192 +695 439 192 +696 440 191 +697 440 190 +698 441 190 +699 443 190 +700 444 189 +701 445 189 +702 446 188 +703 447 188 +704 448 187 +705 448 186 +706 449 186 +707 449 185 +708 450 184 +709 450 184 +710 450 183 +711 451 182 +712 451 181 +713 452 181 +714 452 180 +715 452 179 +716 453 178 +717 453 178 +718 453 177 +719 454 176 +720 454 175 +721 454 175 +722 455 174 +723 455 173 +724 456 173 +725 457 172 +726 457 171 +727 458 171 +728 458 170 +729 458 169 +730 459 169 +731 460 168 +732 460 167 +733 461 167 +734 461 166 +735 461 165 +736 462 164 +737 462 164 +738 462 163 +739 463 162 +740 463 162 +741 463 161 +742 464 160 +743 465 160 +744 465 159 +745 465 158 +746 466 157 +747 466 157 +748 467 156 +749 467 155 +750 467 155 +751 468 154 +752 468 153 +753 468 152 +754 469 152 +755 469 151 +756 469 150 +757 470 150 +758 470 149 +759 471 148 +760 471 148 +761 473 148 +762 475 147 +763 475 146 +764 475 146 +765 476 145 +766 476 144 +767 476 144 +768 480 144 +769 480 143 +770 481 142 +771 481 142 +772 482 141 +773 482 140 +774 483 140 +775 483 139 +776 484 139 +777 484 138 +778 485 137 +779 486 137 +780 486 136 +781 487 135 +782 488 135 +783 488 134 +784 491 134 +785 492 134 +786 493 133 +787 493 132 +788 494 132 +789 494 131 +790 496 131 +791 497 130 +792 497 129 +793 498 129 +794 498 128 +795 499 127 +796 499 127 +797 500 126 +798 501 126 +799 501 125 +800 502 124 +801 503 124 +802 504 123 +803 505 123 +804 506 122 +805 508 122 +806 509 121 +807 510 121 +808 511 120 +809 511 120 +810 513 119 +811 514 119 +812 515 118 +813 516 117 +814 517 117 +815 518 116 +816 519 116 +817 519 115 +818 520 115 +819 521 114 +820 521 113 +821 522 113 +822 523 112 +823 523 111 +824 524 111 +825 524 110 +826 525 109 +827 526 109 +828 527 108 +829 528 108 +830 528 107 +831 529 106 +832 530 106 +833 530 105 +834 531 104 +835 531 104 +836 532 103 +837 532 103 +838 533 102 +839 533 101 +840 534 101 +841 535 100 +842 535 99 +843 536 99 +844 536 98 +845 537 97 +846 538 97 +847 538 96 +848 538 95 +849 539 95 +850 540 94 +851 540 93 +852 541 93 +853 541 92 +854 542 91 +855 545 91 +856 545 91 +857 546 90 +858 546 89 +859 547 89 +860 548 88 +861 549 87 +862 552 87 +863 553 87 +864 554 86 +865 555 85 +866 556 85 +867 557 84 +868 558 84 +869 559 83 +870 559 82 +871 560 82 +872 561 81 +873 562 81 +874 563 80 +875 564 79 +876 564 79 +877 565 78 +878 566 77 +879 567 77 +880 567 76 +881 568 76 +882 569 75 +883 569 74 +884 570 74 +885 570 73 +886 571 72 +887 571 72 +888 573 71 +889 574 70 +890 574 70 +891 575 69 +892 575 68 +893 576 68 +894 577 67 +895 577 67 +896 578 66 +897 579 65 +898 579 65 +899 580 64 +900 580 63 +901 581 63 +902 581 62 +903 582 61 +904 583 61 +905 583 60 +906 583 59 +907 584 59 +908 585 58 +909 585 57 +910 586 57 +911 586 56 +912 587 55 +913 587 55 +914 588 54 +915 588 54 +916 589 53 +917 590 52 +918 591 52 +919 591 51 +920 592 50 +921 593 50 +922 593 49 +923 594 48 +924 594 48 +925 595 47 +926 596 47 +927 597 46 +928 599 45 +929 600 45 +930 601 44 +931 602 43 +932 603 43 +933 603 42 +934 604 42 +935 605 41 +936 605 40 +937 606 40 +938 607 39 +939 607 38 +940 608 38 +941 608 37 +942 609 36 +943 611 36 +944 612 35 +945 612 34 +946 613 34 +947 613 33 +948 614 33 +949 614 32 +950 615 31 +951 615 31 +952 616 30 +953 616 29 +954 617 29 +955 617 28 +956 618 27 +957 618 27 +958 619 26 +959 620 25 +960 620 25 +961 621 24 +962 621 23 +963 622 23 +964 622 22 +965 623 21 +966 624 21 +967 625 20 +968 625 20 +969 626 19 +970 629 18 +971 630 18 +972 631 17 +973 632 16 +974 632 16 +975 633 15 +976 635 14 +977 636 14 +978 637 13 +979 637 13 +980 638 12 +981 639 11 +982 640 11 +983 643 10 +984 644 9 +985 645 9 +986 646 8 +987 646 7 +988 647 7 +989 651 6 +990 652 5 +991 652 5 +992 653 4 +993 653 3 +994 655 3 +995 656 2 +996 657 1 +997 657 1 +998 657 0 +999 658 0 diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py index 6d052c9..317ae81 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py @@ -1,3 +1,4 @@ +# encode.py # Copyright (c) MariaDB Foundation. # Distributed under the terms of the Modified BSD License. @@ -126,23 +127,35 @@ def execute(self, kernel, data): result_df = df if inplace else df.copy() try: + # We'll store encoder info here to save into data at the end + encoder_obj = None + label_mappings = None + if method == "label": # Use pandas.factorize which handles NaN by assigning -1 codes + label_mappings = {} for col in columns: codes, uniques = pd.factorize(result_df[col], sort=True) new_col = f"{col}_lbl" result_df[new_col] = codes + # Save mapping value->code for reuse later + mapping = {val: idx for idx, val in enumerate(uniques)} + label_mappings[col] = mapping if drop_original: result_df.drop(columns=[col], inplace=True) + encoder_obj = label_mappings + elif method == "onehot": # sklearn OneHotEncoder with version compatibility encoder = self._make_ohe(handle_unknown="ignore") # replace NaN with sentinel string so it's treated as a category - arr = encoder.fit_transform(result_df[columns].astype(object).fillna("___MISSING___")) + tmp = result_df[columns].astype(object).fillna("___MISSING___") + arr = encoder.fit_transform(tmp) # feature names (sklearn >= 1.0) try: feature_names = encoder.get_feature_names_out(columns) + feature_names = [str(fn) for fn in feature_names] except Exception: # fallback: build names manually cats = encoder.categories_ @@ -150,12 +163,16 @@ def execute(self, kernel, data): for cname, cat_list in zip(columns, cats): for cat in cat_list: feature_names.append(f"{cname}_{str(cat)}") + # create DataFrame of encoded features ohe_df = pd.DataFrame(arr, columns=feature_names, index=result_df.index) + # concatenate appropriately if drop_original: result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) else: result_df = pd.concat([result_df, ohe_df], axis=1) + encoder_obj = encoder # save fitted OneHotEncoder + elif method == "ordinal": # use sklearn OrdinalEncoder for one or multiple columns (automatic ordering) enc = OrdinalEncoder(dtype=np.float64) @@ -167,17 +184,29 @@ def execute(self, kernel, data): if drop_original: result_df.drop(columns=[col], inplace=True) + encoder_obj = enc + else: kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") return - # Apply result + # Apply result back to shared data if inplace if inplace: data["last_select"] = result_df kernel._send_message("stdout", "Encoded columns in-place and updated last_select.") else: kernel._send_message("stdout", "Displayed encoded result (last_select not modified).") + # Save encoder (or mapping) to shared data for downstream pipeline usage + try: + if encoder_obj is not None: + data["last_select_encoder"] = encoder_obj + elif label_mappings is not None: + data["last_select_encoder"] = label_mappings + except Exception: + # don't fail pipeline just because we couldn't save encoder + pass + # display self._send_html(kernel, result_df) diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py new file mode 100644 index 0000000..35bd4b4 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py @@ -0,0 +1,366 @@ +# mlpipeline.py +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import json + +# Import the other pipeline stages (paths kept as in your original snippet) +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData +from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel +from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel +from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel + + +class MLPipeline(MariaMagic): + """ + %mlpipeline target=target_col problem=classification|regression [features=col1,col2,...] [model=rf|auto] + [save_path=/path/to/model.joblib] + + Automates an end-to-end ML pipeline on data['last_select'] with minimal input. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "mlpipeline" + + def help(self): + return ( + "%mlpipeline target=target_col problem=classification|regression [features=col1,col2,...] [model=rf|auto]\n" + "[save_path=/path/to/model.joblib]\n" + "Automates an ML pipeline: cleaning, encoding, feature selection, preprocessing, model selection, training, and evaluation." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + try: + return json.loads(s) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _send_message(self, kernel, channel, message): + kernel._send_message(channel, f"[MLPipeline] {message}") + + def execute(self, kernel, data): + df = data.get("last_select") + if df is None or df.empty: + self._send_message(kernel, "stderr", "No last_select found or DataFrame is empty.") + return False + + try: + args = self.parse_args(self.args) + except Exception as e: + self._send_message(kernel, "stderr", f"Error parsing arguments: {e}. Use key=value syntax.") + return False + + # Parse arguments + target = args.get("target") + problem = args.get("problem") + features_arg = args.get("features") + model_name_arg = args.get("model", "auto") + save_path = args.get("save_path", None) + + # Validate required arguments + if not target: + self._send_message(kernel, "stderr", "target argument is required (target=target_col).") + return False + if not problem: + self._send_message(kernel, "stderr", "problem argument is required (problem=classification|regression).") + return False + if problem not in ("classification", "regression"): + self._send_message(kernel, "stderr", "problem must be 'classification' or 'regression'.") + return False + if target not in df.columns: + self._send_message(kernel, "stderr", f"Target column '{target}' not found in DataFrame.") + return False + + # Parse features or set to all columns except target if not provided + if features_arg: + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + self._send_message(kernel, "stderr", "features must be comma-separated string or list.") + return False + else: + features = [col for col in df.columns if col != target] + if not features: + self._send_message(kernel, "stderr", "No features available after excluding target column.") + return False + + # Validate features + missing = [c for c in features if c not in df.columns] + if missing: + self._send_message(kernel, "stderr", f"Missing feature columns in DataFrame: {', '.join(missing)}") + return False + + # Set defaults + inplace = True + missing_strategy = "drop" + outlier_action = "none" + encode_method = "onehot" + scale_method = "standardize" + test_size = 0.2 + val_size = 0.0 + stratify = target if problem == "classification" else None + shuffle = True + random_state = None + model_store_name = "last_model" + train_name = "last_select_train" + test_name = "last_select_test" + val_name = "last_select_val" + feature_method = "correlation" + k_features = 5 + primary_metric = "accuracy" if problem == "classification" else "r2" + cv = 0 + + # Work on a copy if not inplace + working_df = df if inplace else df.copy(deep=True) + data["last_select"] = working_df + + # Step 1: Handle missing values + try: + drop_args = f"columns={','.join(features + [target])}" + DropMissing(drop_args).execute(kernel, data) + cur_df = data.get("last_select") + if cur_df is None or cur_df.empty: + self._send_message(kernel, "stderr", "DataFrame is empty after dropping missing values.") + return False + # Refresh working_df reference after cleaning + working_df = cur_df + except Exception as e: + self._send_message(kernel, "stderr", f"Error handling missing values: {e}") + return False + + # Step 2: Encode categorical features + try: + # Recompute cat_columns on current working_df + cat_columns = [c for c in features if c in working_df.columns and working_df[c].dtype in ["object", "category"]] + if cat_columns: + encode_args = f"method={encode_method} columns={','.join(cat_columns)} inplace=True drop_original=True" + # reset any previous encoder + data["last_select_encoder"] = None + Encode(encode_args).execute(kernel, data) + + # after Encode runs, refresh working_df from shared data to see new columns + working_df = data.get("last_select", working_df) + + if encode_method == "onehot": + encoder = data.get("last_select_encoder") + if not encoder: + self._send_message(kernel, "stderr", "Encoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder'].") + return False + try: + # get_feature_names_out may require passing the original column names + try: + feature_names = list(encoder.get_feature_names_out(cat_columns)) + except Exception: + # fallback for older sklearn or if encoder doesn't support that call + cats = getattr(encoder, "categories_", None) + feature_names = [] + if cats is not None: + for cname, cat_list in zip(cat_columns, cats): + for cat in cat_list: + feature_names.append(f"{cname}_{str(cat)}") + else: + # As a last resort, build feature names from current working_df columns + # by selecting columns that start with the column name + "_" + feature_names = [] + for cname in cat_columns: + feature_names += [c for c in working_df.columns if c.startswith(cname + "_")] + # remove duplicates and ensure these features exist + feature_names = [str(fn) for fn in feature_names] + features = [c for c in features if c not in cat_columns] + feature_names + except Exception as e: + self._send_message(kernel, "stderr", f"Failed to retrieve encoded feature names: {e}") + return False + + elif encode_method == "label": + # label encoding created columns _lbl + features = [f"{c}_lbl" if c in cat_columns else c for c in features] + + elif encode_method == "ordinal": + # ordinal encoding created columns _ord + features = [f"{c}_ord" if c in cat_columns else c for c in features] + + # Refresh working_df again (defensive) + working_df = data.get("last_select", working_df) + + # Verify encoded features exist + missing_encoded = [f for f in features if f not in working_df.columns] + if missing_encoded: + # helpful debug output: list what columns do exist that are related + related_columns = [] + for c in cat_columns: + related_columns += [col for col in working_df.columns if col.startswith(c + "_") or col.startswith(c + "_lbl") or col.startswith(c + "_ord")] + self._send_message(kernel, "stderr", f"Encoded features not found in DataFrame: {', '.join(missing_encoded)}") + if related_columns: + self._send_message(kernel, "stderr", f"Available related columns: {', '.join(related_columns)}") + return False + except Exception as e: + self._send_message(kernel, "stderr", f"Error during encoding: {e}") + return False + + # Step 3: Feature selection (if features not provided) + if not features_arg: + try: + select_features_args = f"target={target} method={feature_method} k={k_features} problem={problem} inplace={inplace}" + SelectFeatures(select_features_args).execute(kernel, data) + features = data.get("selected_features", []) + if not features: + self._send_message(kernel, "stderr", "Feature selection failed to return features.") + return False + # Verify selected features exist + working_df = data.get("last_select", working_df) + missing_features = [f for f in features if f not in working_df.columns] + if missing_features: + self._send_message(kernel, "stderr", f"Selected features not found in DataFrame: {', '.join(missing_features)}") + return False + except Exception as e: + self._send_message(kernel, "stderr", f"Error during feature selection: {e}") + return False + + # Step 4: Scale numeric features + try: + working_df = data.get("last_select", working_df) + num_columns = [c for c in features if c in working_df.columns and pd.api.types.is_numeric_dtype(working_df[c])] + if num_columns: + scale_args = f"columns={','.join(num_columns)} inplace=True" + Standardize(scale_args).execute(kernel, data) + except Exception as e: + self._send_message(kernel, "stderr", f"Error during scaling: {e}") + return False + + # Step 5: Split data + # Step 5: Split data + try: + split_args = f"test_size={test_size} val_size={val_size} shuffle={shuffle} " \ + f"train_name={train_name} test_name={test_name} val_name={val_name} inplace={inplace}" + if stratify: + split_args += f" stratify={stratify}" + if random_state is not None: + split_args += f" random_state={random_state}" + + SplitData(split_args).execute(kernel, data) + + # Safely check that the split produced valid DataFrames + train_df = data.get(train_name) + test_df = data.get(test_name) + + if train_df is None or train_df.empty or test_df is None or test_df.empty: + self._send_message(kernel, "stderr", "Data splitting failed to produce non-empty train/test sets.") + return False + + except Exception as e: + self._send_message(kernel, "stderr", f"Error during data splitting: {e}") + return False + + # Step 6: Model selection or training + try: + # Treat 'auto' and None the same → use SelectModel + if not model_name_arg or model_name_arg == "auto": + select_model_args = ( + f"features={','.join(features)} target={target} cv=5 " + f"primary_metric={primary_metric} problem={problem} output_name={model_store_name} inplace={inplace}" + ) + SelectModel(select_model_args).execute(kernel, data) + self._send_message(kernel, "stdout", "Automatically selected best model via SelectModel.") + else: + # Train a specific model + train_args = ( + f"model={model_name_arg} features={','.join(features)} target={target} " + f"model_name={model_store_name} test_name={test_name} cv={cv} inplace={inplace} problem={problem}" + ) + TrainModel(train_args).execute(kernel, data) + self._send_message(kernel, "stdout", f"Trained specified model '{model_name_arg}'.") + + # Validate model creation + model_obj = data.get(model_store_name) + if model_obj is None: + self._send_message( + kernel, "stderr", + f"No model object created. Ensure SelectModel or TrainModel supports problem='{problem}'." + ) + return False + + except Exception as e: + self._send_message(kernel, "stderr", f"Error during model training/selection: {e}") + return False + + # Step 7: Evaluate model + try: + eval_args = f"model_name={model_store_name} test_name={test_name} problem={problem}" + EvaluateModel(eval_args).execute(kernel, data) + except Exception as e: + self._send_message(kernel, "stderr", f"Error during model evaluation: {e}") + return False + + # Step 8: Save model if requested + # Step 8: Save model if requested + if save_path: + try: + # Ensure correct key for SaveModel command + save_args = f"model_name={model_store_name} save_path={save_path}" + SaveModel(save_args).execute(kernel, data) + self._send_message(kernel, "stdout", f"Model saved to {save_path}.") + except Exception as e: + self._send_message(kernel, "stderr", f"Error saving model: {e}") + return False + else: + self._send_message(kernel, "stderr", "You must provide save_path=/path/to/file.joblib") + return False + + + # Summary + self._send_message(kernel, "stdout", "ML pipeline completed successfully.") + return True diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py index bd56480..1f0ca10 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py @@ -13,12 +13,13 @@ class SelectFeatures(MariaMagic): """ - %select_features features=col1,col2 target=target_col + %select_features target=target_col [method=correlation|rf_importance|rfe|mutual_info|chi2|anova|l1_selection|variance] [k=5] [problem=classification|regression] [output_name=selected_features] [inplace=True|False] Identify the best features for training a model on data['last_select']. + Uses all columns except the target column as features. Methods: - correlation: Absolute Pearson correlation with the target. - rf_importance: RandomForest feature importance scores. @@ -95,7 +96,6 @@ def execute(self, kernel, data): kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") return - features_arg = args.get("features") target = args.get("target") method = args.get("method", "correlation").lower() k = args.get("k", 5) @@ -103,28 +103,21 @@ def execute(self, kernel, data): output_name = args.get("output_name", "selected_features") inplace = bool(args.get("inplace", True)) - if not features_arg: - kernel._send_message("stderr", "features argument is required (features=col1,col2...).") - return if not target: kernel._send_message("stderr", "target argument is required (target=target_col).") return - # Parse features - if isinstance(features_arg, str): - features = [c.strip() for c in features_arg.split(",") if c.strip()] - elif isinstance(features_arg, (list, tuple)): - features = list(features_arg) - else: - kernel._send_message("stderr", "features must be comma-separated string or list.") + if target not in df.columns: + kernel._send_message("stderr", f"Target column '{target}' not found in DataFrame.") return - missing = [c for c in features + [target] if c not in df.columns] - if missing: - kernel._send_message("stderr", f"Missing columns in DataFrame: {', '.join(missing)}") + # Use all columns except the target as features + features = [col for col in df.columns if col != target] + if not features: + kernel._send_message("stderr", "No features available after excluding target column.") return - # Determine problem type (same logic as TrainModel) + # Determine problem type if problem_override: problem = problem_override.lower() if problem not in ("classification", "regression"): @@ -153,7 +146,7 @@ def execute(self, kernel, data): kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") return - # Scale data for methods that require it (e.g., chi2 requires non-negative, l1_selection benefits from scaling) + # Scale data for methods that require it if method in ("chi2", "l1_selection"): scaler = MinMaxScaler() if method == "chi2" else StandardScaler() try: @@ -165,7 +158,6 @@ def execute(self, kernel, data): # Feature selection try: if method == "correlation": - # Pearson correlation (absolute value) with target correlations = X.corrwith(y, method="pearson").abs() scores = correlations.sort_values(ascending=False) selected_features = scores.head(k).index.tolist() @@ -175,7 +167,6 @@ def execute(self, kernel, data): }) elif method == "rf_importance": - # RandomForest feature importance model = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() model.fit(X, y) importances = pd.Series(model.feature_importances_, index=features) @@ -187,12 +178,11 @@ def execute(self, kernel, data): }) elif method == "rfe": - # Recursive Feature Elimination estimator = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() selector = RFE(estimator, n_features_to_select=k) selector.fit(X, y) ranking = pd.Series(selector.ranking_, index=features) - scores = 1 / (ranking + 1) # Inverse ranking as score (higher is better) + scores = 1 / (ranking + 1) selected_features = ranking[ranking == 1].index.tolist() result_df = pd.DataFrame({ "Feature": ranking.index, @@ -201,7 +191,6 @@ def execute(self, kernel, data): }).sort_values("Score", ascending=False) elif method == "mutual_info": - # Mutual Information score_func = mutual_info_classif if problem == "classification" else mutual_info_regression selector = SelectKBest(score_func=score_func, k=k) selector.fit(X, y) @@ -214,7 +203,6 @@ def execute(self, kernel, data): }) elif method == "chi2": - # Chi-squared (classification only, requires non-negative features) if problem != "classification": kernel._send_message("stderr", "chi2 method is only for classification problems.") return @@ -232,7 +220,6 @@ def execute(self, kernel, data): }) elif method == "anova": - # ANOVA F-test score_func = f_classif if problem == "classification" else f_regression selector = SelectKBest(score_func=score_func, k=k) selector.fit(X, y) @@ -245,10 +232,8 @@ def execute(self, kernel, data): }) elif method == "l1_selection": - # L1-based feature selection model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000) if problem == "classification" else Lasso(alpha=0.01) model.fit(X, y) - # Use absolute coefficients as importance scores scores = pd.Series(np.abs(model.coef_.ravel() if problem == "classification" else model.coef_), index=features) scores = scores.sort_values(ascending=False) selected_features = scores[scores > 0].head(k).index.tolist() @@ -258,8 +243,7 @@ def execute(self, kernel, data): }) elif method == "variance": - # Variance Threshold - selector = VarianceThreshold(threshold=0.0) # Default threshold, can be customized via args if needed + selector = VarianceThreshold(threshold=0.0) selector.fit(X) variances = pd.Series(selector.variances_, index=features) scores = variances.sort_values(ascending=False) diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py index 4960dd7..23edc02 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py @@ -11,8 +11,9 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor -from sklearn.neural_network import MLPClassifier +from sklearn.neural_network import MLPClassifier, MLPRegressor +# Optional external libraries _XGBOOST_AVAILABLE = False _LIGHTGBM_AVAILABLE = False _CATBOOST_AVAILABLE = False @@ -36,15 +37,15 @@ class SelectModel(MariaMagic): """ - %select_model features=col1,col2 target=target_col - [models=rf,logistic,svm] [cv=5] [metric=accuracy|r2|f1|precision|recall|mse|mae] + %select_model target=target_col + [features=col1,col2] [cv=5] [primary_metric=accuracy|r2|f1|precision|recall|mse|mae] [problem=classification|regression] [output_name=best_model] [inplace=True|False] [model_params={'rf': {'n_estimators': 100}, 'logistic': {'C': 1.0}}] - Select the best model by comparing multiple models on data['last_select'] using cross-validation. - Models: logistic, rf, svm, knn, gbm, ada, mlp, xgboost, lightgbm, catboost (classification); - linear, ridge, lasso, rf, knn, gbm, ada, mlp, xgboost, lightgbm, catboost (regression). - Stores the best model in data[output_name] and displays a table of model performances. + Select the best model by comparing all available models on data['last_select'] using cross-validation. + If features are not provided, uses data['selected_features'] from %select_features. + Tests all metrics (classification: accuracy, f1, precision, recall; regression: r2, mse, mae). + Stores the best model in data[output_name] based on primary_metric and displays a table of performances. """ def __init__(self, args=""): self.args = args @@ -99,7 +100,6 @@ def _send_html(self, kernel, df, title=None): pass def _choose_model(self, name, problem, params=None): - # Reuse TrainModel's model selection logic p = params or {} name = name.lower() if name in ("logistic", "logistic_regression", "lr"): @@ -163,45 +163,38 @@ def execute(self, kernel, data): features_arg = args.get("features") target = args.get("target") - models_arg = args.get("models", "rf,logistic,knn") # Default models cv = int(args.get("cv", 5) or 5) - metric = args.get("metric", None) + primary_metric = args.get("primary_metric", None) problem_override = args.get("problem", None) output_name = args.get("output_name", "best_model") inplace = bool(args.get("inplace", True)) model_params = args.get("model_params", {}) or {} - if not features_arg: - kernel._send_message("stderr", "features argument is required (features=col1,col2...).") - return if not target: kernel._send_message("stderr", "target argument is required (target=target_col).") return - # Parse features - if isinstance(features_arg, str): - features = [c.strip() for c in features_arg.split(",") if c.strip()] - elif isinstance(features_arg, (list, tuple)): - features = list(features_arg) - else: - kernel._send_message("stderr", "features must be comma-separated string or list.") - return - - # Parse models - if isinstance(models_arg, str): - models = [m.strip() for m in models_arg.split(",") if m.strip()] - elif isinstance(models_arg, (list, tuple)): - models = list(models_arg) + # Use selected_features if features not provided + if not features_arg: + features = data.get("selected_features") + if not features: + kernel._send_message("stderr", "No features provided and no selected_features found. Run %select_features first.") + return else: - kernel._send_message("stderr", "models must be comma-separated string or list.") - return + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + kernel._send_message("stderr", "features must be comma-separated string or list.") + return missing = [c for c in features + [target] if c not in df.columns] if missing: kernel._send_message("stderr", f"Missing columns in DataFrame: {', '.join(missing)}") return - # Determine problem type (same logic as TrainModel) + # Determine problem type if problem_override: problem = problem_override.lower() if problem not in ("classification", "regression"): @@ -220,31 +213,45 @@ def execute(self, kernel, data): else: problem = "classification" - # Validate metric - valid_metrics = { + # Define all available models based on problem type + classification_models = ["logistic", "rf", "svm", "knn", "gbm", "ada", "mlp"] + regression_models = ["linear", "ridge", "lasso", "rf", "knn", "gbm", "ada", "mlp"] + if _XGBOOST_AVAILABLE: + classification_models.append("xgboost") + regression_models.append("xgboost") + if _LIGHTGBM_AVAILABLE: + classification_models.append("lightgbm") + regression_models.append("lightgbm") + if _CATBOOST_AVAILABLE: + classification_models.append("catboost") + regression_models.append("catboost") + models = classification_models if problem == "classification" else regression_models + + # Define all metrics + metrics = { "classification": ["accuracy", "f1", "precision", "recall"], "regression": ["r2", "mse", "mae"] } - if metric is None: - metric = "accuracy" if problem == "classification" else "r2" - if metric not in valid_metrics[problem]: - kernel._send_message("stderr", f"Invalid metric '{metric}' for {problem}. Choose from {', '.join(valid_metrics[problem])}.") + if primary_metric is None: + primary_metric = "accuracy" if problem == "classification" else "r2" + if primary_metric not in metrics[problem]: + kernel._send_message("stderr", f"Invalid primary_metric '{primary_metric}' for {problem}. Choose from {', '.join(metrics[problem])}.") return # Prepare data X = df[features].copy() y = df[target].copy() - # Handle missing values (simple imputation) + # Handle missing values X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) if X.isna().any().any(): kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") return - # Evaluate models + # Evaluate models across all metrics results = [] best_model = None - best_score = -float("inf") if metric not in ("mse", "mae") else float("inf") + best_score = -float("inf") if primary_metric not in ("mse", "mae") else float("inf") best_model_name = None for model_name in models: @@ -252,32 +259,33 @@ def execute(self, kernel, data): # Get model-specific parameters params = model_params.get(model_name, {}) if isinstance(model_params, dict) else {} model = self._choose_model(model_name, problem, params) - scoring = metric if metric in ("accuracy", "f1", "precision", "recall", "r2") else ( - "neg_mean_squared_error" if metric == "mse" else "neg_mean_absolute_error" - ) - cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring) - mean_score = np.mean(cv_scores) - std_score = np.std(cv_scores) - - # Adjust score for negative metrics (mse, mae) - if metric in ("mse", "mae"): - mean_score = -mean_score # Convert back to positive for reporting - - results.append({ - "Model": model_name, - "Mean_Score": mean_score, - "Std_Score": std_score - }) - - # Update best model (maximize for accuracy, f1, precision, recall, r2; minimize for mse, mae) - if metric in ("mse", "mae"): - if mean_score < best_score: - best_score = mean_score + model_result = {"Model": model_name} + + # Evaluate all metrics + for metric in metrics[problem]: + scoring = metric if metric in ("accuracy", "f1", "precision", "recall", "r2") else ( + "neg_mean_squared_error" if metric == "mse" else "neg_mean_absolute_error" + ) + cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring) + mean_score = np.mean(cv_scores) + std_score = np.std(cv_scores) + if metric in ("mse", "mae"): + mean_score = -mean_score # Convert to positive + model_result[f"{metric}_Mean"] = mean_score + model_result[f"{metric}_Std"] = std_score + + results.append(model_result) + + # Update best model based on primary_metric + current_score = model_result[f"{primary_metric}_Mean"] + if primary_metric in ("mse", "mae"): + if current_score < best_score: + best_score = current_score best_model = model best_model_name = model_name else: - if mean_score > best_score: - best_score = mean_score + if current_score > best_score: + best_score = current_score best_model = model best_model_name = model_name @@ -290,9 +298,11 @@ def execute(self, kernel, data): return # Create results DataFrame - result_df = pd.DataFrame(results).sort_values("Mean_Score", ascending=metric in ("mse", "mae")) - result_df["Mean_Score"] = result_df["Mean_Score"].round(4) - result_df["Std_Score"] = result_df["Std_Score"].round(4) + result_df = pd.DataFrame(results) + for metric in metrics[problem]: + result_df[f"{metric}_Mean"] = result_df[f"{metric}_Mean"].round(4) + result_df[f"{metric}_Std"] = result_df[f"{metric}_Std"].round(4) + result_df = result_df.sort_values(f"{primary_metric}_Mean", ascending=primary_metric in ("mse", "mae")) # Fit the best model on the full training data try: @@ -309,7 +319,7 @@ def execute(self, kernel, data): "problem": problem, "features": features, "target": target, - "metric": metric, + "primary_metric": primary_metric, "cv": cv, "score": float(best_score), "all_results": result_df.to_dict() @@ -321,7 +331,7 @@ def execute(self, kernel, data): return # Display results - self._send_html(kernel, result_df, title=f"Model Selection Results (metric={metric})") - kernel._send_message("stdout", f"Best model '{best_model_name}' (mean {metric}={best_score:.4f}) saved to data['{output_name}'].") + self._send_html(kernel, result_df, title=f"Model Selection Results (primary_metric={primary_metric})") + kernel._send_message("stdout", f"Best model '{best_model_name}' (mean {primary_metric}={best_score:.4f}) saved to data['{output_name}'].") return \ No newline at end of file diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index aadd450..9c18f7f 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -29,6 +29,7 @@ from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline def get(): return { @@ -57,4 +58,5 @@ def get(): "predict": Predict, "select_features": SelectFeatures, "select_model": SelectModel, + "ml_pipeline": MLPipeline, } diff --git a/test.py b/test.py deleted file mode 100644 index 8e23576..0000000 --- a/test.py +++ /dev/null @@ -1 +0,0 @@ -print("Hello World") \ No newline at end of file From 2cb79d249921f8b0fb3052d12a40375753001338 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Mon, 27 Oct 2025 19:21:36 +0000 Subject: [PATCH 19/38] Removing eggs folder --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index 14e75e1..bccfebe 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,3 @@ mariadb_kernel/_version.py .vscode/ .eggs/ -catboost_info/ -mariadb_kernel.egg-info/ -models - From 168613f658a2557da36981d7c10af41cc82cb081 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Mon, 27 Oct 2025 19:24:19 +0000 Subject: [PATCH 20/38] Removed files --- .eggs/README.txt | 6 - .../EGG-INFO/PKG-INFO | 185 --- .../EGG-INFO/RECORD | 43 - .../EGG-INFO/WHEEL | 5 - .../EGG-INFO/entry_points.txt | 49 - .../EGG-INFO/licenses/LICENSE | 17 - .../EGG-INFO/requires.txt | 9 - .../EGG-INFO/top_level.txt | 1 - .../setuptools_scm/.git_archival.txt | 3 - .../setuptools_scm/__init__.py | 30 - .../setuptools_scm/__main__.py | 6 - .../setuptools_scm/_cli.py | 291 ----- .../setuptools_scm/_compat.py | 65 -- .../setuptools_scm/_config.py | 318 ------ .../setuptools_scm/_entrypoints.py | 126 --- .../setuptools_scm/_file_finders/__init__.py | 113 -- .../setuptools_scm/_file_finders/git.py | 124 -- .../setuptools_scm/_file_finders/hg.py | 72 -- .../setuptools_scm/_file_finders/pathtools.py | 9 - .../setuptools_scm/_get_version_impl.py | 250 ---- .../setuptools_scm/_integration/__init__.py | 0 .../_integration/deprecation.py | 20 - .../_integration/dump_version.py | 128 --- .../_integration/pyproject_reading.py | 285 ----- .../setuptools_scm/_integration/setup_cfg.py | 46 - .../setuptools_scm/_integration/setuptools.py | 159 --- .../setuptools_scm/_integration/toml.py | 69 -- .../_integration/version_inference.py | 141 --- .../setuptools_scm/_log.py | 87 -- .../setuptools_scm/_modify_version.py | 61 - .../setuptools_scm/_node_utils.py | 46 - .../setuptools_scm/_overrides.py | 298 ----- .../setuptools_scm/_requirement_cls.py | 34 - .../setuptools_scm/_run_cmd.py | 221 ---- .../setuptools_scm/_types.py | 61 - .../setuptools_scm/_version_cls.py | 101 -- .../setuptools_scm/discover.py | 74 -- .../setuptools_scm/fallbacks.py | 45 - .../setuptools_scm/git.py | 454 -------- .../setuptools_scm/hg.py | 308 ----- .../setuptools_scm/hg_git.py | 181 --- .../setuptools_scm/integration.py | 31 - .../setuptools_scm/py.typed | 0 .../setuptools_scm/scm_workdir.py | 54 - .../setuptools_scm/version.py | 583 ---------- .gitignore | 3 + catboost_info/catboost_training.json | 1004 ----------------- catboost_info/learn/events.out.tfevents | Bin 54870 -> 0 bytes catboost_info/learn_error.tsv | 1001 ---------------- catboost_info/time_left.tsv | 1001 ---------------- mariadb_kernel.egg-info/PKG-INFO | 76 -- mariadb_kernel.egg-info/SOURCES.txt | 96 -- mariadb_kernel.egg-info/dependency_links.txt | 1 - mariadb_kernel.egg-info/requires.txt | 9 - mariadb_kernel.egg-info/top_level.txt | 1 - models/test_model.joblib | Bin 146537 -> 0 bytes 56 files changed, 3 insertions(+), 8398 deletions(-) delete mode 100644 .eggs/README.txt delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py delete mode 100644 .eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py delete mode 100644 catboost_info/catboost_training.json delete mode 100644 catboost_info/learn/events.out.tfevents delete mode 100644 catboost_info/learn_error.tsv delete mode 100644 catboost_info/time_left.tsv delete mode 100644 mariadb_kernel.egg-info/PKG-INFO delete mode 100644 mariadb_kernel.egg-info/SOURCES.txt delete mode 100644 mariadb_kernel.egg-info/dependency_links.txt delete mode 100644 mariadb_kernel.egg-info/requires.txt delete mode 100644 mariadb_kernel.egg-info/top_level.txt delete mode 100644 models/test_model.joblib diff --git a/.eggs/README.txt b/.eggs/README.txt deleted file mode 100644 index 5d01668..0000000 --- a/.eggs/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins. - -This directory caches those eggs to prevent repeated downloads. - -However, it is safe to delete this directory. - diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO deleted file mode 100644 index 0a1de09..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/PKG-INFO +++ /dev/null @@ -1,185 +0,0 @@ -Metadata-Version: 2.4 -Name: setuptools-scm -Version: 9.2.2 -Summary: the blessed package to manage your versions by scm tags -Author-email: Ronny Pfannschmidt -License: Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. - -Project-URL: documentation, https://setuptools-scm.readthedocs.io/ -Project-URL: repository, https://github.com/pypa/setuptools-scm/ -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: MIT License -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 :: Only -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Programming Language :: Python :: 3.11 -Classifier: Programming Language :: Python :: 3.12 -Classifier: Programming Language :: Python :: 3.13 -Classifier: Topic :: Software Development :: Libraries -Classifier: Topic :: Software Development :: Version Control -Classifier: Topic :: System :: Software Distribution -Classifier: Topic :: Utilities -Requires-Python: >=3.8 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: packaging>=20 -Requires-Dist: setuptools -Requires-Dist: tomli>=1; python_version < "3.11" -Requires-Dist: typing-extensions; python_version < "3.10" -Provides-Extra: rich -Requires-Dist: rich; extra == "rich" -Provides-Extra: simple -Provides-Extra: toml -Dynamic: license-file - -# setuptools-scm -[![github ci](https://github.com/pypa/setuptools-scm/actions/workflows/python-tests.yml/badge.svg)](https://github.com/pypa/setuptools-scm/actions/workflows/python-tests.yml) -[![Documentation Status](https://readthedocs.org/projects/setuptools-scm/badge/?version=latest)](https://setuptools-scm.readthedocs.io/en/latest/?badge=latest) -[![tidelift](https://tidelift.com/badges/package/pypi/setuptools-scm) ](https://tidelift.com/subscription/pkg/pypi-setuptools-scm?utm_source=pypi-setuptools-scm&utm_medium=readme) - -## about - -[setuptools-scm] extracts Python package versions from `git` or `hg` metadata -instead of declaring them as the version argument -or in a Source Code Managed (SCM) managed file. - -Additionally [setuptools-scm] provides `setuptools` with a list of -files that are managed by the SCM -
-(i.e. it automatically adds all the SCM-managed files to the sdist). -
-Unwanted files must be excluded via `MANIFEST.in` -or [configuring Git archive][git-archive-docs]. - -> **⚠️ Important:** Installing setuptools-scm automatically enables a file finder that includes **all SCM-tracked files** in your source distributions. This can be surprising if you have development files tracked in Git/Mercurial that you don't want in your package. Use `MANIFEST.in` to exclude unwanted files. See the [documentation] for details. - -## `pyproject.toml` usage - -The preferred way to configure [setuptools-scm] is to author -settings in a `tool.setuptools_scm` section of `pyproject.toml`. - -This feature requires setuptools 61 or later (recommended: >=80 for best compatibility). -First, ensure that [setuptools-scm] is present during the project's -build step by specifying it as one of the build requirements. - -```toml title="pyproject.toml" -[build-system] -requires = ["setuptools>=80", "setuptools-scm>=8"] -build-backend = "setuptools.build_meta" -``` - -That will be sufficient to require [setuptools-scm] for projects -that support [PEP 518] like [pip] and [build]. - -[pip]: https://pypi.org/project/pip -[build]: https://pypi.org/project/build -[PEP 518]: https://peps.python.org/pep-0518/ - - -To enable version inference, you need to set the version -dynamically in the `project` section of `pyproject.toml`: - -```toml title="pyproject.toml" -[project] -# version = "0.0.1" # Remove any existing version parameter. -dynamic = ["version"] - -[tool.setuptools_scm] -``` - -!!! note "Simplified Configuration" - - Starting with setuptools-scm 8.1+, if `setuptools_scm` (or `setuptools-scm`) is - present in your `build-system.requires`, the `[tool.setuptools_scm]` section - becomes optional! You can now enable setuptools-scm with just: - - ```toml title="pyproject.toml" - [build-system] - requires = ["setuptools>=80", "setuptools-scm>=8"] - build-backend = "setuptools.build_meta" - - [project] - dynamic = ["version"] - ``` - - The `[tool.setuptools_scm]` section is only needed if you want to customize - configuration options. - -Additionally, a version file can be written by specifying: - -```toml title="pyproject.toml" -[tool.setuptools_scm] -version_file = "pkg/_version.py" -``` - -Where `pkg` is the name of your package. - -If you need to confirm which version string is being generated or debug the configuration, -you can install [setuptools-scm] directly in your working environment and run: - -```console -$ python -m setuptools_scm -# To explore other options, try: -$ python -m setuptools_scm --help -``` - -For further configuration see the [documentation]. - -[setuptools-scm]: https://github.com/pypa/setuptools-scm -[documentation]: https://setuptools-scm.readthedocs.io/ -[git-archive-docs]: https://setuptools-scm.readthedocs.io/en/stable/usage/#builtin-mechanisms-for-obtaining-version-numbers - - -## Interaction with Enterprise Distributions - -Some enterprise distributions like RHEL7 -ship rather old setuptools versions. - -In those cases its typically possible to build by using an sdist against `setuptools-scm<2.0`. -As those old setuptools versions lack sensible types for versions, -modern [setuptools-scm] is unable to support them sensibly. - -It's strongly recommended to build a wheel artifact using modern Python and setuptools, -then installing the artifact instead of trying to run against old setuptools versions. - -!!! note "Legacy Setuptools Support" - While setuptools-scm recommends setuptools >=80, it maintains compatibility with setuptools 61+ - to support legacy deployments that cannot easily upgrade. Support for setuptools <80 is deprecated - and will be removed in a future release. This allows enterprise environments and older CI/CD systems - to continue using setuptools-scm while still encouraging adoption of newer versions. - - -## Code of Conduct - - -Everyone interacting in the [setuptools-scm] project's codebases, issue -trackers, chat rooms, and mailing lists is expected to follow the -[PSF Code of Conduct]. - -[PSF Code of Conduct]: https://github.com/pypa/.github/blob/main/CODE_OF_CONDUCT.md - - -## Security Contact - -To report a security vulnerability, please use the -[Tidelift security contact](https://tidelift.com/security). -Tidelift will coordinate the fix and disclosure. diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD deleted file mode 100644 index cdee4cc..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/RECORD +++ /dev/null @@ -1,43 +0,0 @@ -setuptools_scm/.git_archival.txt,sha256=2_90kdS1POSQMuZfBCUw6qNjObu7Ijp8DmptEAmlGkU,102 -setuptools_scm/__init__.py,sha256=k4jjJK8ejFI95amIoLWNCFECWIQW9NlxF9Had4RqOHM,785 -setuptools_scm/__main__.py,sha256=AhntzdNH3Jhcio_Ohoc6_EW7CuIN02OM-0irpGEXXh0,116 -setuptools_scm/_cli.py,sha256=btP9GJ66NIymXwaJNV_MpHg_2V5vZ444TWWELnoHplQ,9427 -setuptools_scm/_compat.py,sha256=CNt8TT3vdaHNsxWqIaSIoVOovfjnyprrx9RLyCD6-f0,2193 -setuptools_scm/_config.py,sha256=v2pRxWCJ4dzTL8a4p_tKsjvxJTDkMDA_oULCwSlACdc,10769 -setuptools_scm/_entrypoints.py,sha256=5ix0F8RAqRaP7ketm0O_IfsOdYKwZqDaKNU1gOKHImA,3697 -setuptools_scm/_get_version_impl.py,sha256=_FGFkXyk8ACUWinpA07L7Z43q5FBsa5juw6k4GPWZWw,9000 -setuptools_scm/_log.py,sha256=ulRwblEzYhkkbyTb2P-217GPpwn-z_Tz_lsd9pTL0RQ,2070 -setuptools_scm/_modify_version.py,sha256=9VU-juFg2IZjrcyz9kLGRfBq4RyZZElhjPMipqjB3Xc,1738 -setuptools_scm/_node_utils.py,sha256=ORxu526O4ruEUYHEHgcxl6punKdWY5K4kWA5e6zUU8Y,1310 -setuptools_scm/_overrides.py,sha256=AXYCpB4OCgFBRnaRWcbQA57jSmar4SAWB-uGHJZ0L6c,10520 -setuptools_scm/_requirement_cls.py,sha256=1q276rt4ZYafSiyN_coIWW3eHZG004TM-iZ130DEoJA,1100 -setuptools_scm/_run_cmd.py,sha256=84edY5QRqKdSJxkdjlUwIb0ztlOi1bMPC1uOxtYSzP0,6193 -setuptools_scm/_types.py,sha256=7ytOld6LZJzDegaamZC4-6ukQhVx8b1OdJbyzpcSeVI,1765 -setuptools_scm/_version_cls.py,sha256=YiD0IMtcKQq-eWfrQHAZLT5VKs598nHp0kzLmCaoxqo,3256 -setuptools_scm/discover.py,sha256=kelrYHy_LSsMuFN7QhYn7iq6xifWCIPBI4bBGxl8AI8,2069 -setuptools_scm/fallbacks.py,sha256=x3Xv1p89AqJiBX6oxuoo8Di0yR5ijOFOwKBJGAeWTbY,1448 -setuptools_scm/git.py,sha256=Z7ByutjME4JB-h7LXryIaSMY-LbqHsbKsqFNrZXZgkM,14964 -setuptools_scm/hg.py,sha256=COpRbHHgMcPMnrjP3ROmM8nrGmpQb6GIfHTzHMP-etY,10733 -setuptools_scm/hg_git.py,sha256=JLTFvUMMJnN-5zZwNOerMkvlqPJrkKTRAlTTTNl6neM,5540 -setuptools_scm/integration.py,sha256=n3FleU_zlCqIp6pv0PKrUx83q8Alc9Er420u_aUoRqI,806 -setuptools_scm/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -setuptools_scm/scm_workdir.py,sha256=HULdWY2vmYWz-nD3sZibeMsSXvbemNgY9CoH4tGvHKU,1466 -setuptools_scm/version.py,sha256=vX2fpwNrlBai9MQtka9fQL83zyB9ZTkvu6RgSBgTuvc,19474 -setuptools_scm/_file_finders/__init__.py,sha256=WPeWsbyrGOGqsjxc_dkYkUzWHXxUQFCzxMrAsSby-jg,3732 -setuptools_scm/_file_finders/git.py,sha256=4IsnFiTz-iyrm3R5ih6HArrGJzImLVdY9zsmMiESOg8,4434 -setuptools_scm/_file_finders/hg.py,sha256=fK_mTX-feWSyGYO8WWN_hVVt2wiulqXLO1qePLZ5h2A,2227 -setuptools_scm/_file_finders/pathtools.py,sha256=AgOl5u_WHxCQeiUCwlN8bUE3B4vs5BxSJEK1LJutyus,179 -setuptools_scm/_integration/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -setuptools_scm/_integration/deprecation.py,sha256=ZnC8Yr8RBUCGjlwPAZB0Rj0fsUkbbt9mqfgIkyYYU30,786 -setuptools_scm/_integration/dump_version.py,sha256=3zJESs0T-XJS2qm0yiyMWfh8MN9n5lGW8_qVS9M8Ew8,3219 -setuptools_scm/_integration/pyproject_reading.py,sha256=RIpa82xFR4J_bC4rWQtpg-zWGlt88m0aqK85IwKIz5c,9252 -setuptools_scm/_integration/setup_cfg.py,sha256=eM88lHpyG7HkUUQ3Lz75aeha8jS_vAhLJKIL4lfJLho,1302 -setuptools_scm/_integration/setuptools.py,sha256=pcbe-J-P_ELNo2FW8y20EnlGrzp9NmCB64idnpLOwNg,5142 -setuptools_scm/_integration/toml.py,sha256=9PIJGUBRAdxDyKM5XsweTSDERSUCdWBQfXYOAXCQVGI,1882 -setuptools_scm/_integration/version_inference.py,sha256=26iPIq8402dBf6FS7lo5IfqNqHgwAxE_T6fLKnKklc8,4195 -setuptools_scm-9.2.2.dist-info/licenses/LICENSE,sha256=iYB6zyMJvShfAzQE7nhYFgLzzZuBmhasLw5fYP9KRz4,1023 -setuptools_scm-9.2.2.dist-info/METADATA,sha256=_OGZb2ixEINe2f-PV2DzadHpkhgFbs57bCPCLi3ktcw,7749 -setuptools_scm-9.2.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91 -setuptools_scm-9.2.2.dist-info/entry_points.txt,sha256=sdHLPEpsB8x6KvWlekw5G1qo-huQlrmh-Lk-EfIucxE,1933 -setuptools_scm-9.2.2.dist-info/top_level.txt,sha256=kiu-91q3_rJLUoc2wl8_lC4cIlpgtgdD_4NaChF4hOA,15 -setuptools_scm-9.2.2.dist-info/RECORD,, diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL deleted file mode 100644 index e7fa31b..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: setuptools (80.9.0) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt deleted file mode 100644 index 24009c4..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/entry_points.txt +++ /dev/null @@ -1,49 +0,0 @@ -[console_scripts] -setuptools-scm = setuptools_scm._cli:main - -[distutils.setup_keywords] -use_scm_version = setuptools_scm._integration.setuptools:version_keyword - -[pipx.run] -setuptools-scm = setuptools_scm._cli:main -setuptools_scm = setuptools_scm._cli:main - -[setuptools.file_finders] -setuptools_scm = setuptools_scm._file_finders:find_files - -[setuptools.finalize_distribution_options] -setuptools_scm = setuptools_scm._integration.setuptools:infer_version - -[setuptools_scm.files_command] -.git = setuptools_scm._file_finders.git:git_find_files -.hg = setuptools_scm._file_finders.hg:hg_find_files - -[setuptools_scm.files_command_fallback] -.git_archival.txt = setuptools_scm._file_finders.git:git_archive_find_files -.hg_archival.txt = setuptools_scm._file_finders.hg:hg_archive_find_files - -[setuptools_scm.local_scheme] -dirty-tag = setuptools_scm.version:get_local_dirty_tag -no-local-version = setuptools_scm.version:get_no_local_node -node-and-date = setuptools_scm.version:get_local_node_and_date -node-and-timestamp = setuptools_scm.version:get_local_node_and_timestamp - -[setuptools_scm.parse_scm] -.git = setuptools_scm.git:parse -.hg = setuptools_scm.hg:parse - -[setuptools_scm.parse_scm_fallback] -.git_archival.txt = setuptools_scm.git:parse_archival -.hg_archival.txt = setuptools_scm.hg:parse_archival -PKG-INFO = setuptools_scm.fallbacks:parse_pkginfo -pyproject.toml = setuptools_scm.fallbacks:fallback_version -setup.py = setuptools_scm.fallbacks:fallback_version - -[setuptools_scm.version_scheme] -calver-by-date = setuptools_scm.version:calver_by_date -guess-next-dev = setuptools_scm.version:guess_next_dev_version -no-guess-dev = setuptools_scm.version:no_guess_dev_version -only-version = setuptools_scm.version:only_version -post-release = setuptools_scm.version:postrelease_version -python-simplified-semver = setuptools_scm.version:simplified_semver_version -release-branch-semver = setuptools_scm.version:release_branch_semver_version diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE deleted file mode 100644 index 89de354..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/licenses/LICENSE +++ /dev/null @@ -1,17 +0,0 @@ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt deleted file mode 100644 index b350a80..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/requires.txt +++ /dev/null @@ -1,9 +0,0 @@ -packaging>=20 -setuptools - -[rich] -rich - -[simple] - -[toml] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt deleted file mode 100644 index cba8d88..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/EGG-INFO/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -setuptools_scm diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt deleted file mode 100644 index 7c51009..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/.git_archival.txt +++ /dev/null @@ -1,3 +0,0 @@ -node: $Format:%H$ -node-date: $Format:%cI$ -describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py deleted file mode 100644 index e265e85..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -""" -:copyright: 2010-2023 by Ronny Pfannschmidt -:license: MIT -""" - -from __future__ import annotations - -from ._config import DEFAULT_LOCAL_SCHEME -from ._config import DEFAULT_VERSION_SCHEME -from ._config import Configuration -from ._get_version_impl import _get_version -from ._get_version_impl import get_version -from ._integration.dump_version import dump_version # soft deprecated -from ._version_cls import NonNormalizedVersion -from ._version_cls import Version -from .version import ScmVersion - -# Public API -__all__ = [ - "DEFAULT_LOCAL_SCHEME", - "DEFAULT_VERSION_SCHEME", - "Configuration", - "NonNormalizedVersion", - "ScmVersion", - "Version", - "_get_version", - "dump_version", - # soft deprecated imports, left for backward compatibility - "get_version", -] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py deleted file mode 100644 index 3f56d42..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/__main__.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import annotations - -from ._cli import main - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py deleted file mode 100644 index 1f104f4..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_cli.py +++ /dev/null @@ -1,291 +0,0 @@ -from __future__ import annotations - -import argparse -import json -import os -import sys - -from pathlib import Path -from typing import Any - -from setuptools_scm import Configuration -from setuptools_scm._file_finders import find_files -from setuptools_scm._get_version_impl import _get_version -from setuptools_scm.discover import walk_potential_roots - - -def main(args: list[str] | None = None) -> int: - opts = _get_cli_opts(args) - inferred_root: str = opts.root or "." - - pyproject = opts.config or _find_pyproject(inferred_root) - - try: - config = Configuration.from_file( - pyproject, - root=(os.path.abspath(opts.root) if opts.root is not None else None), - ) - except (LookupError, FileNotFoundError) as ex: - # no pyproject.toml OR no [tool.setuptools_scm] - print( - f"Warning: could not use {os.path.relpath(pyproject)}," - " using default configuration.\n" - f" Reason: {ex}.", - file=sys.stderr, - ) - config = Configuration(root=inferred_root) - version: str | None - if opts.no_version: - version = "0.0.0+no-version-was-requested.fake-version" - else: - version = _get_version( - config, force_write_version_files=opts.force_write_version_files - ) - if version is None: - raise SystemExit("ERROR: no version found for", opts) - if opts.strip_dev: - version = version.partition(".dev")[0] - - return command(opts, version, config) - - -def _get_cli_opts(args: list[str] | None) -> argparse.Namespace: - prog = "python -m setuptools_scm" - desc = "Print project version according to SCM metadata" - parser = argparse.ArgumentParser(prog, description=desc) - # By default, help for `--help` starts with lower case, so we keep the pattern: - parser.add_argument( - "-r", - "--root", - default=None, - help='directory managed by the SCM, default: inferred from config file, or "."', - ) - parser.add_argument( - "-c", - "--config", - default=None, - metavar="PATH", - help="path to 'pyproject.toml' with setuptools-scm config, " - "default: looked up in the current or parent directories", - ) - parser.add_argument( - "--strip-dev", - action="store_true", - help="remove the dev/local parts of the version before printing the version", - ) - parser.add_argument( - "-N", - "--no-version", - action="store_true", - help="do not include package version in the output", - ) - output_formats = ["json", "plain", "key-value"] - parser.add_argument( - "-f", - "--format", - type=str.casefold, - default="plain", - help="specify output format", - choices=output_formats, - ) - parser.add_argument( - "-q", - "--query", - type=str.casefold, - nargs="*", - help="display setuptools-scm settings according to query, " - "e.g. dist_name, do not supply an argument in order to " - "print a list of valid queries.", - ) - parser.add_argument( - "--force-write-version-files", - action="store_true", - help="trigger to write the content of the version files\n" - "its recommended to use normal/editable installation instead)", - ) - sub = parser.add_subparsers(title="extra commands", dest="command", metavar="") - # We avoid `metavar` to prevent printing repetitive information - desc = "List information about the package, e.g. included files" - sub.add_parser("ls", help=desc[0].lower() + desc[1:], description=desc) - - # Add create-archival-file subcommand - archival_desc = "Create .git_archival.txt file for git archive support" - archival_parser = sub.add_parser( - "create-archival-file", - help=archival_desc[0].lower() + archival_desc[1:], - description=archival_desc, - ) - archival_group = archival_parser.add_mutually_exclusive_group(required=True) - archival_group.add_argument( - "--stable", - action="store_true", - help="create stable archival file (recommended, no branch names)", - ) - archival_group.add_argument( - "--full", - action="store_true", - help="create full archival file with branch information (can cause instability)", - ) - archival_parser.add_argument( - "--force", action="store_true", help="overwrite existing .git_archival.txt file" - ) - return parser.parse_args(args) - - -# flake8: noqa: C901 -def command(opts: argparse.Namespace, version: str, config: Configuration) -> int: - data: dict[str, Any] = {} - - if opts.command == "ls": - opts.query = ["files"] - - if opts.command == "create-archival-file": - return _create_archival_file(opts, config) - - if opts.query == []: - opts.no_version = True - sys.stderr.write("Available queries:\n\n") - opts.query = ["queries"] - data["queries"] = ["files", *config.__dataclass_fields__] - - if opts.query is None: - opts.query = [] - - if not opts.no_version: - data["version"] = version - - if "files" in opts.query: - data["files"] = find_files(config.root) - - for q in opts.query: - if q in ["files", "queries", "version"]: - continue - - try: - if q.startswith("_"): - raise AttributeError() - data[q] = getattr(config, q) - except AttributeError: - sys.stderr.write(f"Error: unknown query: '{q}'\n") - return 1 - - if opts.format == "json": - print(json.dumps(data, indent=2)) - - if opts.format == "plain": - _print_plain(data) - - if opts.format == "key-value": - _print_key_value(data) - - return 0 - - -def _print_plain(data: dict[str, Any]) -> None: - version = data.pop("version", None) - if version: - print(version) - files = data.pop("files", []) - for file_ in files: - print(file_) - queries = data.pop("queries", []) - for query in queries: - print(query) - if data: - print("\n".join(data.values())) - - -def _print_key_value(data: dict[str, Any]) -> None: - for key, value in data.items(): - if isinstance(value, str): - print(f"{key} = {value}") - else: - str_value = "\n ".join(value) - print(f"{key} = {str_value}") - - -def _find_pyproject(parent: str) -> str: - for directory in walk_potential_roots(os.path.abspath(parent)): - pyproject = os.path.join(directory, "pyproject.toml") - if os.path.isfile(pyproject): - return pyproject - - return os.path.abspath( - "pyproject.toml" - ) # use default name to trigger the default errors - - -def _create_archival_file(opts: argparse.Namespace, config: Configuration) -> int: - """Create .git_archival.txt file with appropriate content.""" - archival_path = Path(config.root, ".git_archival.txt") - - # Check if file exists and force flag - if archival_path.exists() and not opts.force: - print( - f"Error: {archival_path} already exists. Use --force to overwrite.", - file=sys.stderr, - ) - return 1 - - if opts.stable: - content = _get_stable_archival_content() - print("Creating stable .git_archival.txt (recommended for releases)") - elif opts.full: - content = _get_full_archival_content() - print("Creating full .git_archival.txt with branch information") - print("WARNING: This can cause archive checksums to be unstable!") - - try: - archival_path.write_text(content, encoding="utf-8") - print(f"Created: {archival_path}") - - gitattributes_path = Path(config.root, ".gitattributes") - needs_gitattributes = True - - if gitattributes_path.exists(): - # TODO: more nuanced check later - gitattributes_content = gitattributes_path.read_text("utf-8") - if ( - ".git_archival.txt" in gitattributes_content - and "export-subst" in gitattributes_content - ): - needs_gitattributes = False - - if needs_gitattributes: - print("\nNext steps:") - print("1. Add this line to .gitattributes:") - print(" .git_archival.txt export-subst") - print("2. Commit both files:") - print(" git add .git_archival.txt .gitattributes") - print(" git commit -m 'add git archive support'") - else: - print("\nNext step:") - print("Commit the archival file:") - print(" git add .git_archival.txt") - print(" git commit -m 'update git archival file'") - - return 0 - except OSError as e: - print(f"Error: Could not create {archival_path}: {e}", file=sys.stderr) - return 1 - - -def _get_stable_archival_content() -> str: - """Generate stable archival file content (no branch names).""" - return """\ -node: $Format:%H$ -node-date: $Format:%cI$ -describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ -""" - - -def _get_full_archival_content() -> str: - """Generate full archival file content with branch information.""" - return """\ -# WARNING: Including ref-names can make archive checksums unstable -# after commits are added post-release. Use only if describe-name is insufficient. -node: $Format:%H$ -node-date: $Format:%cI$ -describe-name: $Format:%(describe:tags=true,match=*[0-9]*)$ -ref-names: $Format:%D$ -""" diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py deleted file mode 100644 index 4e9e301..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_compat.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Compatibility utilities for cross-platform functionality.""" - -from __future__ import annotations - - -def normalize_path_for_assertion(path: str) -> str: - """Normalize path separators for cross-platform assertions. - - On Windows, this converts backslashes to forward slashes to ensure - path comparisons work correctly. On other platforms, returns the path unchanged. - The length of the string is not changed by this operation. - - Args: - path: The path string to normalize - - Returns: - The path with normalized separators - """ - return path.replace("\\", "/") - - -def strip_path_suffix( - full_path: str, suffix_path: str, error_msg: str | None = None -) -> str: - """Strip a suffix from a path, with cross-platform path separator handling. - - This function first normalizes path separators for Windows compatibility, - then asserts that the full path ends with the suffix, and finally returns - the path with the suffix removed. This is the common pattern used for - computing parent directories from git output. - - Args: - full_path: The full path string - suffix_path: The suffix path to strip from the end - error_msg: Optional custom error message for the assertion - - Returns: - The prefix path with the suffix removed - - Raises: - AssertionError: If the full path doesn't end with the suffix - """ - normalized_full = normalize_path_for_assertion(full_path) - - if error_msg: - assert normalized_full.endswith(suffix_path), error_msg - else: - assert normalized_full.endswith(suffix_path), ( - f"Path assertion failed: {full_path!r} does not end with {suffix_path!r}" - ) - - return full_path[: -len(suffix_path)] - - -# Legacy aliases for backward compatibility during transition -def assert_path_endswith( - full_path: str, suffix_path: str, error_msg: str | None = None -) -> None: - """Legacy alias - use strip_path_suffix instead.""" - strip_path_suffix(full_path, suffix_path, error_msg) - - -def compute_path_prefix(full_path: str, suffix_path: str) -> str: - """Legacy alias - use strip_path_suffix instead.""" - return strip_path_suffix(full_path, suffix_path) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py deleted file mode 100644 index 49fac2a..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_config.py +++ /dev/null @@ -1,318 +0,0 @@ -"""configuration""" - -from __future__ import annotations - -import dataclasses -import os -import re -import warnings - -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Any -from typing import Pattern -from typing import Protocol - -if TYPE_CHECKING: - from . import git - -from . import _log -from . import _types as _t -from ._integration.pyproject_reading import PyProjectData -from ._integration.pyproject_reading import ( - get_args_for_pyproject as _get_args_for_pyproject, -) -from ._integration.pyproject_reading import read_pyproject as _read_pyproject -from ._overrides import read_toml_overrides -from ._version_cls import Version as _Version -from ._version_cls import _validate_version_cls -from ._version_cls import _VersionT - -log = _log.log.getChild("config") - - -def _is_called_from_dataclasses() -> bool: - """Check if the current call is from the dataclasses module.""" - import inspect - - frame = inspect.currentframe() - try: - # Walk up to 7 frames to check for dataclasses calls - current_frame = frame - assert current_frame is not None - for _ in range(7): - current_frame = current_frame.f_back - if current_frame is None: - break - if "dataclasses.py" in current_frame.f_code.co_filename: - return True - return False - finally: - del frame - - -class _GitDescribeCommandDescriptor: - """Data descriptor for deprecated git_describe_command field.""" - - def __get__( - self, obj: Configuration | None, objtype: type[Configuration] | None = None - ) -> _t.CMD_TYPE | None: - if obj is None: - return self # type: ignore[return-value] - - # Only warn if not being called by dataclasses.replace or similar introspection - is_from_dataclasses = _is_called_from_dataclasses() - if not is_from_dataclasses: - warnings.warn( - "Configuration field 'git_describe_command' is deprecated. " - "Use 'scm.git.describe_command' instead.", - DeprecationWarning, - stacklevel=2, - ) - return obj.scm.git.describe_command - - def __set__(self, obj: Configuration, value: _t.CMD_TYPE | None) -> None: - warnings.warn( - "Configuration field 'git_describe_command' is deprecated. " - "Use 'scm.git.describe_command' instead.", - DeprecationWarning, - stacklevel=2, - ) - obj.scm.git.describe_command = value - - -DEFAULT_TAG_REGEX = re.compile( - r"^(?:[\w-]+-)?(?P[vV]?\d+(?:\.\d+){0,2}[^\+]*)(?:\+.*)?$" -) -"""default tag regex that tries to match PEP440 style versions -with prefix consisting of dashed words""" - -DEFAULT_VERSION_SCHEME = "guess-next-dev" -DEFAULT_LOCAL_SCHEME = "node-and-date" - - -def _check_tag_regex(value: str | Pattern[str] | None) -> Pattern[str]: - if not value: - regex = DEFAULT_TAG_REGEX - else: - regex = re.compile(value) - - group_names = regex.groupindex.keys() - if regex.groups == 0 or (regex.groups > 1 and "version" not in group_names): - raise ValueError( - f"Expected tag_regex '{regex.pattern}' to contain a single match group or" - " a group named 'version' to identify the version part of any tag." - ) - - return regex - - -def _get_default_git_pre_parse() -> git.GitPreParse: - """Get the default git pre_parse enum value""" - from . import git - - return git.GitPreParse.WARN_ON_SHALLOW - - -class ParseFunction(Protocol): - def __call__( - self, root: _t.PathT, *, config: Configuration - ) -> _t.SCMVERSION | None: ... - - -def _check_absolute_root(root: _t.PathT, relative_to: _t.PathT | None) -> str: - log.debug("check absolute root=%s relative_to=%s", root, relative_to) - if relative_to: - if ( - os.path.isabs(root) - and os.path.isabs(relative_to) - and not os.path.commonpath([root, relative_to]) == root - ): - warnings.warn( - f"absolute root path '{root}' overrides relative_to '{relative_to}'" - ) - if os.path.isdir(relative_to): - warnings.warn( - "relative_to is expected to be a file," - f" its the directory {relative_to}\n" - "assuming the parent directory was passed" - ) - log.debug("dir %s", relative_to) - root = os.path.join(relative_to, root) - else: - log.debug("file %s", relative_to) - root = os.path.join(os.path.dirname(relative_to), root) - return os.path.abspath(root) - - -@dataclasses.dataclass -class GitConfiguration: - """Git-specific configuration options""" - - pre_parse: git.GitPreParse = dataclasses.field( - default_factory=lambda: _get_default_git_pre_parse() - ) - describe_command: _t.CMD_TYPE | None = None - - @classmethod - def from_data(cls, data: dict[str, Any]) -> GitConfiguration: - """Create GitConfiguration from configuration data, converting strings to enums""" - git_data = data.copy() - - # Convert string pre_parse values to enum instances - if "pre_parse" in git_data and isinstance(git_data["pre_parse"], str): - from . import git - - try: - git_data["pre_parse"] = git.GitPreParse(git_data["pre_parse"]) - except ValueError as e: - valid_options = [option.value for option in git.GitPreParse] - raise ValueError( - f"Invalid git pre_parse function '{git_data['pre_parse']}'. " - f"Valid options are: {', '.join(valid_options)}" - ) from e - - return cls(**git_data) - - -@dataclasses.dataclass -class ScmConfiguration: - """SCM-specific configuration options""" - - git: GitConfiguration = dataclasses.field(default_factory=GitConfiguration) - - @classmethod - def from_data(cls, data: dict[str, Any]) -> ScmConfiguration: - """Create ScmConfiguration from configuration data""" - scm_data = data.copy() - - # Handle git-specific configuration - git_data = scm_data.pop("git", {}) - git_config = GitConfiguration.from_data(git_data) - - return cls(git=git_config, **scm_data) - - -@dataclasses.dataclass -class Configuration: - """Global configuration model""" - - relative_to: _t.PathT | None = None - root: _t.PathT = "." - version_scheme: _t.VERSION_SCHEME = DEFAULT_VERSION_SCHEME - local_scheme: _t.VERSION_SCHEME = DEFAULT_LOCAL_SCHEME - tag_regex: Pattern[str] = DEFAULT_TAG_REGEX - parentdir_prefix_version: str | None = None - fallback_version: str | None = None - fallback_root: _t.PathT = "." - write_to: _t.PathT | None = None - write_to_template: str | None = None - version_file: _t.PathT | None = None - version_file_template: str | None = None - parse: ParseFunction | None = None - git_describe_command: dataclasses.InitVar[_t.CMD_TYPE | None] = ( - _GitDescribeCommandDescriptor() - ) - - dist_name: str | None = None - version_cls: type[_VersionT] = _Version - search_parent_directories: bool = False - - parent: _t.PathT | None = None - - # Nested SCM configurations - scm: ScmConfiguration = dataclasses.field( - default_factory=lambda: ScmConfiguration() - ) - - # Deprecated fields (handled in __post_init__) - - def __post_init__(self, git_describe_command: _t.CMD_TYPE | None) -> None: - self.tag_regex = _check_tag_regex(self.tag_regex) - - # Handle deprecated git_describe_command - # Check if it's a descriptor object (happens when no value is passed) - if git_describe_command is not None and not isinstance( - git_describe_command, _GitDescribeCommandDescriptor - ): - # Check if this is being called from dataclasses - is_from_dataclasses = _is_called_from_dataclasses() - - same_value = ( - self.scm.git.describe_command is not None - and self.scm.git.describe_command == git_describe_command - ) - - if is_from_dataclasses and same_value: - # Ignore the passed value - it's from dataclasses.replace() with same value - pass - else: - warnings.warn( - "Configuration field 'git_describe_command' is deprecated. " - "Use 'scm.git.describe_command' instead.", - DeprecationWarning, - stacklevel=2, - ) - # Check for conflicts - if self.scm.git.describe_command is not None: - raise ValueError( - "Cannot specify both 'git_describe_command' (deprecated) and " - "'scm.git.describe_command'. Please use only 'scm.git.describe_command'." - ) - self.scm.git.describe_command = git_describe_command - - @property - def absolute_root(self) -> str: - return _check_absolute_root(self.root, self.relative_to) - - @classmethod - def from_file( - cls, - name: str | os.PathLike[str] = "pyproject.toml", - dist_name: str | None = None, - pyproject_data: PyProjectData | None = None, - **kwargs: Any, - ) -> Configuration: - """ - Read Configuration from pyproject.toml (or similar). - Raises exceptions when file is not found or toml is - not installed or the file has invalid format. - - Parameters: - - name: path to pyproject.toml - - dist_name: name of the distribution - - **kwargs: additional keyword arguments to pass to the Configuration constructor - """ - - if pyproject_data is None: - pyproject_data = _read_pyproject(Path(name)) - args = _get_args_for_pyproject(pyproject_data, dist_name, kwargs) - - args.update(read_toml_overrides(args["dist_name"])) - relative_to = args.pop("relative_to", name) - return cls.from_data(relative_to=relative_to, data=args) - - @classmethod - def from_data( - cls, relative_to: str | os.PathLike[str], data: dict[str, Any] - ) -> Configuration: - """ - given configuration data - create a config instance after validating tag regex/version class - """ - version_cls = _validate_version_cls( - data.pop("version_cls", None), data.pop("normalize", True) - ) - - # Handle nested SCM configuration - scm_data = data.pop("scm", {}) - - # Handle nested SCM configuration - - scm_config = ScmConfiguration.from_data(scm_data) - return cls( - relative_to=relative_to, - version_cls=version_cls, - scm=scm_config, - **data, - ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py deleted file mode 100644 index 74a18a7..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_entrypoints.py +++ /dev/null @@ -1,126 +0,0 @@ -from __future__ import annotations - -import sys - -from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Iterator -from typing import cast - -from . import _log -from . import version - -__all__ = [ - "entry_points", - "im", -] -if TYPE_CHECKING: - from . import _types as _t - from ._config import Configuration - from ._config import ParseFunction - -from importlib import metadata as im - -log = _log.log.getChild("entrypoints") - - -if sys.version_info[:2] < (3, 10): - - def entry_points(*, group: str, name: str | None = None) -> list[im.EntryPoint]: - # Python 3.9: entry_points() returns dict, need to handle filtering manually - - eps = im.entry_points() # Returns dict - - group_eps = eps.get(group, []) - if name is not None: - return [ep for ep in group_eps if ep.name == name] - return group_eps -else: - - def entry_points(*, group: str, name: str | None = None) -> im.EntryPoints: - kw = {"group": group} - if name is not None: - kw["name"] = name - return im.entry_points(**kw) - - -def version_from_entrypoint( - config: Configuration, *, entrypoint: str, root: _t.PathT -) -> version.ScmVersion | None: - from .discover import iter_matching_entrypoints - - log.debug("version_from_ep %s in %s", entrypoint, root) - for ep in iter_matching_entrypoints(root, entrypoint, config): - fn: ParseFunction = ep.load() - maybe_version: version.ScmVersion | None = fn(root, config=config) - log.debug("%s found %r", ep, maybe_version) - if maybe_version is not None: - return maybe_version - return None - - -def _get_ep(group: str, name: str) -> Any | None: - for ep in entry_points(group=group, name=name): - log.debug("ep found: %s", ep.name) - return ep.load() - return None - - -def _get_from_object_reference_str(path: str, group: str) -> Any | None: - # todo: remove for importlib native spelling - from importlib.metadata import EntryPoint # hack - - ep = EntryPoint(path, path, group) - try: - return ep.load() - except (AttributeError, ModuleNotFoundError): - return None - - -def _iter_version_schemes( - entrypoint: str, - scheme_value: _t.VERSION_SCHEMES, - _memo: set[object] | None = None, -) -> Iterator[Callable[[version.ScmVersion], str]]: - if _memo is None: - _memo = set() - if isinstance(scheme_value, str): - scheme_value = cast( - "_t.VERSION_SCHEMES", - _get_ep(entrypoint, scheme_value) - or _get_from_object_reference_str(scheme_value, entrypoint), - ) - - if isinstance(scheme_value, (list, tuple)): - for variant in scheme_value: - if variant not in _memo: - _memo.add(variant) - yield from _iter_version_schemes(entrypoint, variant, _memo=_memo) - elif callable(scheme_value): - yield scheme_value - - -def _call_version_scheme( - version: version.ScmVersion, - entrypoint: str, - given_value: _t.VERSION_SCHEMES, - default: str | None = None, -) -> str: - found_any_implementation = False - for scheme in _iter_version_schemes(entrypoint, given_value): - found_any_implementation = True - result = scheme(version) - if result is not None: - return result - if not found_any_implementation: - raise ValueError( - f'Couldn\'t find any implementations for entrypoint "{entrypoint}"' - f' with value "{given_value}".' - ) - if default is not None: - return default - raise ValueError( - f'None of the "{entrypoint}" entrypoints matching "{given_value}"' - " returned a value." - ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py deleted file mode 100644 index e19afc8..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/__init__.py +++ /dev/null @@ -1,113 +0,0 @@ -from __future__ import annotations - -import os - -from typing import TYPE_CHECKING -from typing import Callable - -from .. import _log -from .. import _types as _t -from .._entrypoints import entry_points -from .pathtools import norm_real - -if TYPE_CHECKING: - import sys - - if sys.version_info >= (3, 10): - from typing import TypeGuard - else: - from typing_extensions import TypeGuard - - -log = _log.log.getChild("file_finder") - - -def scm_find_files( - path: _t.PathT, - scm_files: set[str], - scm_dirs: set[str], - force_all_files: bool = False, -) -> list[str]: - """ setuptools compatible file finder that follows symlinks - - - path: the root directory from which to search - - scm_files: set of scm controlled files and symlinks - (including symlinks to directories) - - scm_dirs: set of scm controlled directories - (including directories containing no scm controlled files) - - force_all_files: ignore ``scm_files`` and ``scm_dirs`` and list everything. - - scm_files and scm_dirs must be absolute with symlinks resolved (realpath), - with normalized case (normcase) - - Spec here: https://setuptools.pypa.io/en/latest/userguide/extension.html#\ - adding-support-for-revision-control-systems - """ - realpath = norm_real(path) - seen: set[str] = set() - res: list[str] = [] - for dirpath, dirnames, filenames in os.walk(realpath, followlinks=True): - # dirpath with symlinks resolved - realdirpath = norm_real(dirpath) - - def _link_not_in_scm(n: str, realdirpath: str = realdirpath) -> bool: - fn = os.path.join(realdirpath, os.path.normcase(n)) - return os.path.islink(fn) and fn not in scm_files - - if not force_all_files and realdirpath not in scm_dirs: - # directory not in scm, don't walk it's content - dirnames[:] = [] - continue - if os.path.islink(dirpath) and not os.path.relpath( - realdirpath, realpath - ).startswith(os.pardir): - # a symlink to a directory not outside path: - # we keep it in the result and don't walk its content - res.append(os.path.join(path, os.path.relpath(dirpath, path))) - dirnames[:] = [] - continue - if realdirpath in seen: - # symlink loop protection - dirnames[:] = [] - continue - dirnames[:] = [ - dn for dn in dirnames if force_all_files or not _link_not_in_scm(dn) - ] - for filename in filenames: - if not force_all_files and _link_not_in_scm(filename): - continue - # dirpath + filename with symlinks preserved - fullfilename = os.path.join(dirpath, filename) - is_tracked = norm_real(fullfilename) in scm_files - if force_all_files or is_tracked: - res.append(os.path.join(path, os.path.relpath(fullfilename, realpath))) - seen.add(realdirpath) - return res - - -def is_toplevel_acceptable(toplevel: str | None) -> TypeGuard[str]: - """ """ - if toplevel is None: - return False - - ignored: list[str] = os.environ.get("SETUPTOOLS_SCM_IGNORE_VCS_ROOTS", "").split( - os.pathsep - ) - ignored = [os.path.normcase(p) for p in ignored] - - log.debug("toplevel: %r\n ignored %s", toplevel, ignored) - - return toplevel not in ignored - - -def find_files(path: _t.PathT = "") -> list[str]: - eps = [ - *entry_points(group="setuptools_scm.files_command"), - *entry_points(group="setuptools_scm.files_command_fallback"), - ] - for ep in eps: - command: Callable[[_t.PathT], list[str]] = ep.load() - res: list[str] = command(path) - if res: - return res - return [] diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py deleted file mode 100644 index 4379c21..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/git.py +++ /dev/null @@ -1,124 +0,0 @@ -from __future__ import annotations - -import logging -import os -import subprocess -import tarfile - -from typing import IO - -from .. import _types as _t -from .._run_cmd import run as _run -from ..integration import data_from_mime -from . import is_toplevel_acceptable -from . import scm_find_files -from .pathtools import norm_real - -log = logging.getLogger(__name__) - - -def _git_toplevel(path: str) -> str | None: - try: - cwd = os.path.abspath(path or ".") - res = _run(["git", "rev-parse", "HEAD"], cwd=cwd) - if res.returncode: - # BAIL if there is no commit - log.error("listing git files failed - pretending there aren't any") - return None - res = _run( - ["git", "rev-parse", "--show-prefix"], - cwd=cwd, - ) - if res.returncode: - return None - out = res.stdout[:-1] # remove the trailing pathsep - if not out: - out = cwd - else: - # Here, ``out`` is a relative path to root of git. - # ``cwd`` is absolute path to current working directory. - # the below method removes the length of ``out`` from - # ``cwd``, which gives the git toplevel - from .._compat import strip_path_suffix - - out = strip_path_suffix(cwd, out, f"cwd={cwd!r}\nout={out!r}") - log.debug("find files toplevel %s", out) - return norm_real(out) - except subprocess.CalledProcessError: - # git returned error, we are not in a git repo - return None - except OSError: - # git command not found, probably - return None - - -def _git_interpret_archive(fd: IO[bytes], toplevel: str) -> tuple[set[str], set[str]]: - with tarfile.open(fileobj=fd, mode="r|*") as tf: - git_files = set() - git_dirs = {toplevel} - for member in tf.getmembers(): - name = os.path.normcase(member.name).replace("/", os.path.sep) - if member.type == tarfile.DIRTYPE: - git_dirs.add(name) - else: - git_files.add(name) - return git_files, git_dirs - - -def _git_ls_files_and_dirs(toplevel: str) -> tuple[set[str], set[str]]: - # use git archive instead of git ls-file to honor - # export-ignore git attribute - - cmd = ["git", "archive", "--prefix", toplevel + os.path.sep, "HEAD"] - log.info("running %s", " ".join(str(x) for x in cmd)) - proc = subprocess.Popen( - cmd, stdout=subprocess.PIPE, cwd=toplevel, stderr=subprocess.DEVNULL - ) - assert proc.stdout is not None - try: - try: - return _git_interpret_archive(proc.stdout, toplevel) - finally: - # ensure we avoid resource warnings by cleaning up the process - proc.stdout.close() - proc.terminate() - # Wait for process to actually terminate and be reaped - try: - proc.wait(timeout=5) # Add timeout to avoid hanging - except subprocess.TimeoutExpired: - log.warning("git archive process did not terminate gracefully, killing") - proc.kill() - proc.wait() - except Exception: - # proc.wait() already called in finally block, check if it failed - if proc.returncode != 0: - log.error("listing git files failed - pretending there aren't any") - return set(), set() - - -def git_find_files(path: _t.PathT = "") -> list[str]: - toplevel = _git_toplevel(os.fspath(path)) - if not is_toplevel_acceptable(toplevel): - return [] - fullpath = norm_real(path) - if not fullpath.startswith(toplevel): - log.warning("toplevel mismatch computed %s vs resolved %s ", toplevel, fullpath) - git_files, git_dirs = _git_ls_files_and_dirs(toplevel) - return scm_find_files(path, git_files, git_dirs) - - -def git_archive_find_files(path: _t.PathT = "") -> list[str]: - # This function assumes that ``path`` is obtained from a git archive - # and therefore all the files that should be ignored were already removed. - archival = os.path.join(path, ".git_archival.txt") - if not os.path.exists(archival): - return [] - - data = data_from_mime(archival) - - if "$Format" in data.get("node", ""): - # Substitutions have not been performed, so not a reliable archive - return [] - - log.warning("git archive detected - fallback to listing all files") - return scm_find_files(path, set(), set(), force_all_files=True) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py deleted file mode 100644 index 182429c..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/hg.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -import logging -import os -import subprocess - -from .. import _types as _t -from .._file_finders import is_toplevel_acceptable -from .._file_finders import scm_find_files -from ..hg import run_hg -from ..integration import data_from_mime -from .pathtools import norm_real - -log = logging.getLogger(__name__) - - -def _hg_toplevel(path: str) -> str | None: - try: - return run_hg( - ["root"], - cwd=(path or "."), - check=True, - ).parse_success(norm_real) - except subprocess.CalledProcessError: - # hg returned error, we are not in a mercurial repo - return None - except OSError: - # hg command not found, probably - return None - - -def _hg_ls_files_and_dirs(toplevel: str) -> tuple[set[str], set[str]]: - hg_files: set[str] = set() - hg_dirs = {toplevel} - res = run_hg(["files"], cwd=toplevel) - if res.returncode: - return set(), set() - for name in res.stdout.splitlines(): - name = os.path.normcase(name).replace("/", os.path.sep) - fullname = os.path.join(toplevel, name) - hg_files.add(fullname) - dirname = os.path.dirname(fullname) - while len(dirname) > len(toplevel) and dirname not in hg_dirs: - hg_dirs.add(dirname) - dirname = os.path.dirname(dirname) - return hg_files, hg_dirs - - -def hg_find_files(path: str = "") -> list[str]: - toplevel = _hg_toplevel(path) - if not is_toplevel_acceptable(toplevel): - return [] - assert toplevel is not None - hg_files, hg_dirs = _hg_ls_files_and_dirs(toplevel) - return scm_find_files(path, hg_files, hg_dirs) - - -def hg_archive_find_files(path: _t.PathT = "") -> list[str]: - # This function assumes that ``path`` is obtained from a mercurial archive - # and therefore all the files that should be ignored were already removed. - archival = os.path.join(path, ".hg_archival.txt") - if not os.path.exists(archival): - return [] - - data = data_from_mime(archival) - - if "node" not in data: - # Ensure file is valid - return [] - - log.warning("hg archive detected - fallback to listing all files") - return scm_find_files(path, set(), set(), force_all_files=True) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py deleted file mode 100644 index 6de8508..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_file_finders/pathtools.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import annotations - -import os - -from setuptools_scm import _types as _t - - -def norm_real(path: _t.PathT) -> str: - return os.path.normcase(os.path.realpath(path)) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py deleted file mode 100644 index 31bc9c3..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_get_version_impl.py +++ /dev/null @@ -1,250 +0,0 @@ -from __future__ import annotations - -import dataclasses -import logging -import re -import warnings - -from pathlib import Path -from typing import Any -from typing import NoReturn -from typing import Pattern - -from . import _config -from . import _entrypoints -from . import _run_cmd -from . import _types as _t -from ._config import Configuration -from ._overrides import _read_pretended_version_for -from ._version_cls import _validate_version_cls -from .version import ScmVersion -from .version import format_version as _format_version - -EMPTY_TAG_REGEX_DEPRECATION = DeprecationWarning( - "empty regex for tag regex is invalid, using default" -) - -_log = logging.getLogger(__name__) - - -def parse_scm_version(config: Configuration) -> ScmVersion | None: - try: - if config.parse is not None: - parse_result = config.parse(config.absolute_root, config=config) - if parse_result is not None and not isinstance(parse_result, ScmVersion): - raise TypeError( - f"version parse result was {str!r}\n" - "please return a parsed version (ScmVersion)" - ) - return parse_result - else: - return _entrypoints.version_from_entrypoint( - config, - entrypoint="setuptools_scm.parse_scm", - root=config.absolute_root, - ) - except _run_cmd.CommandNotFoundError as e: - _log.exception("command %s not found while parsing the scm, using fallbacks", e) - return None - - -def parse_fallback_version(config: Configuration) -> ScmVersion | None: - return _entrypoints.version_from_entrypoint( - config, - entrypoint="setuptools_scm.parse_scm_fallback", - root=config.fallback_root, - ) - - -def parse_version(config: Configuration) -> ScmVersion | None: - # First try to get a version from the normal flow - scm_version = ( - _read_pretended_version_for(config) - or parse_scm_version(config) - or parse_fallback_version(config) - ) - - # Apply any metadata overrides to the version we found - from ._overrides import _apply_metadata_overrides - - return _apply_metadata_overrides(scm_version, config) - - -def write_version_files( - config: Configuration, version: str, scm_version: ScmVersion -) -> None: - if config.write_to is not None: - from ._integration.dump_version import dump_version - - dump_version( - root=config.root, - version=version, - scm_version=scm_version, - write_to=config.write_to, - template=config.write_to_template, - ) - if config.version_file: - from ._integration.dump_version import write_version_to_path - - version_file = Path(config.version_file) - assert not version_file.is_absolute(), f"{version_file=}" - # todo: use a better name than fallback root - assert config.relative_to is not None - target = Path(config.relative_to).parent.joinpath(version_file) - write_version_to_path( - target, - template=config.version_file_template, - version=version, - scm_version=scm_version, - ) - - -def _get_version( - config: Configuration, force_write_version_files: bool | None = None -) -> str | None: - parsed_version = parse_version(config) - if parsed_version is None: - return None - version_string = _format_version(parsed_version) - if force_write_version_files is None: - force_write_version_files = True - warnings.warn( - "force_write_version_files ought to be set," - " presuming the legacy True value", - DeprecationWarning, - ) - - if force_write_version_files: - write_version_files(config, version=version_string, scm_version=parsed_version) - - return version_string - - -def _find_scm_in_parents(config: Configuration) -> Path | None: - """ - Search parent directories for SCM repositories when relative_to is not set. - Uses the existing entrypoint system for SCM discovery. - """ - if config.search_parent_directories: - return None - - searching_config = dataclasses.replace(config, search_parent_directories=True) - - from .discover import iter_matching_entrypoints - - for _ep in iter_matching_entrypoints( - config.absolute_root, "setuptools_scm.parse_scm", searching_config - ): - # xxx: iter_matching_entrypoints should return the parent directory, we do a hack atm - assert searching_config.parent is not None - return Path(searching_config.parent) - - return None - - -def _version_missing(config: Configuration) -> NoReturn: - base_error = ( - f"setuptools-scm was unable to detect version for {config.absolute_root}.\n\n" - ) - - # If relative_to is not set, check for SCM repositories in parent directories - scm_parent = None - if config.relative_to is None: - scm_parent = _find_scm_in_parents(config) - - if scm_parent is not None: - # Found an SCM repository in a parent directory - error_msg = ( - base_error - + f"However, a repository was found in a parent directory: {scm_parent}\n\n" - f"To fix this, you have a few options:\n\n" - f"1. Use the 'relative_to' parameter to specify the file that setuptools-scm should use as reference:\n" - f" setuptools_scm.get_version(relative_to=__file__)\n\n" - f"2. Enable parent directory search in your configuration:\n" - f" [tool.setuptools_scm]\n" - f" search_parent_directories = true\n\n" - f"3. Change your working directory to the repository root: {scm_parent}\n\n" - f"4. Set the root explicitly in your configuration:\n" - f" [tool.setuptools_scm]\n" - f' root = "{scm_parent}"\n\n' - "For more information, see: https://setuptools-scm.readthedocs.io/en/latest/config/" - ) - else: - # No SCM repository found in parent directories either - error_msg = ( - base_error - + "Make sure you're either building from a fully intact git repository " - "or PyPI tarballs. Most other sources (such as GitHub's tarballs, a " - "git checkout without the .git folder) don't contain the necessary " - "metadata and will not work.\n\n" - "For example, if you're using pip, instead of " - "https://github.com/user/proj/archive/master.zip " - "use git+https://github.com/user/proj.git#egg=proj\n\n" - "Alternatively, set the version with the environment variable " - "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_${NORMALIZED_DIST_NAME} as described " - "in https://setuptools-scm.readthedocs.io/en/latest/config/" - ) - - raise LookupError(error_msg) - - -def get_version( - root: _t.PathT = ".", - version_scheme: _t.VERSION_SCHEME = _config.DEFAULT_VERSION_SCHEME, - local_scheme: _t.VERSION_SCHEME = _config.DEFAULT_LOCAL_SCHEME, - write_to: _t.PathT | None = None, - write_to_template: str | None = None, - version_file: _t.PathT | None = None, - version_file_template: str | None = None, - relative_to: _t.PathT | None = None, - tag_regex: str | Pattern[str] = _config.DEFAULT_TAG_REGEX, - parentdir_prefix_version: str | None = None, - fallback_version: str | None = None, - fallback_root: _t.PathT = ".", - parse: Any | None = None, - git_describe_command: _t.CMD_TYPE | None = None, - dist_name: str | None = None, - version_cls: Any | None = None, - normalize: bool = True, - search_parent_directories: bool = False, - scm: dict[str, Any] | None = None, -) -> str: - """ - If supplied, relative_to should be a file from which root may - be resolved. Typically called by a script or module that is not - in the root of the repository to direct setuptools-scm to the - root of the repository by supplying ``__file__``. - """ - - version_cls = _validate_version_cls(version_cls, normalize) - del normalize - tag_regex = parse_tag_regex(tag_regex) - - # Handle scm parameter by converting it to ScmConfiguration - if scm is not None: - scm_config = _config.ScmConfiguration.from_data(scm) - else: - scm_config = _config.ScmConfiguration() - - # Remove scm from locals() since we handle it separately - config_params = locals().copy() - config_params.pop("scm", None) - config_params.pop("scm_config", None) - - config = _config.Configuration(scm=scm_config, **config_params) - maybe_version = _get_version(config, force_write_version_files=True) - - if maybe_version is None: - _version_missing(config) - return maybe_version - - -def parse_tag_regex(tag_regex: str | Pattern[str]) -> Pattern[str]: - if isinstance(tag_regex, str): - if tag_regex == "": - warnings.warn(EMPTY_TAG_REGEX_DEPRECATION) - return _config.DEFAULT_TAG_REGEX - else: - return re.compile(tag_regex) - else: - return tag_regex diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py deleted file mode 100644 index a1b3615..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/deprecation.py +++ /dev/null @@ -1,20 +0,0 @@ -import warnings - -from pathlib import Path - - -def warn_dynamic_version(path: Path, section: str, expression: str) -> None: - warnings.warn( - f"{path}: at [{section}]\n" - f"{expression} is forcing setuptools to override the version setuptools-scm did already set\n" - "When using setuptools-scm it's invalid to use setuptools dynamic version as well, please remove it.\n" - "Setuptools-scm is responsible for setting the version, forcing setuptools to override creates errors." - ) - - -def warn_pyproject_setuptools_dynamic_version(path: Path) -> None: - warn_dynamic_version(path, "tool.setuptools.dynamic", "version = {attr = ...}") - - -def warn_setup_cfg_dynamic_version(path: Path) -> None: - warn_dynamic_version(path, "metadata", "version = attr: ...") diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py deleted file mode 100644 index 06081c9..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/dump_version.py +++ /dev/null @@ -1,128 +0,0 @@ -from __future__ import annotations - -import warnings - -from pathlib import Path - -from .. import _types as _t -from .._log import log as parent_log -from .._version_cls import _version_as_tuple -from ..version import ScmVersion - -log = parent_log.getChild("dump_version") - - -TEMPLATES = { - ".py": """\ -# file generated by setuptools-scm -# don't change, don't track in version control - -__all__ = [ - "__version__", - "__version_tuple__", - "version", - "version_tuple", - "__commit_id__", - "commit_id", -] - -TYPE_CHECKING = False -if TYPE_CHECKING: - from typing import Tuple - from typing import Union - - VERSION_TUPLE = Tuple[Union[int, str], ...] - COMMIT_ID = Union[str, None] -else: - VERSION_TUPLE = object - COMMIT_ID = object - -version: str -__version__: str -__version_tuple__: VERSION_TUPLE -version_tuple: VERSION_TUPLE -commit_id: COMMIT_ID -__commit_id__: COMMIT_ID - -__version__ = version = {version!r} -__version_tuple__ = version_tuple = {version_tuple!r} - -__commit_id__ = commit_id = {scm_version.short_node!r} -""", - ".txt": "{version}", -} - - -def dump_version( - root: _t.PathT, - version: str, - write_to: _t.PathT, - template: str | None = None, - scm_version: ScmVersion | None = None, -) -> None: - assert isinstance(version, str) - root = Path(root) - write_to = Path(write_to) - if write_to.is_absolute(): - # trigger warning on escape - write_to.relative_to(root) - warnings.warn( - f"{write_to=!s} is a absolute path," - " please switch to using a relative version file", - DeprecationWarning, - ) - target = write_to - else: - target = Path(root).joinpath(write_to) - write_version_to_path( - target, template=template, version=version, scm_version=scm_version - ) - - -def _validate_template(target: Path, template: str | None) -> str: - if template == "": - warnings.warn(f"{template=} looks like a error, using default instead") - template = None - if template is None: - template = TEMPLATES.get(target.suffix) - - if template is None: - raise ValueError( - f"bad file format: {target.suffix!r} (of {target})\n" - "only *.txt and *.py have a default template" - ) - else: - return template - - -class DummyScmVersion: - @property - def short_node(self) -> str | None: - return None - - -def write_version_to_path( - target: Path, - template: str | None, - version: str, - scm_version: ScmVersion | None = None, -) -> None: - final_template = _validate_template(target, template) - log.debug("dump %s into %s", version, target) - version_tuple = _version_as_tuple(version) - if scm_version is None: - warnings.warn( - "write_version_to_path called without scm_version parameter. " - "This will be required in a future version. " - "Pass scm_version=None explicitly to suppress this warning.", - DeprecationWarning, - stacklevel=2, - ) - - content = final_template.format( - version=version, - version_tuple=version_tuple, - scm_version=scm_version or DummyScmVersion(), - ) - - target.write_text(content, encoding="utf-8") diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py deleted file mode 100644 index 75d86f6..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/pyproject_reading.py +++ /dev/null @@ -1,285 +0,0 @@ -from __future__ import annotations - -import warnings - -from dataclasses import dataclass -from pathlib import Path -from typing import Sequence - -from .. import _log -from .. import _types as _t -from .._requirement_cls import extract_package_name -from .toml import TOML_RESULT -from .toml import InvalidTomlError -from .toml import read_toml_content - -log = _log.log.getChild("pyproject_reading") - -_ROOT = "root" - - -DEFAULT_PYPROJECT_PATH = Path("pyproject.toml") -DEFAULT_TOOL_NAME = "setuptools_scm" - - -@dataclass -class PyProjectData: - path: Path - tool_name: str - project: TOML_RESULT - section: TOML_RESULT - is_required: bool - section_present: bool - project_present: bool - build_requires: list[str] - - @classmethod - def for_testing( - cls, - *, - is_required: bool = False, - section_present: bool = False, - project_present: bool = False, - project_name: str | None = None, - has_dynamic_version: bool = True, - build_requires: list[str] | None = None, - local_scheme: str | None = None, - ) -> PyProjectData: - """Create a PyProjectData instance for testing purposes.""" - project: TOML_RESULT - if project_name is not None: - project = {"name": project_name} - assert project_present - else: - project = {} - - # If project is present and has_dynamic_version is True, add dynamic=['version'] - if project_present and has_dynamic_version: - project["dynamic"] = ["version"] - - if build_requires is None: - build_requires = [] - if local_scheme is not None: - assert section_present - section = {"local_scheme": local_scheme} - else: - section = {} - return cls( - path=DEFAULT_PYPROJECT_PATH, - tool_name=DEFAULT_TOOL_NAME, - project=project, - section=section, - is_required=is_required, - section_present=section_present, - project_present=project_present, - build_requires=build_requires, - ) - - @classmethod - def empty( - cls, path: Path = DEFAULT_PYPROJECT_PATH, tool_name: str = DEFAULT_TOOL_NAME - ) -> PyProjectData: - return cls( - path=path, - tool_name=tool_name, - project={}, - section={}, - is_required=False, - section_present=False, - project_present=False, - build_requires=[], - ) - - @property - def project_name(self) -> str | None: - return self.project.get("name") - - @property - def project_version(self) -> str | None: - """Return the static version from [project] if present. - - When the project declares dynamic = ["version"], the version - is intentionally omitted from [project] and this returns None. - """ - return self.project.get("version") - - def should_infer(self) -> bool: - """ - Determine if setuptools_scm should infer version based on configuration. - - Infer when: - 1. An explicit [tool.setuptools_scm] section is present, OR - 2. setuptools-scm[simple] is in build-system.requires AND - version is in project.dynamic - - Returns: - True if [tool.setuptools_scm] is present, otherwise False - """ - # Original behavior: explicit tool section - if self.section_present: - return True - - # New behavior: simple extra + dynamic version - if self.project_present: - dynamic_fields = self.project.get("dynamic", []) - if "version" in dynamic_fields: - if has_build_package_with_extra( - self.build_requires, "setuptools-scm", "simple" - ): - return True - - return False - - -def has_build_package( - requires: Sequence[str], canonical_build_package_name: str -) -> bool: - for requirement in requires: - package_name = extract_package_name(requirement) - if package_name == canonical_build_package_name: - return True - return False - - -def has_build_package_with_extra( - requires: Sequence[str], canonical_build_package_name: str, extra_name: str -) -> bool: - """Check if a build dependency has a specific extra. - - Args: - requires: List of requirement strings from build-system.requires - canonical_build_package_name: The canonical package name to look for - extra_name: The extra name to check for (e.g., "simple") - - Returns: - True if the package is found with the specified extra - """ - from .._requirement_cls import Requirement - - for requirement_string in requires: - try: - requirement = Requirement(requirement_string) - package_name = extract_package_name(requirement_string) - if package_name == canonical_build_package_name: - if extra_name in requirement.extras: - return True - except Exception: - # If parsing fails, continue to next requirement - continue - return False - - -def read_pyproject( - path: Path = DEFAULT_PYPROJECT_PATH, - tool_name: str = DEFAULT_TOOL_NAME, - canonical_build_package_name: str = "setuptools-scm", - _given_result: _t.GivenPyProjectResult = None, - _given_definition: TOML_RESULT | None = None, -) -> PyProjectData: - """Read and parse pyproject configuration. - - This function supports dependency injection for tests via ``_given_result`` - and ``_given_definition``. - - :param path: Path to the pyproject file - :param tool_name: The tool section name (default: ``setuptools_scm``) - :param canonical_build_package_name: Normalized build requirement name - :param _given_result: Optional testing hook. Can be: - - ``PyProjectData``: returned directly - - ``InvalidTomlError`` | ``FileNotFoundError``: raised directly - - ``None``: read from filesystem (default) - :param _given_definition: Optional testing hook to provide parsed TOML content. - When provided, this dictionary is used instead of reading and parsing - the file from disk. Ignored if ``_given_result`` is provided. - """ - - if _given_result is not None: - if isinstance(_given_result, PyProjectData): - return _given_result - if isinstance(_given_result, (InvalidTomlError, FileNotFoundError)): - raise _given_result - - if _given_definition is not None: - defn = _given_definition - else: - defn = read_toml_content(path) - - requires: list[str] = defn.get("build-system", {}).get("requires", []) - is_required = has_build_package(requires, canonical_build_package_name) - - tool_section = defn.get("tool", {}) - section = tool_section.get(tool_name, {}) - section_present = tool_name in tool_section - - if not section_present: - log.warning( - "toml section missing %r does not contain a tool.%s section", - path, - tool_name, - ) - - project = defn.get("project", {}) - project_present = "project" in defn - pyproject_data = PyProjectData( - path, - tool_name, - project, - section, - is_required, - section_present, - project_present, - requires, - ) - - setuptools_dynamic_version = ( - defn.get("tool", {}) - .get("setuptools", {}) - .get("dynamic", {}) - .get("version", None) - ) - # Only warn if setuptools-scm is being used for version inference - # (not just file finding). When only file finders are used, it's valid - # to use tool.setuptools.dynamic.version for versioning. - if setuptools_dynamic_version is not None and pyproject_data.should_infer(): - from .deprecation import warn_pyproject_setuptools_dynamic_version - - warn_pyproject_setuptools_dynamic_version(path) - - return pyproject_data - - -def get_args_for_pyproject( - pyproject: PyProjectData, - dist_name: str | None, - kwargs: TOML_RESULT, -) -> TOML_RESULT: - """drops problematic details and figures the distribution name""" - section = pyproject.section.copy() - kwargs = kwargs.copy() - if "relative_to" in section: - relative = section.pop("relative_to") - warnings.warn( - f"{pyproject.path}: at [tool.{pyproject.tool_name}]\n" - f"ignoring value relative_to={relative!r}" - " as its always relative to the config file" - ) - if "dist_name" in section: - if dist_name is None: - dist_name = section.pop("dist_name") - else: - assert dist_name == section["dist_name"] - section.pop("dist_name") - if dist_name is None: - # minimal pep 621 support for figuring the pretend keys - dist_name = pyproject.project_name - if _ROOT in kwargs: - if kwargs[_ROOT] is None: - kwargs.pop(_ROOT, None) - elif _ROOT in section: - if section[_ROOT] != kwargs[_ROOT]: - warnings.warn( - f"root {section[_ROOT]} is overridden" - f" by the cli arg {kwargs[_ROOT]}" - ) - section.pop(_ROOT, None) - return {"dist_name": dist_name, **section, **kwargs} diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py deleted file mode 100644 index 893a9ad..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setup_cfg.py +++ /dev/null @@ -1,46 +0,0 @@ -from __future__ import annotations - -import os - -from dataclasses import dataclass -from pathlib import Path - -import setuptools - - -@dataclass -class SetuptoolsBasicData: - path: Path - name: str | None - version: str | None - - -def read_setup_cfg(input: str | os.PathLike[str] = "setup.cfg") -> SetuptoolsBasicData: - """Parse setup.cfg and return unified data. Does not raise if file is missing.""" - import configparser - - path = Path(input) - parser = configparser.ConfigParser() - parser.read([input], encoding="utf-8") - - name = parser.get("metadata", "name", fallback=None) - version = parser.get("metadata", "version", fallback=None) - if version is not None and "attr" in version: - from .deprecation import warn_setup_cfg_dynamic_version - - warn_setup_cfg_dynamic_version(path) - version = None - return SetuptoolsBasicData(path=path, name=name, version=version) - - -def extract_from_legacy( - dist: setuptools.Distribution, - *, - _given_legacy_data: SetuptoolsBasicData | None = None, -) -> SetuptoolsBasicData: - base = _given_legacy_data if _given_legacy_data is not None else read_setup_cfg() - if base.name is None: - base.name = dist.metadata.name - if base.version is None: - base.version = dist.metadata.version - return base diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py deleted file mode 100644 index aa1c645..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/setuptools.py +++ /dev/null @@ -1,159 +0,0 @@ -from __future__ import annotations - -import logging -import warnings - -from typing import Any -from typing import Callable - -import setuptools - -from .. import _types as _t -from .pyproject_reading import PyProjectData -from .pyproject_reading import read_pyproject -from .setup_cfg import SetuptoolsBasicData -from .setup_cfg import extract_from_legacy -from .toml import InvalidTomlError -from .version_inference import get_version_inference_config - -log = logging.getLogger(__name__) - - -def _warn_on_old_setuptools(_version: str = setuptools.__version__) -> None: - if int(_version.split(".")[0]) < 61: - warnings.warn( - RuntimeWarning( - f""" -ERROR: setuptools=={_version} is used in combination with setuptools-scm>=8.x - -Your build configuration is incomplete and previously worked by accident! -setuptools-scm requires setuptools>=61 (recommended: >=80) - -Suggested workaround if applicable: - - migrating from the deprecated setup_requires mechanism to pep517/518 - and using a pyproject.toml to declare build dependencies - which are reliably pre-installed before running the build tools -""" - ) - ) - - -_warn_on_old_setuptools() - - -def _log_hookstart(hook: str, dist: setuptools.Distribution) -> None: - log.debug( - "%s %s %s %r", - hook, - id(dist), - id(dist.metadata), - {**vars(dist.metadata), "long_description": ...}, - ) - - -def get_keyword_overrides( - value: bool | dict[str, Any] | Callable[[], dict[str, Any]], -) -> dict[str, Any]: - """normalize the version keyword input""" - if value is True: - return {} - elif callable(value): - return value() - else: - assert isinstance(value, dict), "version_keyword expects a dict or True" - return value - - -def version_keyword( - dist: setuptools.Distribution, - keyword: str, - value: bool | dict[str, Any] | Callable[[], dict[str, Any]], - *, - _given_pyproject_data: _t.GivenPyProjectResult = None, - _given_legacy_data: SetuptoolsBasicData | None = None, - _get_version_inference_config: _t.GetVersionInferenceConfig = get_version_inference_config, -) -> None: - """apply version infernce when setup(use_scm_version=...) is used - this takes priority over the finalize_options based version - """ - - _log_hookstart("version_keyword", dist) - - # Parse overrides (integration point responsibility) - overrides = get_keyword_overrides(value) - - assert "dist_name" not in overrides, ( - "dist_name may not be specified in the setup keyword " - ) - - legacy_data = extract_from_legacy(dist, _given_legacy_data=_given_legacy_data) - dist_name: str | None = legacy_data.name - - was_set_by_infer = getattr(dist, "_setuptools_scm_version_set_by_infer", False) - - # Exit early if overrides is empty dict AND version was set by infer - if overrides == {} and was_set_by_infer: - return - - # Get pyproject data (support direct injection for tests) - try: - pyproject_data = read_pyproject(_given_result=_given_pyproject_data) - except FileNotFoundError: - log.debug("pyproject.toml not found, proceeding with empty configuration") - pyproject_data = PyProjectData.empty() - except InvalidTomlError as e: - log.debug("Configuration issue in pyproject.toml: %s", e) - return - - # Pass None as current_version if overrides is truthy AND version was set by infer - current_version = ( - None - if (overrides and was_set_by_infer) - else (legacy_data.version or pyproject_data.project_version) - ) - - result = _get_version_inference_config( - dist_name=dist_name, - current_version=current_version, - pyproject_data=pyproject_data, - overrides=overrides, - ) - - result.apply(dist) - - -def infer_version( - dist: setuptools.Distribution, - *, - _given_pyproject_data: _t.GivenPyProjectResult = None, - _given_legacy_data: SetuptoolsBasicData | None = None, - _get_version_inference_config: _t.GetVersionInferenceConfig = get_version_inference_config, -) -> None: - """apply version inference from the finalize_options hook - this is the default for pyproject.toml based projects that don't use the use_scm_version keyword - - if the version keyword is used, it will override the version from this hook - as user might have passed custom code version schemes - """ - - _log_hookstart("infer_version", dist) - - legacy_data = extract_from_legacy(dist, _given_legacy_data=_given_legacy_data) - dist_name = legacy_data.name - - try: - pyproject_data = read_pyproject(_given_result=_given_pyproject_data) - except FileNotFoundError: - log.debug("pyproject.toml not found, skipping infer_version") - return - except InvalidTomlError as e: - log.debug("Configuration issue in pyproject.toml: %s", e) - return - - # Only infer when tool section present per get_version_inference_config - result = _get_version_inference_config( - dist_name=dist_name, - current_version=legacy_data.version or pyproject_data.project_version, - pyproject_data=pyproject_data, - ) - result.apply(dist) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py deleted file mode 100644 index 2253287..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/toml.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import annotations - -import sys - -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Dict -from typing import TypedDict -from typing import cast - -if sys.version_info >= (3, 11): - from tomllib import loads as load_toml -else: - from tomli import loads as load_toml - -if TYPE_CHECKING: - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - -from .. import _log - -log = _log.log.getChild("toml") - -TOML_RESULT: TypeAlias = Dict[str, Any] -TOML_LOADER: TypeAlias = Callable[[str], TOML_RESULT] - - -class InvalidTomlError(ValueError): - """Raised when TOML data cannot be parsed.""" - - -def read_toml_content(path: Path, default: TOML_RESULT | None = None) -> TOML_RESULT: - try: - data = path.read_text(encoding="utf-8") - except FileNotFoundError: - if default is None: - raise - else: - log.debug("%s missing, presuming default %r", path, default) - return default - else: - try: - return load_toml(data) - except Exception as e: # tomllib/tomli raise different decode errors - raise InvalidTomlError(f"Invalid TOML in {path}") from e - - -class _CheatTomlData(TypedDict): - cheat: dict[str, Any] - - -def load_toml_or_inline_map(data: str | None) -> dict[str, Any]: - """ - load toml data - with a special hack if only a inline map is given - """ - if not data: - return {} - try: - if data[0] == "{": - data = "cheat=" + data - loaded: _CheatTomlData = cast(_CheatTomlData, load_toml(data)) - return loaded["cheat"] - return load_toml(data) - except Exception as e: # tomllib/tomli raise different decode errors - raise InvalidTomlError("Invalid TOML content") from e diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py deleted file mode 100644 index 6258d90..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_integration/version_inference.py +++ /dev/null @@ -1,141 +0,0 @@ -from __future__ import annotations - -from dataclasses import dataclass -from typing import TYPE_CHECKING -from typing import Any -from typing import Union - -from setuptools import Distribution - -from .. import _log - -if TYPE_CHECKING: - from .pyproject_reading import PyProjectData - -log = _log.log.getChild("version_inference") - - -@dataclass -class VersionInferenceConfig: - """Configuration for version inference.""" - - dist_name: str | None - pyproject_data: PyProjectData | None - overrides: dict[str, Any] | None - - def apply(self, dist: Distribution) -> None: - """Apply version inference to the distribution.""" - version_string = infer_version_string( - self.dist_name, - self.pyproject_data, # type: ignore[arg-type] - self.overrides, - force_write_version_files=True, - ) - dist.metadata.version = version_string - - # Mark that this version was set by infer_version if overrides is None (infer_version context) - if self.overrides is None: - dist._setuptools_scm_version_set_by_infer = True # type: ignore[attr-defined] - - -@dataclass -class VersionInferenceWarning: - """Error message for user.""" - - message: str - - def apply(self, dist: Distribution) -> None: - """Apply error handling to the distribution.""" - import warnings - - warnings.warn(self.message) - - -@dataclass(frozen=True) -class VersionInferenceNoOp: - """No operation result - silent skip.""" - - def apply(self, dist: Distribution) -> None: - """Apply no-op to the distribution.""" - - -VersionInferenceResult = Union[ - VersionInferenceConfig, # Proceed with inference - VersionInferenceWarning, # Show warning - VersionInferenceNoOp, # Don't infer (silent) -] - - -def infer_version_string( - dist_name: str | None, - pyproject_data: PyProjectData, - overrides: dict[str, Any] | None = None, - *, - force_write_version_files: bool = False, -) -> str: - """ - Compute the inferred version string from the given inputs without requiring a - setuptools Distribution instance. This is a pure helper that simplifies - integration tests by avoiding file I/O and side effects on a Distribution. - - Parameters: - dist_name: Optional distribution name (used for overrides and env scoping) - pyproject_data: Parsed PyProjectData (may be constructed via for_testing()) - overrides: Optional override configuration (same keys as [tool.setuptools_scm]) - force_write_version_files: When True, apply write_to/version_file effects - - Returns: - The computed version string. - """ - from .. import _config as _config_module - from .._get_version_impl import _get_version - from .._get_version_impl import _version_missing - - config = _config_module.Configuration.from_file( - dist_name=dist_name, pyproject_data=pyproject_data, **(overrides or {}) - ) - - maybe_version = _get_version( - config, force_write_version_files=force_write_version_files - ) - if maybe_version is None: - _version_missing(config) - return maybe_version - - -def get_version_inference_config( - dist_name: str | None, - current_version: str | None, - pyproject_data: PyProjectData, - overrides: dict[str, Any] | None = None, -) -> VersionInferenceResult: - """ - Determine whether and how to perform version inference. - - Args: - dist_name: The distribution name - current_version: Current version if any - pyproject_data: PyProjectData from parser (None if file doesn't exist) - overrides: Override configuration (None for no overrides) - - Returns: - VersionInferenceResult with the decision and configuration - """ - - config = VersionInferenceConfig( - dist_name=dist_name, - pyproject_data=pyproject_data, - overrides=overrides, - ) - - inference_implied = pyproject_data.should_infer() or overrides is not None - - if inference_implied: - if current_version is None: - return config - else: - return VersionInferenceWarning( - f"version of {dist_name} already set", - ) - else: - return VersionInferenceNoOp() diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py deleted file mode 100644 index ea17f37..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_log.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -logging helpers, supports vendoring -""" - -from __future__ import annotations - -import contextlib -import logging -import os -import sys - -from typing import IO -from typing import Iterator -from typing import Mapping - -log = logging.getLogger(__name__.rsplit(".", 1)[0]) -log.propagate = False - - -class AlwaysStdErrHandler(logging.StreamHandler): # type: ignore[type-arg] - def __init__(self) -> None: - super().__init__(sys.stderr) - - @property - def stream(self) -> IO[str]: - return sys.stderr - - @stream.setter - def stream(self, value: IO[str]) -> None: - assert value is sys.stderr - - -def make_default_handler() -> logging.Handler: - try: - from rich.console import Console - - console = Console(stderr=True) - from rich.logging import RichHandler - - return RichHandler(console=console) - except ImportError: - last_resort = logging.lastResort - assert last_resort is not None - return last_resort - - -_default_handler = make_default_handler() - -log.addHandler(_default_handler) - - -def _default_log_level(_env: Mapping[str, str] = os.environ) -> int: - val: str | None = _env.get("SETUPTOOLS_SCM_DEBUG") - return logging.WARNING if val is None else logging.DEBUG - - -log.setLevel(_default_log_level()) - - -@contextlib.contextmanager -def defer_to_pytest() -> Iterator[None]: - log.propagate = True - old_level = log.level - log.setLevel(logging.NOTSET) - log.removeHandler(_default_handler) - try: - yield - finally: - log.addHandler(_default_handler) - log.propagate = False - log.setLevel(old_level) - - -@contextlib.contextmanager -def enable_debug(handler: logging.Handler = _default_handler) -> Iterator[None]: - log.addHandler(handler) - old_level = log.level - log.setLevel(logging.DEBUG) - old_handler_level = handler.level - handler.setLevel(logging.DEBUG) - try: - yield - finally: - log.setLevel(old_level) - handler.setLevel(old_handler_level) - if handler is not _default_handler: - log.removeHandler(handler) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py deleted file mode 100644 index aae41a6..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_modify_version.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import annotations - -import re - -from . import _types as _t - - -def strip_local(version_string: str) -> str: - public = version_string.partition("+")[0] - return public - - -def _add_post(version: str) -> str: - if "post" in version: - raise ValueError( - f"{version} already is a post release, refusing to guess the update" - ) - return f"{version}.post1" - - -def _bump_dev(version: str) -> str | None: - if ".dev" not in version: - return None - - prefix, tail = version.rsplit(".dev", 1) - if tail != "0": - raise ValueError( - "choosing custom numbers for the `.devX` distance " - "is not supported.\n " - f"The {version} can't be bumped\n" - "Please drop the tag or create a new supported one ending in .dev0" - ) - return prefix - - -def _bump_regex(version: str) -> str: - match = re.match(r"(.*?)(\d+)$", version) - if match is None: - raise ValueError( - f"{version} does not end with a number to bump, " - "please correct or use a custom version scheme" - ) - else: - prefix, tail = match.groups() - return f"{prefix}{int(tail) + 1}" - - -def _format_local_with_time(version: _t.SCMVERSION, time_format: str) -> str: - if version.exact or version.node is None: - return version.format_choice( - "", "+d{time:{time_format}}", time_format=time_format - ) - else: - return version.format_choice( - "+{node}", "+{node}.d{time:{time_format}}", time_format=time_format - ) - - -def _dont_guess_next_version(tag_version: _t.SCMVERSION) -> str: - version = strip_local(str(tag_version.tag)) - return _bump_dev(version) or _add_post(version) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py deleted file mode 100644 index 1a7a227..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_node_utils.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Private utilities for consistent node ID handling across SCM backends.""" - -from __future__ import annotations - -# Standard node ID length used across all SCM backends -_NODE_ID_LENGTH = 10 - - -def _slice_node_id(node_id: str) -> str: - """ - Slice a node ID to a consistent length. - - This ensures that all SCM backends (git, mercurial, archival) - return the same length node IDs for consistency. - - Args: - node_id: The full node ID/hash from the SCM - - Returns: - The node ID sliced to the standard length - """ - return node_id[:_NODE_ID_LENGTH] - - -def _format_node_for_output(node_id: str | None) -> str | None: - """ - Format a node ID for output, applying consistent slicing. - - Args: - node_id: The full node ID/hash from the SCM or None - - Returns: - The node ID sliced to standard length for output, or None if input was None - """ - if node_id is None: - return None - - # Handle mercurial nodes with 'h' prefix - if node_id.startswith("h"): - # For mercurial nodes, slice the part after 'h' and reconstruct - hg_hash = node_id[1:] # Remove 'h' prefix - sliced_hash = _slice_node_id(hg_hash) - return "h" + sliced_hash - - # For git nodes (with or without 'g' prefix) and others - return _slice_node_id(node_id) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py deleted file mode 100644 index 4e06b7a..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_overrides.py +++ /dev/null @@ -1,298 +0,0 @@ -from __future__ import annotations - -import dataclasses -import os - -from difflib import get_close_matches -from typing import Any -from typing import Mapping - -from packaging.utils import canonicalize_name - -from . import _config -from . import _log -from . import version -from ._integration.toml import load_toml_or_inline_map - -log = _log.log.getChild("overrides") - -PRETEND_KEY = "SETUPTOOLS_SCM_PRETEND_VERSION" -PRETEND_KEY_NAMED = PRETEND_KEY + "_FOR_{name}" -PRETEND_METADATA_KEY = "SETUPTOOLS_SCM_PRETEND_METADATA" -PRETEND_METADATA_KEY_NAMED = PRETEND_METADATA_KEY + "_FOR_{name}" - - -def _search_env_vars_with_prefix( - prefix: str, dist_name: str, env: Mapping[str, str] -) -> list[tuple[str, str]]: - """Search environment variables with a given prefix for potential dist name matches. - - Args: - prefix: The environment variable prefix (e.g., "SETUPTOOLS_SCM_PRETEND_VERSION_FOR_") - dist_name: The original dist name to match against - env: Environment dictionary to search in - - Returns: - List of (env_var_name, env_var_value) tuples for potential matches - """ - # Get the canonical name for comparison - canonical_dist_name = canonicalize_name(dist_name) - - matches = [] - for env_var, value in env.items(): - if env_var.startswith(prefix): - suffix = env_var[len(prefix) :] - # Normalize the suffix and compare to canonical dist name - try: - normalized_suffix = canonicalize_name(suffix.lower().replace("_", "-")) - if normalized_suffix == canonical_dist_name: - matches.append((env_var, value)) - except Exception: - # If normalization fails for any reason, skip this env var - continue - - return matches - - -def _find_close_env_var_matches( - prefix: str, expected_suffix: str, env: Mapping[str, str], threshold: float = 0.6 -) -> list[str]: - """Find environment variables with similar suffixes that might be typos. - - Args: - prefix: The environment variable prefix - expected_suffix: The expected suffix (canonicalized dist name in env var format) - env: Environment dictionary to search in - threshold: Similarity threshold for matches (0.0 to 1.0) - - Returns: - List of environment variable names that are close matches - """ - candidates = [] - for env_var in env: - if env_var.startswith(prefix): - suffix = env_var[len(prefix) :] - candidates.append(suffix) - - # Use difflib to find close matches - close_matches = get_close_matches( - expected_suffix, candidates, n=3, cutoff=threshold - ) - - return [f"{prefix}{match}" for match in close_matches if match != expected_suffix] - - -def read_named_env( - *, - tool: str = "SETUPTOOLS_SCM", - name: str, - dist_name: str | None, - env: Mapping[str, str] = os.environ, -) -> str | None: - """Read a named environment variable, with fallback search for dist-specific variants. - - This function first tries the standard normalized environment variable name. - If that's not found and a dist_name is provided, it searches for alternative - normalizations and warns about potential issues. - - Args: - tool: The tool prefix (default: "SETUPTOOLS_SCM") - name: The environment variable name component - dist_name: The distribution name for dist-specific variables - env: Environment dictionary to search in (defaults to os.environ) - - Returns: - The environment variable value if found, None otherwise - """ - - # First try the generic version - generic_val = env.get(f"{tool}_{name}") - - if dist_name is not None: - # Normalize the dist name using packaging.utils.canonicalize_name - canonical_dist_name = canonicalize_name(dist_name) - env_var_dist_name = canonical_dist_name.replace("-", "_").upper() - expected_env_var = f"{tool}_{name}_FOR_{env_var_dist_name}" - - # Try the standard normalized name first - val = env.get(expected_env_var) - if val is not None: - return val - - # If not found, search for alternative normalizations - prefix = f"{tool}_{name}_FOR_" - alternative_matches = _search_env_vars_with_prefix(prefix, dist_name, env) - - if alternative_matches: - # Found alternative matches - use the first one but warn - env_var, value = alternative_matches[0] - log.warning( - "Found environment variable '%s' for dist name '%s', " - "but expected '%s'. Consider using the standard normalized name.", - env_var, - dist_name, - expected_env_var, - ) - if len(alternative_matches) > 1: - other_vars = [var for var, _ in alternative_matches[1:]] - log.warning( - "Multiple alternative environment variables found: %s. Using '%s'.", - other_vars, - env_var, - ) - return value - - # No exact or alternative matches found - look for potential typos - close_matches = _find_close_env_var_matches(prefix, env_var_dist_name, env) - if close_matches: - log.warning( - "Environment variable '%s' not found for dist name '%s' " - "(canonicalized as '%s'). Did you mean one of these? %s", - expected_env_var, - dist_name, - canonical_dist_name, - close_matches, - ) - - return generic_val - - -def _read_pretended_metadata_for( - config: _config.Configuration, -) -> dict[str, Any] | None: - """read overridden metadata from the environment - - tries ``SETUPTOOLS_SCM_PRETEND_METADATA`` - and ``SETUPTOOLS_SCM_PRETEND_METADATA_FOR_$UPPERCASE_DIST_NAME`` - - Returns a dictionary with metadata field overrides like: - {"node": "g1337beef", "distance": 4} - """ - log.debug("dist name: %s", config.dist_name) - - pretended = read_named_env(name="PRETEND_METADATA", dist_name=config.dist_name) - - if pretended: - try: - metadata_overrides = load_toml_or_inline_map(pretended) - # Validate that only known ScmVersion fields are provided - valid_fields = { - "tag", - "distance", - "node", - "dirty", - "preformatted", - "branch", - "node_date", - "time", - } - invalid_fields = set(metadata_overrides.keys()) - valid_fields - if invalid_fields: - log.warning( - "Invalid metadata fields in pretend metadata: %s. " - "Valid fields are: %s", - invalid_fields, - valid_fields, - ) - # Remove invalid fields but continue processing - for field in invalid_fields: - metadata_overrides.pop(field) - - return metadata_overrides or None - except Exception as e: - log.error("Failed to parse pretend metadata: %s", e) - return None - else: - return None - - -def _apply_metadata_overrides( - scm_version: version.ScmVersion | None, - config: _config.Configuration, -) -> version.ScmVersion | None: - """Apply metadata overrides to a ScmVersion object. - - This function reads pretend metadata from environment variables and applies - the overrides to the given ScmVersion. TOML type coercion is used so values - should be provided in their correct types (int, bool, datetime, etc.). - - Args: - scm_version: The ScmVersion to apply overrides to, or None - config: Configuration object - - Returns: - Modified ScmVersion with overrides applied, or None - """ - metadata_overrides = _read_pretended_metadata_for(config) - - if not metadata_overrides: - return scm_version - - if scm_version is None: - log.warning( - "PRETEND_METADATA specified but no base version found. " - "Metadata overrides cannot be applied without a base version." - ) - return None - - log.info("Applying metadata overrides: %s", metadata_overrides) - - # Define type checks and field mappings - from datetime import date - from datetime import datetime - - field_specs: dict[str, tuple[type | tuple[type, type], str]] = { - "distance": (int, "int"), - "dirty": (bool, "bool"), - "preformatted": (bool, "bool"), - "node_date": (date, "date"), - "time": (datetime, "datetime"), - "node": ((str, type(None)), "str or None"), - "branch": ((str, type(None)), "str or None"), - # tag is special - can be multiple types, handled separately - } - - # Apply each override individually using dataclasses.replace for type safety - result = scm_version - - for field, value in metadata_overrides.items(): - if field in field_specs: - expected_type, type_name = field_specs[field] - assert isinstance(value, expected_type), ( - f"{field} must be {type_name}, got {type(value).__name__}: {value!r}" - ) - result = dataclasses.replace(result, **{field: value}) - elif field == "tag": - # tag can be Version, NonNormalizedVersion, or str - we'll let the assignment handle validation - result = dataclasses.replace(result, tag=value) - else: - # This shouldn't happen due to validation in _read_pretended_metadata_for - log.warning("Unknown field '%s' in metadata overrides", field) - - # Ensure config is preserved (should not be overridden) - assert result.config is config, "Config must be preserved during metadata overrides" - - return result - - -def _read_pretended_version_for( - config: _config.Configuration, -) -> version.ScmVersion | None: - """read a a overridden version from the environment - - tries ``SETUPTOOLS_SCM_PRETEND_VERSION`` - and ``SETUPTOOLS_SCM_PRETEND_VERSION_FOR_$UPPERCASE_DIST_NAME`` - """ - log.debug("dist name: %s", config.dist_name) - - pretended = read_named_env(name="PRETEND_VERSION", dist_name=config.dist_name) - - if pretended: - return version.meta(tag=pretended, preformatted=True, config=config) - else: - return None - - -def read_toml_overrides(dist_name: str | None) -> dict[str, Any]: - data = read_named_env(name="OVERRIDES", dist_name=dist_name) - return load_toml_or_inline_map(data) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py deleted file mode 100644 index 9bb8846..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_requirement_cls.py +++ /dev/null @@ -1,34 +0,0 @@ -from __future__ import annotations - -__all__ = ["Requirement", "extract_package_name"] - -try: - from packaging.requirements import Requirement - from packaging.utils import canonicalize_name -except ImportError: - from setuptools.extern.packaging.requirements import ( # type: ignore[import-not-found,no-redef] - Requirement as Requirement, - ) - from setuptools.extern.packaging.utils import ( # type: ignore[import-not-found,no-redef] - canonicalize_name as canonicalize_name, - ) - -from . import _log - -log = _log.log.getChild("requirement_cls") - - -def extract_package_name(requirement_string: str) -> str: - """Extract the canonical package name from a requirement string. - - This function uses packaging.requirements.Requirement to properly parse - the requirement and extract the package name, handling all edge cases - that the custom regex-based approach might miss. - - Args: - requirement_string: The requirement string to parse - - Returns: - The package name as a string - """ - return canonicalize_name(Requirement(requirement_string).name) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py deleted file mode 100644 index 2dff636..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_run_cmd.py +++ /dev/null @@ -1,221 +0,0 @@ -from __future__ import annotations - -import os -import shlex -import subprocess -import textwrap -import warnings - -from typing import TYPE_CHECKING -from typing import Callable -from typing import Final -from typing import Mapping -from typing import Sequence -from typing import TypeVar -from typing import overload - -from . import _log -from . import _types as _t - -if TYPE_CHECKING: - BaseCompletedProcess = subprocess.CompletedProcess[str] -else: - BaseCompletedProcess = subprocess.CompletedProcess - -# pick 40 seconds -# unfortunately github CI for windows sometimes needs -# up to 30 seconds to start a command - - -def _get_timeout(env: Mapping[str, str]) -> int: - return int(env.get("SETUPTOOLS_SCM_SUBPROCESS_TIMEOUT") or 40) - - -BROKEN_TIMEOUT: Final[int] = _get_timeout(os.environ) - -log = _log.log.getChild("run_cmd") - -PARSE_RESULT = TypeVar("PARSE_RESULT") -T = TypeVar("T") - - -class CompletedProcess(BaseCompletedProcess): - @classmethod - def from_raw( - cls, input: BaseCompletedProcess, strip: bool = True - ) -> CompletedProcess: - return cls( - args=input.args, - returncode=input.returncode, - stdout=input.stdout.strip() if strip and input.stdout else input.stdout, - stderr=input.stderr.strip() if strip and input.stderr else input.stderr, - ) - - @overload - def parse_success( - self, - parse: Callable[[str], PARSE_RESULT], - default: None = None, - error_msg: str | None = None, - ) -> PARSE_RESULT | None: ... - - @overload - def parse_success( - self, - parse: Callable[[str], PARSE_RESULT], - default: T, - error_msg: str | None = None, - ) -> PARSE_RESULT | T: ... - - def parse_success( - self, - parse: Callable[[str], PARSE_RESULT], - default: T | None = None, - error_msg: str | None = None, - ) -> PARSE_RESULT | T | None: - if self.returncode: - if error_msg: - log.warning("%s %s", error_msg, self) - return default - else: - return parse(self.stdout) - - -KEEP_GIT_ENV = ( - "GIT_CEILING_DIRECTORIES", - "GIT_EXEC_PATH", - "GIT_SSH", - "GIT_SSH_COMMAND", - "GIT_AUTHOR_DATE", - "GIT_COMMITTER_DATE", -) - - -def no_git_env(env: Mapping[str, str]) -> dict[str, str]: - # adapted from pre-commit - # Too many bugs dealing with environment variables and GIT: - # https://github.com/pre-commit/pre-commit/issues/300 - # In git 2.6.3 (maybe others), git exports GIT_WORK_TREE while running - # pre-commit hooks - # In git 1.9.1 (maybe others), git exports GIT_DIR and GIT_INDEX_FILE - # while running pre-commit hooks in submodules. - # GIT_DIR: Causes git clone to clone wrong thing - # GIT_INDEX_FILE: Causes 'error invalid object ...' during commit - for k, v in env.items(): - if k.startswith("GIT_"): - log.debug("%s: %s", k, v) - return { - k: v for k, v in env.items() if not k.startswith("GIT_") or k in KEEP_GIT_ENV - } - - -def avoid_pip_isolation(env: Mapping[str, str]) -> dict[str, str]: - """ - pip build isolation can break Mercurial - (see https://github.com/pypa/pip/issues/10635) - - pip uses PYTHONNOUSERSITE and a path in PYTHONPATH containing "pip-build-env-". - """ - new_env = {k: v for k, v in env.items() if k != "PYTHONNOUSERSITE"} - if "PYTHONPATH" not in new_env: - return new_env - - new_env["PYTHONPATH"] = os.pathsep.join( - [ - path - for path in new_env["PYTHONPATH"].split(os.pathsep) - if "-build-env-" not in path - ] - ) - return new_env - - -def ensure_stripped_str(str_or_bytes: str | bytes) -> str: - if isinstance(str_or_bytes, str): - return str_or_bytes.strip() - else: - return str_or_bytes.decode("utf-8", "surrogateescape").strip() - - -def run( - cmd: _t.CMD_TYPE, - cwd: _t.PathT, - *, - strip: bool = True, - trace: bool = True, - timeout: int | None = None, - check: bool = False, -) -> CompletedProcess: - if isinstance(cmd, str): - cmd = shlex.split(cmd) - else: - cmd = [os.fspath(x) for x in cmd] - cmd_4_trace = " ".join(map(_unsafe_quote_for_display, cmd)) - log.debug("at %s\n $ %s ", cwd, cmd_4_trace) - if timeout is None: - timeout = BROKEN_TIMEOUT - res = subprocess.run( - cmd, - capture_output=True, - cwd=os.fspath(cwd), - env=dict( - avoid_pip_isolation(no_git_env(os.environ)), - # os.environ, - # try to disable i18n, but still allow UTF-8 encoded text. - LC_ALL="C.UTF-8", - LANGUAGE="", - HGPLAIN="1", - ), - text=True, - encoding="utf-8", - timeout=timeout, - ) - - res = CompletedProcess.from_raw(res, strip=strip) - if trace: - if res.stdout: - log.debug("out:\n%s", textwrap.indent(res.stdout, " ")) - if res.stderr: - log.debug("err:\n%s", textwrap.indent(res.stderr, " ")) - if res.returncode: - log.debug("ret: %s", res.returncode) - if check: - res.check_returncode() - return res - - -def _unsafe_quote_for_display(item: _t.PathT) -> str: - # give better results than shlex.join in our cases - text = os.fspath(item) - return text if all(c not in text for c in " {[:") else f'"{text}"' - - -def has_command( - name: str, args: Sequence[str] = ["version"], warn: bool = True -) -> bool: - try: - p = run([name, *args], cwd=".") - if p.returncode != 0: - log.error("Command '%s' returned non-zero. This is stderr:", name) - log.error(p.stderr) - except OSError as e: - log.warning("command %s missing: %s", name, e) - res = False - except subprocess.TimeoutExpired as e: - log.warning("command %s timed out %s", name, e) - res = False - - else: - res = not p.returncode - if not res and warn: - warnings.warn(f"{name!r} was not found", category=RuntimeWarning) - return res - - -class CommandNotFoundError(LookupError, FileNotFoundError): - pass - - -def require_command(name: str) -> None: - if not has_command(name, warn=False): - raise CommandNotFoundError(name) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py deleted file mode 100644 index 4f8874f..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_types.py +++ /dev/null @@ -1,61 +0,0 @@ -from __future__ import annotations - -import os - -from typing import TYPE_CHECKING -from typing import Callable -from typing import List -from typing import Protocol -from typing import Sequence -from typing import Tuple -from typing import Union - -from setuptools import Distribution - -if TYPE_CHECKING: - import sys - - if sys.version_info >= (3, 10): - from typing import TypeAlias - else: - from typing_extensions import TypeAlias - - from . import version - from ._integration.pyproject_reading import PyProjectData - from ._integration.toml import InvalidTomlError - -PathT: TypeAlias = Union["os.PathLike[str]", str] - -CMD_TYPE: TypeAlias = Union[Sequence[PathT], str] - -VERSION_SCHEME: TypeAlias = Union[str, Callable[["version.ScmVersion"], str]] -VERSION_SCHEMES: TypeAlias = Union[List[str], Tuple[str, ...], VERSION_SCHEME] -SCMVERSION: TypeAlias = "version.ScmVersion" - -# Git pre-parse function types -GIT_PRE_PARSE: TypeAlias = Union[str, None] - -# Testing injection types for configuration reading -GivenPyProjectResult: TypeAlias = Union[ - "PyProjectData", "InvalidTomlError", FileNotFoundError, None -] - - -class VersionInferenceApplicable(Protocol): - """A result object from version inference decision that can be applied to a dist.""" - - def apply(self, dist: Distribution) -> None: # pragma: no cover - structural type - ... - - -class GetVersionInferenceConfig(Protocol): - """Callable protocol for the decision function used by integration points.""" - - def __call__( - self, - dist_name: str | None, - current_version: str | None, - pyproject_data: PyProjectData, - overrides: dict[str, object] | None = None, - ) -> VersionInferenceApplicable: # pragma: no cover - structural type - ... diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py deleted file mode 100644 index e0fe387..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/_version_cls.py +++ /dev/null @@ -1,101 +0,0 @@ -from __future__ import annotations - -from typing import Type -from typing import Union -from typing import cast - -try: - from packaging.version import InvalidVersion - from packaging.version import Version as Version -except ImportError: - from setuptools.extern.packaging.version import ( # type: ignore[import-not-found, no-redef] - InvalidVersion, - ) - from setuptools.extern.packaging.version import ( # type: ignore[no-redef] - Version as Version, - ) -from . import _log - -log = _log.log.getChild("version_cls") - - -class NonNormalizedVersion(Version): - """A non-normalizing version handler. - - You can use this class to preserve version verification but skip normalization. - For example you can use this to avoid git release candidate version tags - ("1.0.0-rc1") to be normalized to "1.0.0rc1". Only use this if you fully - trust the version tags. - """ - - def __init__(self, version: str) -> None: - # parse and validate using parent - super().__init__(version) - - # store raw for str - self._raw_version = version - - def __str__(self) -> str: - # return the non-normalized version (parent returns the normalized) - return self._raw_version - - def __repr__(self) -> str: - # same pattern as parent - return f"" - - -def _version_as_tuple(version_str: str) -> tuple[int | str, ...]: - try: - parsed_version = Version(version_str) - except InvalidVersion as e: - log.error("failed to parse version %s: %s", e, version_str) - return (version_str,) - else: - version_fields: tuple[int | str, ...] = parsed_version.release - if parsed_version.epoch: - version_fields = (f"{parsed_version.epoch}!", *version_fields) - if parsed_version.pre is not None: - version_fields += (f"{parsed_version.pre[0]}{parsed_version.pre[1]}",) - - if parsed_version.post is not None: - version_fields += (f"post{parsed_version.post}",) - - if parsed_version.dev is not None: - version_fields += (f"dev{parsed_version.dev}",) - - if parsed_version.local is not None: - version_fields += (parsed_version.local,) - return version_fields - - -_VersionT = Union[Version, NonNormalizedVersion] - - -def import_name(name: str) -> object: - import importlib - - pkg_name, cls_name = name.rsplit(".", 1) - pkg = importlib.import_module(pkg_name) - return getattr(pkg, cls_name) - - -def _validate_version_cls( - version_cls: type[_VersionT] | str | None, normalize: bool -) -> type[_VersionT]: - if not normalize: - if version_cls is not None: - raise ValueError( - "Providing a custom `version_cls` is not permitted when " - "`normalize=False`" - ) - return NonNormalizedVersion - # Use `version_cls` if provided, default to packaging or pkg_resources - elif version_cls is None: - return Version - elif isinstance(version_cls, str): - try: - return cast(Type[_VersionT], import_name(version_cls)) - except Exception: - raise ValueError(f"Unable to import version_cls='{version_cls}'") from None - else: - return version_cls diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py deleted file mode 100644 index e8208ca..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/discover.py +++ /dev/null @@ -1,74 +0,0 @@ -from __future__ import annotations - -import os - -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Iterable -from typing import Iterator - -from . import _entrypoints -from . import _log -from . import _types as _t -from ._config import Configuration - -if TYPE_CHECKING: - from ._entrypoints import im - - -log = _log.log.getChild("discover") - - -def walk_potential_roots(root: _t.PathT, search_parents: bool = True) -> Iterator[Path]: - """ - Iterate though a path and each of its parents. - :param root: File path. - :param search_parents: If ``False`` the parents are not considered. - """ - root = Path(root) - yield root - if search_parents: - yield from root.parents - - -def match_entrypoint(root: _t.PathT, name: str) -> bool: - """ - Consider a ``root`` as entry-point. - :param root: File path. - :param name: Subdirectory name. - :return: ``True`` if a subdirectory ``name`` exits in ``root``. - """ - - if os.path.exists(os.path.join(root, name)): - if not os.path.isabs(name): - return True - log.debug("ignoring bad ep %s", name) - - return False - - -# blocked entrypints from legacy plugins -_BLOCKED_EP_TARGETS = {"setuptools_scm_git_archive:parse"} - - -def iter_matching_entrypoints( - root: _t.PathT, entrypoint: str, config: Configuration -) -> Iterable[im.EntryPoint]: - """ - Consider different entry-points in ``root`` and optionally its parents. - :param root: File path. - :param entrypoint: Entry-point to consider. - :param config: Configuration, - read ``search_parent_directories``, write found parent to ``parent``. - """ - - log.debug("looking for ep %s in %s", entrypoint, root) - - for wd in walk_potential_roots(root, config.search_parent_directories): - for ep in _entrypoints.entry_points(group=entrypoint): - if ep.value in _BLOCKED_EP_TARGETS: - continue - if match_entrypoint(wd, ep.name): - log.debug("found ep %s in %s", ep, wd) - config.parent = wd - yield ep diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py deleted file mode 100644 index 45a7535..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/fallbacks.py +++ /dev/null @@ -1,45 +0,0 @@ -from __future__ import annotations - -import logging -import os - -from pathlib import Path -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from . import _types as _t -from . import Configuration -from .integration import data_from_mime -from .version import ScmVersion -from .version import meta -from .version import tag_to_version - -log = logging.getLogger(__name__) - -_UNKNOWN = "UNKNOWN" - - -def parse_pkginfo(root: _t.PathT, config: Configuration) -> ScmVersion | None: - pkginfo = Path(root) / "PKG-INFO" - log.debug("pkginfo %s", pkginfo) - data = data_from_mime(pkginfo) - version = data.get("Version", _UNKNOWN) - if version != _UNKNOWN: - return meta(version, preformatted=True, config=config) - else: - return None - - -def fallback_version(root: _t.PathT, config: Configuration) -> ScmVersion | None: - if config.parentdir_prefix_version is not None: - _, parent_name = os.path.split(os.path.abspath(root)) - if parent_name.startswith(config.parentdir_prefix_version): - version = tag_to_version( - parent_name[len(config.parentdir_prefix_version) :], config - ) - if version is not None: - return meta(str(version), preformatted=True, config=config) - if config.fallback_version is not None: - log.debug("FALLBACK %s", config.fallback_version) - return meta(config.fallback_version, preformatted=True, config=config) - return None diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py deleted file mode 100644 index 966ab69..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/git.py +++ /dev/null @@ -1,454 +0,0 @@ -from __future__ import annotations - -import dataclasses -import logging -import os -import re -import shlex -import sys -import warnings - -from datetime import date -from datetime import datetime -from datetime import timezone -from enum import Enum -from os.path import samefile -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Callable -from typing import Sequence - -from . import Configuration -from . import _types as _t -from . import discover -from ._run_cmd import CompletedProcess as _CompletedProcess -from ._run_cmd import require_command as _require_command -from ._run_cmd import run as _run -from .integration import data_from_mime -from .scm_workdir import Workdir -from .scm_workdir import get_latest_file_mtime -from .version import ScmVersion -from .version import meta -from .version import tag_to_version - -if TYPE_CHECKING: - from . import hg_git -log = logging.getLogger(__name__) - -REF_TAG_RE = re.compile(r"(?<=\btag: )([^,]+)\b") -DESCRIBE_UNSUPPORTED = "%(describe" - -# If testing command in shell make sure to quote the match argument like -# '*[0-9]*' as it will expand before being sent to git if there are any matching -# files in current directory. -DEFAULT_DESCRIBE = [ - "git", - "describe", - "--dirty", - "--tags", - "--long", - "--abbrev=40", - "--match", - "*[0-9]*", -] - - -class GitPreParse(Enum): - """Available git pre-parse functions""" - - WARN_ON_SHALLOW = "warn_on_shallow" - FAIL_ON_SHALLOW = "fail_on_shallow" - FETCH_ON_SHALLOW = "fetch_on_shallow" - FAIL_ON_MISSING_SUBMODULES = "fail_on_missing_submodules" - - -def run_git( - args: Sequence[str | os.PathLike[str]], - repo: Path, - *, - check: bool = False, - timeout: int | None = None, -) -> _CompletedProcess: - return _run( - ["git", "--git-dir", repo / ".git", *args], - cwd=repo, - check=check, - timeout=timeout, - ) - - -class GitWorkdir(Workdir): - """experimental, may change at any time""" - - @classmethod - def from_potential_worktree(cls, wd: _t.PathT) -> GitWorkdir | None: - wd = Path(wd).resolve() - real_wd = run_git(["rev-parse", "--show-prefix"], wd).parse_success(parse=str) - if real_wd is None: - return None - else: - real_wd = real_wd[:-1] # remove the trailing pathsep - - if not real_wd: - real_wd = os.fspath(wd) - else: - str_wd = os.fspath(wd) - from ._compat import strip_path_suffix - - real_wd = strip_path_suffix(str_wd, real_wd) - log.debug("real root %s", real_wd) - if not samefile(real_wd, wd): - return None - - return cls(Path(real_wd)) - - def is_dirty(self) -> bool: - return run_git( - ["status", "--porcelain", "--untracked-files=no"], self.path - ).parse_success( - parse=bool, - default=False, - ) - - def get_branch(self) -> str | None: - return run_git( - ["rev-parse", "--abbrev-ref", "HEAD"], - self.path, - ).parse_success( - parse=str, - error_msg="branch err (abbrev-err)", - ) or run_git( - ["symbolic-ref", "--short", "HEAD"], - self.path, - ).parse_success( - parse=str, - error_msg="branch err (symbolic-ref)", - ) - - def get_head_date(self) -> date | None: - def parse_timestamp(timestamp_text: str) -> date | None: - if "%c" in timestamp_text: - log.warning("git too old -> timestamp is %r", timestamp_text) - return None - if sys.version_info < (3, 11) and timestamp_text.endswith("Z"): - timestamp_text = timestamp_text[:-1] + "+00:00" - - # Convert to UTC to ensure consistent date regardless of local timezone - dt = datetime.fromisoformat(timestamp_text) - log.debug("dt: %s", dt) - dt_utc = dt.astimezone(timezone.utc).date() - log.debug("dt utc: %s", dt_utc) - return dt_utc - - res = run_git( - [ - *("-c", "log.showSignature=false"), - *("log", "-n", "1", "HEAD"), - "--format=%cI", - ], - self.path, - ) - return res.parse_success( - parse=parse_timestamp, - error_msg="logging the iso date for head failed", - ) - - def get_dirty_tag_date(self) -> date | None: - """Get the latest modification time of changed files in the working directory. - - Returns the date of the most recently modified file that has changes, - or None if no files are changed or if an error occurs. - """ - if not self.is_dirty(): - return None - - try: - # Get list of changed files - changed_files_res = run_git(["diff", "--name-only"], self.path) - if changed_files_res.returncode != 0: - return None - - changed_files = changed_files_res.stdout.strip().split("\n") - return get_latest_file_mtime(changed_files, self.path) - - except Exception as e: - log.debug("Failed to get dirty tag date: %s", e) - return None - - def is_shallow(self) -> bool: - return self.path.joinpath(".git/shallow").is_file() - - def fetch_shallow(self) -> None: - run_git(["fetch", "--unshallow"], self.path, check=True, timeout=240) - - def node(self) -> str | None: - return run_git( - ["rev-parse", "--verify", "--quiet", "HEAD"], self.path - ).parse_success( - parse=str, - ) - - def count_all_nodes(self) -> int: - res = run_git(["rev-list", "HEAD"], self.path) - return res.stdout.count("\n") + 1 - - def default_describe(self) -> _CompletedProcess: - return run_git(DEFAULT_DESCRIBE[1:], self.path) - - -def warn_on_shallow(wd: GitWorkdir) -> None: - """experimental, may change at any time""" - if wd.is_shallow(): - warnings.warn(f'"{wd.path}" is shallow and may cause errors') - - -def fetch_on_shallow(wd: GitWorkdir) -> None: - """experimental, may change at any time""" - if wd.is_shallow(): - warnings.warn(f'"{wd.path}" was shallow, git fetch was used to rectify') - wd.fetch_shallow() - - -def fail_on_shallow(wd: GitWorkdir) -> None: - """experimental, may change at any time""" - if wd.is_shallow(): - raise ValueError( - f'{wd.path} is shallow, please correct with "git fetch --unshallow"' - ) - - -def fail_on_missing_submodules(wd: GitWorkdir) -> None: - """ - Fail if submodules are defined but not initialized/cloned. - - This pre_parse function checks if there are submodules defined in .gitmodules - but not properly initialized (cloned). This helps prevent packaging incomplete - projects when submodules are required for a complete build. - """ - gitmodules_path = wd.path / ".gitmodules" - if not gitmodules_path.exists(): - # No submodules defined, nothing to check - return - - # Get submodule status - lines starting with '-' indicate uninitialized submodules - status_result = run_git(["submodule", "status"], wd.path) - if status_result.returncode != 0: - # Command failed, might not be in a git repo or other error - log.debug("Failed to check submodule status: %s", status_result.stderr) - return - - status_lines = ( - status_result.stdout.strip().split("\n") if status_result.stdout.strip() else [] - ) - uninitialized_submodules = [] - - for line in status_lines: - line = line.strip() - if line.startswith("-"): - # Extract submodule path (everything after the commit hash) - parts = line.split() - if len(parts) >= 2: - submodule_path = parts[1] - uninitialized_submodules.append(submodule_path) - - # If .gitmodules exists but git submodule status returns nothing, - # it means submodules are defined but not properly set up (common after cloning without --recurse-submodules) - if not status_lines and gitmodules_path.exists(): - raise ValueError( - f"Submodules are defined in .gitmodules but not initialized in {wd.path}. " - f"Please run 'git submodule update --init --recursive' to initialize them." - ) - - if uninitialized_submodules: - submodule_list = ", ".join(uninitialized_submodules) - raise ValueError( - f"Submodules are not initialized in {wd.path}: {submodule_list}. " - f"Please run 'git submodule update --init --recursive' to initialize them." - ) - - -# Mapping from enum items to actual pre_parse functions -_GIT_PRE_PARSE_FUNCTIONS: dict[GitPreParse, Callable[[GitWorkdir], None]] = { - GitPreParse.WARN_ON_SHALLOW: warn_on_shallow, - GitPreParse.FAIL_ON_SHALLOW: fail_on_shallow, - GitPreParse.FETCH_ON_SHALLOW: fetch_on_shallow, - GitPreParse.FAIL_ON_MISSING_SUBMODULES: fail_on_missing_submodules, -} - - -def get_working_directory(config: Configuration, root: _t.PathT) -> GitWorkdir | None: - """ - Return the working directory (``GitWorkdir``). - """ - - if config.parent: # todo broken - return GitWorkdir.from_potential_worktree(config.parent) - - for potential_root in discover.walk_potential_roots( - root, search_parents=config.search_parent_directories - ): - potential_wd = GitWorkdir.from_potential_worktree(potential_root) - if potential_wd is not None: - return potential_wd - - return GitWorkdir.from_potential_worktree(root) - - -def parse( - root: _t.PathT, - config: Configuration, - describe_command: str | list[str] | None = None, - pre_parse: Callable[[GitWorkdir], None] | None = None, -) -> ScmVersion | None: - """ - :param pre_parse: experimental pre_parse action, may change at any time. - Takes precedence over config.git_pre_parse if provided. - """ - _require_command("git") - wd = get_working_directory(config, root) - if wd: - # Use function parameter first, then config setting, then default - if pre_parse is not None: - effective_pre_parse = pre_parse - else: - # config.scm.git.pre_parse is always a GitPreParse enum instance - effective_pre_parse = _GIT_PRE_PARSE_FUNCTIONS.get( - config.scm.git.pre_parse, warn_on_shallow - ) - - return _git_parse_inner( - config, wd, describe_command=describe_command, pre_parse=effective_pre_parse - ) - else: - return None - - -def version_from_describe( - wd: GitWorkdir | hg_git.GitWorkdirHgClient, - config: Configuration, - describe_command: _t.CMD_TYPE | None, -) -> ScmVersion | None: - if config.scm.git.describe_command is not None: - describe_command = config.scm.git.describe_command - - if describe_command is not None: - if isinstance(describe_command, str): - describe_command = shlex.split(describe_command) - # todo: figure how to ensure git with gitdir gets correctly invoked - if describe_command[0] == "git": - describe_res = run_git(describe_command[1:], wd.path) - else: - describe_res = _run(describe_command, wd.path) - else: - describe_res = wd.default_describe() - - def parse_describe(output: str) -> ScmVersion: - tag, distance, node, dirty = _git_parse_describe(output) - return meta(tag=tag, distance=distance, dirty=dirty, node=node, config=config) - - return describe_res.parse_success(parse=parse_describe) - - -def _git_parse_inner( - config: Configuration, - wd: GitWorkdir | hg_git.GitWorkdirHgClient, - pre_parse: (Callable[[GitWorkdir | hg_git.GitWorkdirHgClient], None]) | None = None, - describe_command: _t.CMD_TYPE | None = None, -) -> ScmVersion: - if pre_parse: - pre_parse(wd) - - version = version_from_describe(wd, config, describe_command) - - if version is None: - # If 'git git_describe_command' failed, try to get the information otherwise. - tag = config.version_cls(config.fallback_version or "0.0") - node = wd.node() - if node is None: - distance = 0 - dirty = True - else: - distance = wd.count_all_nodes() - node = "g" + node - dirty = wd.is_dirty() - version = meta( - tag=tag, distance=distance, dirty=dirty, node=node, config=config - ) - branch = wd.get_branch() - node_date = wd.get_head_date() - - # If we can't get node_date from HEAD (e.g., no commits yet), - # and the working directory is dirty, try to use the latest - # modification time of changed files instead of current time - if node_date is None and wd.is_dirty(): - dirty_date = wd.get_dirty_tag_date() - if dirty_date is not None: - node_date = dirty_date - - # Final fallback to current time - if node_date is None: - node_date = datetime.now(timezone.utc).date() - - return dataclasses.replace(version, branch=branch, node_date=node_date) - - -def _git_parse_describe( - describe_output: str, -) -> tuple[str, int, str | None, bool]: - # 'describe_output' looks e.g. like 'v1.5.0-0-g4060507' or - # 'v1.15.1rc1-37-g9bd1298-dirty'. - # It may also just be a bare tag name if this is a tagged commit and we are - # parsing a .git_archival.txt file. - - if describe_output.endswith("-dirty"): - dirty = True - describe_output = describe_output[:-6] - else: - dirty = False - - split = describe_output.rsplit("-", 2) - if len(split) < 3: # probably a tagged commit - tag = describe_output - number = 0 - node = None - else: - tag, number_, node = split - number = int(number_) - return tag, number, node, dirty - - -def archival_to_version( - data: dict[str, str], config: Configuration -) -> ScmVersion | None: - node: str | None - log.debug("data %s", data) - archival_describe = data.get("describe-name", DESCRIBE_UNSUPPORTED) - if DESCRIBE_UNSUPPORTED in archival_describe: - warnings.warn("git archive did not support describe output") - else: - tag, number, node, _ = _git_parse_describe(archival_describe) - return meta( - tag, - config=config, - distance=number, - node=node, - ) - - for ref in REF_TAG_RE.findall(data.get("ref-names", "")): - version = tag_to_version(ref, config) - if version is not None: - return meta(version, config=config) - node = data.get("node") - if node is None: - return None - elif "$FORMAT" in node.upper(): - warnings.warn("unprocessed git archival found (no export subst applied)") - return None - else: - return meta("0.0", node=node, config=config) - - -def parse_archival(root: _t.PathT, config: Configuration) -> ScmVersion | None: - archival = os.path.join(root, ".git_archival.txt") - data = data_from_mime(archival) - return archival_to_version(data, config=config) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py deleted file mode 100644 index 4232051..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg.py +++ /dev/null @@ -1,308 +0,0 @@ -from __future__ import annotations - -import datetime -import logging -import os - -from pathlib import Path -from typing import TYPE_CHECKING -from typing import Any - -from . import Configuration -from ._version_cls import Version -from .integration import data_from_mime -from .scm_workdir import Workdir -from .scm_workdir import get_latest_file_mtime -from .version import ScmVersion -from .version import meta -from .version import tag_to_version - -if TYPE_CHECKING: - from . import _types as _t - -from ._run_cmd import CompletedProcess -from ._run_cmd import require_command as _require_command -from ._run_cmd import run as _run - -log = logging.getLogger(__name__) - - -def _get_hg_command() -> str: - """Get the hg command from environment, allowing runtime configuration.""" - return os.environ.get("SETUPTOOLS_SCM_HG_COMMAND", "hg") - - -def run_hg(args: list[str], cwd: _t.PathT, **kwargs: Any) -> CompletedProcess: - """Run mercurial command with the configured hg executable.""" - cmd = [_get_hg_command(), *args] - return _run(cmd, cwd=cwd, **kwargs) - - -class HgWorkdir(Workdir): - @classmethod - def from_potential_worktree(cls, wd: _t.PathT) -> HgWorkdir | None: - res = run_hg(["root"], wd) - if res.returncode: - return None - return cls(Path(res.stdout)) - - def get_meta(self, config: Configuration) -> ScmVersion | None: - # TODO: support bookmarks and topics (but nowadays bookmarks are - # mainly used to emulate Git branches, which is already supported with - # the dedicated class GitWorkdirHgClient) - - node_info = self._get_node_info() - if node_info is None: - return None - - node, tags_str, node_date_str = node_info - branch_info = self._get_branch_info() - branch, dirty, dirty_date = branch_info - - # Determine the appropriate node date - node_date = self._get_node_date(dirty, node_date_str, dirty_date) - - # Handle initial/empty repository - if self._is_initial_node(node): - return self._create_initial_meta(config, dirty, branch, node_date) - - node = "h" + node - tags = self._parse_tags(tags_str) - - # Try to get version from current tags - tag_version = self._get_version_from_tags(tags, config) - if tag_version: - return meta(tag_version, dirty=dirty, branch=branch, config=config) - - # Fall back to distance-based versioning - return self._get_distance_based_version(config, dirty, branch, node, node_date) - - def _get_node_info(self) -> tuple[str, str, str] | None: - """Get node, tags, and date information from mercurial log.""" - try: - node, tags_str, node_date_str = self.hg_log( - ".", "{node}\n{tag}\n{date|shortdate}" - ).split("\n") - return node, tags_str, node_date_str - except ValueError: - log.exception("Failed to get node info") - return None - - def _get_branch_info(self) -> tuple[str, bool, str]: - """Get branch name, dirty status, and dirty date.""" - branch, dirty_str, dirty_date = run_hg( - ["id", "-T", "{branch}\n{if(dirty, 1, 0)}\n{date|shortdate}"], - cwd=self.path, - check=True, - ).stdout.split("\n") - dirty = bool(int(dirty_str)) - return branch, dirty, dirty_date - - def _get_node_date( - self, dirty: bool, node_date_str: str, dirty_date: str - ) -> datetime.date: - """Get the appropriate node date, preferring file modification times for dirty repos.""" - if dirty: - file_mod_date = self.get_dirty_tag_date() - if file_mod_date is not None: - return file_mod_date - # Fall back to hg id date for dirty repos - return datetime.date.fromisoformat(dirty_date) - else: - return datetime.date.fromisoformat(node_date_str) - - def _is_initial_node(self, node: str) -> bool: - """Check if this is an initial/empty repository node.""" - return node == "0" * len(node) - - def _create_initial_meta( - self, config: Configuration, dirty: bool, branch: str, node_date: datetime.date - ) -> ScmVersion: - """Create metadata for initial/empty repository.""" - log.debug("initial node %s", self.path) - return meta( - Version("0.0"), - config=config, - dirty=dirty, - branch=branch, - node_date=node_date, - ) - - def _parse_tags(self, tags_str: str) -> list[str]: - """Parse and filter tags from mercurial output.""" - tags = tags_str.split() - if "tip" in tags: - # tip is not a real tag - tags.remove("tip") - return tags - - def _get_version_from_tags( - self, tags: list[str], config: Configuration - ) -> Version | None: - """Try to get a version from the current tags.""" - if tags: - tag = tag_to_version(tags[0], config) - return tag - return None - - def _get_distance_based_version( - self, - config: Configuration, - dirty: bool, - branch: str, - node: str, - node_date: datetime.date, - ) -> ScmVersion | None: - """Get version based on distance from latest tag.""" - try: - tag_str = self.get_latest_normalizable_tag() - if tag_str is None: - dist = self.get_distance_revs("") - else: - dist = self.get_distance_revs(tag_str) - - if tag_str == "null" or tag_str is None: - tag = Version("0.0") - dist += 1 - else: - maybe_tag = tag_to_version(tag_str, config=config) - if maybe_tag is None: - # If tag conversion fails, treat as no tag found - tag = Version("0.0") - dist += 1 - else: - tag = maybe_tag - - if self.check_changes_since_tag(tag_str) or dirty: - return meta( - tag, - distance=dist, - node=node, - dirty=dirty, - branch=branch, - config=config, - node_date=node_date, - ) - else: - return meta(tag, config=config, node_date=node_date) - - except ValueError: - # unpacking failed, old hg - log.exception("error") - return None - - def hg_log(self, revset: str, template: str) -> str: - return run_hg( - ["log", "-r", revset, "-T", template], cwd=self.path, check=True - ).stdout - - def get_latest_normalizable_tag(self) -> str | None: - # Gets all tags containing a '.' (see #229) from oldest to newest - outlines = self.hg_log( - revset="ancestors(.) and tag('re:\\.')", - template="{tags}{if(tags, '\n', '')}", - ).split() - if not outlines: - return None - tag = outlines[-1].split()[-1] - return tag - - def get_distance_revs(self, rev1: str, rev2: str = ".") -> int: - revset = f"({rev1}::{rev2})" - out = self.hg_log(revset, ".") - return len(out) - 1 - - def check_changes_since_tag(self, tag: str | None) -> bool: - if tag == "0.0" or tag is None: - return True - - revset = ( - "(branch(.)" # look for revisions in this branch only - f" and tag({tag!r})::." # after the last tag - # ignore commits that only modify .hgtags and nothing else: - " and (merge() or file('re:^(?!\\.hgtags).*$'))" - f" and not tag({tag!r}))" # ignore the tagged commit itself - ) - - return bool(self.hg_log(revset, ".")) - - def get_dirty_tag_date(self) -> datetime.date | None: - """Get the latest modification time of changed files in the working directory. - - Returns the date of the most recently modified file that has changes, - or None if no files are changed or if an error occurs. - """ - try: - # Check if working directory is dirty first - res = run_hg(["id", "-T", "{dirty}"], cwd=self.path) - if res.returncode != 0 or not bool(res.stdout): - return None - - # Get list of changed files using hg status - status_res = run_hg(["status", "-m", "-a", "-r"], cwd=self.path) - if status_res.returncode != 0: - return None - - changed_files = [] - for line in status_res.stdout.strip().split("\n"): - if line and len(line) > 2: - # Format is "M filename" or "A filename" etc. - filepath = line[2:] # Skip status char and space - changed_files.append(filepath) - - return get_latest_file_mtime(changed_files, self.path) - - except Exception as e: - log.debug("Failed to get dirty tag date: %s", e) - - return None - - -def parse(root: _t.PathT, config: Configuration) -> ScmVersion | None: - hg_cmd = _get_hg_command() - _require_command(hg_cmd) - if os.path.exists(os.path.join(root, ".hg/git")): - res = run_hg(["path"], root) - if not res.returncode: - for line in res.stdout.split("\n"): - if line.startswith("default ="): - path = Path(line.split()[2]) - if path.name.endswith(".git") or (path / ".git").exists(): - from .git import _git_parse_inner - from .hg_git import GitWorkdirHgClient - - wd_hggit = GitWorkdirHgClient.from_potential_worktree(root) - if wd_hggit: - return _git_parse_inner(config, wd_hggit) - - wd = HgWorkdir.from_potential_worktree(config.absolute_root) - - if wd is None: - return None - - return wd.get_meta(config) - - -def archival_to_version(data: dict[str, str], config: Configuration) -> ScmVersion: - log.debug("data %s", data) - node = data.get("node", "") - if node: - node = "h" + node - if "tag" in data: - return meta(data["tag"], config=config) - elif "latesttag" in data: - return meta( - data["latesttag"], - distance=int(data["latesttagdistance"]), - node=node, - branch=data.get("branch"), - config=config, - ) - else: - return meta(config.version_cls("0.0"), node=node, config=config) - - -def parse_archival(root: _t.PathT, config: Configuration) -> ScmVersion: - archival = os.path.join(root, ".hg_archival.txt") - data = data_from_mime(archival) - return archival_to_version(data, config=config) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py deleted file mode 100644 index 3e91b20..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/hg_git.py +++ /dev/null @@ -1,181 +0,0 @@ -from __future__ import annotations - -import logging -import os - -from contextlib import suppress -from datetime import date -from pathlib import Path - -from . import _types as _t -from ._run_cmd import CompletedProcess as _CompletedProcess -from .git import GitWorkdir -from .hg import HgWorkdir -from .hg import run_hg -from .scm_workdir import get_latest_file_mtime - -log = logging.getLogger(__name__) - -_FAKE_GIT_DESCRIBE_ERROR = _CompletedProcess( - "fake git describe output for hg", - 1, - "<>hg git failed to describe", -) - - -class GitWorkdirHgClient(GitWorkdir, HgWorkdir): - @classmethod - def from_potential_worktree(cls, wd: _t.PathT) -> GitWorkdirHgClient | None: - res = run_hg(["root"], cwd=wd).parse_success(parse=Path) - if res is None: - return None - return cls(res) - - def is_dirty(self) -> bool: - res = run_hg(["id", "-T", "{dirty}"], cwd=self.path, check=True) - return bool(res.stdout) - - def get_branch(self) -> str | None: - res = run_hg(["id", "-T", "{bookmarks}"], cwd=self.path) - if res.returncode: - log.info("branch err %s", res) - return None - return res.stdout - - def get_head_date(self) -> date | None: - return run_hg( - ["log", "-r", ".", "-T", "{shortdate(date)}"], cwd=self.path - ).parse_success(parse=date.fromisoformat, error_msg="head date err") - - def get_dirty_tag_date(self) -> date | None: - """Get the latest modification time of changed files in the working directory. - - Returns the date of the most recently modified file that has changes, - or None if no files are changed or if an error occurs. - """ - if not self.is_dirty(): - return None - - try: - # Get list of changed files using hg status - status_res = run_hg(["status", "-m", "-a", "-r"], cwd=self.path) - if status_res.returncode != 0: - return None - - changed_files = [] - for line in status_res.stdout.strip().split("\n"): - if line and len(line) > 2: - # Format is "M filename" or "A filename" etc. - filepath = line[2:] # Skip status char and space - changed_files.append(filepath) - - return get_latest_file_mtime(changed_files, self.path) - - except Exception as e: - log.debug("Failed to get dirty tag date: %s", e) - - return None - - def is_shallow(self) -> bool: - return False - - def fetch_shallow(self) -> None: - pass - - def get_hg_node(self) -> str | None: - res = run_hg(["log", "-r", ".", "-T", "{node}"], cwd=self.path) - if res.returncode: - return None - else: - return res.stdout - - def _hg2git(self, hg_node: str) -> str | None: - with suppress(FileNotFoundError): - with open(os.path.join(self.path, ".hg/git-mapfile")) as map_items: - for item in map_items: - if hg_node in item: - git_node, hg_node = item.split() - return git_node - return None - - def node(self) -> str | None: - hg_node = self.get_hg_node() - if hg_node is None: - return None - - git_node = self._hg2git(hg_node) - - if git_node is None: - # trying again after hg -> git - run_hg(["gexport"], cwd=self.path) - git_node = self._hg2git(hg_node) - - if git_node is None: - log.debug("Cannot get git node so we use hg node %s", hg_node) - - if hg_node == "0" * len(hg_node): - # mimic Git behavior - return None - - return hg_node - - return git_node - - def count_all_nodes(self) -> int: - res = run_hg(["log", "-r", "ancestors(.)", "-T", "."], cwd=self.path) - return len(res.stdout) - - def default_describe(self) -> _CompletedProcess: - """ - Tentative to reproduce the output of - - `git describe --dirty --tags --long --match *[0-9]*` - - """ - res = run_hg( - [ - "log", - "-r", - "(reverse(ancestors(.)) and tag(r're:v?[0-9].*'))", - "-T", - "{tags}{if(tags, ' ', '')}", - ], - cwd=self.path, - ) - if res.returncode: - return _FAKE_GIT_DESCRIBE_ERROR - hg_tags: list[str] = res.stdout.split() - - if not hg_tags: - return _FAKE_GIT_DESCRIBE_ERROR - - with self.path.joinpath(".hg/git-tags").open() as fp: - git_tags: dict[str, str] = dict(line.split()[::-1] for line in fp) - - tag: str - for hg_tag in hg_tags: - if hg_tag in git_tags: - tag = hg_tag - break - else: - logging.warning("tag not found hg=%s git=%s", hg_tags, git_tags) - return _FAKE_GIT_DESCRIBE_ERROR - - res = run_hg(["log", "-r", f"'{tag}'::.", "-T", "."], cwd=self.path) - if res.returncode: - return _FAKE_GIT_DESCRIBE_ERROR - distance = len(res.stdout) - 1 - - node = self.node() - assert node is not None - desc = f"{tag}-{distance}-g{node}" - - if self.is_dirty(): - desc += "-dirty" - log.debug("faked describe %r", desc) - return _CompletedProcess( - ["setuptools-scm", "faked", "describe"], - returncode=0, - stdout=desc, - stderr="", - ) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py deleted file mode 100644 index b15d74a..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/integration.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import annotations - -import logging -import textwrap - -from pathlib import Path - -from . import _types as _t - -log = logging.getLogger(__name__) - - -def data_from_mime(path: _t.PathT, content: str | None = None) -> dict[str, str]: - """return a mapping from mime/pseudo-mime content - :param path: path to the mime file - :param content: content of the mime file, if None, read from path - :rtype: dict[str, str] - - """ - - if content is None: - content = Path(path).read_text(encoding="utf-8") - log.debug("mime %s content:\n%s", path, textwrap.indent(content, " ")) - - from email.parser import HeaderParser - - parser = HeaderParser() - message = parser.parsestr(content) - data = dict(message.items()) - log.debug("mime %s data:\n%s", path, data) - return data diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py deleted file mode 100644 index b3ca7aa..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/scm_workdir.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import annotations - -import logging - -from dataclasses import dataclass -from datetime import date -from datetime import datetime -from datetime import timezone -from pathlib import Path - -from ._config import Configuration -from .version import ScmVersion - -log = logging.getLogger(__name__) - - -def get_latest_file_mtime(changed_files: list[str], base_path: Path) -> date | None: - """Get the latest modification time of the given files. - - Args: - changed_files: List of relative file paths - base_path: Base directory path to resolve relative paths - - Returns: - The date of the most recently modified file, or None if no valid files found - """ - if not changed_files or changed_files == [""]: - return None - - latest_mtime = 0.0 - for filepath in changed_files: - full_path = base_path / filepath - try: - file_stat = full_path.stat() - latest_mtime = max(latest_mtime, file_stat.st_mtime) - except OSError: - # File might not exist or be accessible, skip it - log.debug("Failed to get mtime for %s", full_path) - continue - - if latest_mtime > 0: - # Convert to UTC date - dt = datetime.fromtimestamp(latest_mtime, timezone.utc) - return dt.date() - - return None - - -@dataclass() -class Workdir: - path: Path - - def run_describe(self, config: Configuration) -> ScmVersion: - raise NotImplementedError(self.run_describe) diff --git a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py b/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py deleted file mode 100644 index 77c26dc..0000000 --- a/.eggs/setuptools_scm-9.2.2-py3.13.egg/setuptools_scm/version.py +++ /dev/null @@ -1,583 +0,0 @@ -from __future__ import annotations - -import dataclasses -import logging -import os -import re -import warnings - -from datetime import date -from datetime import datetime -from datetime import timezone -from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Match - -from . import _entrypoints -from . import _modify_version -from ._node_utils import _format_node_for_output - -if TYPE_CHECKING: - import sys - - if sys.version_info >= (3, 10): - from typing import Concatenate - from typing import ParamSpec - else: - from typing_extensions import Concatenate - from typing_extensions import ParamSpec - - _P = ParamSpec("_P") - -from typing import TypedDict - -from . import _config -from . import _version_cls as _v -from ._version_cls import Version as PkgVersion -from ._version_cls import _VersionT - -log = logging.getLogger(__name__) - - -SEMVER_MINOR = 2 -SEMVER_PATCH = 3 -SEMVER_LEN = 3 - - -class _TagDict(TypedDict): - version: str - prefix: str - suffix: str - - -def _parse_version_tag( - tag: str | object, config: _config.Configuration -) -> _TagDict | None: - match = config.tag_regex.match(str(tag)) - - if match: - key: str | int = 1 if len(match.groups()) == 1 else "version" - full = match.group(0) - log.debug("%r %r %s", tag, config.tag_regex, match) - log.debug( - "key %s data %s, %s, %r", key, match.groupdict(), match.groups(), full - ) - - if version := match.group(key): - result = _TagDict( - version=version, - prefix=full[: match.start(key)], - suffix=full[match.end(key) :], - ) - - log.debug("tag %r parsed to %r", tag, result) - return result - - raise ValueError( - f'The tag_regex "{config.tag_regex.pattern}" matched tag "{tag}", ' - "however the matched group has no value." - ) - else: - log.debug("tag %r did not parse", tag) - - return None - - -def callable_or_entrypoint(group: str, callable_or_name: str | Any) -> Any: - log.debug("ep %r %r", group, callable_or_name) - - if callable(callable_or_name): - return callable_or_name - - from ._entrypoints import _get_ep - - return _get_ep(group, callable_or_name) - - -def tag_to_version( - tag: _VersionT | str, config: _config.Configuration -) -> _VersionT | None: - """ - take a tag that might be prefixed with a keyword and return only the version part - """ - log.debug("tag %s", tag) - - tag_dict = _parse_version_tag(tag, config) - if tag_dict is None or not tag_dict.get("version", None): - warnings.warn(f"tag {tag!r} no version found") - return None - - version_str = tag_dict["version"] - log.debug("version pre parse %s", version_str) - - # Try to create version from base version first - try: - version: _VersionT = config.version_cls(version_str) - log.debug("version=%r", version) - except Exception: - warnings.warn( - f"tag {tag!r} will be stripped of its suffix {tag_dict.get('suffix', '')!r}" - ) - # Fall back to trying without any suffix - version = config.version_cls(version_str) - log.debug("version=%r", version) - return version - - # If base version is valid, check if we can preserve the suffix - if suffix := tag_dict.get("suffix", ""): - log.debug("tag %r includes local build data %r, preserving it", tag, suffix) - # Try creating version with suffix - if it fails, we'll use the base version - try: - version_with_suffix = config.version_cls(version_str + suffix) - log.debug("version with suffix=%r", version_with_suffix) - return version_with_suffix - except Exception: - warnings.warn(f"tag {tag!r} will be stripped of its suffix {suffix!r}") - # Return the base version without suffix - return version - - return version - - -def _source_epoch_or_utc_now() -> datetime: - if "SOURCE_DATE_EPOCH" in os.environ: - date_epoch = int(os.environ["SOURCE_DATE_EPOCH"]) - return datetime.fromtimestamp(date_epoch, timezone.utc) - else: - return datetime.now(timezone.utc) - - -@dataclasses.dataclass -class ScmVersion: - """represents a parsed version from scm""" - - tag: _v.Version | _v.NonNormalizedVersion - """the related tag or preformatted version""" - config: _config.Configuration - """the configuration used to parse the version""" - distance: int = 0 - """the number of commits since the tag""" - node: str | None = None - """the shortened node id""" - dirty: bool = False - """whether the working copy had uncommitted changes""" - preformatted: bool = False - """whether the version string was preformatted""" - branch: str | None = None - """the branch name if any""" - node_date: date | None = None - """the date of the commit if available""" - time: datetime = dataclasses.field(default_factory=_source_epoch_or_utc_now) - """the current time or source epoch time - only set for unit-testing version schemes - for real usage it must be `now(utc)` or `SOURCE_EPOCH` - """ - - @property - def exact(self) -> bool: - """returns true checked out exactly on a tag and no local changes apply""" - return self.distance == 0 and not self.dirty - - @property - def short_node(self) -> str | None: - """Return the node formatted for output.""" - return _format_node_for_output(self.node) - - def __repr__(self) -> str: - return ( - f"" - ) - - def format_with(self, fmt: str, **kw: object) -> str: - """format a given format string with attributes of this object""" - return fmt.format( - time=self.time, - tag=self.tag, - distance=self.distance, - node=_format_node_for_output(self.node), - dirty=self.dirty, - branch=self.branch, - node_date=self.node_date, - **kw, - ) - - def format_choice(self, clean_format: str, dirty_format: str, **kw: object) -> str: - """given `clean_format` and `dirty_format` - - choose one based on `self.dirty` and format it using `self.format_with`""" - - return self.format_with(dirty_format if self.dirty else clean_format, **kw) - - def format_next_version( - self, - guess_next: Callable[Concatenate[ScmVersion, _P], str], - fmt: str = "{guessed}.dev{distance}", - *k: _P.args, - **kw: _P.kwargs, - ) -> str: - guessed = guess_next(self, *k, **kw) - return self.format_with(fmt, guessed=guessed) - - -def _parse_tag( - tag: _VersionT | str, preformatted: bool, config: _config.Configuration -) -> _VersionT: - if preformatted: - # For preformatted versions, tag should already be validated as a version object - # String validation is handled in meta function before calling this - if isinstance(tag, str): - # This should not happen with enhanced meta, but kept for safety - return _v.NonNormalizedVersion(tag) - else: - # Already a version object (including test mocks), return as-is - return tag - elif not isinstance(tag, config.version_cls): - version = tag_to_version(tag, config) - assert version is not None - return version - else: - return tag - - -def meta( - tag: str | _VersionT, - *, - distance: int = 0, - dirty: bool = False, - node: str | None = None, - preformatted: bool = False, - branch: str | None = None, - config: _config.Configuration, - node_date: date | None = None, - time: datetime | None = None, -) -> ScmVersion: - parsed_version: _VersionT - # Enhanced string validation for preformatted versions - if preformatted and isinstance(tag, str): - # Validate PEP 440 compliance using NonNormalizedVersion - # Let validation errors bubble up to the caller - parsed_version = _v.NonNormalizedVersion(tag) - else: - # Use existing _parse_tag logic for non-preformatted or already validated inputs - parsed_version = _parse_tag(tag, preformatted, config) - - log.info("version %s -> %s", tag, parsed_version) - assert parsed_version is not None, f"Can't parse version {tag}" - scm_version = ScmVersion( - parsed_version, - distance=distance, - node=node, - dirty=dirty, - preformatted=preformatted, - branch=branch, - config=config, - node_date=node_date, - ) - if time is not None: - scm_version = dataclasses.replace(scm_version, time=time) - return scm_version - - -def guess_next_version(tag_version: ScmVersion) -> str: - version = _modify_version.strip_local(str(tag_version.tag)) - return _modify_version._bump_dev(version) or _modify_version._bump_regex(version) - - -def guess_next_dev_version(version: ScmVersion) -> str: - if version.exact: - return version.format_with("{tag}") - else: - return version.format_next_version(guess_next_version) - - -def guess_next_simple_semver( - version: ScmVersion, retain: int, increment: bool = True -) -> str: - if isinstance(version.tag, _v.Version): - parts = list(version.tag.release[:retain]) - else: - try: - parts = [int(i) for i in str(version.tag).split(".")[:retain]] - except ValueError: - raise ValueError(f"{version} can't be parsed as numeric version") from None - while len(parts) < retain: - parts.append(0) - if increment: - parts[-1] += 1 - while len(parts) < SEMVER_LEN: - parts.append(0) - return ".".join(str(i) for i in parts) - - -def simplified_semver_version(version: ScmVersion) -> str: - if version.exact: - return guess_next_simple_semver(version, retain=SEMVER_LEN, increment=False) - elif version.branch is not None and "feature" in version.branch: - return version.format_next_version( - guess_next_simple_semver, retain=SEMVER_MINOR - ) - else: - return version.format_next_version( - guess_next_simple_semver, retain=SEMVER_PATCH - ) - - -def release_branch_semver_version(version: ScmVersion) -> str: - if version.exact: - return version.format_with("{tag}") - if version.branch is not None: - # Does the branch name (stripped of namespace) parse as a version? - branch_ver_data = _parse_version_tag( - version.branch.split("/")[-1], version.config - ) - if branch_ver_data is not None: - branch_ver = branch_ver_data["version"] - if branch_ver[0] == "v": - # Allow branches that start with 'v', similar to Version. - branch_ver = branch_ver[1:] - # Does the branch version up to the minor part match the tag? If not it - # might be like, an issue number or something and not a version number, so - # we only want to use it if it matches. - tag_ver_up_to_minor = str(version.tag).split(".")[:SEMVER_MINOR] - branch_ver_up_to_minor = branch_ver.split(".")[:SEMVER_MINOR] - if branch_ver_up_to_minor == tag_ver_up_to_minor: - # We're in a release/maintenance branch, next is a patch/rc/beta bump: - return version.format_next_version(guess_next_version) - # We're in a development branch, next is a minor bump: - return version.format_next_version(guess_next_simple_semver, retain=SEMVER_MINOR) - - -def release_branch_semver(version: ScmVersion) -> str: - warnings.warn( - "release_branch_semver is deprecated and will be removed in the future. " - "Use release_branch_semver_version instead", - category=DeprecationWarning, - stacklevel=2, - ) - return release_branch_semver_version(version) - - -def only_version(version: ScmVersion) -> str: - return version.format_with("{tag}") - - -def no_guess_dev_version(version: ScmVersion) -> str: - if version.exact: - return version.format_with("{tag}") - else: - return version.format_next_version(_modify_version._dont_guess_next_version) - - -_DATE_REGEX = re.compile( - r""" - ^(?P - (?P[vV]?) - (?P\d{2}|\d{4})(?:\.\d{1,2}){2}) - (?:\.(?P\d*))?$ - """, - re.VERBOSE, -) - - -def date_ver_match(ver: str) -> Match[str] | None: - return _DATE_REGEX.match(ver) - - -def guess_next_date_ver( - version: ScmVersion, - node_date: date | None = None, - date_fmt: str | None = None, - version_cls: type | None = None, -) -> str: - """ - same-day -> patch +1 - other-day -> today - - distance is always added as .devX - """ - match = date_ver_match(str(version.tag)) - if match is None: - warnings.warn( - f"{version} does not correspond to a valid versioning date, " - "assuming legacy version" - ) - if date_fmt is None: - date_fmt = "%y.%m.%d" - else: - # deduct date format if not provided - if date_fmt is None: - date_fmt = "%Y.%m.%d" if len(match.group("year")) == 4 else "%y.%m.%d" - if prefix := match.group("prefix"): - if not date_fmt.startswith(prefix): - date_fmt = prefix + date_fmt - - today = version.time.date() - head_date = node_date or today - # compute patch - if match is None: - # For legacy non-date tags, always use patch=0 (treat as "other day") - # Use yesterday to ensure tag_date != head_date - from datetime import timedelta - - tag_date = head_date - timedelta(days=1) - else: - tag_date = ( - datetime.strptime(match.group("date"), date_fmt) - .replace(tzinfo=timezone.utc) - .date() - ) - if tag_date == head_date: - assert match is not None - # Same day as existing date tag - increment patch - patch = int(match.group("patch") or "0") + 1 - else: - # Different day or legacy non-date tag - use patch 0 - if tag_date > head_date and match is not None: - # warn on future times (only for actual date tags, not legacy) - warnings.warn( - f"your previous tag ({tag_date}) is ahead your node date ({head_date})" - ) - patch = 0 - next_version = "{node_date:{date_fmt}}.{patch}".format( - node_date=head_date, date_fmt=date_fmt, patch=patch - ) - # rely on the Version object to ensure consistency (e.g. remove leading 0s) - if version_cls is None: - version_cls = PkgVersion - next_version = str(version_cls(next_version)) - return next_version - - -def calver_by_date(version: ScmVersion) -> str: - if version.exact and not version.dirty: - return version.format_with("{tag}") - # TODO: move the release-X check to a new scheme - if version.branch is not None and version.branch.startswith("release-"): - branch_ver = _parse_version_tag(version.branch.split("-")[-1], version.config) - if branch_ver is not None: - ver = branch_ver["version"] - match = date_ver_match(ver) - if match: - return ver - return version.format_next_version( - guess_next_date_ver, - node_date=version.node_date, - version_cls=version.config.version_cls, - ) - - -def get_local_node_and_date(version: ScmVersion) -> str: - return _modify_version._format_local_with_time(version, time_format="%Y%m%d") - - -def get_local_node_and_timestamp(version: ScmVersion) -> str: - return _modify_version._format_local_with_time(version, time_format="%Y%m%d%H%M%S") - - -def get_local_dirty_tag(version: ScmVersion) -> str: - return version.format_choice("", "+dirty") - - -def get_no_local_node(version: ScmVersion) -> str: - return "" - - -def postrelease_version(version: ScmVersion) -> str: - if version.exact: - return version.format_with("{tag}") - else: - return version.format_with("{tag}.post{distance}") - - -def _combine_version_with_local_parts( - main_version: str, *local_parts: str | None -) -> str: - """ - Combine a main version with multiple local parts into a valid PEP 440 version string. - Handles deduplication of local parts to avoid adding the same local data twice. - - Args: - main_version: The main version string (e.g., "1.2.0", "1.2.dev3") - *local_parts: Variable number of local version parts, can be None or empty - - Returns: - A valid PEP 440 version string - - Examples: - _combine_version_with_local_parts("1.2.0", "build.123", "d20090213") -> "1.2.0+build.123.d20090213" - _combine_version_with_local_parts("1.2.0", "build.123", None) -> "1.2.0+build.123" - _combine_version_with_local_parts("1.2.0+build.123", "d20090213") -> "1.2.0+build.123.d20090213" - _combine_version_with_local_parts("1.2.0+build.123", "build.123") -> "1.2.0+build.123" # no duplication - _combine_version_with_local_parts("1.2.0", None, None) -> "1.2.0" - """ - # Split main version into base and existing local parts - if "+" in main_version: - main_part, existing_local = main_version.split("+", 1) - all_local_parts = existing_local.split(".") - else: - main_part = main_version - all_local_parts = [] - - # Process each new local part - for part in local_parts: - if not part or not part.strip(): - continue - - # Strip any leading + and split into segments - clean_part = part.strip("+") - if not clean_part: - continue - - # Split multi-part local identifiers (e.g., "build.123" -> ["build", "123"]) - part_segments = clean_part.split(".") - - # Add each segment if not already present - for segment in part_segments: - if segment and segment not in all_local_parts: - all_local_parts.append(segment) - - # Return combined result - if all_local_parts: - return main_part + "+" + ".".join(all_local_parts) - else: - return main_part - - -def format_version(version: ScmVersion) -> str: - log.debug("scm version %s", version) - log.debug("config %s", version.config) - if version.preformatted: - return str(version.tag) - - # Extract original tag's local data for later combination - original_local = "" - if hasattr(version.tag, "local") and version.tag.local is not None: - original_local = str(version.tag.local) - - # Create a patched ScmVersion with only the base version (no local data) for version schemes - from dataclasses import replace - - # Extract the base version (public part) from the tag using config's version_cls - base_version_str = str(version.tag.public) - base_tag = version.config.version_cls(base_version_str) - version_for_scheme = replace(version, tag=base_tag) - - main_version = _entrypoints._call_version_scheme( - version_for_scheme, - "setuptools_scm.version_scheme", - version.config.version_scheme, - ) - log.debug("version %s", main_version) - assert main_version is not None - - local_version = _entrypoints._call_version_scheme( - version, "setuptools_scm.local_scheme", version.config.local_scheme, "+unknown" - ) - log.debug("local_version %s", local_version) - - # Combine main version with original local data and new local scheme data - return _combine_version_with_local_parts( - str(main_version), original_local, local_version - ) diff --git a/.gitignore b/.gitignore index bccfebe..d0e3462 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,6 @@ mariadb_kernel/_version.py .vscode/ .eggs/ +catboost_info/ +mariadb_kernel.egg-info/ +models/ diff --git a/catboost_info/catboost_training.json b/catboost_info/catboost_training.json deleted file mode 100644 index b468cdf..0000000 --- a/catboost_info/catboost_training.json +++ /dev/null @@ -1,1004 +0,0 @@ -{ -"meta":{"test_sets":[],"test_metrics":[],"learn_metrics":[{"best_value":"Min","name":"Logloss"}],"launch_mode":"Train","parameters":"","iteration_count":1000,"learn_sets":["learn"],"name":"experiment"}, -"iterations":[ -{"learn":[0.6921376331],"iteration":0,"passed_time":0.0005506633771,"remaining_time":0.5501127137}, -{"learn":[0.6914311222],"iteration":1,"passed_time":0.0007853338881,"remaining_time":0.3918816102}, -{"learn":[0.6902726804],"iteration":2,"passed_time":0.001054080287,"remaining_time":0.3503060152}, -{"learn":[0.6893236297],"iteration":3,"passed_time":0.001332495648,"remaining_time":0.3317914165}, -{"learn":[0.6884269714],"iteration":4,"passed_time":0.001567760683,"remaining_time":0.3119843759}, -{"learn":[0.6877708094],"iteration":5,"passed_time":0.001757945965,"remaining_time":0.2912330483}, -{"learn":[0.6868745599],"iteration":6,"passed_time":0.00198443086,"remaining_time":0.281505692}, -{"learn":[0.6854426605],"iteration":7,"passed_time":0.002176281622,"remaining_time":0.2698589211}, -{"learn":[0.6846512471],"iteration":8,"passed_time":0.002514425851,"remaining_time":0.2768662242}, -{"learn":[0.6835081152],"iteration":9,"passed_time":0.0027196756,"remaining_time":0.2692478844}, -{"learn":[0.6828162926],"iteration":10,"passed_time":0.002932122931,"remaining_time":0.2636245072}, -{"learn":[0.6821300387],"iteration":11,"passed_time":0.003192283539,"remaining_time":0.2628313447}, -{"learn":[0.6810161471],"iteration":12,"passed_time":0.005394840798,"remaining_time":0.4095929129}, -{"learn":[0.6800281746],"iteration":13,"passed_time":0.00563666743,"remaining_time":0.3969824347}, -{"learn":[0.6789965715],"iteration":14,"passed_time":0.005860724626,"remaining_time":0.3848542504}, -{"learn":[0.6778871247],"iteration":15,"passed_time":0.006066001399,"remaining_time":0.373059086}, -{"learn":[0.6767778482],"iteration":16,"passed_time":0.006307268676,"remaining_time":0.3647085358}, -{"learn":[0.675800315],"iteration":17,"passed_time":0.006694250074,"remaining_time":0.3652085318}, -{"learn":[0.6745953049],"iteration":18,"passed_time":0.006899477982,"remaining_time":0.3562309421}, -{"learn":[0.6735743199],"iteration":19,"passed_time":0.007181293899,"remaining_time":0.3518834011}, -{"learn":[0.6728023546],"iteration":20,"passed_time":0.00745657228,"remaining_time":0.3476182982}, -{"learn":[0.6717833025],"iteration":21,"passed_time":0.007663767372,"remaining_time":0.340689295}, -{"learn":[0.6705873864],"iteration":22,"passed_time":0.007879479029,"remaining_time":0.3347065657}, -{"learn":[0.6691952518],"iteration":23,"passed_time":0.008064386899,"remaining_time":0.3279517339}, -{"learn":[0.6683306949],"iteration":24,"passed_time":0.008284067351,"remaining_time":0.3230786267}, -{"learn":[0.6672187277],"iteration":25,"passed_time":0.008483715772,"remaining_time":0.3178130447}, -{"learn":[0.6662092464],"iteration":26,"passed_time":0.008697806743,"remaining_time":0.3134431837}, -{"learn":[0.6652034862],"iteration":27,"passed_time":0.008924122091,"remaining_time":0.309794524}, -{"learn":[0.664099523],"iteration":28,"passed_time":0.009119880197,"remaining_time":0.3053587473}, -{"learn":[0.6628215143],"iteration":29,"passed_time":0.009304048429,"remaining_time":0.3008308992}, -{"learn":[0.6619682567],"iteration":30,"passed_time":0.009508690698,"remaining_time":0.2972232673}, -{"learn":[0.6605963962],"iteration":31,"passed_time":0.009684086934,"remaining_time":0.2929436298}, -{"learn":[0.6597477538],"iteration":32,"passed_time":0.009889587671,"remaining_time":0.2897948872}, -{"learn":[0.6585793495],"iteration":33,"passed_time":0.01006296194,"remaining_time":0.2859065068}, -{"learn":[0.6574136274],"iteration":34,"passed_time":0.01025222915,"remaining_time":0.2826686037}, -{"learn":[0.6566625323],"iteration":35,"passed_time":0.01047332483,"remaining_time":0.280452365}, -{"learn":[0.6557201062],"iteration":36,"passed_time":0.01065918187,"remaining_time":0.2774268146}, -{"learn":[0.6547810435],"iteration":37,"passed_time":0.01083729085,"remaining_time":0.2743545736}, -{"learn":[0.6537957873],"iteration":38,"passed_time":0.01125047385,"remaining_time":0.2772232145}, -{"learn":[0.6524456825],"iteration":39,"passed_time":0.01142843771,"remaining_time":0.2742825051}, -{"learn":[0.651703792],"iteration":40,"passed_time":0.01168960449,"remaining_time":0.2734227002}, -{"learn":[0.6505536692],"iteration":41,"passed_time":0.01189282376,"remaining_time":0.2712696466}, -{"learn":[0.6495760253],"iteration":42,"passed_time":0.01207861379,"remaining_time":0.2688193812}, -{"learn":[0.6482374328],"iteration":43,"passed_time":0.01223421047,"remaining_time":0.2658160275}, -{"learn":[0.6469012243],"iteration":44,"passed_time":0.01240492197,"remaining_time":0.2632600106}, -{"learn":[0.6460723281],"iteration":45,"passed_time":0.01259484997,"remaining_time":0.2612062363}, -{"learn":[0.645103676],"iteration":46,"passed_time":0.01281349796,"remaining_time":0.2598141183}, -{"learn":[0.6441353474],"iteration":47,"passed_time":0.01300224247,"remaining_time":0.257877809}, -{"learn":[0.6431703482],"iteration":48,"passed_time":0.01320365299,"remaining_time":0.2562586529}, -{"learn":[0.6423503671],"iteration":49,"passed_time":0.01342363478,"remaining_time":0.2550490607}, -{"learn":[0.6417189837],"iteration":50,"passed_time":0.01364485225,"remaining_time":0.2539012703}, -{"learn":[0.6408985257],"iteration":51,"passed_time":0.01385111336,"remaining_time":0.2525164512}, -{"learn":[0.6395800029],"iteration":52,"passed_time":0.01401858126,"remaining_time":0.2504829519}, -{"learn":[0.638454846],"iteration":53,"passed_time":0.01418940011,"remaining_time":0.2485772686}, -{"learn":[0.637829457],"iteration":54,"passed_time":0.01440893286,"remaining_time":0.2475716645}, -{"learn":[0.6369738238],"iteration":55,"passed_time":0.01462075124,"remaining_time":0.2464640923}, -{"learn":[0.6356642927],"iteration":56,"passed_time":0.01481690396,"remaining_time":0.2451287796}, -{"learn":[0.6346218245],"iteration":57,"passed_time":0.01499538758,"remaining_time":0.2435457775}, -{"learn":[0.6335089888],"iteration":58,"passed_time":0.01517758233,"remaining_time":0.2420695758}, -{"learn":[0.6327144418],"iteration":59,"passed_time":0.01543631179,"remaining_time":0.2418355513}, -{"learn":[0.6314146519],"iteration":60,"passed_time":0.01561911365,"remaining_time":0.2404319298}, -{"learn":[0.6302148189],"iteration":61,"passed_time":0.01585171444,"remaining_time":0.2398210991}, -{"learn":[0.6294122423],"iteration":62,"passed_time":0.01606182921,"remaining_time":0.2388878408}, -{"learn":[0.6283095479],"iteration":63,"passed_time":0.01626120998,"remaining_time":0.237820196}, -{"learn":[0.6273769736],"iteration":64,"passed_time":0.01649659274,"remaining_time":0.2372971417}, -{"learn":[0.626091625],"iteration":65,"passed_time":0.01666773328,"remaining_time":0.2358736801}, -{"learn":[0.6248111555],"iteration":66,"passed_time":0.01682289351,"remaining_time":0.2342650694}, -{"learn":[0.623531299],"iteration":67,"passed_time":0.01698280179,"remaining_time":0.2327642833}, -{"learn":[0.6225138136],"iteration":68,"passed_time":0.01714976845,"remaining_time":0.2313976004}, -{"learn":[0.62191024],"iteration":69,"passed_time":0.0173738312,"remaining_time":0.2308237574}, -{"learn":[0.6212150369],"iteration":70,"passed_time":0.01900881115,"remaining_time":0.2487209234}, -{"learn":[0.6205226438],"iteration":71,"passed_time":0.0192645902,"remaining_time":0.2482991626}, -{"learn":[0.6194400617],"iteration":72,"passed_time":0.01947590217,"remaining_time":0.2473172782}, -{"learn":[0.6186558434],"iteration":73,"passed_time":0.01975672413,"remaining_time":0.2472260343}, -{"learn":[0.6179682612],"iteration":74,"passed_time":0.02002807369,"remaining_time":0.2470129089}, -{"learn":[0.6168900728],"iteration":75,"passed_time":0.02027816331,"remaining_time":0.246539775}, -{"learn":[0.61597624],"iteration":76,"passed_time":0.02048544466,"remaining_time":0.2455592912}, -{"learn":[0.6151563355],"iteration":77,"passed_time":0.02069465358,"remaining_time":0.2446214179}, -{"learn":[0.6140871133],"iteration":78,"passed_time":0.02087465311,"remaining_time":0.2433614622}, -{"learn":[0.6135282346],"iteration":79,"passed_time":0.02136253616,"remaining_time":0.2456691658}, -{"learn":[0.6125324198],"iteration":80,"passed_time":0.02155299945,"remaining_time":0.2445334135}, -{"learn":[0.6116280981],"iteration":81,"passed_time":0.02174286526,"remaining_time":0.2434140281}, -{"learn":[0.610728332],"iteration":82,"passed_time":0.02194195358,"remaining_time":0.242418933}, -{"learn":[0.6099583251],"iteration":83,"passed_time":0.02215505208,"remaining_time":0.2415955679}, -{"learn":[0.6091488685],"iteration":84,"passed_time":0.02236177111,"remaining_time":0.240717889}, -{"learn":[0.607908896],"iteration":85,"passed_time":0.02252964473,"remaining_time":0.2394429685}, -{"learn":[0.6066734365],"iteration":86,"passed_time":0.02268634347,"remaining_time":0.2380762252}, -{"learn":[0.6057140146],"iteration":87,"passed_time":0.02288567612,"remaining_time":0.2371788252}, -{"learn":[0.6048230018],"iteration":88,"passed_time":0.02309271758,"remaining_time":0.2363760193}, -{"learn":[0.6038645932],"iteration":89,"passed_time":0.02327931351,"remaining_time":0.2353797255}, -{"learn":[0.603286062],"iteration":90,"passed_time":0.02364643761,"remaining_time":0.2362045251}, -{"learn":[0.6020624808],"iteration":91,"passed_time":0.0238396081,"remaining_time":0.2352865669}, -{"learn":[0.6010184118],"iteration":92,"passed_time":0.02403660522,"remaining_time":0.2344215155}, -{"learn":[0.5997997522],"iteration":93,"passed_time":0.02419529595,"remaining_time":0.2332014694}, -{"learn":[0.5988519958],"iteration":94,"passed_time":0.02443855262,"remaining_time":0.2328093697}, -{"learn":[0.5978163225],"iteration":95,"passed_time":0.02460600645,"remaining_time":0.2317065608}, -{"learn":[0.5968528134],"iteration":96,"passed_time":0.0247872302,"remaining_time":0.2307512255}, -{"learn":[0.5956415364],"iteration":97,"passed_time":0.02497165127,"remaining_time":0.2298411168}, -{"learn":[0.5950270976],"iteration":98,"passed_time":0.02514632157,"remaining_time":0.2288569266}, -{"learn":[0.5940695831],"iteration":99,"passed_time":0.02530889668,"remaining_time":0.2277800702}, -{"learn":[0.5931115406],"iteration":100,"passed_time":0.02546676559,"remaining_time":0.2266794283}, -{"learn":[0.5923869354],"iteration":101,"passed_time":0.0256237601,"remaining_time":0.2255895742}, -{"learn":[0.5916092396],"iteration":102,"passed_time":0.02585865088,"remaining_time":0.225196212}, -{"learn":[0.5907429201],"iteration":103,"passed_time":0.02603825912,"remaining_time":0.2243296171}, -{"learn":[0.5900058321],"iteration":104,"passed_time":0.02622273942,"remaining_time":0.223517636}, -{"learn":[0.5892723969],"iteration":105,"passed_time":0.02642358874,"remaining_time":0.2228555503}, -{"learn":[0.5880816919],"iteration":106,"passed_time":0.02659274359,"remaining_time":0.2219375703}, -{"learn":[0.5874404567],"iteration":107,"passed_time":0.02681064862,"remaining_time":0.2214360978}, -{"learn":[0.586706996],"iteration":108,"passed_time":0.02701284949,"remaining_time":0.2208114577}, -{"learn":[0.5859788145],"iteration":109,"passed_time":0.02719723798,"remaining_time":0.22005038}, -{"learn":[0.5852130651],"iteration":110,"passed_time":0.02741903814,"remaining_time":0.2195993235}, -{"learn":[0.584120197],"iteration":111,"passed_time":0.02766346202,"remaining_time":0.2193317345}, -{"learn":[0.5833940932],"iteration":112,"passed_time":0.02791707139,"remaining_time":0.2191366577}, -{"learn":[0.5823940975],"iteration":113,"passed_time":0.02809780908,"remaining_time":0.2183742004}, -{"learn":[0.5816756827],"iteration":114,"passed_time":0.02831838058,"remaining_time":0.2179284071}, -{"learn":[0.5805052434],"iteration":115,"passed_time":0.02848656368,"remaining_time":0.2170872612}, -{"learn":[0.5794262716],"iteration":116,"passed_time":0.02867859954,"remaining_time":0.2164376359}, -{"learn":[0.5787069798],"iteration":117,"passed_time":0.02888154301,"remaining_time":0.2158772961}, -{"learn":[0.5777818561],"iteration":118,"passed_time":0.02905500131,"remaining_time":0.2151046736}, -{"learn":[0.5768609388],"iteration":119,"passed_time":0.02923108684,"remaining_time":0.2143613035}, -{"learn":[0.576025205],"iteration":120,"passed_time":0.02940838066,"remaining_time":0.2136360876}, -{"learn":[0.5753114138],"iteration":121,"passed_time":0.02960232595,"remaining_time":0.2130396901}, -{"learn":[0.5741571529],"iteration":122,"passed_time":0.02985105772,"remaining_time":0.2128404685}, -{"learn":[0.5731786489],"iteration":123,"passed_time":0.0300463927,"remaining_time":0.2122632258}, -{"learn":[0.5722654206],"iteration":124,"passed_time":0.03364201036,"remaining_time":0.2354940725}, -{"learn":[0.5715580412],"iteration":125,"passed_time":0.03391048615,"remaining_time":0.2352203563}, -{"learn":[0.5707720092],"iteration":126,"passed_time":0.03412408699,"remaining_time":0.2345695114}, -{"learn":[0.5702917491],"iteration":127,"passed_time":0.03433297237,"remaining_time":0.2338933742}, -{"learn":[0.5693833147],"iteration":128,"passed_time":0.03455619219,"remaining_time":0.2333212667}, -{"learn":[0.5687683991],"iteration":129,"passed_time":0.03481659491,"remaining_time":0.2330033659}, -{"learn":[0.567802012],"iteration":130,"passed_time":0.03502881383,"remaining_time":0.2323667116}, -{"learn":[0.5668365444],"iteration":131,"passed_time":0.035243034,"remaining_time":0.2317496478}, -{"learn":[0.566021783],"iteration":132,"passed_time":0.03546937045,"remaining_time":0.2312176254}, -{"learn":[0.5654101968],"iteration":133,"passed_time":0.03569811275,"remaining_time":0.2307057138}, -{"learn":[0.5645956993],"iteration":134,"passed_time":0.03590905008,"remaining_time":0.2300839135}, -{"learn":[0.5639859353],"iteration":135,"passed_time":0.03614988091,"remaining_time":0.2296580669}, -{"learn":[0.5632997581],"iteration":136,"passed_time":0.03637466849,"remaining_time":0.2291338606}, -{"learn":[0.5626934341],"iteration":137,"passed_time":0.03686424329,"remaining_time":0.2302679545}, -{"learn":[0.5619667087],"iteration":138,"passed_time":0.04770563842,"remaining_time":0.2955003934}, -{"learn":[0.561355676],"iteration":139,"passed_time":0.04803399708,"remaining_time":0.2950659821}, -{"learn":[0.5606338722],"iteration":140,"passed_time":0.04827343044,"remaining_time":0.2940913245}, -{"learn":[0.5599243215],"iteration":141,"passed_time":0.04846762228,"remaining_time":0.2928536614}, -{"learn":[0.5588081224],"iteration":142,"passed_time":0.04874712044,"remaining_time":0.2921418337}, -{"learn":[0.5582083719],"iteration":143,"passed_time":0.04900995344,"remaining_time":0.2913369455}, -{"learn":[0.5571798342],"iteration":144,"passed_time":0.04921146206,"remaining_time":0.2901779315}, -{"learn":[0.5560722521],"iteration":145,"passed_time":0.04940132454,"remaining_time":0.288963912}, -{"learn":[0.555395518],"iteration":146,"passed_time":0.04963731404,"remaining_time":0.2880314889}, -{"learn":[0.5548768214],"iteration":147,"passed_time":0.04990071232,"remaining_time":0.2872662628}, -{"learn":[0.5540812016],"iteration":148,"passed_time":0.0501297989,"remaining_time":0.2863118044}, -{"learn":[0.5532853092],"iteration":149,"passed_time":0.05034942456,"remaining_time":0.2853134058}, -{"learn":[0.5524131826],"iteration":150,"passed_time":0.05056340966,"remaining_time":0.2842936079}, -{"learn":[0.5514765126],"iteration":151,"passed_time":0.0507795685,"remaining_time":0.2832965401}, -{"learn":[0.5503822821],"iteration":152,"passed_time":0.05116399042,"remaining_time":0.2832411757}, -{"learn":[0.5496355976],"iteration":153,"passed_time":0.05141092491,"remaining_time":0.2824262499}, -{"learn":[0.5487687332],"iteration":154,"passed_time":0.05165100907,"remaining_time":0.2815813075}, -{"learn":[0.5479253531],"iteration":155,"passed_time":0.05186466321,"remaining_time":0.2806011266}, -{"learn":[0.5468407444],"iteration":156,"passed_time":0.05205473411,"remaining_time":0.2795040819}, -{"learn":[0.5457558547],"iteration":157,"passed_time":0.05226718662,"remaining_time":0.278537792}, -{"learn":[0.5448993189],"iteration":158,"passed_time":0.05247221352,"remaining_time":0.277541708}, -{"learn":[0.5442429866],"iteration":159,"passed_time":0.05270743302,"remaining_time":0.2767140234}, -{"learn":[0.54350417],"iteration":160,"passed_time":0.05291273311,"remaining_time":0.2757377831}, -{"learn":[0.5428456068],"iteration":161,"passed_time":0.05315264439,"remaining_time":0.2749500988}, -{"learn":[0.5421104516],"iteration":162,"passed_time":0.05337751341,"remaining_time":0.274091894}, -{"learn":[0.5410383344],"iteration":163,"passed_time":0.05355567422,"remaining_time":0.2730033149}, -{"learn":[0.5401923571],"iteration":164,"passed_time":0.05374770452,"remaining_time":0.2719959592}, -{"learn":[0.5392873032],"iteration":165,"passed_time":0.0539680254,"remaining_time":0.2711405613}, -{"learn":[0.5386331763],"iteration":166,"passed_time":0.05418953977,"remaining_time":0.2702987223}, -{"learn":[0.5379828811],"iteration":167,"passed_time":0.05441635894,"remaining_time":0.2694905395}, -{"learn":[0.5372601577],"iteration":168,"passed_time":0.05558315582,"remaining_time":0.2733112573}, -{"learn":[0.5364935909],"iteration":169,"passed_time":0.05586399481,"remaining_time":0.2727477394}, -{"learn":[0.5359273042],"iteration":170,"passed_time":0.0561265324,"remaining_time":0.2720988033}, -{"learn":[0.5352867246],"iteration":171,"passed_time":0.05651085288,"remaining_time":0.2720406174}, -{"learn":[0.5346508111],"iteration":172,"passed_time":0.05671870839,"remaining_time":0.2711350973}, -{"learn":[0.5338960375],"iteration":173,"passed_time":0.05703869967,"remaining_time":0.2707699191}, -{"learn":[0.5331440909],"iteration":174,"passed_time":0.05730524196,"remaining_time":0.2701532835}, -{"learn":[0.5325764247],"iteration":175,"passed_time":0.05761228736,"remaining_time":0.2697302545}, -{"learn":[0.5319340825],"iteration":176,"passed_time":0.05788239346,"remaining_time":0.2691367786}, -{"learn":[0.5308882339],"iteration":177,"passed_time":0.05809320604,"remaining_time":0.26827312}, -{"learn":[0.5301761712],"iteration":178,"passed_time":0.0583282945,"remaining_time":0.2675280993}, -{"learn":[0.5292916383],"iteration":179,"passed_time":0.05855921721,"remaining_time":0.2667697673}, -{"learn":[0.5285842078],"iteration":180,"passed_time":0.0588231667,"remaining_time":0.2661667046}, -{"learn":[0.5280214122],"iteration":181,"passed_time":0.0591590054,"remaining_time":0.2658904748}, -{"learn":[0.5274686984],"iteration":182,"passed_time":0.06032077807,"remaining_time":0.26930096}, -{"learn":[0.5265111327],"iteration":183,"passed_time":0.06059102374,"remaining_time":0.2687080183}, -{"learn":[0.5254784737],"iteration":184,"passed_time":0.06079603508,"remaining_time":0.2678311816}, -{"learn":[0.5246844462],"iteration":185,"passed_time":0.06124754337,"remaining_time":0.2680403242}, -{"learn":[0.5240639022],"iteration":186,"passed_time":0.06168427593,"remaining_time":0.2681781622}, -{"learn":[0.523193121],"iteration":187,"passed_time":0.06204974676,"remaining_time":0.2680020977}, -{"learn":[0.5224034829],"iteration":188,"passed_time":0.06249607129,"remaining_time":0.2681709726}, -{"learn":[0.5215357968],"iteration":189,"passed_time":0.06776152737,"remaining_time":0.2888780904}, -{"learn":[0.5205158591],"iteration":190,"passed_time":0.06820345739,"remaining_time":0.2888827069}, -{"learn":[0.519499617],"iteration":191,"passed_time":0.06841906687,"remaining_time":0.2879302398}, -{"learn":[0.5189555841],"iteration":192,"passed_time":0.06874475572,"remaining_time":0.2874456884}, -{"learn":[0.5182280072],"iteration":193,"passed_time":0.06899695616,"remaining_time":0.286657457}, -{"learn":[0.5176118144],"iteration":194,"passed_time":0.06929205371,"remaining_time":0.2860518115}, -{"learn":[0.516600694],"iteration":195,"passed_time":0.06949547325,"remaining_time":0.2850732678}, -{"learn":[0.5160629494],"iteration":196,"passed_time":0.06978863324,"remaining_time":0.2844683883}, -{"learn":[0.5152068564],"iteration":197,"passed_time":0.07002515432,"remaining_time":0.2836372412}, -{"learn":[0.5146640411],"iteration":198,"passed_time":0.07031274963,"remaining_time":0.2830176505}, -{"learn":[0.5139804397],"iteration":199,"passed_time":0.0705935727,"remaining_time":0.2823742908}, -{"learn":[0.5130543666],"iteration":200,"passed_time":0.07085005622,"remaining_time":0.2816377857}, -{"learn":[0.5122875103],"iteration":201,"passed_time":0.07112385792,"remaining_time":0.2809744486}, -{"learn":[0.5114408987],"iteration":202,"passed_time":0.07147159041,"remaining_time":0.2806052096}, -{"learn":[0.5108332038],"iteration":203,"passed_time":0.07814299613,"remaining_time":0.3049109064}, -{"learn":[0.5102274801],"iteration":204,"passed_time":0.07866349022,"remaining_time":0.3050608523}, -{"learn":[0.5096990253],"iteration":205,"passed_time":0.07911588658,"remaining_time":0.3049418153}, -{"learn":[0.5091704939],"iteration":206,"passed_time":0.07955885831,"remaining_time":0.3047834524}, -{"learn":[0.5084650133],"iteration":207,"passed_time":0.07998701286,"remaining_time":0.3045659336}, -{"learn":[0.5076262014],"iteration":208,"passed_time":0.08035209092,"remaining_time":0.3041076742}, -{"learn":[0.5067908296],"iteration":209,"passed_time":0.08071680953,"remaining_time":0.3036489501}, -{"learn":[0.5058066249],"iteration":210,"passed_time":0.08121877271,"remaining_time":0.3037043207}, -{"learn":[0.505107935],"iteration":211,"passed_time":0.08166758157,"remaining_time":0.3035568598}, -{"learn":[0.5043528165],"iteration":212,"passed_time":0.08207285141,"remaining_time":0.3032456998}, -{"learn":[0.5037583411],"iteration":213,"passed_time":0.08253401348,"remaining_time":0.3031389467}, -{"learn":[0.5033062739],"iteration":214,"passed_time":0.08302365529,"remaining_time":0.3031328809}, -{"learn":[0.502780165],"iteration":215,"passed_time":0.08352755011,"remaining_time":0.3031740708}, -{"learn":[0.5019570121],"iteration":216,"passed_time":0.08413131042,"remaining_time":0.3035705809}, -{"learn":[0.5011352003],"iteration":217,"passed_time":0.08455725568,"remaining_time":0.303320064}, -{"learn":[0.500619271],"iteration":218,"passed_time":0.08503471347,"remaining_time":0.3032516494}, -{"learn":[0.499652671],"iteration":219,"passed_time":0.09698412874,"remaining_time":0.3438528201}, -{"learn":[0.4991394196],"iteration":220,"passed_time":0.1092758406,"remaining_time":0.3851849768}, -{"learn":[0.4984841432],"iteration":221,"passed_time":0.1096579418,"remaining_time":0.3842967512}, -{"learn":[0.4977248609],"iteration":222,"passed_time":0.1099168009,"remaining_time":0.3829836515}, -{"learn":[0.4969089159],"iteration":223,"passed_time":0.1101721498,"remaining_time":0.3816678045}, -{"learn":[0.4962528561],"iteration":224,"passed_time":0.1105039267,"remaining_time":0.3806246366}, -{"learn":[0.4952942176],"iteration":225,"passed_time":0.1107545805,"remaining_time":0.3793099351}, -{"learn":[0.4947139238],"iteration":226,"passed_time":0.1124684203,"remaining_time":0.3829871759}, -{"learn":[0.494205675],"iteration":227,"passed_time":0.1157605753,"remaining_time":0.3919612462}, -{"learn":[0.4934548054],"iteration":228,"passed_time":0.117541986,"remaining_time":0.3957417958}, -{"learn":[0.4928752099],"iteration":229,"passed_time":0.118417462,"remaining_time":0.3964410686}, -{"learn":[0.4921438651],"iteration":230,"passed_time":0.1199319189,"remaining_time":0.3992538771}, -{"learn":[0.4915419051],"iteration":231,"passed_time":0.1203728927,"remaining_time":0.3984757826}, -{"learn":[0.4906675262],"iteration":232,"passed_time":0.1208700419,"remaining_time":0.3978855028}, -{"learn":[0.4898703141],"iteration":233,"passed_time":0.121417344,"remaining_time":0.3974601944}, -{"learn":[0.4894825518],"iteration":234,"passed_time":0.12185313,"remaining_time":0.3966708273}, -{"learn":[0.4889106836],"iteration":235,"passed_time":0.1222620739,"remaining_time":0.3957975614}, -{"learn":[0.4883435837],"iteration":236,"passed_time":0.1227171697,"remaining_time":0.3950767953}, -{"learn":[0.4875512464],"iteration":237,"passed_time":0.1234467287,"remaining_time":0.3952370054}, -{"learn":[0.4866151299],"iteration":238,"passed_time":0.1238333033,"remaining_time":0.3942976728}, -{"learn":[0.4858255344],"iteration":239,"passed_time":0.125482289,"remaining_time":0.3973605817}, -{"learn":[0.4852675327],"iteration":240,"passed_time":0.1259663076,"remaining_time":0.3967154667}, -{"learn":[0.4844093025],"iteration":241,"passed_time":0.1263979249,"remaining_time":0.3959075499}, -{"learn":[0.4839843852],"iteration":242,"passed_time":0.1268836657,"remaining_time":0.395271337}, -{"learn":[0.4834949417],"iteration":243,"passed_time":0.1297374992,"remaining_time":0.4019735631}, -{"learn":[0.4828630686],"iteration":244,"passed_time":0.1304808537,"remaining_time":0.4020940592}, -{"learn":[0.4820830396],"iteration":245,"passed_time":0.1309091752,"remaining_time":0.4012419434}, -{"learn":[0.4811603895],"iteration":246,"passed_time":0.1312974567,"remaining_time":0.4002711938}, -{"learn":[0.4806003102],"iteration":247,"passed_time":0.1317431478,"remaining_time":0.3994792224}, -{"learn":[0.4801141407],"iteration":248,"passed_time":0.1322640065,"remaining_time":0.3989167426}, -{"learn":[0.4795589915],"iteration":249,"passed_time":0.1327555156,"remaining_time":0.3982665467}, -{"learn":[0.4790101647],"iteration":250,"passed_time":0.1332032868,"remaining_time":0.3974870988}, -{"learn":[0.4783102529],"iteration":251,"passed_time":0.1340747063,"remaining_time":0.3979677789}, -{"learn":[0.4777278134],"iteration":252,"passed_time":0.1346313837,"remaining_time":0.3975084729}, -{"learn":[0.4772474894],"iteration":253,"passed_time":0.1363684936,"remaining_time":0.4005153395}, -{"learn":[0.4764735358],"iteration":254,"passed_time":0.1367531262,"remaining_time":0.3995336431}, -{"learn":[0.4755656251],"iteration":255,"passed_time":0.1370101783,"remaining_time":0.3981858307}, -{"learn":[0.4751491078],"iteration":256,"passed_time":0.1382981169,"remaining_time":0.3998268516}, -{"learn":[0.4743878543],"iteration":257,"passed_time":0.1386027668,"remaining_time":0.3986172596}, -{"learn":[0.4736746209],"iteration":258,"passed_time":0.1403680436,"remaining_time":0.4015935147}, -{"learn":[0.4729804865],"iteration":259,"passed_time":0.1412752119,"remaining_time":0.4020909876}, -{"learn":[0.4724334478],"iteration":260,"passed_time":0.1420032738,"remaining_time":0.4020705722}, -{"learn":[0.4720227505],"iteration":261,"passed_time":0.1427360627,"remaining_time":0.4020580698}, -{"learn":[0.4712655161],"iteration":262,"passed_time":0.1434610555,"remaining_time":0.4020182428}, -{"learn":[0.4704395831],"iteration":263,"passed_time":0.1449213751,"remaining_time":0.4040232276}, -{"learn":[0.4698364777],"iteration":264,"passed_time":0.1456504406,"remaining_time":0.4039738637}, -{"learn":[0.4693632637],"iteration":265,"passed_time":0.1461866855,"remaining_time":0.4033873202}, -{"learn":[0.468612028],"iteration":266,"passed_time":0.1474025943,"remaining_time":0.4046670472}, -{"learn":[0.4677223393],"iteration":267,"passed_time":0.1481455952,"remaining_time":0.4046364765}, -{"learn":[0.4670231215],"iteration":268,"passed_time":0.1502482533,"remaining_time":0.4082954392}, -{"learn":[0.4663278971],"iteration":269,"passed_time":0.152772893,"remaining_time":0.4130526367}, -{"learn":[0.4658006259],"iteration":270,"passed_time":0.1535577956,"remaining_time":0.4130761366}, -{"learn":[0.4653341855],"iteration":271,"passed_time":0.1541995322,"remaining_time":0.4127105125}, -{"learn":[0.4646414902],"iteration":272,"passed_time":0.1546745237,"remaining_time":0.4118988233}, -{"learn":[0.4639707293],"iteration":273,"passed_time":0.1553459531,"remaining_time":0.4116100801}, -{"learn":[0.4632801924],"iteration":274,"passed_time":0.1558574794,"remaining_time":0.410896991}, -{"learn":[0.4628103631],"iteration":275,"passed_time":0.1564273611,"remaining_time":0.41033844}, -{"learn":[0.4620725938],"iteration":276,"passed_time":0.1571536921,"remaining_time":0.4101881565}, -{"learn":[0.4616752139],"iteration":277,"passed_time":0.1578959374,"remaining_time":0.4100750605}, -{"learn":[0.4608016184],"iteration":278,"passed_time":0.1583776694,"remaining_time":0.4092842281}, -{"learn":[0.459932306],"iteration":279,"passed_time":0.1614568995,"remaining_time":0.4151748844}, -{"learn":[0.4593444594],"iteration":280,"passed_time":0.1618450488,"remaining_time":0.4141159791}, -{"learn":[0.4588904636],"iteration":281,"passed_time":0.1622101039,"remaining_time":0.4130030306}, -{"learn":[0.4582101745],"iteration":282,"passed_time":0.1630924951,"remaining_time":0.4132060741}, -{"learn":[0.4575310392],"iteration":283,"passed_time":0.1634776178,"remaining_time":0.4121477969}, -{"learn":[0.4566698628],"iteration":284,"passed_time":0.1640455756,"remaining_time":0.4115529353}, -{"learn":[0.4561560367],"iteration":285,"passed_time":0.1644130466,"remaining_time":0.4104577456}, -{"learn":[0.4554346417],"iteration":286,"passed_time":0.1648104569,"remaining_time":0.4094420063}, -{"learn":[0.454855838],"iteration":287,"passed_time":0.1652337472,"remaining_time":0.4084945417}, -{"learn":[0.4541831996],"iteration":288,"passed_time":0.1663634164,"remaining_time":0.4092885434}, -{"learn":[0.4536689264],"iteration":289,"passed_time":0.1668685602,"remaining_time":0.4085402681}, -{"learn":[0.4531557347],"iteration":290,"passed_time":0.1673281334,"remaining_time":0.4076826343}, -{"learn":[0.4527086743],"iteration":291,"passed_time":0.1698112535,"remaining_time":0.4117341352}, -{"learn":[0.4522080932],"iteration":292,"passed_time":0.1708048201,"remaining_time":0.4121467844}, -{"learn":[0.4516352969],"iteration":293,"passed_time":0.171830531,"remaining_time":0.4126270574}, -{"learn":[0.4509672991],"iteration":294,"passed_time":0.1725648195,"remaining_time":0.4124006704}, -{"learn":[0.4502509066],"iteration":295,"passed_time":0.1732911723,"remaining_time":0.4121519774}, -{"learn":[0.4495855272],"iteration":296,"passed_time":0.1738579907,"remaining_time":0.4115224495}, -{"learn":[0.4488103007],"iteration":297,"passed_time":0.1745239639,"remaining_time":0.4111269216}, -{"learn":[0.4482808496],"iteration":298,"passed_time":0.1761683281,"remaining_time":0.4130234047}, -{"learn":[0.44764027],"iteration":299,"passed_time":0.1767470974,"remaining_time":0.4124098938}, -{"learn":[0.4471345288],"iteration":300,"passed_time":0.1773201968,"remaining_time":0.411783447}, -{"learn":[0.4464275283],"iteration":301,"passed_time":0.1779259195,"remaining_time":0.4112327542}, -{"learn":[0.445842513],"iteration":302,"passed_time":0.1786413143,"remaining_time":0.4109339804}, -{"learn":[0.4453389304],"iteration":303,"passed_time":0.179133654,"remaining_time":0.4101217869}, -{"learn":[0.4445062748],"iteration":304,"passed_time":0.1796293844,"remaining_time":0.4093194168}, -{"learn":[0.4438071592],"iteration":305,"passed_time":0.180148086,"remaining_time":0.4085711493}, -{"learn":[0.4431103936],"iteration":306,"passed_time":0.1805860483,"remaining_time":0.4076421221}, -{"learn":[0.4424160804],"iteration":307,"passed_time":0.1815578997,"remaining_time":0.4079158005}, -{"learn":[0.4419180283],"iteration":308,"passed_time":0.1820822926,"remaining_time":0.4071807902}, -{"learn":[0.4410959354],"iteration":309,"passed_time":0.1825291209,"remaining_time":0.406274495}, -{"learn":[0.4403396802],"iteration":310,"passed_time":0.1832653693,"remaining_time":0.4060123454}, -{"learn":[0.439715841],"iteration":311,"passed_time":0.1839737579,"remaining_time":0.4056857226}, -{"learn":[0.4391380974],"iteration":312,"passed_time":0.1846743289,"remaining_time":0.4053395015}, -{"learn":[0.4387702474],"iteration":313,"passed_time":0.1853864409,"remaining_time":0.4050162371}, -{"learn":[0.4382839075],"iteration":314,"passed_time":0.1860388514,"remaining_time":0.404560677}, -{"learn":[0.437851578],"iteration":315,"passed_time":0.1866688863,"remaining_time":0.4040554374}, -{"learn":[0.4370380257],"iteration":316,"passed_time":0.1882798317,"remaining_time":0.4056628551}, -{"learn":[0.4365532909],"iteration":317,"passed_time":0.188876909,"remaining_time":0.405075635}, -{"learn":[0.4359790853],"iteration":318,"passed_time":0.1894348055,"remaining_time":0.4044047102}, -{"learn":[0.4353643443],"iteration":319,"passed_time":0.1906715673,"remaining_time":0.4051770806}, -{"learn":[0.4349343138],"iteration":320,"passed_time":0.191420987,"remaining_time":0.4049060753}, -{"learn":[0.4341925383],"iteration":321,"passed_time":0.1931586684,"remaining_time":0.4067129727}, -{"learn":[0.4335165833],"iteration":322,"passed_time":0.1946537089,"remaining_time":0.4079893526}, -{"learn":[0.4328867538],"iteration":323,"passed_time":0.1955175543,"remaining_time":0.4079316873}, -{"learn":[0.4323144172],"iteration":324,"passed_time":0.1964996124,"remaining_time":0.4081145797}, -{"learn":[0.4316435754],"iteration":325,"passed_time":0.1973518754,"remaining_time":0.4080219755}, -{"learn":[0.4310806351],"iteration":326,"passed_time":0.1983415272,"remaining_time":0.4082074857}, -{"learn":[0.4305228548],"iteration":327,"passed_time":0.1987165671,"remaining_time":0.4071266252}, -{"learn":[0.4299000289],"iteration":328,"passed_time":0.1990773521,"remaining_time":0.4060209825}, -{"learn":[0.4294867728],"iteration":329,"passed_time":0.1999732747,"remaining_time":0.4060063456}, -{"learn":[0.4287576846],"iteration":330,"passed_time":0.2005555481,"remaining_time":0.4053524523}, -{"learn":[0.4281379964],"iteration":331,"passed_time":0.2011324594,"remaining_time":0.4046882014}, -{"learn":[0.4274753375],"iteration":332,"passed_time":0.2016183908,"remaining_time":0.4038422422}, -{"learn":[0.4269457119],"iteration":333,"passed_time":0.2021672636,"remaining_time":0.4031239447}, -{"learn":[0.4265914985],"iteration":334,"passed_time":0.2028451346,"remaining_time":0.4026627298}, -{"learn":[0.4261210859],"iteration":335,"passed_time":0.2034348599,"remaining_time":0.4020260327}, -{"learn":[0.4253347473],"iteration":336,"passed_time":0.2039420057,"remaining_time":0.4012271508}, -{"learn":[0.4246753199],"iteration":337,"passed_time":0.2044512739,"remaining_time":0.4004341518}, -{"learn":[0.4242052266],"iteration":338,"passed_time":0.2050077051,"remaining_time":0.3997347879}, -{"learn":[0.423488817],"iteration":339,"passed_time":0.2055324916,"remaining_time":0.3989748366}, -{"learn":[0.4228350009],"iteration":340,"passed_time":0.2061729365,"remaining_time":0.3984397805}, -{"learn":[0.4222266035],"iteration":341,"passed_time":0.2068351056,"remaining_time":0.3979459049}, -{"learn":[0.4217495535],"iteration":342,"passed_time":0.2075861733,"remaining_time":0.397621329}, -{"learn":[0.4212821083],"iteration":343,"passed_time":0.208360868,"remaining_time":0.3973393296}, -{"learn":[0.4208801644],"iteration":344,"passed_time":0.2093298237,"remaining_time":0.3974232885}, -{"learn":[0.4203901972],"iteration":345,"passed_time":0.209898891,"remaining_time":0.3967453027}, -{"learn":[0.4198750854],"iteration":346,"passed_time":0.2104952424,"remaining_time":0.3961192889}, -{"learn":[0.4193912702],"iteration":347,"passed_time":0.2110447189,"remaining_time":0.3954056228}, -{"learn":[0.4187875901],"iteration":348,"passed_time":0.2115227897,"remaining_time":0.3945597023}, -{"learn":[0.4183272166],"iteration":349,"passed_time":0.2121592355,"remaining_time":0.3940100088}, -{"learn":[0.4178133053],"iteration":350,"passed_time":0.2126936594,"remaining_time":0.3932711822}, -{"learn":[0.4172735342],"iteration":351,"passed_time":0.2149225279,"remaining_time":0.3956528354}, -{"learn":[0.4165695991],"iteration":352,"passed_time":0.2163618086,"remaining_time":0.3965611619}, -{"learn":[0.416060222],"iteration":353,"passed_time":0.2169572078,"remaining_time":0.3959162605}, -{"learn":[0.4154641713],"iteration":354,"passed_time":0.2175585204,"remaining_time":0.3952823821}, -{"learn":[0.414705119],"iteration":355,"passed_time":0.2180793806,"remaining_time":0.3945031492}, -{"learn":[0.4140638581],"iteration":356,"passed_time":0.2186555459,"remaining_time":0.3938249749}, -{"learn":[0.4134714263],"iteration":357,"passed_time":0.2191818016,"remaining_time":0.3930578678}, -{"learn":[0.4131337319],"iteration":358,"passed_time":0.2209840315,"remaining_time":0.3945703737}, -{"learn":[0.4126064479],"iteration":359,"passed_time":0.2216364612,"remaining_time":0.3940203755}, -{"learn":[0.4121523883],"iteration":360,"passed_time":0.2221682313,"remaining_time":0.3932562321}, -{"learn":[0.4115223118],"iteration":361,"passed_time":0.2227907028,"remaining_time":0.3926532276}, -{"learn":[0.4110527294],"iteration":362,"passed_time":0.2234476052,"remaining_time":0.3921105359}, -{"learn":[0.4106555283],"iteration":363,"passed_time":0.2242245521,"remaining_time":0.3917769646}, -{"learn":[0.4100904507],"iteration":364,"passed_time":0.2249400169,"remaining_time":0.391334002}, -{"learn":[0.4095082581],"iteration":365,"passed_time":0.2254414988,"remaining_time":0.3905188805}, -{"learn":[0.4088843039],"iteration":366,"passed_time":0.2263359222,"remaining_time":0.3903832119}, -{"learn":[0.4081387094],"iteration":367,"passed_time":0.226922421,"remaining_time":0.3897145925}, -{"learn":[0.4073968657],"iteration":368,"passed_time":0.2284440017,"remaining_time":0.3906454338}, -{"learn":[0.40671779],"iteration":369,"passed_time":0.2302709512,"remaining_time":0.392082971}, -{"learn":[0.4063282268],"iteration":370,"passed_time":0.2310629611,"remaining_time":0.3917482548}, -{"learn":[0.4058952715],"iteration":371,"passed_time":0.2351638011,"remaining_time":0.3969969546}, -{"learn":[0.4054609069],"iteration":372,"passed_time":0.23588082,"remaining_time":0.3965074373}, -{"learn":[0.4049458589],"iteration":373,"passed_time":0.2364975176,"remaining_time":0.3958487862}, -{"learn":[0.4045187661],"iteration":374,"passed_time":0.2372490337,"remaining_time":0.3954150561}, -{"learn":[0.4038458211],"iteration":375,"passed_time":0.2379071814,"remaining_time":0.394824684}, -{"learn":[0.4031138676],"iteration":376,"passed_time":0.2384520287,"remaining_time":0.3940467212}, -{"learn":[0.4025977935],"iteration":377,"passed_time":0.2390649109,"remaining_time":0.3933819433}, -{"learn":[0.4021515165],"iteration":378,"passed_time":0.2396257877,"remaining_time":0.3926322274}, -{"learn":[0.4017208729],"iteration":379,"passed_time":0.2402559285,"remaining_time":0.3919965148}, -{"learn":[0.4012702831],"iteration":380,"passed_time":0.2413900987,"remaining_time":0.3921797142}, -{"learn":[0.400815968],"iteration":381,"passed_time":0.2422200659,"remaining_time":0.3918638763}, -{"learn":[0.4003585875],"iteration":382,"passed_time":0.2430818001,"remaining_time":0.3915965292}, -{"learn":[0.3996368136],"iteration":383,"passed_time":0.243582873,"remaining_time":0.3907475255}, -{"learn":[0.3989150098],"iteration":384,"passed_time":0.2441080496,"remaining_time":0.3899388325}, -{"learn":[0.3984945757],"iteration":385,"passed_time":0.2448157116,"remaining_time":0.3894218832}, -{"learn":[0.3979838576],"iteration":386,"passed_time":0.2457731315,"remaining_time":0.3892995597}, -{"learn":[0.3975541421],"iteration":387,"passed_time":0.2465708034,"remaining_time":0.3889209579}, -{"learn":[0.3969472051],"iteration":388,"passed_time":0.2470648508,"remaining_time":0.3880633004}, -{"learn":[0.3965793593],"iteration":389,"passed_time":0.2477127547,"remaining_time":0.3874481548}, -{"learn":[0.3960817797],"iteration":390,"passed_time":0.2482348895,"remaining_time":0.3866369506}, -{"learn":[0.3953677969],"iteration":391,"passed_time":0.2486813269,"remaining_time":0.3857098131}, -{"learn":[0.3948702344],"iteration":392,"passed_time":0.2491915071,"remaining_time":0.3848835747}, -{"learn":[0.3943742556],"iteration":393,"passed_time":0.2497690259,"remaining_time":0.3841625119}, -{"learn":[0.3938194088],"iteration":394,"passed_time":0.2503357995,"remaining_time":0.3834257182}, -{"learn":[0.393111263],"iteration":395,"passed_time":0.2507962749,"remaining_time":0.3825276516}, -{"learn":[0.3924046883],"iteration":396,"passed_time":0.2513014639,"remaining_time":0.3816997046}, -{"learn":[0.3919914365],"iteration":397,"passed_time":0.2518446428,"remaining_time":0.3809308416}, -{"learn":[0.3914010099],"iteration":398,"passed_time":0.2523300163,"remaining_time":0.3800760396}, -{"learn":[0.390907675],"iteration":399,"passed_time":0.2528522688,"remaining_time":0.3792784032}, -{"learn":[0.390361424],"iteration":400,"passed_time":0.2533540735,"remaining_time":0.3784515961}, -{"learn":[0.3897764002],"iteration":401,"passed_time":0.2538981331,"remaining_time":0.3776892627}, -{"learn":[0.3892910055],"iteration":402,"passed_time":0.2544391916,"remaining_time":0.3769235667}, -{"learn":[0.3889340375],"iteration":403,"passed_time":0.2553128122,"remaining_time":0.3766495942}, -{"learn":[0.3882963104],"iteration":404,"passed_time":0.2560603135,"remaining_time":0.3761873742}, -{"learn":[0.3878849873],"iteration":405,"passed_time":0.2582861316,"remaining_time":0.3778866063}, -{"learn":[0.3873047105],"iteration":406,"passed_time":0.2590881032,"remaining_time":0.377492003}, -{"learn":[0.3869512975],"iteration":407,"passed_time":0.2609867835,"remaining_time":0.3786867055}, -{"learn":[0.3864888975],"iteration":408,"passed_time":0.261609425,"remaining_time":0.3780224209}, -{"learn":[0.3860683399],"iteration":409,"passed_time":0.2624639087,"remaining_time":0.3776919661}, -{"learn":[0.3854924994],"iteration":410,"passed_time":0.2635370918,"remaining_time":0.3776723773}, -{"learn":[0.3850928077],"iteration":411,"passed_time":0.2646888275,"remaining_time":0.377759783}, -{"learn":[0.3847414127],"iteration":412,"passed_time":0.2654285953,"remaining_time":0.3772556548}, -{"learn":[0.3842214559],"iteration":413,"passed_time":0.2661514942,"remaining_time":0.3767265111}, -{"learn":[0.3839176638],"iteration":414,"passed_time":0.2675213856,"remaining_time":0.3771084592}, -{"learn":[0.3834964207],"iteration":415,"passed_time":0.2682169357,"remaining_time":0.3765353136}, -{"learn":[0.3829643939],"iteration":416,"passed_time":0.2687970532,"remaining_time":0.3758001966}, -{"learn":[0.3826096952],"iteration":417,"passed_time":0.2698969921,"remaining_time":0.3757895918}, -{"learn":[0.3820993347],"iteration":418,"passed_time":0.2731110483,"remaining_time":0.3787052961}, -{"learn":[0.3815246395],"iteration":419,"passed_time":0.2745873571,"remaining_time":0.3791920646}, -{"learn":[0.3812263012],"iteration":420,"passed_time":0.2761327299,"remaining_time":0.3797644908}, -{"learn":[0.380756293],"iteration":421,"passed_time":0.2770033586,"remaining_time":0.3794027045}, -{"learn":[0.3803054477],"iteration":422,"passed_time":0.277682967,"remaining_time":0.3787779478}, -{"learn":[0.3799018434],"iteration":423,"passed_time":0.2782834207,"remaining_time":0.3780454018}, -{"learn":[0.3793343902],"iteration":424,"passed_time":0.2788537715,"remaining_time":0.3772727497}, -{"learn":[0.3789422768],"iteration":425,"passed_time":0.2795747181,"remaining_time":0.3767039628}, -{"learn":[0.3786449858],"iteration":426,"passed_time":0.2803431543,"remaining_time":0.3761981907}, -{"learn":[0.3781736961],"iteration":427,"passed_time":0.28105148,"remaining_time":0.3756108565}, -{"learn":[0.3776146003],"iteration":428,"passed_time":0.28163332,"remaining_time":0.3748546054}, -{"learn":[0.3772632352],"iteration":429,"passed_time":0.2821840289,"remaining_time":0.3740578987}, -{"learn":[0.3768726545],"iteration":430,"passed_time":0.2827157601,"remaining_time":0.3732372795}, -{"learn":[0.3763188847],"iteration":431,"passed_time":0.2832323109,"remaining_time":0.3723980384}, -{"learn":[0.3759762943],"iteration":432,"passed_time":0.2838044071,"remaining_time":0.3716330227}, -{"learn":[0.3756239925],"iteration":433,"passed_time":0.2847285037,"remaining_time":0.3713279564}, -{"learn":[0.3752392616],"iteration":434,"passed_time":0.2855756343,"remaining_time":0.3709200768}, -{"learn":[0.3746320861],"iteration":435,"passed_time":0.2861310843,"remaining_time":0.3701328705}, -{"learn":[0.3739689333],"iteration":436,"passed_time":0.2866816654,"remaining_time":0.3693404523}, -{"learn":[0.3736773006],"iteration":437,"passed_time":0.2872724155,"remaining_time":0.3686006792}, -{"learn":[0.3732340549],"iteration":438,"passed_time":0.2878093622,"remaining_time":0.3677928296}, -{"learn":[0.3725745422],"iteration":439,"passed_time":0.2883467403,"remaining_time":0.3669867603}, -{"learn":[0.3719174904],"iteration":440,"passed_time":0.2894269674,"remaining_time":0.3668700108}, -{"learn":[0.3714807417],"iteration":441,"passed_time":0.2899982225,"remaining_time":0.3661063533}, -{"learn":[0.3710459641],"iteration":442,"passed_time":0.2905078938,"remaining_time":0.3652661328}, -{"learn":[0.3703917946],"iteration":443,"passed_time":0.2909793189,"remaining_time":0.3643795075}, -{"learn":[0.3699332518],"iteration":444,"passed_time":0.2915444008,"remaining_time":0.3636115561}, -{"learn":[0.3694799159],"iteration":445,"passed_time":0.2920996989,"remaining_time":0.3628323615}, -{"learn":[0.3688319496],"iteration":446,"passed_time":0.2925414694,"remaining_time":0.3619137194}, -{"learn":[0.3682378743],"iteration":447,"passed_time":0.2931437852,"remaining_time":0.3611950211}, -{"learn":[0.3677363523],"iteration":448,"passed_time":0.2936419292,"remaining_time":0.3603490044}, -{"learn":[0.3674085396],"iteration":449,"passed_time":0.2942661609,"remaining_time":0.3596586411}, -{"learn":[0.367077423],"iteration":450,"passed_time":0.2949064941,"remaining_time":0.3589881713}, -{"learn":[0.3667051792],"iteration":451,"passed_time":0.2954570075,"remaining_time":0.3582089383}, -{"learn":[0.3664227],"iteration":452,"passed_time":0.2960547363,"remaining_time":0.357487728}, -{"learn":[0.3660525254],"iteration":453,"passed_time":0.2965446128,"remaining_time":0.3566373538}, -{"learn":[0.3654101065],"iteration":454,"passed_time":0.2970064531,"remaining_time":0.3557549823}, -{"learn":[0.3649876288],"iteration":455,"passed_time":0.2975205166,"remaining_time":0.3549367567}, -{"learn":[0.3646104591],"iteration":456,"passed_time":0.2980737725,"remaining_time":0.3541664298}, -{"learn":[0.364115434],"iteration":457,"passed_time":0.2990222064,"remaining_time":0.3538647071}, -{"learn":[0.3637939692],"iteration":458,"passed_time":0.3000197081,"remaining_time":0.3536180002}, -{"learn":[0.3634185621],"iteration":459,"passed_time":0.3006429692,"remaining_time":0.352928703}, -{"learn":[0.3628906863],"iteration":460,"passed_time":0.301148196,"remaining_time":0.3521016869}, -{"learn":[0.362450093],"iteration":461,"passed_time":0.3016677058,"remaining_time":0.3512926963}, -{"learn":[0.3618173812],"iteration":462,"passed_time":0.3021884164,"remaining_time":0.350486349}, -{"learn":[0.3612926432],"iteration":463,"passed_time":0.302972743,"remaining_time":0.3499857549}, -{"learn":[0.3608229033],"iteration":464,"passed_time":0.3040010826,"remaining_time":0.3497646864}, -{"learn":[0.3603342303],"iteration":465,"passed_time":0.3049503335,"remaining_time":0.3494495238}, -{"learn":[0.3599510874],"iteration":466,"passed_time":0.3096284342,"remaining_time":0.3533874849}, -{"learn":[0.3593257538],"iteration":467,"passed_time":0.3103259611,"remaining_time":0.3527636994}, -{"learn":[0.3588909294],"iteration":468,"passed_time":0.3113314734,"remaining_time":0.3524882993}, -{"learn":[0.3584752466],"iteration":469,"passed_time":0.3120732412,"remaining_time":0.3519123784}, -{"learn":[0.3580092447],"iteration":470,"passed_time":0.3127731955,"remaining_time":0.3512887907}, -{"learn":[0.3575279159],"iteration":471,"passed_time":0.3134354027,"remaining_time":0.3506226539}, -{"learn":[0.3570059666],"iteration":472,"passed_time":0.3153024356,"remaining_time":0.3512989081}, -{"learn":[0.356525468],"iteration":473,"passed_time":0.3157099761,"remaining_time":0.3503448258}, -{"learn":[0.3561132167],"iteration":474,"passed_time":0.3161932925,"remaining_time":0.349476797}, -{"learn":[0.3557490196],"iteration":475,"passed_time":0.3166485945,"remaining_time":0.3485795452}, -{"learn":[0.3554368232],"iteration":476,"passed_time":0.3172151434,"remaining_time":0.3478061216}, -{"learn":[0.3550801958],"iteration":477,"passed_time":0.3176272495,"remaining_time":0.3468649043}, -{"learn":[0.3544650078],"iteration":478,"passed_time":0.3179188014,"remaining_time":0.3457947715}, -{"learn":[0.3541559832],"iteration":479,"passed_time":0.3183427669,"remaining_time":0.3448713308}, -{"learn":[0.3537509612],"iteration":480,"passed_time":0.3187753508,"remaining_time":0.3439592662}, -{"learn":[0.3532782325],"iteration":481,"passed_time":0.3191261262,"remaining_time":0.3429612726}, -{"learn":[0.3527637294],"iteration":482,"passed_time":0.3194514467,"remaining_time":0.3419387121}, -{"learn":[0.3522067794],"iteration":483,"passed_time":0.3197839534,"remaining_time":0.3409266941}, -{"learn":[0.3516512513],"iteration":484,"passed_time":0.3201487053,"remaining_time":0.339951718}, -{"learn":[0.3513434274],"iteration":485,"passed_time":0.3205595731,"remaining_time":0.3390280259}, -{"learn":[0.3509157343],"iteration":486,"passed_time":0.3209247611,"remaining_time":0.3380583213}, -{"learn":[0.3505166599],"iteration":487,"passed_time":0.3212523916,"remaining_time":0.3370516896}, -{"learn":[0.3500145929],"iteration":488,"passed_time":0.3215824291,"remaining_time":0.3360503502}, -{"learn":[0.3496588809],"iteration":489,"passed_time":0.3219426458,"remaining_time":0.335083162}, -{"learn":[0.3492625185],"iteration":490,"passed_time":0.322399651,"remaining_time":0.3342187829}, -{"learn":[0.3488656453],"iteration":491,"passed_time":0.3227923015,"remaining_time":0.3332896121}, -{"learn":[0.3484638504],"iteration":492,"passed_time":0.3231560736,"remaining_time":0.3323329195}, -{"learn":[0.3480004881],"iteration":493,"passed_time":0.3234965985,"remaining_time":0.3313548154}, -{"learn":[0.3476062758],"iteration":494,"passed_time":0.323941694,"remaining_time":0.3304859706}, -{"learn":[0.3472642175],"iteration":495,"passed_time":0.324350876,"remaining_time":0.3295823417}, -{"learn":[0.3468141258],"iteration":496,"passed_time":0.3247473794,"remaining_time":0.3286678709}, -{"learn":[0.3463197521],"iteration":497,"passed_time":0.3250688244,"remaining_time":0.3276798189}, -{"learn":[0.3460200386],"iteration":498,"passed_time":0.3254756201,"remaining_time":0.3267801316}, -{"learn":[0.3455273892],"iteration":499,"passed_time":0.3258117316,"remaining_time":0.3258117316}, -{"learn":[0.3449356386],"iteration":500,"passed_time":0.3261248511,"remaining_time":0.3248229555}, -{"learn":[0.3445860488],"iteration":501,"passed_time":0.3265338473,"remaining_time":0.3239319839}, -{"learn":[0.3441699871],"iteration":502,"passed_time":0.3269095564,"remaining_time":0.3230100388}, -{"learn":[0.3437617634],"iteration":503,"passed_time":0.3272397571,"remaining_time":0.3220454753}, -{"learn":[0.343414247],"iteration":504,"passed_time":0.3276446868,"remaining_time":0.3211566732}, -{"learn":[0.3430223976],"iteration":505,"passed_time":0.3283799178,"remaining_time":0.3205922518}, -{"learn":[0.34276789],"iteration":506,"passed_time":0.3288634981,"remaining_time":0.3197824548}, -{"learn":[0.3423123913],"iteration":507,"passed_time":0.3292155659,"remaining_time":0.3188465717}, -{"learn":[0.3419258424],"iteration":508,"passed_time":0.3295586355,"remaining_time":0.3179043026}, -{"learn":[0.3414402349],"iteration":509,"passed_time":0.3298939341,"remaining_time":0.3169569171}, -{"learn":[0.3409900154],"iteration":510,"passed_time":0.3302300308,"remaining_time":0.3160126909}, -{"learn":[0.3404066435],"iteration":511,"passed_time":0.330526685,"remaining_time":0.3150332467}, -{"learn":[0.3400745732],"iteration":512,"passed_time":0.3315326622,"remaining_time":0.3147298372}, -{"learn":[0.3396446535],"iteration":513,"passed_time":0.3319733954,"remaining_time":0.3138892416}, -{"learn":[0.3392627665],"iteration":514,"passed_time":0.3323251441,"remaining_time":0.3129663978}, -{"learn":[0.3389605539],"iteration":515,"passed_time":0.3327506488,"remaining_time":0.3121149497}, -{"learn":[0.3386215099],"iteration":516,"passed_time":0.3331681775,"remaining_time":0.3112576977}, -{"learn":[0.3381765485],"iteration":517,"passed_time":0.3334935631,"remaining_time":0.3103164043}, -{"learn":[0.3376931037],"iteration":518,"passed_time":0.3338169224,"remaining_time":0.3093756063}, -{"learn":[0.337248815],"iteration":519,"passed_time":0.3341415136,"remaining_time":0.3084383203}, -{"learn":[0.3367245538],"iteration":520,"passed_time":0.3344925904,"remaining_time":0.3075277367}, -{"learn":[0.3363859143],"iteration":521,"passed_time":0.3376304277,"remaining_time":0.3091711579}, -{"learn":[0.3359881639],"iteration":522,"passed_time":0.3385165422,"remaining_time":0.3087426207}, -{"learn":[0.335651734],"iteration":523,"passed_time":0.3391893682,"remaining_time":0.3081185864}, -{"learn":[0.335174254],"iteration":524,"passed_time":0.3398135656,"remaining_time":0.3074503688}, -{"learn":[0.3346553019],"iteration":525,"passed_time":0.340461328,"remaining_time":0.3068035541}, -{"learn":[0.3342645126],"iteration":526,"passed_time":0.3411102502,"remaining_time":0.3061577768}, -{"learn":[0.333931561],"iteration":527,"passed_time":0.3421015002,"remaining_time":0.3058180078}, -{"learn":[0.3334565205],"iteration":528,"passed_time":0.3429734602,"remaining_time":0.3053695647}, -{"learn":[0.3331309302],"iteration":529,"passed_time":0.3437869073,"remaining_time":0.3048676348}, -{"learn":[0.3328454452],"iteration":530,"passed_time":0.3462173018,"remaining_time":0.3057926827}, -{"learn":[0.3325251554],"iteration":531,"passed_time":0.3466204815,"remaining_time":0.304921777}, -{"learn":[0.3321362521],"iteration":532,"passed_time":0.3474775273,"remaining_time":0.3044502913}, -{"learn":[0.3318056336],"iteration":533,"passed_time":0.3478953928,"remaining_time":0.3035941068}, -{"learn":[0.3313364259],"iteration":534,"passed_time":0.3482484741,"remaining_time":0.3026832532}, -{"learn":[0.3309532276],"iteration":535,"passed_time":0.3486176657,"remaining_time":0.3017884271}, -{"learn":[0.3306332145],"iteration":536,"passed_time":0.3494756577,"remaining_time":0.301317001}, -{"learn":[0.3302503697],"iteration":537,"passed_time":0.3499166267,"remaining_time":0.3004860252}, -{"learn":[0.3298849165],"iteration":538,"passed_time":0.3502547191,"remaining_time":0.2995685075}, -{"learn":[0.3294564698],"iteration":539,"passed_time":0.3505782409,"remaining_time":0.2986407237}, -{"learn":[0.3290752811],"iteration":540,"passed_time":0.3509396463,"remaining_time":0.2977473154}, -{"learn":[0.3286145287],"iteration":541,"passed_time":0.351250937,"remaining_time":0.2968135224}, -{"learn":[0.3283276004],"iteration":542,"passed_time":0.3517529965,"remaining_time":0.2960425771}, -{"learn":[0.327948004],"iteration":543,"passed_time":0.352163302,"remaining_time":0.295195709}, -{"learn":[0.3276326656],"iteration":544,"passed_time":0.3524674003,"remaining_time":0.2942617746}, -{"learn":[0.3273104302],"iteration":545,"passed_time":0.3528592852,"remaining_time":0.2934031419}, -{"learn":[0.3269880712],"iteration":546,"passed_time":0.3532791953,"remaining_time":0.292569425}, -{"learn":[0.3266501256],"iteration":547,"passed_time":0.3536846858,"remaining_time":0.2917253248}, -{"learn":[0.3262259236],"iteration":548,"passed_time":0.3540171476,"remaining_time":0.2908228298}, -{"learn":[0.3258500312],"iteration":549,"passed_time":0.3543562466,"remaining_time":0.2899278381}, -{"learn":[0.3253507018],"iteration":550,"passed_time":0.3546589755,"remaining_time":0.2890052269}, -{"learn":[0.3250135141],"iteration":551,"passed_time":0.355055667,"remaining_time":0.288161121}, -{"learn":[0.3246421516],"iteration":552,"passed_time":0.3554378948,"remaining_time":0.2873069421}, -{"learn":[0.3244070964],"iteration":553,"passed_time":0.3558949874,"remaining_time":0.2865147371}, -{"learn":[0.3240867768],"iteration":554,"passed_time":0.3562683347,"remaining_time":0.2856565927}, -{"learn":[0.3238518877],"iteration":555,"passed_time":0.3570102351,"remaining_time":0.2850945043}, -{"learn":[0.3235307634],"iteration":556,"passed_time":0.3574072901,"remaining_time":0.2842575036}, -{"learn":[0.3231597074],"iteration":557,"passed_time":0.3577621642,"remaining_time":0.2833886677}, -{"learn":[0.3226190891],"iteration":558,"passed_time":0.3580537165,"remaining_time":0.2824717155}, -{"learn":[0.3221721266],"iteration":559,"passed_time":0.3583790962,"remaining_time":0.2815835756}, -{"learn":[0.3217267905],"iteration":560,"passed_time":0.3587386326,"remaining_time":0.2807241706}, -{"learn":[0.32127652],"iteration":561,"passed_time":0.3590720036,"remaining_time":0.2798461523}, -{"learn":[0.3210437383],"iteration":562,"passed_time":0.3595055899,"remaining_time":0.2790478558}, -{"learn":[0.3208125659],"iteration":563,"passed_time":0.3598968851,"remaining_time":0.2782181594}, -{"learn":[0.3204400071],"iteration":564,"passed_time":0.3602902071,"remaining_time":0.2773915754}, -{"learn":[0.3199909372],"iteration":565,"passed_time":0.36064548,"remaining_time":0.2765373469}, -{"learn":[0.3196365365],"iteration":566,"passed_time":0.3609983237,"remaining_time":0.2756830232}, -{"learn":[0.3191042415],"iteration":567,"passed_time":0.3612906944,"remaining_time":0.2747844718}, -{"learn":[0.3186643677],"iteration":568,"passed_time":0.3616025116,"remaining_time":0.2739027812}, -{"learn":[0.3183591408],"iteration":569,"passed_time":0.362010737,"remaining_time":0.2730958191}, -{"learn":[0.3178278974],"iteration":570,"passed_time":0.3623038477,"remaining_time":0.2722037665}, -{"learn":[0.3174204188],"iteration":571,"passed_time":0.3626155564,"remaining_time":0.271327724}, -{"learn":[0.3169393625],"iteration":572,"passed_time":0.3629143003,"remaining_time":0.2704439899}, -{"learn":[0.3165036993],"iteration":573,"passed_time":0.363220913,"remaining_time":0.269568134}, -{"learn":[0.3161586651],"iteration":574,"passed_time":0.363547929,"remaining_time":0.2687093388}, -{"learn":[0.3158007775],"iteration":575,"passed_time":0.3639338083,"remaining_time":0.26789572}, -{"learn":[0.3153626706],"iteration":576,"passed_time":0.3642983688,"remaining_time":0.267067955}, -{"learn":[0.3148854673],"iteration":577,"passed_time":0.3646425797,"remaining_time":0.2662269354}, -{"learn":[0.3143635946],"iteration":578,"passed_time":0.3650496702,"remaining_time":0.2654333526}, -{"learn":[0.3138878771],"iteration":579,"passed_time":0.3660084401,"remaining_time":0.2650405946}, -{"learn":[0.3134522523],"iteration":580,"passed_time":0.3665438913,"remaining_time":0.2643406032}, -{"learn":[0.3131497758],"iteration":581,"passed_time":0.3669453893,"remaining_time":0.2635449703}, -{"learn":[0.3128520804],"iteration":582,"passed_time":0.3673621961,"remaining_time":0.2627616394}, -{"learn":[0.3124257113],"iteration":583,"passed_time":0.3679513895,"remaining_time":0.2621023596}, -{"learn":[0.3121328269],"iteration":584,"passed_time":0.3684001299,"remaining_time":0.2613436819}, -{"learn":[0.311617566],"iteration":585,"passed_time":0.3687012989,"remaining_time":0.260481805}, -{"learn":[0.3113503967],"iteration":586,"passed_time":0.3690998272,"remaining_time":0.2596903384}, -{"learn":[0.3110510664],"iteration":587,"passed_time":0.3695252416,"remaining_time":0.2589190468}, -{"learn":[0.3105376831],"iteration":588,"passed_time":0.3698207864,"remaining_time":0.2580583076}, -{"learn":[0.3101793698],"iteration":589,"passed_time":0.3701629642,"remaining_time":0.2572318903}, -{"learn":[0.3097576456],"iteration":590,"passed_time":0.3705036756,"remaining_time":0.256406097}, -{"learn":[0.3093302974],"iteration":591,"passed_time":0.3708391752,"remaining_time":0.2555783505}, -{"learn":[0.3089044435],"iteration":592,"passed_time":0.3711963898,"remaining_time":0.254767168}, -{"learn":[0.3085143992],"iteration":593,"passed_time":0.3715781563,"remaining_time":0.2539742954}, -{"learn":[0.308161561],"iteration":594,"passed_time":0.3719649689,"remaining_time":0.2531862393}, -{"learn":[0.307898977],"iteration":595,"passed_time":0.3724476686,"remaining_time":0.252464527}, -{"learn":[0.3076050154],"iteration":596,"passed_time":0.3728517823,"remaining_time":0.2516905666}, -{"learn":[0.3073084865],"iteration":597,"passed_time":0.3732415627,"remaining_time":0.2509082077}, -{"learn":[0.3070910786],"iteration":598,"passed_time":0.3736596733,"remaining_time":0.2501461252}, -{"learn":[0.3066311819],"iteration":599,"passed_time":0.3740099804,"remaining_time":0.249339987}, -{"learn":[0.3063479534],"iteration":600,"passed_time":0.3743961785,"remaining_time":0.2485591934}, -{"learn":[0.3059329774],"iteration":601,"passed_time":0.3747714456,"remaining_time":0.247772484}, -{"learn":[0.3055891906],"iteration":602,"passed_time":0.3751904495,"remaining_time":0.2470159344}, -{"learn":[0.3052941263],"iteration":603,"passed_time":0.3756562411,"remaining_time":0.246291178}, -{"learn":[0.3049547076],"iteration":604,"passed_time":0.3760269905,"remaining_time":0.2455052252}, -{"learn":[0.3044544842],"iteration":605,"passed_time":0.3763324552,"remaining_time":0.2446781969}, -{"learn":[0.3041649589],"iteration":606,"passed_time":0.3768138714,"remaining_time":0.2439668064}, -{"learn":[0.3036680775],"iteration":607,"passed_time":0.3773010596,"remaining_time":0.2432598937}, -{"learn":[0.3033767513],"iteration":608,"passed_time":0.3780158821,"remaining_time":0.2426998521}, -{"learn":[0.3029951879],"iteration":609,"passed_time":0.3786339673,"remaining_time":0.2420774545}, -{"learn":[0.3025867684],"iteration":610,"passed_time":0.3794267042,"remaining_time":0.241566265}, -{"learn":[0.3020925096],"iteration":611,"passed_time":0.3800789752,"remaining_time":0.2409651019}, -{"learn":[0.3018062498],"iteration":612,"passed_time":0.3807328261,"remaining_time":0.2403647695}, -{"learn":[0.3013960038],"iteration":613,"passed_time":0.3813861455,"remaining_time":0.2397639286}, -{"learn":[0.3010602551],"iteration":614,"passed_time":0.3821270163,"remaining_time":0.2392177257}, -{"learn":[0.3007759707],"iteration":615,"passed_time":0.383546687,"remaining_time":0.2390940387}, -{"learn":[0.3005227702],"iteration":616,"passed_time":0.3853002015,"remaining_time":0.2391733828}, -{"learn":[0.3001474057],"iteration":617,"passed_time":0.3858573272,"remaining_time":0.2385072799}, -{"learn":[0.2998298875],"iteration":618,"passed_time":0.3864455551,"remaining_time":0.2378606729}, -{"learn":[0.2994279989],"iteration":619,"passed_time":0.3870972323,"remaining_time":0.2372531424}, -{"learn":[0.2990533326],"iteration":620,"passed_time":0.387642183,"remaining_time":0.2365803339}, -{"learn":[0.2986103424],"iteration":621,"passed_time":0.3882269796,"remaining_time":0.2359321516}, -{"learn":[0.2982386308],"iteration":622,"passed_time":0.3890611999,"remaining_time":0.2354351081}, -{"learn":[0.2980017236],"iteration":623,"passed_time":0.389633898,"remaining_time":0.2347794001}, -{"learn":[0.2976457221],"iteration":624,"passed_time":0.3902746228,"remaining_time":0.2341647737}, -{"learn":[0.2973187694],"iteration":625,"passed_time":0.3908383146,"remaining_time":0.233504041}, -{"learn":[0.2970820921],"iteration":626,"passed_time":0.3915603281,"remaining_time":0.2329378028}, -{"learn":[0.2968406635],"iteration":627,"passed_time":0.3927190752,"remaining_time":0.2326297706}, -{"learn":[0.29651102],"iteration":628,"passed_time":0.3932948847,"remaining_time":0.2319752023}, -{"learn":[0.2962718989],"iteration":629,"passed_time":0.3939082796,"remaining_time":0.2313429579}, -{"learn":[0.2960237563],"iteration":630,"passed_time":0.3945193363,"remaining_time":0.2307094059}, -{"learn":[0.2957421456],"iteration":631,"passed_time":0.3951295835,"remaining_time":0.2300754537}, -{"learn":[0.2954565712],"iteration":632,"passed_time":0.395840817,"remaining_time":0.2295001261}, -{"learn":[0.2951291544],"iteration":633,"passed_time":0.3968526821,"remaining_time":0.2290979206}, -{"learn":[0.2948581576],"iteration":634,"passed_time":0.3975057375,"remaining_time":0.2284875499}, -{"learn":[0.2944618165],"iteration":635,"passed_time":0.398007172,"remaining_time":0.2277902683}, -{"learn":[0.2940285674],"iteration":636,"passed_time":0.398549799,"remaining_time":0.2271170754}, -{"learn":[0.2937956708],"iteration":637,"passed_time":0.3990961718,"remaining_time":0.2264464173}, -{"learn":[0.2935192755],"iteration":638,"passed_time":0.3998986777,"remaining_time":0.2259208492}, -{"learn":[0.2932442342],"iteration":639,"passed_time":0.4008465293,"remaining_time":0.2254761727}, -{"learn":[0.2928142079],"iteration":640,"passed_time":0.4014032505,"remaining_time":0.2248108688}, -{"learn":[0.2925079678],"iteration":641,"passed_time":0.4019529065,"remaining_time":0.2241419635}, -{"learn":[0.2922439575],"iteration":642,"passed_time":0.4026259892,"remaining_time":0.2235419567}, -{"learn":[0.2919395225],"iteration":643,"passed_time":0.4032024573,"remaining_time":0.2228883149}, -{"learn":[0.2915480094],"iteration":644,"passed_time":0.4037289074,"remaining_time":0.2222073831}, -{"learn":[0.2912747434],"iteration":645,"passed_time":0.4049154446,"remaining_time":0.2218886492}, -{"learn":[0.2908083541],"iteration":646,"passed_time":0.405585167,"remaining_time":0.2212852611}, -{"learn":[0.2906082656],"iteration":647,"passed_time":0.4064820636,"remaining_time":0.2208050716}, -{"learn":[0.2902667182],"iteration":648,"passed_time":0.407054487,"remaining_time":0.2201481124}, -{"learn":[0.2900387049],"iteration":649,"passed_time":0.4077896999,"remaining_time":0.2195790692}, -{"learn":[0.2898409878],"iteration":650,"passed_time":0.4084738505,"remaining_time":0.218982141}, -{"learn":[0.289457913],"iteration":651,"passed_time":0.4091963881,"remaining_time":0.2184054341}, -{"learn":[0.2891859795],"iteration":652,"passed_time":0.4112077824,"remaining_time":0.2185131707}, -{"learn":[0.288721157],"iteration":653,"passed_time":0.4117850476,"remaining_time":0.2178556979}, -{"learn":[0.288452323],"iteration":654,"passed_time":0.4126922254,"remaining_time":0.2173722409}, -{"learn":[0.2882551636],"iteration":655,"passed_time":0.4134877791,"remaining_time":0.2168289573}, -{"learn":[0.2879799902],"iteration":656,"passed_time":0.4154170392,"remaining_time":0.21687678}, -{"learn":[0.2876007472],"iteration":657,"passed_time":0.4163582726,"remaining_time":0.2164050596}, -{"learn":[0.287326932],"iteration":658,"passed_time":0.4190196687,"remaining_time":0.2168220137}, -{"learn":[0.2869494557],"iteration":659,"passed_time":0.4195774715,"remaining_time":0.2161459702}, -{"learn":[0.2866300855],"iteration":660,"passed_time":0.4201767647,"remaining_time":0.2154915632}, -{"learn":[0.286373198],"iteration":661,"passed_time":0.4207802955,"remaining_time":0.2148394862}, -{"learn":[0.2859906852],"iteration":662,"passed_time":0.4213216894,"remaining_time":0.2141559718}, -{"learn":[0.2855351972],"iteration":663,"passed_time":0.4218715628,"remaining_time":0.2134771763}, -{"learn":[0.2851559839],"iteration":664,"passed_time":0.4224128604,"remaining_time":0.2127944485}, -{"learn":[0.2848492605],"iteration":665,"passed_time":0.4229119453,"remaining_time":0.2120909756}, -{"learn":[0.2845873215],"iteration":666,"passed_time":0.4235194938,"remaining_time":0.211442266}, -{"learn":[0.2843303766],"iteration":667,"passed_time":0.4240966697,"remaining_time":0.2107785844}, -{"learn":[0.2840162835],"iteration":668,"passed_time":0.4246249729,"remaining_time":0.2100909806}, -{"learn":[0.2837108139],"iteration":669,"passed_time":0.4252157833,"remaining_time":0.2094346395}, -{"learn":[0.2833648005],"iteration":670,"passed_time":0.4257435042,"remaining_time":0.2087475602}, -{"learn":[0.2830218524],"iteration":671,"passed_time":0.4262304825,"remaining_time":0.2080410689}, -{"learn":[0.2826870212],"iteration":672,"passed_time":0.4267960153,"remaining_time":0.2073733982}, -{"learn":[0.2823174106],"iteration":673,"passed_time":0.4274357705,"remaining_time":0.2067419306}, -{"learn":[0.2820060615],"iteration":674,"passed_time":0.4279301271,"remaining_time":0.2060404316}, -{"learn":[0.2815587755],"iteration":675,"passed_time":0.4283903348,"remaining_time":0.2053231782}, -{"learn":[0.2813848065],"iteration":676,"passed_time":0.4289439582,"remaining_time":0.2046512533}, -{"learn":[0.28112724],"iteration":677,"passed_time":0.4295107341,"remaining_time":0.2039859238}, -{"learn":[0.2808396901],"iteration":678,"passed_time":0.4300008989,"remaining_time":0.2032846665}, -{"learn":[0.2806121494],"iteration":679,"passed_time":0.430644332,"remaining_time":0.2026561562}, -{"learn":[0.2802471199],"iteration":680,"passed_time":0.4311365141,"remaining_time":0.2019567518}, -{"learn":[0.2798035315],"iteration":681,"passed_time":0.4315784996,"remaining_time":0.2012345497}, -{"learn":[0.2795483172],"iteration":682,"passed_time":0.4321520022,"remaining_time":0.2005742089}, -{"learn":[0.2792404869],"iteration":683,"passed_time":0.4327219428,"remaining_time":0.1999124765}, -{"learn":[0.2789540802],"iteration":684,"passed_time":0.4332210177,"remaining_time":0.1992184242}, -{"learn":[0.2786558292],"iteration":685,"passed_time":0.4337036596,"remaining_time":0.1985174185}, -{"learn":[0.2782940418],"iteration":686,"passed_time":0.4342437486,"remaining_time":0.1978432217}, -{"learn":[0.277895276],"iteration":687,"passed_time":0.4347123066,"remaining_time":0.1971369763}, -{"learn":[0.2775917564],"iteration":688,"passed_time":0.4352487322,"remaining_time":0.1964620547}, -{"learn":[0.2772567102],"iteration":689,"passed_time":0.4357865156,"remaining_time":0.1957881447}, -{"learn":[0.2770056639],"iteration":690,"passed_time":0.4364056991,"remaining_time":0.195151029}, -{"learn":[0.2768216282],"iteration":691,"passed_time":0.4371092238,"remaining_time":0.1945515042}, -{"learn":[0.2765277241],"iteration":692,"passed_time":0.4378316981,"remaining_time":0.1939600741}, -{"learn":[0.2762676307],"iteration":693,"passed_time":0.4386821423,"remaining_time":0.193424691}, -{"learn":[0.2760578649],"iteration":694,"passed_time":0.4394127408,"remaining_time":0.1928358071}, -{"learn":[0.2757622919],"iteration":695,"passed_time":0.4399842425,"remaining_time":0.1921770255}, -{"learn":[0.2754316394],"iteration":696,"passed_time":0.4404868095,"remaining_time":0.1914885269}, -{"learn":[0.2749989714],"iteration":697,"passed_time":0.4409797478,"remaining_time":0.1907963952}, -{"learn":[0.2745687238],"iteration":698,"passed_time":0.4419040317,"remaining_time":0.1902905773}, -{"learn":[0.2743486528],"iteration":699,"passed_time":0.4439039594,"remaining_time":0.190244554}, -{"learn":[0.2741394831],"iteration":700,"passed_time":0.4447956434,"remaining_time":0.189720253}, -{"learn":[0.2738577638],"iteration":701,"passed_time":0.445638625,"remaining_time":0.1891742311}, -{"learn":[0.2736173953],"iteration":702,"passed_time":0.4463544359,"remaining_time":0.1885736379}, -{"learn":[0.2734091218],"iteration":703,"passed_time":0.4474496523,"remaining_time":0.1881322402}, -{"learn":[0.2732017615],"iteration":704,"passed_time":0.4482101406,"remaining_time":0.1875489241}, -{"learn":[0.2729077467],"iteration":705,"passed_time":0.4488166122,"remaining_time":0.1869009688}, -{"learn":[0.2726194156],"iteration":706,"passed_time":0.4491539468,"remaining_time":0.1861415932}, -{"learn":[0.2723656573],"iteration":707,"passed_time":0.4496752879,"remaining_time":0.1854592995}, -{"learn":[0.2721585333],"iteration":708,"passed_time":0.4500881214,"remaining_time":0.1847329243}, -{"learn":[0.2719530272],"iteration":709,"passed_time":0.4505072974,"remaining_time":0.1840100229}, -{"learn":[0.2716654135],"iteration":710,"passed_time":0.4508423546,"remaining_time":0.1832537841}, -{"learn":[0.2713763544],"iteration":711,"passed_time":0.4513510253,"remaining_time":0.182568954}, -{"learn":[0.271054617],"iteration":712,"passed_time":0.4517413073,"remaining_time":0.1818369638}, -{"learn":[0.270765279],"iteration":713,"passed_time":0.4521095738,"remaining_time":0.1810971122}, -{"learn":[0.2705314734],"iteration":714,"passed_time":0.4524674402,"remaining_time":0.1803541545}, -{"learn":[0.2701860922],"iteration":715,"passed_time":0.4527827674,"remaining_time":0.1795953994}, -{"learn":[0.2699523228],"iteration":716,"passed_time":0.4532239041,"remaining_time":0.1788875382}, -{"learn":[0.2695332468],"iteration":717,"passed_time":0.4535409006,"remaining_time":0.1781316629}, -{"learn":[0.2692927846],"iteration":718,"passed_time":0.4539108459,"remaining_time":0.1773977019}, -{"learn":[0.2690525545],"iteration":719,"passed_time":0.4542673415,"remaining_time":0.1766595217}, -{"learn":[0.2686732369],"iteration":720,"passed_time":0.4545854733,"remaining_time":0.1759075549}, -{"learn":[0.2684303522],"iteration":721,"passed_time":0.4549509786,"remaining_time":0.1751750305}, -{"learn":[0.268084062],"iteration":722,"passed_time":0.4553284021,"remaining_time":0.1744480876}, -{"learn":[0.2678163158],"iteration":723,"passed_time":0.4556837334,"remaining_time":0.1737136884}, -{"learn":[0.2675443845],"iteration":724,"passed_time":0.4565697413,"remaining_time":0.173181626}, -{"learn":[0.2673065875],"iteration":725,"passed_time":0.4573022374,"remaining_time":0.1725906516}, -{"learn":[0.267035761],"iteration":726,"passed_time":0.4577266771,"remaining_time":0.1718836078}, -{"learn":[0.2668061469],"iteration":727,"passed_time":0.4582369921,"remaining_time":0.1712094256}, -{"learn":[0.2665300156],"iteration":728,"passed_time":0.458587581,"remaining_time":0.1704763161}, -{"learn":[0.2663027665],"iteration":729,"passed_time":0.4589974774,"remaining_time":0.1697661903}, -{"learn":[0.2661044661],"iteration":730,"passed_time":0.4596895263,"remaining_time":0.1691607149}, -{"learn":[0.2658744007],"iteration":731,"passed_time":0.4601046902,"remaining_time":0.1684536297}, -{"learn":[0.2655614189],"iteration":732,"passed_time":0.4606128688,"remaining_time":0.1677812223}, -{"learn":[0.265313834],"iteration":733,"passed_time":0.4610676066,"remaining_time":0.1670898956}, -{"learn":[0.2650115107],"iteration":734,"passed_time":0.4614170024,"remaining_time":0.1663612322}, -{"learn":[0.2646720558],"iteration":735,"passed_time":0.4617790771,"remaining_time":0.1656381472}, -{"learn":[0.2644053825],"iteration":736,"passed_time":0.4621144345,"remaining_time":0.1649065079}, -{"learn":[0.2639993003],"iteration":737,"passed_time":0.4624059627,"remaining_time":0.1641603824}, -{"learn":[0.2637928831],"iteration":738,"passed_time":0.4627744662,"remaining_time":0.1634426734}, -{"learn":[0.26338629],"iteration":739,"passed_time":0.4631056449,"remaining_time":0.1627127942}, -{"learn":[0.2631142416],"iteration":740,"passed_time":0.4634984127,"remaining_time":0.1620055181}, -{"learn":[0.262746945],"iteration":741,"passed_time":0.4638036187,"remaining_time":0.1612686437}, -{"learn":[0.2625778381],"iteration":742,"passed_time":0.4645685041,"remaining_time":0.1606919321}, -{"learn":[0.26234528],"iteration":743,"passed_time":0.4650294033,"remaining_time":0.1600101173}, -{"learn":[0.262142009],"iteration":744,"passed_time":0.4654747442,"remaining_time":0.1593235702}, -{"learn":[0.2617390794],"iteration":745,"passed_time":0.4658131417,"remaining_time":0.1586012574}, -{"learn":[0.2614471678],"iteration":746,"passed_time":0.4662386401,"remaining_time":0.1579094725}, -{"learn":[0.2611715304],"iteration":747,"passed_time":0.466616248,"remaining_time":0.1572022654}, -{"learn":[0.2608939814],"iteration":748,"passed_time":0.4670498365,"remaining_time":0.1565146982}, -{"learn":[0.260690721],"iteration":749,"passed_time":0.4674780929,"remaining_time":0.155826031}, -{"learn":[0.2603874398],"iteration":750,"passed_time":0.4677855066,"remaining_time":0.1550979909}, -{"learn":[0.2601193339],"iteration":751,"passed_time":0.4681716736,"remaining_time":0.1543970413}, -{"learn":[0.2597940266],"iteration":752,"passed_time":0.4685466653,"remaining_time":0.1536932621}, -{"learn":[0.2595251692],"iteration":753,"passed_time":0.4689000876,"remaining_time":0.1529833177}, -{"learn":[0.2592016033],"iteration":754,"passed_time":0.4692272787,"remaining_time":0.1522658057}, -{"learn":[0.2588046512],"iteration":755,"passed_time":0.4695262306,"remaining_time":0.151540212}, -{"learn":[0.2584771727],"iteration":756,"passed_time":0.4699087931,"remaining_time":0.1508425848}, -{"learn":[0.258083484],"iteration":757,"passed_time":0.4702699272,"remaining_time":0.1501389477}, -{"learn":[0.2576890822],"iteration":758,"passed_time":0.4706126784,"remaining_time":0.1494303761}, -{"learn":[0.2575262125],"iteration":759,"passed_time":0.4710687548,"remaining_time":0.1487585541}, -{"learn":[0.2573358842],"iteration":760,"passed_time":0.4714924382,"remaining_time":0.1480771258}, -{"learn":[0.2571070407],"iteration":761,"passed_time":0.4738636861,"remaining_time":0.1480046684}, -{"learn":[0.2567511073],"iteration":762,"passed_time":0.4750951317,"remaining_time":0.1475721444}, -{"learn":[0.2564899155],"iteration":763,"passed_time":0.4755368151,"remaining_time":0.1468935712}, -{"learn":[0.2561713372],"iteration":764,"passed_time":0.4758664528,"remaining_time":0.1461811979}, -{"learn":[0.2558850923],"iteration":765,"passed_time":0.4763280569,"remaining_time":0.1455101375}, -{"learn":[0.255597336],"iteration":766,"passed_time":0.4766923409,"remaining_time":0.1448100592}, -{"learn":[0.2553038406],"iteration":767,"passed_time":0.4769909137,"remaining_time":0.1440910052}, -{"learn":[0.2551415861],"iteration":768,"passed_time":0.4800090564,"remaining_time":0.1441899766}, -{"learn":[0.2549170256],"iteration":769,"passed_time":0.4805868065,"remaining_time":0.1435519032}, -{"learn":[0.2547311187],"iteration":770,"passed_time":0.4811551019,"remaining_time":0.1429111781}, -{"learn":[0.2544388579],"iteration":771,"passed_time":0.4816654229,"remaining_time":0.1422535187}, -{"learn":[0.2541200263],"iteration":772,"passed_time":0.4822915889,"remaining_time":0.1416302596}, -{"learn":[0.253959594],"iteration":773,"passed_time":0.4828837652,"remaining_time":0.1409970684}, -{"learn":[0.2537388567],"iteration":774,"passed_time":0.483432444,"remaining_time":0.1403513547}, -{"learn":[0.2534203806],"iteration":775,"passed_time":0.4839090885,"remaining_time":0.1396850977}, -{"learn":[0.2531755737],"iteration":776,"passed_time":0.4843985186,"remaining_time":0.1390229982}, -{"learn":[0.2529106076],"iteration":777,"passed_time":0.48489424,"remaining_time":0.1383631379}, -{"learn":[0.25252861],"iteration":778,"passed_time":0.485382096,"remaining_time":0.1377014675}, -{"learn":[0.2522401299],"iteration":779,"passed_time":0.4860105642,"remaining_time":0.1370799027}, -{"learn":[0.2518930776],"iteration":780,"passed_time":0.4865325231,"remaining_time":0.136428454}, -{"learn":[0.2515788674],"iteration":781,"passed_time":0.4871425093,"remaining_time":0.135801876}, -{"learn":[0.2513293752],"iteration":782,"passed_time":0.4881200719,"remaining_time":0.1352772102}, -{"learn":[0.2510782033],"iteration":783,"passed_time":0.488686901,"remaining_time":0.1346382278}, -{"learn":[0.2506980555],"iteration":784,"passed_time":0.491481106,"remaining_time":0.1346094749}, -{"learn":[0.2503226527],"iteration":785,"passed_time":0.4928145453,"remaining_time":0.1341759703}, -{"learn":[0.2499814757],"iteration":786,"passed_time":0.4933862558,"remaining_time":0.1335340184}, -{"learn":[0.2496719041],"iteration":787,"passed_time":0.4939144002,"remaining_time":0.1328805239}, -{"learn":[0.2494325127],"iteration":788,"passed_time":0.4944342649,"remaining_time":0.132225133}, -{"learn":[0.249093871],"iteration":789,"passed_time":0.4949448406,"remaining_time":0.1315676158}, -{"learn":[0.2488785131],"iteration":790,"passed_time":0.4964334978,"remaining_time":0.1311689014}, -{"learn":[0.2486212573],"iteration":791,"passed_time":0.4970151808,"remaining_time":0.1305292394}, -{"learn":[0.2483151789],"iteration":792,"passed_time":0.4976228655,"remaining_time":0.1298965109}, -{"learn":[0.2481008215],"iteration":793,"passed_time":0.4982025468,"remaining_time":0.1292565802}, -{"learn":[0.2478614556],"iteration":794,"passed_time":0.498857468,"remaining_time":0.1286362024}, -{"learn":[0.2475220348],"iteration":795,"passed_time":0.4993843042,"remaining_time":0.1279829121}, -{"learn":[0.2471528138],"iteration":796,"passed_time":0.4999155123,"remaining_time":0.1273310527}, -{"learn":[0.2467823178],"iteration":797,"passed_time":0.5004594386,"remaining_time":0.126682715}, -{"learn":[0.2465691545],"iteration":798,"passed_time":0.501245397,"remaining_time":0.1260955254}, -{"learn":[0.2464111426],"iteration":799,"passed_time":0.5017683106,"remaining_time":0.1254420777}, -{"learn":[0.2462568326],"iteration":800,"passed_time":0.502630562,"remaining_time":0.1248732607}, -{"learn":[0.245888627],"iteration":801,"passed_time":0.5034427368,"remaining_time":0.124291349}, -{"learn":[0.2457131488],"iteration":802,"passed_time":0.5043012692,"remaining_time":0.1237202367}, -{"learn":[0.2454448321],"iteration":803,"passed_time":0.5055405473,"remaining_time":0.1232412279}, -{"learn":[0.24514739],"iteration":804,"passed_time":0.5062728608,"remaining_time":0.1226375253}, -{"learn":[0.2449321066],"iteration":805,"passed_time":0.5085157913,"remaining_time":0.1223971011}, -{"learn":[0.244754027],"iteration":806,"passed_time":0.5094402014,"remaining_time":0.1218363803}, -{"learn":[0.2444520933],"iteration":807,"passed_time":0.5103976546,"remaining_time":0.121282611}, -{"learn":[0.244150364],"iteration":808,"passed_time":0.5111959177,"remaining_time":0.1206902599}, -{"learn":[0.2438756845],"iteration":809,"passed_time":0.5117936802,"remaining_time":0.1200503694}, -{"learn":[0.2436781462],"iteration":810,"passed_time":0.5130265818,"remaining_time":0.1195585992}, -{"learn":[0.2434370794],"iteration":811,"passed_time":0.5140712848,"remaining_time":0.1190214305}, -{"learn":[0.243231222],"iteration":812,"passed_time":0.5151709642,"remaining_time":0.1184956584}, -{"learn":[0.2429795031],"iteration":813,"passed_time":0.5163394584,"remaining_time":0.1179842006}, -{"learn":[0.2428085953],"iteration":814,"passed_time":0.5177108502,"remaining_time":0.1175171868}, -{"learn":[0.2425811674],"iteration":815,"passed_time":0.5184515148,"remaining_time":0.1169057337}, -{"learn":[0.242311599],"iteration":816,"passed_time":0.519181002,"remaining_time":0.1162914607}, -{"learn":[0.2419550121],"iteration":817,"passed_time":0.5197364035,"remaining_time":0.1156381729}, -{"learn":[0.2418074012],"iteration":818,"passed_time":0.5205502008,"remaining_time":0.11504223}, -{"learn":[0.2415643207],"iteration":819,"passed_time":0.5212907748,"remaining_time":0.1144296823}, -{"learn":[0.2412046386],"iteration":820,"passed_time":0.5218312713,"remaining_time":0.1137732004}, -{"learn":[0.2410006757],"iteration":821,"passed_time":0.5224740591,"remaining_time":0.1131391515}, -{"learn":[0.240788513],"iteration":822,"passed_time":0.5231319496,"remaining_time":0.1125083294}, -{"learn":[0.2405819957],"iteration":823,"passed_time":0.5237085713,"remaining_time":0.1118600832}, -{"learn":[0.2402247212],"iteration":824,"passed_time":0.5243313708,"remaining_time":0.1112218059}, -{"learn":[0.2400239493],"iteration":825,"passed_time":0.5249420104,"remaining_time":0.1105810046}, -{"learn":[0.2398227815],"iteration":826,"passed_time":0.5256867701,"remaining_time":0.1099683328}, -{"learn":[0.2394675868],"iteration":827,"passed_time":0.5264600676,"remaining_time":0.1093612701}, -{"learn":[0.2391150721],"iteration":828,"passed_time":0.5275203545,"remaining_time":0.1088130044}, -{"learn":[0.238944539],"iteration":829,"passed_time":0.528225089,"remaining_time":0.1081906809}, -{"learn":[0.2387435521],"iteration":830,"passed_time":0.5289842217,"remaining_time":0.107579222}, -{"learn":[0.2385065428],"iteration":831,"passed_time":0.529543322,"remaining_time":0.1069270169}, -{"learn":[0.2383424767],"iteration":832,"passed_time":0.5301500187,"remaining_time":0.1062845776}, -{"learn":[0.2381619513],"iteration":833,"passed_time":0.5307885353,"remaining_time":0.1056485574}, -{"learn":[0.2378786377],"iteration":834,"passed_time":0.5312926611,"remaining_time":0.1049859749}, -{"learn":[0.2375881033],"iteration":835,"passed_time":0.5317917068,"remaining_time":0.104322775}, -{"learn":[0.2373639579],"iteration":836,"passed_time":0.5323433501,"remaining_time":0.1036702104}, -{"learn":[0.2370155092],"iteration":837,"passed_time":0.5328155297,"remaining_time":0.1030025248}, -{"learn":[0.2366988765],"iteration":838,"passed_time":0.5333837297,"remaining_time":0.1023537312}, -{"learn":[0.2364143282],"iteration":839,"passed_time":0.533889267,"remaining_time":0.1016931937}, -{"learn":[0.2361525744],"iteration":840,"passed_time":0.534412663,"remaining_time":0.1010364012}, -{"learn":[0.2359784991],"iteration":841,"passed_time":0.5350258961,"remaining_time":0.1003967834}, -{"learn":[0.2357424561],"iteration":842,"passed_time":0.535566698,"remaining_time":0.09974373854}, -{"learn":[0.2355073158],"iteration":843,"passed_time":0.5361661841,"remaining_time":0.09910180655}, -{"learn":[0.2353411147],"iteration":844,"passed_time":0.5367596976,"remaining_time":0.09845887944}, -{"learn":[0.2350888635],"iteration":845,"passed_time":0.5373840649,"remaining_time":0.09782168557}, -{"learn":[0.2348985119],"iteration":846,"passed_time":0.5380018638,"remaining_time":0.0971833355}, -{"learn":[0.2346166734],"iteration":847,"passed_time":0.5384990209,"remaining_time":0.0965234094}, -{"learn":[0.2343535147],"iteration":848,"passed_time":0.5389994596,"remaining_time":0.09586445041}, -{"learn":[0.2340152775],"iteration":849,"passed_time":0.5395533792,"remaining_time":0.0952153022}, -{"learn":[0.2338253856],"iteration":850,"passed_time":0.540102651,"remaining_time":0.09456556404}, -{"learn":[0.2335178043],"iteration":851,"passed_time":0.5405720594,"remaining_time":0.09390218872}, -{"learn":[0.2333585535],"iteration":852,"passed_time":0.5411738392,"remaining_time":0.09326208014}, -{"learn":[0.2331409327],"iteration":853,"passed_time":0.5416619499,"remaining_time":0.09260262844}, -{"learn":[0.232799132],"iteration":854,"passed_time":0.5420831053,"remaining_time":0.09193222253}, -{"learn":[0.2326270938],"iteration":855,"passed_time":0.545022903,"remaining_time":0.09168609583}, -{"learn":[0.232290253],"iteration":856,"passed_time":0.5455285614,"remaining_time":0.09102751958}, -{"learn":[0.2320625271],"iteration":857,"passed_time":0.5460920766,"remaining_time":0.09037887515}, -{"learn":[0.2317883436],"iteration":858,"passed_time":0.5467508422,"remaining_time":0.08974606373}, -{"learn":[0.2315655244],"iteration":859,"passed_time":0.547951441,"remaining_time":0.08920139737}, -{"learn":[0.2313702852],"iteration":860,"passed_time":0.5488170515,"remaining_time":0.08860112677}, -{"learn":[0.2311274941],"iteration":861,"passed_time":0.5495685823,"remaining_time":0.08798197721}, -{"learn":[0.2307902468],"iteration":862,"passed_time":0.5524229874,"remaining_time":0.0876963491}, -{"learn":[0.230628665],"iteration":863,"passed_time":0.5537842252,"remaining_time":0.08716973916}, -{"learn":[0.2304723965],"iteration":864,"passed_time":0.5545513147,"remaining_time":0.08654847109}, -{"learn":[0.2302186894],"iteration":865,"passed_time":0.5551528647,"remaining_time":0.08590125157}, -{"learn":[0.2300390857],"iteration":866,"passed_time":0.5564569068,"remaining_time":0.08536190151}, -{"learn":[0.2297347912],"iteration":867,"passed_time":0.5570938476,"remaining_time":0.08471934087}, -{"learn":[0.2295422937],"iteration":868,"passed_time":0.5580044049,"remaining_time":0.08411804032}, -{"learn":[0.2294070827],"iteration":869,"passed_time":0.5591088771,"remaining_time":0.08354500463}, -{"learn":[0.229074195],"iteration":870,"passed_time":0.5598280886,"remaining_time":0.08291368936}, -{"learn":[0.2288957728],"iteration":871,"passed_time":0.5608158459,"remaining_time":0.08232159205}, -{"learn":[0.2287379333],"iteration":872,"passed_time":0.5617823568,"remaining_time":0.0817254975}, -{"learn":[0.2284071765],"iteration":873,"passed_time":0.5627034871,"remaining_time":0.08112201302}, -{"learn":[0.2281981345],"iteration":874,"passed_time":0.5634399657,"remaining_time":0.08049142367}, -{"learn":[0.2279745298],"iteration":875,"passed_time":0.5642612997,"remaining_time":0.07987260407}, -{"learn":[0.2277943641],"iteration":876,"passed_time":0.5648610068,"remaining_time":0.07922223927}, -{"learn":[0.227464397],"iteration":877,"passed_time":0.5654543985,"remaining_time":0.07857111232}, -{"learn":[0.2272489561],"iteration":878,"passed_time":0.5662696285,"remaining_time":0.07795065421}, -{"learn":[0.2270080894],"iteration":879,"passed_time":0.5672281575,"remaining_time":0.0773492942}, -{"learn":[0.226743977],"iteration":880,"passed_time":0.5678032533,"remaining_time":0.07669533161}, -{"learn":[0.2265238464],"iteration":881,"passed_time":0.5684084677,"remaining_time":0.07604557732}, -{"learn":[0.2261985966],"iteration":882,"passed_time":0.5691880037,"remaining_time":0.07541902201}, -{"learn":[0.2259350951],"iteration":883,"passed_time":0.5697210672,"remaining_time":0.07475977805}, -{"learn":[0.2256665911],"iteration":884,"passed_time":0.5702514542,"remaining_time":0.07410047145}, -{"learn":[0.2254206049],"iteration":885,"passed_time":0.5707525763,"remaining_time":0.07343769041}, -{"learn":[0.2251793891],"iteration":886,"passed_time":0.5712200224,"remaining_time":0.07277098368}, -{"learn":[0.2249354039],"iteration":887,"passed_time":0.5719631158,"remaining_time":0.07213949209}, -{"learn":[0.224719188],"iteration":888,"passed_time":0.573473036,"remaining_time":0.07160349493}, -{"learn":[0.2244787983],"iteration":889,"passed_time":0.5740166277,"remaining_time":0.07094587533}, -{"learn":[0.2242641449],"iteration":890,"passed_time":0.5745484288,"remaining_time":0.07028706929}, -{"learn":[0.2240526804],"iteration":891,"passed_time":0.5750417033,"remaining_time":0.06962388336}, -{"learn":[0.2239209754],"iteration":892,"passed_time":0.5756626222,"remaining_time":0.06897637243}, -{"learn":[0.2236602732],"iteration":893,"passed_time":0.5761377019,"remaining_time":0.06831162908}, -{"learn":[0.2235089796],"iteration":894,"passed_time":0.5771884627,"remaining_time":0.06771484758}, -{"learn":[0.2233028178],"iteration":895,"passed_time":0.5778392655,"remaining_time":0.06707062903}, -{"learn":[0.2231716088],"iteration":896,"passed_time":0.5785026788,"remaining_time":0.06642784383}, -{"learn":[0.2229924926],"iteration":897,"passed_time":0.5790633561,"remaining_time":0.06577334335}, -{"learn":[0.2226752298],"iteration":898,"passed_time":0.5795658509,"remaining_time":0.06511251495}, -{"learn":[0.2225238638],"iteration":899,"passed_time":0.5801448497,"remaining_time":0.06446053885}, -{"learn":[0.2223377015],"iteration":900,"passed_time":0.5806770458,"remaining_time":0.06380358217}, -{"learn":[0.2220517312],"iteration":901,"passed_time":0.5812016734,"remaining_time":0.06314607982}, -{"learn":[0.2217935941],"iteration":902,"passed_time":0.581771487,"remaining_time":0.06249372563}, -{"learn":[0.2215100569],"iteration":903,"passed_time":0.5823889162,"remaining_time":0.06184661057}, -{"learn":[0.2213576053],"iteration":904,"passed_time":0.5830686959,"remaining_time":0.0612061062}, -{"learn":[0.2210437059],"iteration":905,"passed_time":0.5835408115,"remaining_time":0.06054396941}, -{"learn":[0.2207309093],"iteration":906,"passed_time":0.5839938446,"remaining_time":0.05988029498}, -{"learn":[0.2205256373],"iteration":907,"passed_time":0.5844979671,"remaining_time":0.05922226098}, -{"learn":[0.2203404031],"iteration":908,"passed_time":0.5851540432,"remaining_time":0.0585797777}, -{"learn":[0.2201428967],"iteration":909,"passed_time":0.5856712456,"remaining_time":0.05792352978}, -{"learn":[0.2199867836],"iteration":910,"passed_time":0.5862662073,"remaining_time":0.0572751838}, -{"learn":[0.2197325953],"iteration":911,"passed_time":0.5868183988,"remaining_time":0.05662282795}, -{"learn":[0.21941949],"iteration":912,"passed_time":0.5872918285,"remaining_time":0.05596318629}, -{"learn":[0.2192427261],"iteration":913,"passed_time":0.5878288912,"remaining_time":0.05530993943}, -{"learn":[0.2190470483],"iteration":914,"passed_time":0.5883754177,"remaining_time":0.05465782569}, -{"learn":[0.2188108265],"iteration":915,"passed_time":0.5888956267,"remaining_time":0.05400352909}, -{"learn":[0.2185763461],"iteration":916,"passed_time":0.5893668761,"remaining_time":0.05334509347}, -{"learn":[0.218397545],"iteration":917,"passed_time":0.5901757336,"remaining_time":0.05271722239}, -{"learn":[0.2182284423],"iteration":918,"passed_time":0.5911811992,"remaining_time":0.05210628633}, -{"learn":[0.2180648446],"iteration":919,"passed_time":0.5917557335,"remaining_time":0.0514570203}, -{"learn":[0.2178877081],"iteration":920,"passed_time":0.5926940813,"remaining_time":0.05083912315}, -{"learn":[0.2177140968],"iteration":921,"passed_time":0.5932395564,"remaining_time":0.05018729436}, -{"learn":[0.2174067859],"iteration":922,"passed_time":0.5937335631,"remaining_time":0.04953140234}, -{"learn":[0.2172427475],"iteration":923,"passed_time":0.594441586,"remaining_time":0.04889346379}, -{"learn":[0.2170075242],"iteration":924,"passed_time":0.5949819889,"remaining_time":0.04824178288}, -{"learn":[0.2167770352],"iteration":925,"passed_time":0.5957625359,"remaining_time":0.0476095331}, -{"learn":[0.2166043286],"iteration":926,"passed_time":0.5969523115,"remaining_time":0.04700918958}, -{"learn":[0.2163751168],"iteration":927,"passed_time":0.5976561483,"remaining_time":0.04636987357}, -{"learn":[0.2161562507],"iteration":928,"passed_time":0.5997954706,"remaining_time":0.04584012746}, -{"learn":[0.2159849725],"iteration":929,"passed_time":0.6006319213,"remaining_time":0.04520885429}, -{"learn":[0.2157700509],"iteration":930,"passed_time":0.6014893591,"remaining_time":0.04457869579}, -{"learn":[0.2155961884],"iteration":931,"passed_time":0.6021677417,"remaining_time":0.04393498545}, -{"learn":[0.2153693672],"iteration":932,"passed_time":0.6031781508,"remaining_time":0.04331504406}, -{"learn":[0.2152064868],"iteration":933,"passed_time":0.6039009417,"remaining_time":0.04267394234}, -{"learn":[0.2149531543],"iteration":934,"passed_time":0.6045038154,"remaining_time":0.04202432941}, -{"learn":[0.2146568788],"iteration":935,"passed_time":0.6050656118,"remaining_time":0.04137200765}, -{"learn":[0.2144625975],"iteration":936,"passed_time":0.6056736374,"remaining_time":0.04072298736}, -{"learn":[0.2142915385],"iteration":937,"passed_time":0.6063558656,"remaining_time":0.04007895913}, -{"learn":[0.2141212757],"iteration":938,"passed_time":0.6070668518,"remaining_time":0.03943671774}, -{"learn":[0.2139780266],"iteration":939,"passed_time":0.6078386553,"remaining_time":0.03879821204}, -{"learn":[0.2137883178],"iteration":940,"passed_time":0.6084696227,"remaining_time":0.03815059271}, -{"learn":[0.2135832033],"iteration":941,"passed_time":0.6089643342,"remaining_time":0.0374946193}, -{"learn":[0.213338275],"iteration":942,"passed_time":0.6097087637,"remaining_time":0.03685408222}, -{"learn":[0.2130945367],"iteration":943,"passed_time":0.6115469788,"remaining_time":0.03627821061}, -{"learn":[0.2127992234],"iteration":944,"passed_time":0.6120421491,"remaining_time":0.03562150074}, -{"learn":[0.2125003551],"iteration":945,"passed_time":0.6125252692,"remaining_time":0.03496444454}, -{"learn":[0.2123049072],"iteration":946,"passed_time":0.6130689864,"remaining_time":0.03431114707}, -{"learn":[0.2120079526],"iteration":947,"passed_time":0.6135146875,"remaining_time":0.03365270438}, -{"learn":[0.2117684079],"iteration":948,"passed_time":0.6140811754,"remaining_time":0.03300120121}, -{"learn":[0.2115994564],"iteration":949,"passed_time":0.6147059998,"remaining_time":0.03235294736}, -{"learn":[0.2113548624],"iteration":950,"passed_time":0.6152105047,"remaining_time":0.03169854335}, -{"learn":[0.2111350383],"iteration":951,"passed_time":0.6158322973,"remaining_time":0.03105036793}, -{"learn":[0.2108422305],"iteration":952,"passed_time":0.616352162,"remaining_time":0.030397221}, -{"learn":[0.2106553572],"iteration":953,"passed_time":0.6168799266,"remaining_time":0.02974473441}, -{"learn":[0.2103629942],"iteration":954,"passed_time":0.6173293363,"remaining_time":0.0290888169}, -{"learn":[0.2100700906],"iteration":955,"passed_time":0.6178461359,"remaining_time":0.02843643303}, -{"learn":[0.2098822147],"iteration":956,"passed_time":0.61836141,"remaining_time":0.02778426398}, -{"learn":[0.2096414268],"iteration":957,"passed_time":0.6188691933,"remaining_time":0.02713205232}, -{"learn":[0.2094775353],"iteration":958,"passed_time":0.6194397643,"remaining_time":0.02648282621}, -{"learn":[0.209343031],"iteration":959,"passed_time":0.6200932802,"remaining_time":0.02583722001}, -{"learn":[0.2091770747],"iteration":960,"passed_time":0.6206941564,"remaining_time":0.02518946108}, -{"learn":[0.2089391159],"iteration":961,"passed_time":0.6211844623,"remaining_time":0.02453743198}, -{"learn":[0.2087473593],"iteration":962,"passed_time":0.621883416,"remaining_time":0.02389375534}, -{"learn":[0.2084878087],"iteration":963,"passed_time":0.6224494507,"remaining_time":0.02324500023}, -{"learn":[0.2082003291],"iteration":964,"passed_time":0.6229478789,"remaining_time":0.02259396452}, -{"learn":[0.2079655379],"iteration":965,"passed_time":0.6237996107,"remaining_time":0.02195567988}, -{"learn":[0.2078309123],"iteration":966,"passed_time":0.6248229071,"remaining_time":0.02132280862}, -{"learn":[0.2076701543],"iteration":967,"passed_time":0.6253783193,"remaining_time":0.02067366345}, -{"learn":[0.2074827892],"iteration":968,"passed_time":0.6258766768,"remaining_time":0.02002288646}, -{"learn":[0.2072977998],"iteration":969,"passed_time":0.6264908643,"remaining_time":0.01937600611}, -{"learn":[0.2071178449],"iteration":970,"passed_time":0.629342721,"remaining_time":0.01879602359}, -{"learn":[0.2069490254],"iteration":971,"passed_time":0.6302602156,"remaining_time":0.01815564407}, -{"learn":[0.2068189063],"iteration":972,"passed_time":0.6317412629,"remaining_time":0.01753033309}, -{"learn":[0.2066576438],"iteration":973,"passed_time":0.632383109,"remaining_time":0.01688086328}, -{"learn":[0.2063765356],"iteration":974,"passed_time":0.6329266285,"remaining_time":0.01622888791}, -{"learn":[0.2061198865],"iteration":975,"passed_time":0.6336683404,"remaining_time":0.01558200837}, -{"learn":[0.2059267993],"iteration":976,"passed_time":0.6352760759,"remaining_time":0.01495532216}, -{"learn":[0.2057632846],"iteration":977,"passed_time":0.636087611,"remaining_time":0.01430871927}, -{"learn":[0.2055076659],"iteration":978,"passed_time":0.6370669428,"remaining_time":0.01366537875}, -{"learn":[0.2053591971],"iteration":979,"passed_time":0.6379596244,"remaining_time":0.01301958417}, -{"learn":[0.2050760154],"iteration":980,"passed_time":0.6387834672,"remaining_time":0.01237195298}, -{"learn":[0.2047961482],"iteration":981,"passed_time":0.639321911,"remaining_time":0.01171873157}, -{"learn":[0.2045843367],"iteration":982,"passed_time":0.6400619719,"remaining_time":0.01106923044}, -{"learn":[0.2043832455],"iteration":983,"passed_time":0.6438909715,"remaining_time":0.01046977189}, -{"learn":[0.2041069418],"iteration":984,"passed_time":0.6445882918,"remaining_time":0.009816065358}, -{"learn":[0.2038964459],"iteration":985,"passed_time":0.6451535577,"remaining_time":0.009160395342}, -{"learn":[0.2037365884],"iteration":986,"passed_time":0.6460587828,"remaining_time":0.008509386197}, -{"learn":[0.2035126856],"iteration":987,"passed_time":0.6467510394,"remaining_time":0.007855275782}, -{"learn":[0.203312993],"iteration":988,"passed_time":0.6473119973,"remaining_time":0.007199627877}, -{"learn":[0.2031612588],"iteration":989,"passed_time":0.6514480141,"remaining_time":0.006580282971}, -{"learn":[0.2029575471],"iteration":990,"passed_time":0.6520319575,"remaining_time":0.005921581854}, -{"learn":[0.202779838],"iteration":991,"passed_time":0.6524651455,"remaining_time":0.005261815689}, -{"learn":[0.2026532761],"iteration":992,"passed_time":0.6531322932,"remaining_time":0.004604155139}, -{"learn":[0.2023749756],"iteration":993,"passed_time":0.6536547356,"remaining_time":0.003945602026}, -{"learn":[0.2022287718],"iteration":994,"passed_time":0.6550559346,"remaining_time":0.003291738365}, -{"learn":[0.2019803503],"iteration":995,"passed_time":0.6567962655,"remaining_time":0.002637736006}, -{"learn":[0.2017732369],"iteration":996,"passed_time":0.6572189676,"remaining_time":0.001977589672}, -{"learn":[0.201497757],"iteration":997,"passed_time":0.6575678617,"remaining_time":0.001317771266}, -{"learn":[0.2012941028],"iteration":998,"passed_time":0.6578914572,"remaining_time":0.0006585500072}, -{"learn":[0.2011675132],"iteration":999,"passed_time":0.6583236286,"remaining_time":0} -]} \ No newline at end of file diff --git a/catboost_info/learn/events.out.tfevents b/catboost_info/learn/events.out.tfevents deleted file mode 100644 index 16a27132e66003b1ce82de968a4b7759a1637e84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 54870 zcmZ|YcRWz<8$bS*riS*Qy@%4?dk<~YyS-DI(o!}VA*&%Xl&FY88d7F95eW?z_JDrz!Osf+Z|d8Hs%kCccydO)EuuNuo$!<^w#U8=EquX)u( zu7(~bs_b9uQ@m;_SL$(l;JCn$2q)eE?0`lMClGP5?tjLa@Ee0 zsIgznjQCiJpfrh^m9eFmS1sjgRY#)g_3-J>t5$Lqo-s`7MnIY32wt_8tAwPkV78@%cuSG7NfNEr^DcuSjC9p%a`fT-UeVlsKvNv<|8BdTUnhkRajmaFhi zM1AhF$B$QCMB=3HbfoQo$-`c-Q+56AyEdo>)Y{4S*~;%5cTF$ ze>aO?5`v~;8kzAnz@*$mb;w(@v4tpHS0!H!DQ$5yy`1gb6*XR>e5#Z$mLZ( zxr+24YLVZ*NM7}qt3exy`taakA6^ZRtDB99D*1A~8LtM)RqO2jQeEetE(qh*Ai26_ zMO0i^#%W#+mMgb~M4dG4Z^5e}a`pCWKPkheomv(0YN%YTa3v~ySqH(}VVGR~+eXy* zfG%l#ELFKO?MKvEh2;Ib8ZK9@O8QE5_1ro`k5?n)D$S3makcXWW3QTAwL3`EO4DCY z_*f(5>h?UMq7`)gc{NI|J}D8kK02x^uSUz2X=ER%8>_68gLySZuHp_7bv@;g;65BH zSI_1Xb@k}^PJFC!auwEwDBWuTD|j_tu8QyVmg>4>w@)xoSJULG_(e~tu0C_F|KZhixr%coO6kXA!My>iHmi9=SxnFq+#56Hu{^7KNEwDr z@EXQvI7_Y`c@UNTPa%s}v*k)@JyBzN?4Hi6IdWw@hN#EWGB5LLu3S|$A*yV>zTl3U zCsz)6-KB0=zI>9$$C@u!o{mJ>kIk~>)dIPiw27#oUM7QhwNS2xj3R2L%Ur>%a*hRoZBxwrh`Q%d4ewHKGks zUtb0}^JGW}4X~DOc^<5M|+VQ-xPr za&_~0SE;Va?Yv@mwMMQAU5M(l?$QZft(B|l14Om|IY%%{`A@E%_9QAaCP#3U*U6Pu zeix~(ixU-+_zc&})d?G-A_p%M%r`d3)p~WJ7ENgKijTEXu1+^6s$Tk_#k|@iS8JYi zmg@2RwMDLOuO!OPze=z|+$vXJ+Y@!}>%={LtZi~NFuRjf zSI*#E!76*ZTm||OwPUKLV3oZ?u0jtJRbH#{n9p#hTvbdYYRLfavAo(PS02rYvWXLo z^SkBBHm{@94fQLIN_;GBxiYsQYE_ukH(u?LD}yaW^|3ip#jCw?wQnR*lS>~9_F#47 z>P`cq`Yf$~osYFouFTUqNZq)SxS<2D_RH1#Gej->7E;5j19G)|4N+sabkZx)P-yv$7>0OINPuuO{lC*7ixfIwDtn`xABI zK%U^<(32~ZvUXBk^;4U6=VKj}t3fx28m(|Jkypp$YO5Ymb0TeOd39W_F3%%swvC-K zuTIES&F{8SU70TD1#hC0a+MuP)YP$lf^lA7uKpSjPGFY!mb1APO>6j2hWvGvJBxUtJ@R8RHhVL^UMfX$|b_8 z^71$F%oy15{|FoQ;@uvenE)H5OjvQ>;R|?X3M{3hjntCYX6M%M%nVq>L&E&+4Fh;) z4ou6EFc<6hc098Hwq-M6%Wa!v@$3vR?a_oiTQuMq&n$u6P$cYP$bEI5SpjSLs>E{e}}M$POD$>%m!Gf8DV<|uXf|vSzx0y37fvEeFe{KfwgQ!*vm8LTk*^e z*u#=mQcGT^J|4+4dte$@2upA>4$mBb?NlY~l(uCw&m4j6QY0*? z)?zcyoPf=K(Nb#3Mho>5JaY!t&4)1mk;#I`)CE|THeum2bd30D=YgFYMp(^}ZYy|p z0a#wKqEwr*Zn*}}E&?kLB`m~X`YxVb0;a4_*xOvMdOULlmOY!W)N>lXJi83+@!u9w zZF(EtZseI8uxob->u{!LKF{2NnV%+X*u}kqSBD2MV46>?*Kn5rkQXYNYb)8n8E(gv~qHDurkMzb6z z-Q`&buxkc{4V(S)H_t+WEm}_4nRUhkcy>jX|g9yv?$>`0qNML^|6{Ol) zZri59v-`j%-zRLL;m-3sdjM>s31OY8sx2m;2^6Uw)FVhH1-*NgV&z=GcRVJ)+ z!j?3i#Q?kbuA$UPl}1kmwZ#H!{FpFR-+tZrXmP+MI1x5w?MYLf#REH|Ntj}@w6#1- z05-QDVV?q=BYBnxEb&tVsU?{$(*#4%Ghj6_glRAM?!iY(0yf8;GVP9&d6o?9%1OfJ zE5(22SqiX!T7(6~ zGJsj@6Q^(d6olg)Lp`|(*kbt>;0h-_-y2HTvU=-ZOcY4{Wp@VGEWt6YM;_0=8@dVS5a92Jz7ffIWL&Dz9v|&Bu+iuLN=fcH8#R_^Z-Dh%MA-H6-h!w5Eij9w zgw52hoy13b2Ta-TkCfzv4;kh>D+bngEn(URdk99Z5@7Rx{g$HL|E}-EM|%%!a{yuH z_1zBhtQ1)6c)~s<+%V=@8L;_jzoaBjPF%0bvk${v7YlvnpVIHiW63t`uCiZ@{u95H{`g z4qHB2HL&xU-=!qWR;%Xn>^rb)YY7``P%)BcHNd7S5q4q6;eR~)0n9ADT1qnG_n;*_ z`w1-i3}M!4GXy&rzkp>-Cv5A0biphCH?R)H-=rkl=fw}_ll%kh!3Dwww6)RZ*fJmu16C7A*sk{T z1gm^=U||agYZkZDfsbYZ?8vLnQj+>{mM3|32G}lJ!dAJO3D$3xz_u(SY_UzO;2_Nk z*oRM*Qj*795(R^~HLxYe2`k*R%9^ju2G}(f!j^vj`iy61ft`u2kdoYGr7u`@*aF*g zim2vy>l6R!SKlFXUhY!T1S z1DmE#*d&wlg8j)0z(x)yOy^Y>!REk4V8K-%q$Kw**d%xfUjk-+iLl`&DT39aD=_03 zgq613HJWe9WnkWEWm1w2Y8DEX4{pE~oF&Xy^LPtBnme#aRl<6t`)KjZ1K6uKrBaf% zzO4#*<_RpsiZHEiN$q&%1#Hng!ro1t^pI!Xz?w87%*OQb3!Ys8wlCqmRNLydjaTu^ z2Uvy$VG}He#q-P;*u%+$T{HXDk7s_sMiiGwNxDBj^^Ip&fi18g%)4Rw5T0EF)@}r0 z3h$pD;+a3Nqlv{*lC3(A5ey2~fhn00*1*4BXFgg0ux@IEdBv&=HW~wgrB}R@k}P_1 zU$E-90j$3RVVnBi`@tu96WFmigjF;O&*0fDU`HDg_G4nnGM)tiTjlars!h4&vrFg6*+zV0W^Mq$C%ZW(vOehyZrlkFXO< zZwbc9d%&W05H@h9{RzIdNMIdQ33EAKHGyaMfgOHQDAm?7Jh?f~9sv7bLYR+VuHc@0 z2<+;3!kT}%FBpOz0c%%UASJoD_JA^o1AANj zN=njbxr5+P{t2+vR|wlRY?>z@?J2P3T7=Ed_TS307+_oL5%#xza01U_fnB__`$di)XG-dg7o+ShGx?<;`w3e!FvgFMmH|vhjWC5W^=&-M1ZI)>LQ3*! zn`?q4Z5FVsQ-tlevk`1-J_puOm9PV)TZ8!|vw_)F=156awA2x7Yvur3c$=`O8ZE)c zxi5gZo+7NG^n&1?%mub(3Sn=}Jq5Mp0W0{KE!7s9Jgf=dl9#}A?-6#?e2(Dh&Ih(? z4`G#+PUrb(uYmngC2Uy!$c8*C0Onc#T&nGf<&;xAD+E^LLzvsiv4XW}5wOei3EL2? zk;q4T4eW&iVcB|jy?FKpSa?{LRNHVpbHNPnEwD+e2vb;Fa)gie4wz~^!agiNyP9Xk zz$|WON=Z6?4Hqo?N`TEaC+z;}u7WM3_rM0sAncc_S2~|$DX=RFguPt0x|(NYz*>f6 zNVP4SdPlJF@d2385yH~LQZMn*%7Go6LD+1X;PAUH=P6rGM|C1a3QQnM^W(Ld;zv>3So_s%ZvC< zeg!t8IbmtHvKRBL3YbxPs#M#u3@>$_eFNrjfv_oB2L)g1R0C_Xny_67!#48Kz60x^ zLRjPIbAr8~8ep%VrbxA=*gX~O(EkATWgB6U`+n;4N&W=3xEEm#Mrwj1l3&15N|L1{ z3np%8#7FxLY~4-56f74D)^C4+Ej~<`?it&_Vdv!fcfbV z7V}rVC(q6R+c%W3MMFzI@yrs~-tq(~$=8dsTzO^%EX13z7s(!iwW&3**K-K7yP+&t zzu5qrT^27Txg}!nY(B}ez`BMIwr|16Cp@zS7Pg%*C(Fp~;(R_iO`Wh|O_T53T zmS=vzwuccG<*q-DXIFt47!x+CMYmQwy9TWLc)}J~9T5!X{=gy{5mw*vl;EDc4(#!( z$5KnWW-19TcL1+0nsn3)FyOI%La#Qk3c_vCG0>v|AYJH6x=pJWKI zxYS2dOM=s!1fP_K0(*Iku*ZL6JMhu&0P|i!*xa72cJb^kFw4$_&G0%P_?##VSf}WR zQf-&qL!J0&;lO_EApJ7sMfy;E=c$Nk1Y!qRiJhnCA*>hk@R)mF=Mh)dz zHn6{o33J(;H-cw5z)D&YcI0H+l{|X^Oh4hS)Dpk3e+1ui$mtrQIWJbLgVCJUa)hk3C^N3XXW)>Y_OS z3)d#>`0}m;_-Ky69!w`}!^6ziJaYo}q$OdU+Lo&F%o*6eH~vy5x2c;n=a~zz3LnBU za^!j0LxdeK-HU^HT_-O9i=I!I%BxrnKH5cKP8|trly?t5C#s8f3D}L2Yf?+x z%qI2YqqzbzyGhts=X;HLb{UwhK4Itj54z4XH(-OO6IOiTa4OH-f!TH?%)BOc0?#~v zC6`{6T2kKRuQty-fgO89SpV(WxUpY%0la`sb|!3zR{}1L>X)RN)@S-Lz60QMo3u-+Fjs_kQ*4-tyfIX`6ky?^>;LS5W zS`e^J5riFzDO|#{U||29CahV`$x%GJ4eZ56!lH&X!_{J4Z6Uy{XAqXLP7x2%>R2eS zyKM-2Z#v@|pX422dC#v%o$P%`rG{sBf$g|T*x32w+VLz5nD=JF8dhBpT!3(3zXlN2 zc}Q>)A1wmdkAL1$ZC`qM+41ZiFwJ{}?e5;+mS>T`DohE>+qXk--R=V`nnReuic2

uP%ftn3b970Vv`@X?+EOVcN8xwod^o{RxD zWHn)tLEiW^OI?z&!1}8arrv+fR6fZ#V6(n^NG;h^o70zP@xY2w2|KjGS&3%}z%*&nKA%EMg#Gv-7ft@+=+L#cDUHB||a|%y^ao zY;7E2DtA2uul!74l|F>M&i^Zzs$~J&b(*lI5&dwvUUzdn2WF*7SV(-AiF|F@z*qmILfb?PaNxiz5y3$9U@43t&FEghgma<5x&^EEm|cD}kbQ9k4a#gjK~nJIhBa2IjPku!*tb1uv=+U=Do=+j%*$J|FEp zu%-VlN-f#euR!pkDh1Y~fUt2z*91dQ8L*Fmgk1{kBbd;C0H$t2*t|(obokoJfqAME z_HlESU}^Lbn3)P;&t{Lo)4sZ!>k}~Dw-=;Nmhb;Acz0I-yMLRouqqqe#jcB13G9|5 zVX-5||KeNn8JNO$!c49kP3PGcV1p+TX5(qPg=b%Z_3K8My0aFZch}Wc1-y=^nx|9S5fI><6$8 za|tW)F6hOxpTLH6BW(2V>v=r;1+9JhKG$ z&493xFZ$uJU0rQfz?$zM%(l4eXFi%Wu&hyp*=YEkY!x)7$9+CwnB+5=NLMA!fyqw{>UbHH{@Crq*1 z7Z;v605er4?8uD}!9kiMu;%3sQYR}XUcJLda{{LNm@v<>BEi$`49wVrFr&|^f}<@L zU>kJ^8{g-%U{pR2EI^GglV;xpuZ|1AYW|&*T9T}^R?w1*z@Fp~<~pz)ei>JHPhJAH z?+#%lmo@S8y*lO!EXt9v5#M@W;;-9fV9wfv^;1wQ;F%jR>zRb@4Ve?dGk0KW9SGYK zW6+yt9>AuS+DjL}FEBKKXP&^;-X|<+;>J*(c>yc6CQP;E5y6R=H?S^S2}|4BXBi*u z3b5k?3Cn+ZRWJnk01Hqe%!6MTaSXrf=)Jf-;4F#vPe!#{j5~f)7Kfmj*?w-5~ zY)ZEmPd?fWV7~$gi{2b1xF>G{v(+c;u*G@7l`r^~gaHd1L0HDK)q>?iIIv9$gr)6E z6&%P!0K1-KBVD)ONBXMrN!|lC*NL$F{Q9Omiv*@~lrY22?Vs}OKCp)C2s=7p;$@ya z0M>CFVeML<6Krcf1m@I{0@EbH1S1BR&J@CW^q<~{ zj}{9ovJ+vU(S~X~ivyPP#Y(zv?FzgFcS$_3z&OH29%QL)O9G}gj<8V$BLuUcWMH%E5!TdORq)110rs%SQn~=X z-~Q*XJJ!uhQi08UOjw-il#YCDX~3-P2s8Ft{E}zsz(V&Erqp$Z;Mgt$*s_^~HCO5( zc)BxzjqgENlc=R}e3DtfoPM8?I_a)4LxpG0f$h#G?Ddb2g01#!U}fQi)hON-ycTnS zrC%bf%c5n1Tlobr4?V&j3~%Sg*Om(`TZ=Hg*c zyM*aq`zd%)6$5)}PS}zx+aY|CCBQBnAj~4{S_04B1M9JZu=dkJ1+V;4U?;{AHoZr5 zb3R%buE_Y9N64&!Za2(6}$sJ0xNbTtpC9cpZO#| z0h@80uogXH1+V-HU`ERcJFwv7Ha=PI zpxN}dUOf8>tleG03Uj^-=3G_4S{M^nldxN`@$n5<4^6@{)XsbJNmc_h>qD5%#OZ>Q zmG8hZD@~-9X!V{Yn7!5jJCR3N=)iajKFJ@zEFKf~)!J(o&wc{4yh7OVWf>!Q_6yh> zUBZ^RT)oS)-@trV5O#lxy=cAbeYyH7Ux^Bha9B=T< z0@(5>!c0C7n8LF&z$SST_VI#}V2W)C%C6moddRTFk#jw4t(L61F(L-4Wv$PO3D?y0~~?X`V;oa_sVZRniH@o zdkEXp+s=q*&cKegCCob2<~7e;fHlZFE!B2TwT)m4={zt87s7@HwcN``y8ujME@9gr zEjq}vi@^G}BCN$=qYgZ~1Z;ciDXF%|0n-HAny$e1=nBm|cTcg4#TRMGYqG&fpBe(UupmiHRqr z+5+R33C>NukzFQCYpQ6=_6o3fs|hPCTrr$)i4U;yCWJlxFF|nFItc~guv>f zd^A5`$)1FnHG0#EXIFvwEhkK&efoT!T?1y@fG{gZzgIl-2WE5cxKvwpE1xzzyAJHB zIbjh2GX$%}0ALRD2&+HdTkw7h1m>zln9i4><$RJifVrn0lWI$AJX~+H(?ux zsR&+Fw}6e>N!XBgfv5N+gMgjsNmxbT1;K_@FtAonj!LyX8b7E7AMG}p1?j58_)dlcJNEa8RGW5|i(vCR0$5HwVU_kbqxop}fOT~utb5!1xP1ivcz=^pI5BVE-Q7coqvxZ8Kp*9NLxfEDqSkj)eWX-uyey;(`56KPV;n zCc@v0X9>X8_z?EqVU6G&kO=J94#FZA#VhgAo&n1mNSMD_|Jy`}>T z@gl74qH!De+A@HB+)vn&HR}WqPA0HLJqWA!^@d;;lm*PXdY@FAtM-pye3H+B8QdqV zm5$nao@E0o-a^>T|NiICG}i5^L)eX0Z3Qo?7r+c&=}5Jy_nIe|hUNlG_9AR$ z`_LbJZF#^xt|4r~ntcU4dkM^>2VvITh6*OV`M_3x+$+^~d*nyKoa+^^pg_X@v#ocF zPqF~myj6r593S_AXNAB%4kE10sm(<^D*{$skFbyXO$9TB*T4=x+#|K5(Y&sL9r`!G zHrNqn=EeT82oUo2>NineCGHt20{x2fG@T>%wjssx_ z<~I@?x4s8s$Q zKFM-m0}m0_F#YK?o_z%NX(nNNGc;Uz_6eBI$6Zowg>FY}c~$|eDul42UUpM?RtYRs zm$18Swg@IqpMg0JCv2AQQNiiM7hna|JEhw8ovd);ll%(I)swJU|NBd6>)ruXz|QnIJeK`320-m#}^H-U?ds8<^5Q!lK4aujX6w z2bk3&!tUJt9l^7|z(zG8tcB9q13aq*W>vUNYKfPE+ajL*19sY0kWP2}fq zT`UNTDm!_Sk7fj{pB7<;Hx3Fmb&P@iYDCykllOu*jtQ{EueM4p*>|UnU@yoNm_Z<6 z55Jrc95$H&v)D}7%q2GVd`rxM<*N|(_4%6NJhK3HCUc8aTZL~w!C-y{*tr10z8q`V znvZ4)Z2lU;Qj)x5d1eJ{K@-AU{~k@{nKiKE>6@k6S~&K8#xomWA3O=0JA2Iuo}C5O zSBJ1(>m3jA%ofYBprYiE+On=X-o~z9D#jpO_=>ld%>ve1nfk{Mya;>`+gnaqd5a}y+W8p)RSF2 za{)H*C}E9k69iAdd0^WX6V~z9k4b#A3&5&X2y5HGRgY&EfgSq1L28N1x35cib_v)U z55lII>=%4;pH0=>#Cy7d3FWZ zv>3v)`xr&?%m-MaC1K|Me+sUfFR)G<2}>VnFW4aV19nrHu&X_m4B?Z!3anx3e^N`H zH?26vvunUQTqdk}l)`78`2(A}pD^D(DTO?{4(#$w!VEOq3tssFz*?ygw&ZFP7d~1b zu$AR&rIu8B{jT8I4Pdr+2>U!}RT9r`0$X#0utC#$D)Q_WFx@ePwY72q| zncv#;><%!C8-)41{Uq2NxC?Bz4q=96S{i(`FklIz2s?RThC0u}fi-!rDb_CU60`?HtJR`#L3O$1OXpev?Y#_|oq|11oMFBG%N!Z(nkq3G97+9ZgtEHA~ndU2) zb43Fycud&Tzhx$Tv?su_P7&5wAiA1xM` zXF6eHXg^!jB ztdkO9kHP{L@+=M5iGmeUOa2YW6--0Zfn5$E?8x)yg2yxi*rfx6S+86#7$-A<-CRIe z%T^l%E8#3)hyE>>Y75BdpTM`|Ij|0Q3DY^zPH~p5N)RJcf8}xYg4w$bSVSm4> r37&vrVEa}RHnMZYVm?|4uogWD8-0509-h4i_Of<~R9n_>&u9N1#T4LU diff --git a/catboost_info/learn_error.tsv b/catboost_info/learn_error.tsv deleted file mode 100644 index b27d717..0000000 --- a/catboost_info/learn_error.tsv +++ /dev/null @@ -1,1001 +0,0 @@ -iter Logloss -0 0.6921376331 -1 0.6914311222 -2 0.6902726804 -3 0.6893236297 -4 0.6884269714 -5 0.6877708094 -6 0.6868745599 -7 0.6854426605 -8 0.6846512471 -9 0.6835081152 -10 0.6828162926 -11 0.6821300387 -12 0.6810161471 -13 0.6800281746 -14 0.6789965715 -15 0.6778871247 -16 0.6767778482 -17 0.675800315 -18 0.6745953049 -19 0.6735743199 -20 0.6728023546 -21 0.6717833025 -22 0.6705873864 -23 0.6691952518 -24 0.6683306949 -25 0.6672187277 -26 0.6662092464 -27 0.6652034862 -28 0.664099523 -29 0.6628215143 -30 0.6619682567 -31 0.6605963962 -32 0.6597477538 -33 0.6585793495 -34 0.6574136274 -35 0.6566625323 -36 0.6557201062 -37 0.6547810435 -38 0.6537957873 -39 0.6524456825 -40 0.651703792 -41 0.6505536692 -42 0.6495760253 -43 0.6482374328 -44 0.6469012243 -45 0.6460723281 -46 0.645103676 -47 0.6441353474 -48 0.6431703482 -49 0.6423503671 -50 0.6417189837 -51 0.6408985257 -52 0.6395800029 -53 0.638454846 -54 0.637829457 -55 0.6369738238 -56 0.6356642927 -57 0.6346218245 -58 0.6335089888 -59 0.6327144418 -60 0.6314146519 -61 0.6302148189 -62 0.6294122423 -63 0.6283095479 -64 0.6273769736 -65 0.626091625 -66 0.6248111555 -67 0.623531299 -68 0.6225138136 -69 0.62191024 -70 0.6212150369 -71 0.6205226438 -72 0.6194400617 -73 0.6186558434 -74 0.6179682612 -75 0.6168900728 -76 0.61597624 -77 0.6151563355 -78 0.6140871133 -79 0.6135282346 -80 0.6125324198 -81 0.6116280981 -82 0.610728332 -83 0.6099583251 -84 0.6091488685 -85 0.607908896 -86 0.6066734365 -87 0.6057140146 -88 0.6048230018 -89 0.6038645932 -90 0.603286062 -91 0.6020624808 -92 0.6010184118 -93 0.5997997522 -94 0.5988519958 -95 0.5978163225 -96 0.5968528134 -97 0.5956415364 -98 0.5950270976 -99 0.5940695831 -100 0.5931115406 -101 0.5923869354 -102 0.5916092396 -103 0.5907429201 -104 0.5900058321 -105 0.5892723969 -106 0.5880816919 -107 0.5874404567 -108 0.586706996 -109 0.5859788145 -110 0.5852130651 -111 0.584120197 -112 0.5833940932 -113 0.5823940975 -114 0.5816756827 -115 0.5805052434 -116 0.5794262716 -117 0.5787069798 -118 0.5777818561 -119 0.5768609388 -120 0.576025205 -121 0.5753114138 -122 0.5741571529 -123 0.5731786489 -124 0.5722654206 -125 0.5715580412 -126 0.5707720092 -127 0.5702917491 -128 0.5693833147 -129 0.5687683991 -130 0.567802012 -131 0.5668365444 -132 0.566021783 -133 0.5654101968 -134 0.5645956993 -135 0.5639859353 -136 0.5632997581 -137 0.5626934341 -138 0.5619667087 -139 0.561355676 -140 0.5606338722 -141 0.5599243215 -142 0.5588081224 -143 0.5582083719 -144 0.5571798342 -145 0.5560722521 -146 0.555395518 -147 0.5548768214 -148 0.5540812016 -149 0.5532853092 -150 0.5524131826 -151 0.5514765126 -152 0.5503822821 -153 0.5496355976 -154 0.5487687332 -155 0.5479253531 -156 0.5468407444 -157 0.5457558547 -158 0.5448993189 -159 0.5442429866 -160 0.54350417 -161 0.5428456068 -162 0.5421104516 -163 0.5410383344 -164 0.5401923571 -165 0.5392873032 -166 0.5386331763 -167 0.5379828811 -168 0.5372601577 -169 0.5364935909 -170 0.5359273042 -171 0.5352867246 -172 0.5346508111 -173 0.5338960375 -174 0.5331440909 -175 0.5325764247 -176 0.5319340825 -177 0.5308882339 -178 0.5301761712 -179 0.5292916383 -180 0.5285842078 -181 0.5280214122 -182 0.5274686984 -183 0.5265111327 -184 0.5254784737 -185 0.5246844462 -186 0.5240639022 -187 0.523193121 -188 0.5224034829 -189 0.5215357968 -190 0.5205158591 -191 0.519499617 -192 0.5189555841 -193 0.5182280072 -194 0.5176118144 -195 0.516600694 -196 0.5160629494 -197 0.5152068564 -198 0.5146640411 -199 0.5139804397 -200 0.5130543666 -201 0.5122875103 -202 0.5114408987 -203 0.5108332038 -204 0.5102274801 -205 0.5096990253 -206 0.5091704939 -207 0.5084650133 -208 0.5076262014 -209 0.5067908296 -210 0.5058066249 -211 0.505107935 -212 0.5043528165 -213 0.5037583411 -214 0.5033062739 -215 0.502780165 -216 0.5019570121 -217 0.5011352003 -218 0.500619271 -219 0.499652671 -220 0.4991394196 -221 0.4984841432 -222 0.4977248609 -223 0.4969089159 -224 0.4962528561 -225 0.4952942176 -226 0.4947139238 -227 0.494205675 -228 0.4934548054 -229 0.4928752099 -230 0.4921438651 -231 0.4915419051 -232 0.4906675262 -233 0.4898703141 -234 0.4894825518 -235 0.4889106836 -236 0.4883435837 -237 0.4875512464 -238 0.4866151299 -239 0.4858255344 -240 0.4852675327 -241 0.4844093025 -242 0.4839843852 -243 0.4834949417 -244 0.4828630686 -245 0.4820830396 -246 0.4811603895 -247 0.4806003102 -248 0.4801141407 -249 0.4795589915 -250 0.4790101647 -251 0.4783102529 -252 0.4777278134 -253 0.4772474894 -254 0.4764735358 -255 0.4755656251 -256 0.4751491078 -257 0.4743878543 -258 0.4736746209 -259 0.4729804865 -260 0.4724334478 -261 0.4720227505 -262 0.4712655161 -263 0.4704395831 -264 0.4698364777 -265 0.4693632637 -266 0.468612028 -267 0.4677223393 -268 0.4670231215 -269 0.4663278971 -270 0.4658006259 -271 0.4653341855 -272 0.4646414902 -273 0.4639707293 -274 0.4632801924 -275 0.4628103631 -276 0.4620725938 -277 0.4616752139 -278 0.4608016184 -279 0.459932306 -280 0.4593444594 -281 0.4588904636 -282 0.4582101745 -283 0.4575310392 -284 0.4566698628 -285 0.4561560367 -286 0.4554346417 -287 0.454855838 -288 0.4541831996 -289 0.4536689264 -290 0.4531557347 -291 0.4527086743 -292 0.4522080932 -293 0.4516352969 -294 0.4509672991 -295 0.4502509066 -296 0.4495855272 -297 0.4488103007 -298 0.4482808496 -299 0.44764027 -300 0.4471345288 -301 0.4464275283 -302 0.445842513 -303 0.4453389304 -304 0.4445062748 -305 0.4438071592 -306 0.4431103936 -307 0.4424160804 -308 0.4419180283 -309 0.4410959354 -310 0.4403396802 -311 0.439715841 -312 0.4391380974 -313 0.4387702474 -314 0.4382839075 -315 0.437851578 -316 0.4370380257 -317 0.4365532909 -318 0.4359790853 -319 0.4353643443 -320 0.4349343138 -321 0.4341925383 -322 0.4335165833 -323 0.4328867538 -324 0.4323144172 -325 0.4316435754 -326 0.4310806351 -327 0.4305228548 -328 0.4299000289 -329 0.4294867728 -330 0.4287576846 -331 0.4281379964 -332 0.4274753375 -333 0.4269457119 -334 0.4265914985 -335 0.4261210859 -336 0.4253347473 -337 0.4246753199 -338 0.4242052266 -339 0.423488817 -340 0.4228350009 -341 0.4222266035 -342 0.4217495535 -343 0.4212821083 -344 0.4208801644 -345 0.4203901972 -346 0.4198750854 -347 0.4193912702 -348 0.4187875901 -349 0.4183272166 -350 0.4178133053 -351 0.4172735342 -352 0.4165695991 -353 0.416060222 -354 0.4154641713 -355 0.414705119 -356 0.4140638581 -357 0.4134714263 -358 0.4131337319 -359 0.4126064479 -360 0.4121523883 -361 0.4115223118 -362 0.4110527294 -363 0.4106555283 -364 0.4100904507 -365 0.4095082581 -366 0.4088843039 -367 0.4081387094 -368 0.4073968657 -369 0.40671779 -370 0.4063282268 -371 0.4058952715 -372 0.4054609069 -373 0.4049458589 -374 0.4045187661 -375 0.4038458211 -376 0.4031138676 -377 0.4025977935 -378 0.4021515165 -379 0.4017208729 -380 0.4012702831 -381 0.400815968 -382 0.4003585875 -383 0.3996368136 -384 0.3989150098 -385 0.3984945757 -386 0.3979838576 -387 0.3975541421 -388 0.3969472051 -389 0.3965793593 -390 0.3960817797 -391 0.3953677969 -392 0.3948702344 -393 0.3943742556 -394 0.3938194088 -395 0.393111263 -396 0.3924046883 -397 0.3919914365 -398 0.3914010099 -399 0.390907675 -400 0.390361424 -401 0.3897764002 -402 0.3892910055 -403 0.3889340375 -404 0.3882963104 -405 0.3878849873 -406 0.3873047105 -407 0.3869512975 -408 0.3864888975 -409 0.3860683399 -410 0.3854924994 -411 0.3850928077 -412 0.3847414127 -413 0.3842214559 -414 0.3839176638 -415 0.3834964207 -416 0.3829643939 -417 0.3826096952 -418 0.3820993347 -419 0.3815246395 -420 0.3812263012 -421 0.380756293 -422 0.3803054477 -423 0.3799018434 -424 0.3793343902 -425 0.3789422768 -426 0.3786449858 -427 0.3781736961 -428 0.3776146003 -429 0.3772632352 -430 0.3768726545 -431 0.3763188847 -432 0.3759762943 -433 0.3756239925 -434 0.3752392616 -435 0.3746320861 -436 0.3739689333 -437 0.3736773006 -438 0.3732340549 -439 0.3725745422 -440 0.3719174904 -441 0.3714807417 -442 0.3710459641 -443 0.3703917946 -444 0.3699332518 -445 0.3694799159 -446 0.3688319496 -447 0.3682378743 -448 0.3677363523 -449 0.3674085396 -450 0.367077423 -451 0.3667051792 -452 0.3664227 -453 0.3660525254 -454 0.3654101065 -455 0.3649876288 -456 0.3646104591 -457 0.364115434 -458 0.3637939692 -459 0.3634185621 -460 0.3628906863 -461 0.362450093 -462 0.3618173812 -463 0.3612926432 -464 0.3608229033 -465 0.3603342303 -466 0.3599510874 -467 0.3593257538 -468 0.3588909294 -469 0.3584752466 -470 0.3580092447 -471 0.3575279159 -472 0.3570059666 -473 0.356525468 -474 0.3561132167 -475 0.3557490196 -476 0.3554368232 -477 0.3550801958 -478 0.3544650078 -479 0.3541559832 -480 0.3537509612 -481 0.3532782325 -482 0.3527637294 -483 0.3522067794 -484 0.3516512513 -485 0.3513434274 -486 0.3509157343 -487 0.3505166599 -488 0.3500145929 -489 0.3496588809 -490 0.3492625185 -491 0.3488656453 -492 0.3484638504 -493 0.3480004881 -494 0.3476062758 -495 0.3472642175 -496 0.3468141258 -497 0.3463197521 -498 0.3460200386 -499 0.3455273892 -500 0.3449356386 -501 0.3445860488 -502 0.3441699871 -503 0.3437617634 -504 0.343414247 -505 0.3430223976 -506 0.34276789 -507 0.3423123913 -508 0.3419258424 -509 0.3414402349 -510 0.3409900154 -511 0.3404066435 -512 0.3400745732 -513 0.3396446535 -514 0.3392627665 -515 0.3389605539 -516 0.3386215099 -517 0.3381765485 -518 0.3376931037 -519 0.337248815 -520 0.3367245538 -521 0.3363859143 -522 0.3359881639 -523 0.335651734 -524 0.335174254 -525 0.3346553019 -526 0.3342645126 -527 0.333931561 -528 0.3334565205 -529 0.3331309302 -530 0.3328454452 -531 0.3325251554 -532 0.3321362521 -533 0.3318056336 -534 0.3313364259 -535 0.3309532276 -536 0.3306332145 -537 0.3302503697 -538 0.3298849165 -539 0.3294564698 -540 0.3290752811 -541 0.3286145287 -542 0.3283276004 -543 0.327948004 -544 0.3276326656 -545 0.3273104302 -546 0.3269880712 -547 0.3266501256 -548 0.3262259236 -549 0.3258500312 -550 0.3253507018 -551 0.3250135141 -552 0.3246421516 -553 0.3244070964 -554 0.3240867768 -555 0.3238518877 -556 0.3235307634 -557 0.3231597074 -558 0.3226190891 -559 0.3221721266 -560 0.3217267905 -561 0.32127652 -562 0.3210437383 -563 0.3208125659 -564 0.3204400071 -565 0.3199909372 -566 0.3196365365 -567 0.3191042415 -568 0.3186643677 -569 0.3183591408 -570 0.3178278974 -571 0.3174204188 -572 0.3169393625 -573 0.3165036993 -574 0.3161586651 -575 0.3158007775 -576 0.3153626706 -577 0.3148854673 -578 0.3143635946 -579 0.3138878771 -580 0.3134522523 -581 0.3131497758 -582 0.3128520804 -583 0.3124257113 -584 0.3121328269 -585 0.311617566 -586 0.3113503967 -587 0.3110510664 -588 0.3105376831 -589 0.3101793698 -590 0.3097576456 -591 0.3093302974 -592 0.3089044435 -593 0.3085143992 -594 0.308161561 -595 0.307898977 -596 0.3076050154 -597 0.3073084865 -598 0.3070910786 -599 0.3066311819 -600 0.3063479534 -601 0.3059329774 -602 0.3055891906 -603 0.3052941263 -604 0.3049547076 -605 0.3044544842 -606 0.3041649589 -607 0.3036680775 -608 0.3033767513 -609 0.3029951879 -610 0.3025867684 -611 0.3020925096 -612 0.3018062498 -613 0.3013960038 -614 0.3010602551 -615 0.3007759707 -616 0.3005227702 -617 0.3001474057 -618 0.2998298875 -619 0.2994279989 -620 0.2990533326 -621 0.2986103424 -622 0.2982386308 -623 0.2980017236 -624 0.2976457221 -625 0.2973187694 -626 0.2970820921 -627 0.2968406635 -628 0.29651102 -629 0.2962718989 -630 0.2960237563 -631 0.2957421456 -632 0.2954565712 -633 0.2951291544 -634 0.2948581576 -635 0.2944618165 -636 0.2940285674 -637 0.2937956708 -638 0.2935192755 -639 0.2932442342 -640 0.2928142079 -641 0.2925079678 -642 0.2922439575 -643 0.2919395225 -644 0.2915480094 -645 0.2912747434 -646 0.2908083541 -647 0.2906082656 -648 0.2902667182 -649 0.2900387049 -650 0.2898409878 -651 0.289457913 -652 0.2891859795 -653 0.288721157 -654 0.288452323 -655 0.2882551636 -656 0.2879799902 -657 0.2876007472 -658 0.287326932 -659 0.2869494557 -660 0.2866300855 -661 0.286373198 -662 0.2859906852 -663 0.2855351972 -664 0.2851559839 -665 0.2848492605 -666 0.2845873215 -667 0.2843303766 -668 0.2840162835 -669 0.2837108139 -670 0.2833648005 -671 0.2830218524 -672 0.2826870212 -673 0.2823174106 -674 0.2820060615 -675 0.2815587755 -676 0.2813848065 -677 0.28112724 -678 0.2808396901 -679 0.2806121494 -680 0.2802471199 -681 0.2798035315 -682 0.2795483172 -683 0.2792404869 -684 0.2789540802 -685 0.2786558292 -686 0.2782940418 -687 0.277895276 -688 0.2775917564 -689 0.2772567102 -690 0.2770056639 -691 0.2768216282 -692 0.2765277241 -693 0.2762676307 -694 0.2760578649 -695 0.2757622919 -696 0.2754316394 -697 0.2749989714 -698 0.2745687238 -699 0.2743486528 -700 0.2741394831 -701 0.2738577638 -702 0.2736173953 -703 0.2734091218 -704 0.2732017615 -705 0.2729077467 -706 0.2726194156 -707 0.2723656573 -708 0.2721585333 -709 0.2719530272 -710 0.2716654135 -711 0.2713763544 -712 0.271054617 -713 0.270765279 -714 0.2705314734 -715 0.2701860922 -716 0.2699523228 -717 0.2695332468 -718 0.2692927846 -719 0.2690525545 -720 0.2686732369 -721 0.2684303522 -722 0.268084062 -723 0.2678163158 -724 0.2675443845 -725 0.2673065875 -726 0.267035761 -727 0.2668061469 -728 0.2665300156 -729 0.2663027665 -730 0.2661044661 -731 0.2658744007 -732 0.2655614189 -733 0.265313834 -734 0.2650115107 -735 0.2646720558 -736 0.2644053825 -737 0.2639993003 -738 0.2637928831 -739 0.26338629 -740 0.2631142416 -741 0.262746945 -742 0.2625778381 -743 0.26234528 -744 0.262142009 -745 0.2617390794 -746 0.2614471678 -747 0.2611715304 -748 0.2608939814 -749 0.260690721 -750 0.2603874398 -751 0.2601193339 -752 0.2597940266 -753 0.2595251692 -754 0.2592016033 -755 0.2588046512 -756 0.2584771727 -757 0.258083484 -758 0.2576890822 -759 0.2575262125 -760 0.2573358842 -761 0.2571070407 -762 0.2567511073 -763 0.2564899155 -764 0.2561713372 -765 0.2558850923 -766 0.255597336 -767 0.2553038406 -768 0.2551415861 -769 0.2549170256 -770 0.2547311187 -771 0.2544388579 -772 0.2541200263 -773 0.253959594 -774 0.2537388567 -775 0.2534203806 -776 0.2531755737 -777 0.2529106076 -778 0.25252861 -779 0.2522401299 -780 0.2518930776 -781 0.2515788674 -782 0.2513293752 -783 0.2510782033 -784 0.2506980555 -785 0.2503226527 -786 0.2499814757 -787 0.2496719041 -788 0.2494325127 -789 0.249093871 -790 0.2488785131 -791 0.2486212573 -792 0.2483151789 -793 0.2481008215 -794 0.2478614556 -795 0.2475220348 -796 0.2471528138 -797 0.2467823178 -798 0.2465691545 -799 0.2464111426 -800 0.2462568326 -801 0.245888627 -802 0.2457131488 -803 0.2454448321 -804 0.24514739 -805 0.2449321066 -806 0.244754027 -807 0.2444520933 -808 0.244150364 -809 0.2438756845 -810 0.2436781462 -811 0.2434370794 -812 0.243231222 -813 0.2429795031 -814 0.2428085953 -815 0.2425811674 -816 0.242311599 -817 0.2419550121 -818 0.2418074012 -819 0.2415643207 -820 0.2412046386 -821 0.2410006757 -822 0.240788513 -823 0.2405819957 -824 0.2402247212 -825 0.2400239493 -826 0.2398227815 -827 0.2394675868 -828 0.2391150721 -829 0.238944539 -830 0.2387435521 -831 0.2385065428 -832 0.2383424767 -833 0.2381619513 -834 0.2378786377 -835 0.2375881033 -836 0.2373639579 -837 0.2370155092 -838 0.2366988765 -839 0.2364143282 -840 0.2361525744 -841 0.2359784991 -842 0.2357424561 -843 0.2355073158 -844 0.2353411147 -845 0.2350888635 -846 0.2348985119 -847 0.2346166734 -848 0.2343535147 -849 0.2340152775 -850 0.2338253856 -851 0.2335178043 -852 0.2333585535 -853 0.2331409327 -854 0.232799132 -855 0.2326270938 -856 0.232290253 -857 0.2320625271 -858 0.2317883436 -859 0.2315655244 -860 0.2313702852 -861 0.2311274941 -862 0.2307902468 -863 0.230628665 -864 0.2304723965 -865 0.2302186894 -866 0.2300390857 -867 0.2297347912 -868 0.2295422937 -869 0.2294070827 -870 0.229074195 -871 0.2288957728 -872 0.2287379333 -873 0.2284071765 -874 0.2281981345 -875 0.2279745298 -876 0.2277943641 -877 0.227464397 -878 0.2272489561 -879 0.2270080894 -880 0.226743977 -881 0.2265238464 -882 0.2261985966 -883 0.2259350951 -884 0.2256665911 -885 0.2254206049 -886 0.2251793891 -887 0.2249354039 -888 0.224719188 -889 0.2244787983 -890 0.2242641449 -891 0.2240526804 -892 0.2239209754 -893 0.2236602732 -894 0.2235089796 -895 0.2233028178 -896 0.2231716088 -897 0.2229924926 -898 0.2226752298 -899 0.2225238638 -900 0.2223377015 -901 0.2220517312 -902 0.2217935941 -903 0.2215100569 -904 0.2213576053 -905 0.2210437059 -906 0.2207309093 -907 0.2205256373 -908 0.2203404031 -909 0.2201428967 -910 0.2199867836 -911 0.2197325953 -912 0.21941949 -913 0.2192427261 -914 0.2190470483 -915 0.2188108265 -916 0.2185763461 -917 0.218397545 -918 0.2182284423 -919 0.2180648446 -920 0.2178877081 -921 0.2177140968 -922 0.2174067859 -923 0.2172427475 -924 0.2170075242 -925 0.2167770352 -926 0.2166043286 -927 0.2163751168 -928 0.2161562507 -929 0.2159849725 -930 0.2157700509 -931 0.2155961884 -932 0.2153693672 -933 0.2152064868 -934 0.2149531543 -935 0.2146568788 -936 0.2144625975 -937 0.2142915385 -938 0.2141212757 -939 0.2139780266 -940 0.2137883178 -941 0.2135832033 -942 0.213338275 -943 0.2130945367 -944 0.2127992234 -945 0.2125003551 -946 0.2123049072 -947 0.2120079526 -948 0.2117684079 -949 0.2115994564 -950 0.2113548624 -951 0.2111350383 -952 0.2108422305 -953 0.2106553572 -954 0.2103629942 -955 0.2100700906 -956 0.2098822147 -957 0.2096414268 -958 0.2094775353 -959 0.209343031 -960 0.2091770747 -961 0.2089391159 -962 0.2087473593 -963 0.2084878087 -964 0.2082003291 -965 0.2079655379 -966 0.2078309123 -967 0.2076701543 -968 0.2074827892 -969 0.2072977998 -970 0.2071178449 -971 0.2069490254 -972 0.2068189063 -973 0.2066576438 -974 0.2063765356 -975 0.2061198865 -976 0.2059267993 -977 0.2057632846 -978 0.2055076659 -979 0.2053591971 -980 0.2050760154 -981 0.2047961482 -982 0.2045843367 -983 0.2043832455 -984 0.2041069418 -985 0.2038964459 -986 0.2037365884 -987 0.2035126856 -988 0.203312993 -989 0.2031612588 -990 0.2029575471 -991 0.202779838 -992 0.2026532761 -993 0.2023749756 -994 0.2022287718 -995 0.2019803503 -996 0.2017732369 -997 0.201497757 -998 0.2012941028 -999 0.2011675132 diff --git a/catboost_info/time_left.tsv b/catboost_info/time_left.tsv deleted file mode 100644 index 6dc4bb5..0000000 --- a/catboost_info/time_left.tsv +++ /dev/null @@ -1,1001 +0,0 @@ -iter Passed Remaining -0 0 550 -1 0 391 -2 1 350 -3 1 331 -4 1 311 -5 1 291 -6 1 281 -7 2 269 -8 2 276 -9 2 269 -10 2 263 -11 3 262 -12 5 409 -13 5 396 -14 5 384 -15 6 373 -16 6 364 -17 6 365 -18 6 356 -19 7 351 -20 7 347 -21 7 340 -22 7 334 -23 8 327 -24 8 323 -25 8 317 -26 8 313 -27 8 309 -28 9 305 -29 9 300 -30 9 297 -31 9 292 -32 9 289 -33 10 285 -34 10 282 -35 10 280 -36 10 277 -37 10 274 -38 11 277 -39 11 274 -40 11 273 -41 11 271 -42 12 268 -43 12 265 -44 12 263 -45 12 261 -46 12 259 -47 13 257 -48 13 256 -49 13 255 -50 13 253 -51 13 252 -52 14 250 -53 14 248 -54 14 247 -55 14 246 -56 14 245 -57 14 243 -58 15 242 -59 15 241 -60 15 240 -61 15 239 -62 16 238 -63 16 237 -64 16 237 -65 16 235 -66 16 234 -67 16 232 -68 17 231 -69 17 230 -70 19 248 -71 19 248 -72 19 247 -73 19 247 -74 20 247 -75 20 246 -76 20 245 -77 20 244 -78 20 243 -79 21 245 -80 21 244 -81 21 243 -82 21 242 -83 22 241 -84 22 240 -85 22 239 -86 22 238 -87 22 237 -88 23 236 -89 23 235 -90 23 236 -91 23 235 -92 24 234 -93 24 233 -94 24 232 -95 24 231 -96 24 230 -97 24 229 -98 25 228 -99 25 227 -100 25 226 -101 25 225 -102 25 225 -103 26 224 -104 26 223 -105 26 222 -106 26 221 -107 26 221 -108 27 220 -109 27 220 -110 27 219 -111 27 219 -112 27 219 -113 28 218 -114 28 217 -115 28 217 -116 28 216 -117 28 215 -118 29 215 -119 29 214 -120 29 213 -121 29 213 -122 29 212 -123 30 212 -124 33 235 -125 33 235 -126 34 234 -127 34 233 -128 34 233 -129 34 233 -130 35 232 -131 35 231 -132 35 231 -133 35 230 -134 35 230 -135 36 229 -136 36 229 -137 36 230 -138 47 295 -139 48 295 -140 48 294 -141 48 292 -142 48 292 -143 49 291 -144 49 290 -145 49 288 -146 49 288 -147 49 287 -148 50 286 -149 50 285 -150 50 284 -151 50 283 -152 51 283 -153 51 282 -154 51 281 -155 51 280 -156 52 279 -157 52 278 -158 52 277 -159 52 276 -160 52 275 -161 53 274 -162 53 274 -163 53 273 -164 53 271 -165 53 271 -166 54 270 -167 54 269 -168 55 273 -169 55 272 -170 56 272 -171 56 272 -172 56 271 -173 57 270 -174 57 270 -175 57 269 -176 57 269 -177 58 268 -178 58 267 -179 58 266 -180 58 266 -181 59 265 -182 60 269 -183 60 268 -184 60 267 -185 61 268 -186 61 268 -187 62 268 -188 62 268 -189 67 288 -190 68 288 -191 68 287 -192 68 287 -193 68 286 -194 69 286 -195 69 285 -196 69 284 -197 70 283 -198 70 283 -199 70 282 -200 70 281 -201 71 280 -202 71 280 -203 78 304 -204 78 305 -205 79 304 -206 79 304 -207 79 304 -208 80 304 -209 80 303 -210 81 303 -211 81 303 -212 82 303 -213 82 303 -214 83 303 -215 83 303 -216 84 303 -217 84 303 -218 85 303 -219 96 343 -220 109 385 -221 109 384 -222 109 382 -223 110 381 -224 110 380 -225 110 379 -226 112 382 -227 115 391 -228 117 395 -229 118 396 -230 119 399 -231 120 398 -232 120 397 -233 121 397 -234 121 396 -235 122 395 -236 122 395 -237 123 395 -238 123 394 -239 125 397 -240 125 396 -241 126 395 -242 126 395 -243 129 401 -244 130 402 -245 130 401 -246 131 400 -247 131 399 -248 132 398 -249 132 398 -250 133 397 -251 134 397 -252 134 397 -253 136 400 -254 136 399 -255 137 398 -256 138 399 -257 138 398 -258 140 401 -259 141 402 -260 142 402 -261 142 402 -262 143 402 -263 144 404 -264 145 403 -265 146 403 -266 147 404 -267 148 404 -268 150 408 -269 152 413 -270 153 413 -271 154 412 -272 154 411 -273 155 411 -274 155 410 -275 156 410 -276 157 410 -277 157 410 -278 158 409 -279 161 415 -280 161 414 -281 162 413 -282 163 413 -283 163 412 -284 164 411 -285 164 410 -286 164 409 -287 165 408 -288 166 409 -289 166 408 -290 167 407 -291 169 411 -292 170 412 -293 171 412 -294 172 412 -295 173 412 -296 173 411 -297 174 411 -298 176 413 -299 176 412 -300 177 411 -301 177 411 -302 178 410 -303 179 410 -304 179 409 -305 180 408 -306 180 407 -307 181 407 -308 182 407 -309 182 406 -310 183 406 -311 183 405 -312 184 405 -313 185 405 -314 186 404 -315 186 404 -316 188 405 -317 188 405 -318 189 404 -319 190 405 -320 191 404 -321 193 406 -322 194 407 -323 195 407 -324 196 408 -325 197 408 -326 198 408 -327 198 407 -328 199 406 -329 199 406 -330 200 405 -331 201 404 -332 201 403 -333 202 403 -334 202 402 -335 203 402 -336 203 401 -337 204 400 -338 205 399 -339 205 398 -340 206 398 -341 206 397 -342 207 397 -343 208 397 -344 209 397 -345 209 396 -346 210 396 -347 211 395 -348 211 394 -349 212 394 -350 212 393 -351 214 395 -352 216 396 -353 216 395 -354 217 395 -355 218 394 -356 218 393 -357 219 393 -358 220 394 -359 221 394 -360 222 393 -361 222 392 -362 223 392 -363 224 391 -364 224 391 -365 225 390 -366 226 390 -367 226 389 -368 228 390 -369 230 392 -370 231 391 -371 235 396 -372 235 396 -373 236 395 -374 237 395 -375 237 394 -376 238 394 -377 239 393 -378 239 392 -379 240 391 -380 241 392 -381 242 391 -382 243 391 -383 243 390 -384 244 389 -385 244 389 -386 245 389 -387 246 388 -388 247 388 -389 247 387 -390 248 386 -391 248 385 -392 249 384 -393 249 384 -394 250 383 -395 250 382 -396 251 381 -397 251 380 -398 252 380 -399 252 379 -400 253 378 -401 253 377 -402 254 376 -403 255 376 -404 256 376 -405 258 377 -406 259 377 -407 260 378 -408 261 378 -409 262 377 -410 263 377 -411 264 377 -412 265 377 -413 266 376 -414 267 377 -415 268 376 -416 268 375 -417 269 375 -418 273 378 -419 274 379 -420 276 379 -421 277 379 -422 277 378 -423 278 378 -424 278 377 -425 279 376 -426 280 376 -427 281 375 -428 281 374 -429 282 374 -430 282 373 -431 283 372 -432 283 371 -433 284 371 -434 285 370 -435 286 370 -436 286 369 -437 287 368 -438 287 367 -439 288 366 -440 289 366 -441 289 366 -442 290 365 -443 290 364 -444 291 363 -445 292 362 -446 292 361 -447 293 361 -448 293 360 -449 294 359 -450 294 358 -451 295 358 -452 296 357 -453 296 356 -454 297 355 -455 297 354 -456 298 354 -457 299 353 -458 300 353 -459 300 352 -460 301 352 -461 301 351 -462 302 350 -463 302 349 -464 304 349 -465 304 349 -466 309 353 -467 310 352 -468 311 352 -469 312 351 -470 312 351 -471 313 350 -472 315 351 -473 315 350 -474 316 349 -475 316 348 -476 317 347 -477 317 346 -478 317 345 -479 318 344 -480 318 343 -481 319 342 -482 319 341 -483 319 340 -484 320 339 -485 320 339 -486 320 338 -487 321 337 -488 321 336 -489 321 335 -490 322 334 -491 322 333 -492 323 332 -493 323 331 -494 323 330 -495 324 329 -496 324 328 -497 325 327 -498 325 326 -499 325 325 -500 326 324 -501 326 323 -502 326 323 -503 327 322 -504 327 321 -505 328 320 -506 328 319 -507 329 318 -508 329 317 -509 329 316 -510 330 316 -511 330 315 -512 331 314 -513 331 313 -514 332 312 -515 332 312 -516 333 311 -517 333 310 -518 333 309 -519 334 308 -520 334 307 -521 337 309 -522 338 308 -523 339 308 -524 339 307 -525 340 306 -526 341 306 -527 342 305 -528 342 305 -529 343 304 -530 346 305 -531 346 304 -532 347 304 -533 347 303 -534 348 302 -535 348 301 -536 349 301 -537 349 300 -538 350 299 -539 350 298 -540 350 297 -541 351 296 -542 351 296 -543 352 295 -544 352 294 -545 352 293 -546 353 292 -547 353 291 -548 354 290 -549 354 289 -550 354 289 -551 355 288 -552 355 287 -553 355 286 -554 356 285 -555 357 285 -556 357 284 -557 357 283 -558 358 282 -559 358 281 -560 358 280 -561 359 279 -562 359 279 -563 359 278 -564 360 277 -565 360 276 -566 360 275 -567 361 274 -568 361 273 -569 362 273 -570 362 272 -571 362 271 -572 362 270 -573 363 269 -574 363 268 -575 363 267 -576 364 267 -577 364 266 -578 365 265 -579 366 265 -580 366 264 -581 366 263 -582 367 262 -583 367 262 -584 368 261 -585 368 260 -586 369 259 -587 369 258 -588 369 258 -589 370 257 -590 370 256 -591 370 255 -592 371 254 -593 371 253 -594 371 253 -595 372 252 -596 372 251 -597 373 250 -598 373 250 -599 374 249 -600 374 248 -601 374 247 -602 375 247 -603 375 246 -604 376 245 -605 376 244 -606 376 243 -607 377 243 -608 378 242 -609 378 242 -610 379 241 -611 380 240 -612 380 240 -613 381 239 -614 382 239 -615 383 239 -616 385 239 -617 385 238 -618 386 237 -619 387 237 -620 387 236 -621 388 235 -622 389 235 -623 389 234 -624 390 234 -625 390 233 -626 391 232 -627 392 232 -628 393 231 -629 393 231 -630 394 230 -631 395 230 -632 395 229 -633 396 229 -634 397 228 -635 398 227 -636 398 227 -637 399 226 -638 399 225 -639 400 225 -640 401 224 -641 401 224 -642 402 223 -643 403 222 -644 403 222 -645 404 221 -646 405 221 -647 406 220 -648 407 220 -649 407 219 -650 408 218 -651 409 218 -652 411 218 -653 411 217 -654 412 217 -655 413 216 -656 415 216 -657 416 216 -658 419 216 -659 419 216 -660 420 215 -661 420 214 -662 421 214 -663 421 213 -664 422 212 -665 422 212 -666 423 211 -667 424 210 -668 424 210 -669 425 209 -670 425 208 -671 426 208 -672 426 207 -673 427 206 -674 427 206 -675 428 205 -676 428 204 -677 429 203 -678 430 203 -679 430 202 -680 431 201 -681 431 201 -682 432 200 -683 432 199 -684 433 199 -685 433 198 -686 434 197 -687 434 197 -688 435 196 -689 435 195 -690 436 195 -691 437 194 -692 437 193 -693 438 193 -694 439 192 -695 439 192 -696 440 191 -697 440 190 -698 441 190 -699 443 190 -700 444 189 -701 445 189 -702 446 188 -703 447 188 -704 448 187 -705 448 186 -706 449 186 -707 449 185 -708 450 184 -709 450 184 -710 450 183 -711 451 182 -712 451 181 -713 452 181 -714 452 180 -715 452 179 -716 453 178 -717 453 178 -718 453 177 -719 454 176 -720 454 175 -721 454 175 -722 455 174 -723 455 173 -724 456 173 -725 457 172 -726 457 171 -727 458 171 -728 458 170 -729 458 169 -730 459 169 -731 460 168 -732 460 167 -733 461 167 -734 461 166 -735 461 165 -736 462 164 -737 462 164 -738 462 163 -739 463 162 -740 463 162 -741 463 161 -742 464 160 -743 465 160 -744 465 159 -745 465 158 -746 466 157 -747 466 157 -748 467 156 -749 467 155 -750 467 155 -751 468 154 -752 468 153 -753 468 152 -754 469 152 -755 469 151 -756 469 150 -757 470 150 -758 470 149 -759 471 148 -760 471 148 -761 473 148 -762 475 147 -763 475 146 -764 475 146 -765 476 145 -766 476 144 -767 476 144 -768 480 144 -769 480 143 -770 481 142 -771 481 142 -772 482 141 -773 482 140 -774 483 140 -775 483 139 -776 484 139 -777 484 138 -778 485 137 -779 486 137 -780 486 136 -781 487 135 -782 488 135 -783 488 134 -784 491 134 -785 492 134 -786 493 133 -787 493 132 -788 494 132 -789 494 131 -790 496 131 -791 497 130 -792 497 129 -793 498 129 -794 498 128 -795 499 127 -796 499 127 -797 500 126 -798 501 126 -799 501 125 -800 502 124 -801 503 124 -802 504 123 -803 505 123 -804 506 122 -805 508 122 -806 509 121 -807 510 121 -808 511 120 -809 511 120 -810 513 119 -811 514 119 -812 515 118 -813 516 117 -814 517 117 -815 518 116 -816 519 116 -817 519 115 -818 520 115 -819 521 114 -820 521 113 -821 522 113 -822 523 112 -823 523 111 -824 524 111 -825 524 110 -826 525 109 -827 526 109 -828 527 108 -829 528 108 -830 528 107 -831 529 106 -832 530 106 -833 530 105 -834 531 104 -835 531 104 -836 532 103 -837 532 103 -838 533 102 -839 533 101 -840 534 101 -841 535 100 -842 535 99 -843 536 99 -844 536 98 -845 537 97 -846 538 97 -847 538 96 -848 538 95 -849 539 95 -850 540 94 -851 540 93 -852 541 93 -853 541 92 -854 542 91 -855 545 91 -856 545 91 -857 546 90 -858 546 89 -859 547 89 -860 548 88 -861 549 87 -862 552 87 -863 553 87 -864 554 86 -865 555 85 -866 556 85 -867 557 84 -868 558 84 -869 559 83 -870 559 82 -871 560 82 -872 561 81 -873 562 81 -874 563 80 -875 564 79 -876 564 79 -877 565 78 -878 566 77 -879 567 77 -880 567 76 -881 568 76 -882 569 75 -883 569 74 -884 570 74 -885 570 73 -886 571 72 -887 571 72 -888 573 71 -889 574 70 -890 574 70 -891 575 69 -892 575 68 -893 576 68 -894 577 67 -895 577 67 -896 578 66 -897 579 65 -898 579 65 -899 580 64 -900 580 63 -901 581 63 -902 581 62 -903 582 61 -904 583 61 -905 583 60 -906 583 59 -907 584 59 -908 585 58 -909 585 57 -910 586 57 -911 586 56 -912 587 55 -913 587 55 -914 588 54 -915 588 54 -916 589 53 -917 590 52 -918 591 52 -919 591 51 -920 592 50 -921 593 50 -922 593 49 -923 594 48 -924 594 48 -925 595 47 -926 596 47 -927 597 46 -928 599 45 -929 600 45 -930 601 44 -931 602 43 -932 603 43 -933 603 42 -934 604 42 -935 605 41 -936 605 40 -937 606 40 -938 607 39 -939 607 38 -940 608 38 -941 608 37 -942 609 36 -943 611 36 -944 612 35 -945 612 34 -946 613 34 -947 613 33 -948 614 33 -949 614 32 -950 615 31 -951 615 31 -952 616 30 -953 616 29 -954 617 29 -955 617 28 -956 618 27 -957 618 27 -958 619 26 -959 620 25 -960 620 25 -961 621 24 -962 621 23 -963 622 23 -964 622 22 -965 623 21 -966 624 21 -967 625 20 -968 625 20 -969 626 19 -970 629 18 -971 630 18 -972 631 17 -973 632 16 -974 632 16 -975 633 15 -976 635 14 -977 636 14 -978 637 13 -979 637 13 -980 638 12 -981 639 11 -982 640 11 -983 643 10 -984 644 9 -985 645 9 -986 646 8 -987 646 7 -988 647 7 -989 651 6 -990 652 5 -991 652 5 -992 653 4 -993 653 3 -994 655 3 -995 656 2 -996 657 1 -997 657 1 -998 657 0 -999 658 0 diff --git a/mariadb_kernel.egg-info/PKG-INFO b/mariadb_kernel.egg-info/PKG-INFO deleted file mode 100644 index b066d1e..0000000 --- a/mariadb_kernel.egg-info/PKG-INFO +++ /dev/null @@ -1,76 +0,0 @@ -Metadata-Version: 2.4 -Name: mariadb_kernel -Version: 0.1.dev254+dirty -Summary: A simple MariaDB Jupyter kernel -Home-page: https://github.com/MariaDB/mariadb_kernel -Author: MariaDB Foundation -Author-email: foundation@mariadb.org -Classifier: License :: OSI Approved :: BSD License -Classifier: Programming Language :: Python :: 3 -Requires-Python: >=3.5 -Description-Content-Type: text/markdown -License-File: LICENSE -Requires-Dist: pandas -Requires-Dist: json2html -Requires-Dist: matplotlib -Requires-Dist: lxml -Requires-Dist: setuptools -Requires-Dist: setuptools-scm -Requires-Dist: ipykernel -Requires-Dist: beautifulsoup4 -Requires-Dist: mycli -Dynamic: author -Dynamic: author-email -Dynamic: classifier -Dynamic: description -Dynamic: description-content-type -Dynamic: home-page -Dynamic: license-file -Dynamic: requires-dist -Dynamic: requires-python -Dynamic: summary - -# MariaDB Jupyter Kernel - -[![badge](https://img.shields.io/badge/Try%20MariaDB-@%20binder-579ACA.svg?logo=)](https://mybinder.org/v2/gh/MariaDB/mariadb_kernel.git/master?urlpath=lab/tree/binder/try_it_out.ipynb) -![GitHub](https://github.com/MariaDB/mariadb_kernel/workflows/CI/badge.svg) -![badge](https://img.shields.io/badge/version-v0.2.0-yellow) - - - -`mariadb_kernel` is a an Open Source kernel for Jupyter which enables users to run MariaDB in a Jupyter notebook. - -# Quick Installation Steps - -`mariadb_kernel` has been packaged for the PyPI package manager, -packaging for `conda-forge` is coming soon. - -Assuming you already have Jupyter Lab and MariaDB installed on your system, -all you need to do is: - -1. Install the kernel -```bash -python3 -m pip install mariadb_kernel -``` -2. Install the kernelspec so that the kernel becomes visible to JupyterLab -```bash -python3 -m mariadb_kernel.install -``` - -For a more complete guide on how to install `mariadb_kernel`, check out our -[Installation docs](https://mariadb.com/kb/en/mariadb-jupyter-kernel-installation/) - -# Using the kernel -Using `mariadb_kernel` is pretty simple, please check our [Using the MariaDB Jupyter Kernel](https://mariadb.com/kb/en/using-the-mariadb-jupyter-kernel/) docs for -some quick tips on how to get started and some links to our example notebooks. - -# Documentation -To get started with `mariadb_kernel`, see the full documentation -https://mariadb.com/kb/en/the-mariadb-jupyter-kernel/ - -# Contributing -Please check the [CONTRIBUTING.md](https://github.com/MariaDB/mariadb_kernel/blob/master/CONTRIBUTING.md) file to -see our guidelines for contributing to `mariadb_kernel`, how to set up a development environment and -how to add a new magic command. - -Please note this project is still in its very early stages and we expect it to change frequently. diff --git a/mariadb_kernel.egg-info/SOURCES.txt b/mariadb_kernel.egg-info/SOURCES.txt deleted file mode 100644 index f91c288..0000000 --- a/mariadb_kernel.egg-info/SOURCES.txt +++ /dev/null @@ -1,96 +0,0 @@ -.git_archival.txt -.gitattributes -.gitignore -.pre-commit-config.yaml -.pylintrc -CONTRIBUTING.md -Dockerfile -LICENSE -README.md -Untitled.ipynb -dev-requirements.txt -last_query.csv -requirements.txt -sample_sales_export.csv -setup.py -test.py -.github/workflows/pre-commit.yml -.github/workflows/pylint.yml -.github/workflows/tests.yml -binder/2016_gbp_usd.csv -binder/Dockerfile -binder/apt.txt -binder/mariadb_config.json -binder/postBuild -binder/requirements.txt -binder/try_it_out.ipynb -catboost_info/catboost_training.json -catboost_info/learn_error.tsv -catboost_info/time_left.tsv -catboost_info/learn/events.out.tfevents -mariadb_kernel/__init__.py -mariadb_kernel/__main__.py -mariadb_kernel/_version.py -mariadb_kernel/client_config.py -mariadb_kernel/code_parser.py -mariadb_kernel/install.py -mariadb_kernel/kernel.py -mariadb_kernel/mariadb_client.py -mariadb_kernel/mariadb_server.py -mariadb_kernel.egg-info/PKG-INFO -mariadb_kernel.egg-info/SOURCES.txt -mariadb_kernel.egg-info/dependency_links.txt -mariadb_kernel.egg-info/requires.txt -mariadb_kernel.egg-info/top_level.txt -mariadb_kernel/code_completion/__init__.py -mariadb_kernel/code_completion/autocompleter.py -mariadb_kernel/code_completion/completion_engine.py -mariadb_kernel/code_completion/introspector.py -mariadb_kernel/code_completion/sql_analyze.py -mariadb_kernel/code_completion/sql_fetch.py -mariadb_kernel/maria_magics/__init__.py -mariadb_kernel/maria_magics/bar.py -mariadb_kernel/maria_magics/cell_magic.py -mariadb_kernel/maria_magics/delimiter.py -mariadb_kernel/maria_magics/df.py -mariadb_kernel/maria_magics/help.py -mariadb_kernel/maria_magics/line.py -mariadb_kernel/maria_magics/line_magic.py -mariadb_kernel/maria_magics/load.py -mariadb_kernel/maria_magics/lsmagic.py -mariadb_kernel/maria_magics/magic_factory.py -mariadb_kernel/maria_magics/maria_magic.py -mariadb_kernel/maria_magics/pie.py -mariadb_kernel/maria_magics/supported_magics.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py -mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py -mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py -mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py -mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py -mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py -mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py -mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py -mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py -mariadb_kernel/tests/__init__.py -mariadb_kernel/tests/conftest.py -mariadb_kernel/tests/test_autocompleter.py -mariadb_kernel/tests/test_clientconfig.py -mariadb_kernel/tests/test_codeparser.py -mariadb_kernel/tests/test_introspector.py -mariadb_kernel/tests/test_magic_linemagic.py -mariadb_kernel/tests/test_magicfactory.py -mariadb_kernel/tests/test_magics.py -mariadb_kernel/tests/test_mariadbclient.py -mariadb_kernel/tests/test_mariadbkernel.py -mariadb_kernel/tests/test_mariadbserver.py -mariadb_kernel/tests/test_sql_fetch.py -mariadb_kernel/tests/docker/Dockerfile -notebooks/FOSSASIA Summit 2021 - NEW (MariaDB) SQL.ipynb -notebooks/covid_datasets_charts.ipynb -notebooks/fosdem_tryout.ipynb -static/lab_open.png \ No newline at end of file diff --git a/mariadb_kernel.egg-info/dependency_links.txt b/mariadb_kernel.egg-info/dependency_links.txt deleted file mode 100644 index 8b13789..0000000 --- a/mariadb_kernel.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/mariadb_kernel.egg-info/requires.txt b/mariadb_kernel.egg-info/requires.txt deleted file mode 100644 index f95ca0d..0000000 --- a/mariadb_kernel.egg-info/requires.txt +++ /dev/null @@ -1,9 +0,0 @@ -pandas -json2html -matplotlib -lxml -setuptools -setuptools-scm -ipykernel -beautifulsoup4 -mycli diff --git a/mariadb_kernel.egg-info/top_level.txt b/mariadb_kernel.egg-info/top_level.txt deleted file mode 100644 index 5944a0c..0000000 --- a/mariadb_kernel.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -mariadb_kernel diff --git a/models/test_model.joblib b/models/test_model.joblib deleted file mode 100644 index 68a9ce5ba7ea4c901bd1a2e4d0cabbea894c107f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 146537 zcmeHw3!Gh5b@wHNS0E&T5D-zuX__W!gz!cgxQ9WE7>XkGfg|_lLW}y@Z|}4Af3LIFxqD`xdu}M+ zkNM@7b@yZK=h^?Y_S&!IBd?UDP+ZcMlV7{=!*3i-?^{^A`8i`cCT|w{U)U*V0)F7cZE%v@4C;H*-Yq*sQOq(m8!Icj`UB zdVX@={BvtPUFR&BHK)g9)0GC!p!!!Ty?fcT&zU!?XPK$CNq%qZwsE?;7tKw5_F9rQ zMAy=uSv~VijQwn?^A|2&W=ej6Db}1N^JXob*Oy5$t-j0HFzhjB&f>0F3l`UAWg?AU zxTt$k&!X=6bGqi3=02@=Y?o`pu20NcVwxyhXE3wQKvG*1Ma@clIJvlbMy?QJ`}U4p`fbIh07*E{Ys>pXGEl35pg&{SY?_5<8~ z>9W~rQ=67bx%zr{>z+Hy#x-AQ?=DMgvlg4$jJU-7UbJMc{h)R_$^4$%bAj=%y%oPq z+!60Nw(s)KUe@=%zR5Fp?%nCH`%XLUw0Hlv`Ah$sS?%eY-8*X5f(45{*|l(?X~)ZZ z-$><5b2FVg|J?3{^SXPwW?#@_KAM?hm(5_(JX57ZuIxey_wM$CAngt_q(*i5an?8Upx#m4U5l3WEMC^Lw9AgI-Q5`P>Z?s5 zK&VZt9a@_{bEGNW;k6@bN7mj}>$tpjylac^A*ZIch0Uyyw%GIJZUZ*+4DfWHYrdLy z&hMVpJ;$`phfpB+dwGptT9V=yse@Wfz09NjzO9PxQ~ zsL#7%zJ*%HXQ{8o+Rr}w=NiC~?N#?9A-a2V<&T_C?d0%>zU$IC_PaBmc+C`Nex^9J z_sOwl3v<%$BVC$(mzpuzJ30%}>F2kL6*LAl_K||e`WmrerH#qTw2b^VcknrE)M1`2SlO1|S-F!wz_wM|GqmN{8slENQ z$-Sc%%sZ#2Yff$cg1LRYdoD=@lymLopwYl~PtAy=+9DfgcgUN*UNchD5&(>Q_a_NA zZ?5?wWyn4Y=NqZreQwvei@JIi8L4(oPhU@3k&nmp!pzF0eYGXEi)QX_=u$(+8roAk zedYm%E;F>v&`%iu$%cN?xThO>zD;@AVWPqUOKtzoJoeJQWi!oJJ9BLI)7vHe>|Co? zMz~J^ZtGnBng2+(FBx_AX?Jd`j&b2cIubau->&`r2j}6!S0J9#ft<4~(f;@j;7_z8 zd3o|#Us(Clmg-(2;K&5d*!*Ss2YLQoTtd4#nyr8Gk1r*C-Cbw?bmr65-H~BRrzUX5 z<}d3%zA!(^N%?a7T{+#Aq<7a(%v!L_AgLZ4rr$?i1}Sm6ABnqC$-g}D6!Y&9hco?E zxNljw;YSNMtOEX^x{i8Ws$BgK_HTXlob+$4ir=r?%!dTWA9vHf$KAA9#j8}-U;a<) z%)gvJ_=qAsP#J1nS6<@$md&mmSbL-Szo1Ci<;5f3Tv8-(qwn>*EU3G?mjIGtNBA_f@j7^Xn#?dI}f$~^qa^=g(l7sH~_(Atwt z5A7kxlZ%`r$qC1e?4I`NP1W55|D*&?v;$}e?_~RM`GGS?ALX*?TfCiq{lMV(+n=n; z@-bc^U(itg*>o!v|D9EUV|$=NS(ZJ&ex`hj<0aae z_!+OSoZ@Oy>(!rq04PVJs44BXu=t~=s=!B0(N09U-QoZK^lxL8=r^k0xxxP3=PBa7 zbN`R_tUmdcYD)Hh%2R|(n$L%De5J$lm9D&719+O}r}EwH<5M}G^KQyN&nL^JU5F2= z!<&HDlYevQ;#S4$kp{2#$l>)KMesU{3l?vdTJ`S!!P0mw@rv;h<5=Pe`n@Ch&0lGr+B=DJR$#SUS0r4>ftrZo7Gi` zr;~mDvE`TYqEeV2<)iXXEX*(4kMj@X1wt@@mZ^ z_HR>_DD>1y<<05e9g07XE0q6MkAuho_y}Cnc!zkB=F_L}kiX>4yS7(L>!*i5c~V)E z#cNhSwRqQ{ev0v&{jTkJz5N=ip>D12Cpx3Uk9UkiIi6hPq%--$otK<4di9p--h%%y z&p#yIE$>KD_4MOUs`+Glg5+ZH0{IYM9`Ar#NH?WMza{s8(k~D2aOEA2F|A>GATa zzqfyDs~=APo~RO@KIFp*4+ou|`k41ePY@jr_XZpvsN`l>D4mc$bJ8!Q530lA4yC94 zwHy3R&`&?~(cjH&W&N~4ysoPsE~5xKVo_z`QnlP|-` z8IF%%l7{P7TEBk#`ENR-Rq@)|f5+tP31edH2^g=UACk`Gv#WmF|Dl7oR>zBgc)Wi6 zYm1K`_4sBrpPrZ?xyHsTe;>dh;4aclsgXAap8u1hs?#%8bFY{--jO9muR6g!M*oiRz z%En7}eI>cJ`@u1v>(ln6!@a!1IwIBwviw%wLSH~TQU1-7x3TrdcS8BIz7WeV`;X^0 zVR`oZ>-~^TJ`>f|fIMIAQG8`_%1BE}+((>|- zcfPc&RpsTbjmt~m4!5-VwPHq+nxmF%t?nxV^7be>o)Nhtd4By{7SDOZ7PUX)=marh z@j~J?%PYuB;4acl>57w+2?G;fxUFlWnx7rv@fLCx`a%|OtvznUxo3TD5^4s~*h2y_FW8;O7s`b+GzWlN6FZ-AA zI+j0Mu14lJ=x1ub;`)XxKl4M*LS9q)?*HW7!S!a;C$|1F|KjV#$d5|K_*hn65`J$` z>ks{YJ;W^XnkC+iZvMpcD9o!wdu7(~;eYl?7mOY}YD;zU_r7}jrN6qt(VsYaU-eqM zi{|pDsct;m*r=94>=2cC0l+guhVu~ zu|>)2>AwDumv!|8vwriTukY3I)YZEE+r7MmoP|D+&ELw~b(dKK!^2A69_0CB`%kt% z%d1%aY`Gedx00XrjoAG0eWm?TVSOhouSQ4B@33AEdm!AHhfsd~ZhrMcZ@(k|(El1w zf9~cn{ek%m($Rdw`6K*Em+to+UEwJ2C&l~{^D)f(kOJf(+*JSP3zj!jexwSj<7qeI zHDUef(!tZGx2n9^0sC2hXRMhAKHu#9@2sB!hk$q7)8?0%AD(~J2mkTmKiR7GzaQiA z4)O-LnZ*^0*GJ!W?DVr98dUbw{QL~KE%Hw~lMkc=V`ETWLQX?&NdCmnkEFce`p4p3 zwqBLWt1mCnPgFm~+t~7>pBm+t`E&ilcptZ4fXdUa)_t(gs$bz<9+!^&>9c>a=)yJC zxo_F?%zHb(>FC-j{L|@4-{We2a$gnx_+AKqAx|E2`(b(hf%yf|qVo#mk}p3(>QlrA ztsg7z%kfLw)6N|6PY1O!UN<;Dka&U~=t#DXw>!MH`{w5d_2tD$$*pg` zWka$-+C3{)^2Y{Y>ex zyo}{Xe#Fn?J(gd#?<7|*V=tIODOMUh?}l}K*h4~ikNv|$pU$1v(Dibq$`PCg9{T(* zw@)sC)B|#0i^uo+@j~egXP*l3>0A&GREIagp7WWT4nMV3@p@;oe$XIZ19wD=C!NVx z?D6EYwSFF6KM?$t4!?dN$J^ut@nXko2rqzRb@AHl&q~eb2d`E7CEL$q^G7~Z?&9_g ziB}b0f3yp5Hyh96?eeFUJ!-n=k1c=e|X}zzTL0n4evLI&0qEp z<83UzEI+qjS-ggRPI~1kXV)^26V4~2$BB0M`)Bn2D8!2$ zZxZi#y$^b8J-o8^^U3>7dio33==J8Ky?%;xfd^TOLFkd486ajto^h2d|Kia+XEWI8a+T6&c{AjG(8?(Z#UH53_T%z5 zR)5B~vixCr6L)@s>euh&LVn$KxO3Uq&q?&b+&)f9zfzI=PI(;_;Y7Jzq`!9Qc-pPw z!<~;;|C}K9y@t*UayrIwyegJz4^mpusgnb41 zNUHH;AZIV3b8f3WUJ-s_-y)xblj&*~=R@UVoCp3(4vsJVLoUPz&BrTk581El;y-C+ zylc>Yjq#Ah3#%Vueoz+gw-%_vz~G>mSySGjjj)JezF{u z3igw0f7s2nzjSYR=cajI3c~B}PeCXs$^{C4rls-v;$Oe7)~a}I*Bcw$k60Hk3|`{? z^x^C|63<+JEk8b){oIn@eR?mwfBTRGJqjF=c#crE$H?&FcqzwwS-fDpg`AOi>dNo$ zt9x46BZ}`2K|4{(#qH0b`VWQI6|*1nP2bq?!4LdW-zPFPK@Y{sE15sH-*EOE$Sbrn z<@@Td7t8WjeEq}nB5pq-XWun!UOhTe~eSey>cGuWO@_dKw@hd4MsQjoM? z;Wzdr%2@2u>1j7&hWwkb-uvVBFShJ?r4sj1L;uN+cZ;`A+UFz|mOrlqJf$9POVHE6 zc{v_kXwsRyFEuFW`>c=j{J=4pK2q;+@$`MtoPVsmK>MI%$U);Z{8Jjdudeug)-r$A zuVc%f?N?K_1my(I*Ok|1KO)6jr`FpidwW`J{%9vkuid5Fs6C43cVYcJ4zH;^u;bu8 zl1HjiPp4S?KjRbZHaP!@I1fHx|6ni2x*aHDY0Bv$1n2(sya3@u^L#QKu28&wh2w}H zi|3(qn}F9>pZa&Fx2iqI-Ura2JqP0zJ(bcG??}??sNa@-zvZ`hKVizw z@402`Hg&$UxPFcFseF0;8n~0q-_8$k-U?LL=a0oJ zw3Ex%^FuF}OlE1jTfLx`TasLf%&zX_zm0pm0kFtkN zEG#c%{_Q2}&-fIU7jgC+YL5YbUmEN>Sbv2c54(kS6TkkS%LhFkb`r$YZj=Mz_1&DV zj8~~>c>*~j+ZMO_HTEM8x857VYwRaP`qYy=KLCz~ z@EZ4z$nnbd=UBW$K9qkvUIXtW|L$I1#pYiY@3Q@p9hVxfsebr>M$&F>)9xF!MxB>x zp8X&5O6d8>Ii==4WO8eH$L$~6ezN`*mu|d0s4TyfcN2a7VR;cZKY;xo_I}6-ZBME{ zFJN3Je!L5b{YBLJ{C^l%<8T})h0+Q6gRfGdb%cAWhyLW}*M05~{r)b;qhF!)P;by! zzEHeSK25MktzUQVgjSU|c7M@sx%-fJi@6V3;i2moes8_(eo&b{+arMEkS78c?ENz%sQup@L{C={c`Dbx3JD-j%ztk_;ej3X!+n41{EWd1@NeSv-7VmJc66s}Y z-MZCv{yuHwv*r=|x5?dW`dxRQDc}ELrQ*r^|H9uR0zOGYc!OBjPtfmi4Iz?-(h2!X zZgwF)D2L!zY5ne*8K>^s%6Qk{`9Ro_vg0yqf7}lpkkb5m5%46-Z|7sUKLqzQP%?)l zNG^7~$??keEBM#buk8MUPhD+qAbCdZhvNH3hQzz&?{MW@r`E@dpBF(nsour$Cflwi z4Ogy81#$}W#caG*dQT4Zr1p!!+qm^2;4P)g`^vCSfcz_!52t@mIJ|u*hqs7FG>>2U z`~>dZkbJ;f`1UK@&8Lf)^(n{K``yT0^xXqaPv#f$Z-#z)(DetmDqfec-$DP#j(3aK zq5Z#z2fZnT*Pk-CPu$f$sP2#E^`cn3M)~MFVf_*~2;3FlQPP=QG4H)oPrdz_CZCrm zj<;wZDj(118>|1s1oaH-hjI2h=!FuWcwUTru%1BYgxC3V3ka7qKYsC!B;t`i`Bane zD4!qlDXrh#{o?kPKHoJiXYU^uWABH4hjvb>8E*rF_JNbl>iz}n!>?2{UI>1(Z^)kR z`pnCl)Oc-A5HEJT$nnPWIp78K8>E|3vtOUwdjBZz*9Xo>{lTST-haeQj zyS<#!dal-kDIL8orQOI4RF^}$5f5~@@Nhi5!;Bc@#(T`6^?S~b-cR_hZx5tH{{NBl zM>tWWvmy`=_HD$I+{3$?aKGKzr+sJ1>+^%0{TgyBiwkysKUYEK zKaJIILgNkoA-uSC!XD>Mea8l+UrtR>o!IzNL63K|YJU9c^H86d{b%N%et#p+hZO5m#0SmCDvcL+JhJZ_U&o7Ryuf>@a{pc& zUQ~Yi%fvh`{r*MI>-S2F&j;mrW%)iFUf{lYX)j>;96R2T4}Fov=W~+3xcyS%MSFsg zQC7bIE)ZToZ@{}i_xpJQ%~PJJUNCy_sJ_Sht7oSVG~=8NN&}i7zk2_TJl7NELr8s! zd_eQDO5?>Vdk>u3s(4|~zc+DxdBqw&-vT=c^++6EWc3Sa-z>gfA@PF8E5;%8_3V6V z`HJ88^Ng?$Qu+BjBji5tMDS3d2FBUJe z?PBVXxEe;EM;rl$l=6HsHhL&qBDQdHlk8 zMa09swQv*F`kHp{xb7iCCfZ5e|xYpzvo}T0i~V8yg~Th)KbAjRF0<>=*B>|p{T%H=bcSEwm+e~|?`8QJZ%4Td<$8$oH0Q6;f0Cce z-&p+_9~$H5^2hSa@0;x*VS7T{`D<#a0l&W;xDV=DQoTR+h+9AA^FMMOnD65TKT&Qs z=?@q5VNm#M8cHYR&-_?_*6Y$7Bl|0#9@X7S0U;lpP z)RsM8R6@T7-ij7qF(Vn39x&QcmHS2V_7LE978k94{(*=0zW9yz4^*XoeRP6+w7kjk zTm72iy>{P6D2rFXF~|*Ff8e#07scfr+L_vu=LfOn&z8%i;rhjl$0?LzrPf=UGUejF zi~1(lrqvFuO`jR3UlTX*_BMZSD&g(TRlH+E`#kZN2&nsUc5nv zhmGPsQfVZ4pA|6;vN+BKlRUgz~US|=N*V%<#d`=jrX z+ZEp@DCv_wcJInPvT3B42Ve!&fXNkjQ)y;!?tDtSFLly2jErR^aTZvOn-R>kWk zpD(JI`+0D_sL}mEcppIOr$;9E0%Gw3xJ-O(zl1;ZW2Bo>gSV}@AE+%sJ;Qhvw||iI zK;Y9#Z+8WbAs^re?$ej|=OY$qe%$}j?g#qV>@9!Vs(4XCzZ-$B7PKRI{-ik<)o$EY z)p&tuvEw}?-{B8CTz1}M^~?8u=Z*GkT5Zw~5*&^w@~iSir!>udde*0TK<%P+sL;`3>=GxbkiOuXh#4n=Fo-)DzURF}!1b$eED5+WzI;c7EvA zwMt%1Eu6pXpEl1AIg9nKY(1_081f43O!emTw6XQacS8K!f3f@{h!OuM_z}x5zYnf| zSig>2?}B~}`5-CwiJ`RpUY?K-<2ms)e&4W6kJClYl7{d(r6&&h*TdM72cKLYXd z=}CJ<{vC7T%<~~dNf8fJhFVu?d&1E_KC7*j&@UhPNlk@W}h6DklctUqb$|cqQtUc{d`KAN8bs+moXC z%l<8n*XSS0caqOPjJI*?UAW%~`ZK7uXHdR*T%h%KzV}4#%fnM z@E=$8yGopb{gp3=42LTePy1^(_?v)tlRo+WceN_sm9U@6@qqU3@aL!X`6$GQ9k0MO z;>+V5^cR5(2Jf!ee60Dmsk%2ZOlcnPpuc7Lv-%yjo43c!>+|`b1d<{is0_8P()!UI z?>X`OR>g~5&3^kax%-*M#N5v$#{=5E!>_mK{q~3vJ6d`eWZQL z@8;|E`Mmv9>OYR(+^_U6Gv@r|Tc&JPc*E=Sz!~647H{}|rSkW0qMlT;;&`F@Cvi3O z_iy5!A^4}%+%I&(aU;8@eR`9ckMsLKn8#yYovlBw&u>)sJ-7M%%l1F!@mP<`=5Oui z*bgP!ueg4fO*gCG#FjtQ{;~XJ%P;v^UWN73F;Vv;(f88t_rKD-eyy|bu2OiP$7R@y z$$tDzXE)y9?Z@e{+rG!$zJeS4Ib58}MgBzbf6;k*zFbHN_Xg5^`?vV|z+dLS!iSUp z7Uf2&pxgaESnP{MNPQYAXUHF^3>S`gpd7!ny!`z1|JuFej2+!p)-N~R_doy6MR{*`AfVij@OXOly7nU9{of7EU&_N8K+;v zeu{hKD1F?2%Jx&T8{SpL{sf8_vUBoyrFfMJ?3QrXw_oP@5WhY}e9(NX(s=RBTWT$P zU$Whw&}97@>s|W(0rX5`O&7lrVCec*^3kB)!&@d^FfqV|vS zc#AE+)Xyp|-AV=H82WuSo;@E{Uq39qUw(wE5)sy~%9da2*CHFEpX2=}**|O#h|M4U zOzAV;LchlRO0=JmSLfYfPi$;Z?+dW~6W>MM`9%wzFO%)Z-)~r6#Oc@2ry)1wdfV{& zHS|>IujGDEor~yqdMc=vGhw{P_Xz!!Xy{xQVu#Yp`%}Hyok=kM{?ETs`vT1iz$+2P#9YtF*j4^_DBnYgPTaggpwlBU;?pgC3Z2{@ZprbX!%w z4+>lZj?3|kR63Gdw_bMgeJeIAdl29M5R13S$EBPghd6n;N$K~~J%4QevVLq&gS-Up zqg`zI?EK`Sf4%liBaYb`v;EeIvH~q4pb6UM8Kco$~u* zP|mXX$MPdT+pgv>N&h$h#q#4jAwKRu$XUp1q?=MBZ|m=G;qyMR`OEL8Z2z&o)LePF z(x0=&`UUROAieK_D$(0L73~#V$An!6_8v{ccrWGgJx)F`MgCF_;kqEY1gaqKFKJmxck4XzQK4qq`b86ORV~B|A!8K zLft=He13_3q6)P6_Jf`QeNFct=I4^Xth_1Sf3p9%{lfApZa*dV@nBv@^P81we>`v- zRD28>M;@0c-Qo6GB8fadg@pY19xJ2(xe7T8e@Qt#%xh&h?swPsE9+RiexgGlKefMf zqa2_z)VfN`n-@E;*!lJOS?>O2IiAq&yk9`?-$#tb^i$T4A*X;hNH?XK{Q~=lfc$;| z;FgGH@bc2yyQl5FeWQARj^E>foC2O}yz95upZtryZ;RL4A*X@cBLAc_nQm_D8h`tf zYJSD*H(K6E{v;vy>*wjEykWeL{r)6B%d6q$XOMUJPN+YM+oSQF6MymYtMTc+3xkJK zD8>B!Eye7an2$m3QL6pQ9+$^uI$pNHoj1dNf&2ZrGySeK-VY-GN8EhtasRF~?17k< zx!P4Ahe07n5l$5H7*vL`D_;)^3;8pr{Dt_SGL&7zg>y`jldqxsXMXtDJuUsdNrUGP z>e`vvrVT*wuSW8ew=?H6F}al`Ew0FFpJbKg&La)S2;WH`&4SiD3zDP^ADHO9|) z2YCs(=Sof_3GY`xJH_UYd??|G3G#{MFI#`~FXh|j^AF>7-1!6O=Y)T2)xF>S3aHQ6Q{{yAZ@nwD?7vh8Fga-)p=B4%l>-oA(az-YW2z(l0yS zD+#3&Z%>kV!{Z(Pb>|0WellsyuGf70t((+-+^If);H0ddD_zo=th>w_gdSG+=@!TP zY`bR1dt-RX`gJTn`k880oPPv5ETufZ4C8IweGHT=&R<;T@Rj%1(tg>^UjN0t3-bL1k#z z#qmqy^(BXXt7Y#;+yVPpKXug^+n0>m8ht-v78k7j8|%f=-f~ESzHV&1^LT~cD)lFq zy1_g8Ret>f`A~YCe{A{7@%2`#_(ohf*y+H&z7qZ z{SNh`d^!IxUd7F4F^>g4kzR6}T5m&b^SD6gTc2>}cD~{CnH5#|gOl{1o7K6f(0W!r zAJ`?49@!^x+wW9HwKCo{xPL!1pNhSY6yqT~t}I@W9=^GX z^(5*^zP|%FBJ~H?-~HsAhwy(?RP`~70Q zOO~I%pVLp%y^Pl(3be?xc&9EUv5#fNN=3^{_@~sYe_;JVem`aJr{w<4@meYgW3CX45@<)oHcqkv@k^5%%P7=;7 zAXL-sk?RLsADu#dH^(n6FQ5DBwSV5q^UDV9X~129_eQ^_^TC6;_t?-+(7Pco;Ga_S z-Z9aQ%APgd+ruC)B_6p{%={AfK|n51$W%(#IAZYz`4B(LYsgudKEKVO_Zjc|H{x1YhD28w)$0)OsxcneyoVE;z_ zeH%!HaRfe+BK`cmmNH%_oshreW*6dvatMBv#=G%1tR3+>-bK$3(&s17AKc;ZbznSc zyzM#o;qjNeN#DmCo*!iSEBn9b-&2!#R{VUX#ACKcAzk2oHvi=-ev{C5tM`AG)epspl4y+6V2m7`b@@zP5!+M?AyRT4)9~&hCDy{Go_~@ zc0L~p(RLo~20y4yN4rDu;NOJxgZqDe=rOP3T@LR;c#|H_J8)Zd0(XJ=6@GtGjCZrY zV&Z@P?3zWLgK9tAVF{9p)lZ;@5g+3Xa0ubq{O$hZE9Sj->Z!Lsle9B^yHGQq#r{{| z8HE&IFGV?tpT|4W1#Y2SDfRQ&b!t9aRzKtKI~MOlNhbV z8gCDf`Bw__%l2(6%rDCygST<}e}T8Ke_|f3?RvmhqN}QO&h2TBm&EtHyZ>aR_eVPu z<$jm`aN!=^{s2GXh0+Q6gAeaX{48g8P7SXXJhzWl6TzoS+0dI!9p zX*U0wDfzZ)bMJp={StUR%5`-XZ>+qSyx*j!zhLBrJRf(oxBmg>fxF_1F?fsjGqEm2 z;ll*-5ZJzPsVlE)?5-@xaQKbk7OkE;6_EB-t(C6s?ZbGUq@ zJz9tUrt%S(ES{FK3Oj9LVQpi-UPhf=WSc(w<=!S^K}ib7fC!qyYu@#W8sz3 zjs(ue^h3rw;1F;pi+2{U@A#lSsd-$$`$DvS z{zJEq>~2pfeXUonclu@g`-^&g9_#iy@69kl~fEmv=%JUkK}Gz!8kc?6|ggIn`cgdwqC65^xf@g)cCrM!&^1OBavF#V)56hdl{nyxM4SNdg*i`TS z>Wka$-+;GQlO5%bfW0G++q5rjOHQ6q`ux9K#|z0J#LwqLAw~Tw6}^9&?CPtlG|$lX z@=*RO0`2iaz+a~mnrDRkHGinyO^}xdU9ffER+g8ebMN_$j(N`yc$FQO7BBGL$Uf+z zl;+nPA+JQVq%(P9roGX0i;|aopDE;ENM3#D>w9%Pb@emqd}Djio|7%V#e3-I$cM_u zc-a_U@%$i`U&@Q(cq{o?-e6t@xhLAcG9y9z$@b;;i!HyDw_Jb7S;#%z{;>bc@)y@P zvguYToPTWjB|qa!EWiA|Cnjj;u)K=94;JzYatd}AxRDRtue#dZTmEDfZW+GOhr=Hf z^P9%zHAscq4L*_%7moX5QU0Rqy_n}c>*jYw_s7W6vfdIdcj@(m1v~9Bt(E0XgZ3!k zR(4!kd4u~yv^@&(fFrn^p^b_$H$7|VsQxi(Z=zmk(uUc$;~?9%bI8!i(S{3hhzAH|ChZBn)L6Wcc*%GLoQAwXx+zUM6LPOp?-3TqD^ah^ z{)O1`OT69Pm8;SHFCdspsr z_J4jKhtlJB;>gYC^l<(U?-T&vk+Y=va!`1`!e{K40e6TG;hGK?AK{>yzX^D~?=!#L zrB(5|3HOm^$F6)6KsvYCqyp33@hmydodsW4rmr2HB0fb;d`mATLK zx%F>ZJZHVKzfVdioksDtI9|*0w-wG`^7HK(UtBz@uMF94!Crd z`M0i0^j>EV-{8a1o<#HbrS~twE)PDf4{LoH@g&vxXg9}(T^v4SH|KXl$v?~wK1mTD zREAntX?wyhe>i(;tK#(zcrW{R<~6GWpD({3QI0pp8{j%Jl>(6R6dqBkTX*MceOD3HSO=&rtVK+`~Q%5y*x=M zr2(a%7ne6uUKYpe5I^kR_0~_UJ&MY|Ma^#}JOB9h59L4Pdhf2U`1KF@eU`QVH?RG} z@+xkXE}t&q*QXpG^AN~4 zNin~H-3U~t%IWD4=?=x~SM`B=`20|QnGf#&jeQq9RIVn-%TH}w^}$w^mnG&mz+KUA z=Ds0%-(|DfKObK2&B_%!Kce};GwOWlu?g~l9049>`R#s6iubhA&v}0gA~hDAW$aRD=#GvOtBk6^Ya^wYv2fQSM-=U-@5Cy-4BlWoVG`?ejbZAz-3CG^+WjA#p}w6 zUOxpc5r1+081*H7{$3$xFi#QnH~V8?50ZE}DM7s0`pfp^`61Fp{j%lE;;n3-vgZeO zFI;|nCsgm^{IY!~`uxk{HP#QH&%+)?^bwE8&_l`2|3F|L41X8sUrmt?(UATcP6u|S z`t~H9TPPjiFla1aC|)R^Cg|5+x#@G~wJKikfb&+@q#IneHu-!k;|*{KxRb>jyWacW z@7yur&RP0=EWd}Q@jA{W(-VW{%%VvmDlcho!Iu5{mbhIvHY@q`F!ed_IIpT%lwP$*Yf)* zE-$kEnq7Yh>&J2X$MJoTeterhmjwF;&i#;HdZ&8_{d!058CD*glm2aW`>nsNDDS@} zeVNbY*4Lk*&w^6v^Z$qNJLI2FfqW1jG!&2hvF?oea7gx7Y9)k4gt6{u=yn)=vepQvEGwJK@I`gM9_5Fm! z_3Kdnv3QGeQu*2vlrxsUxIG~5{YkCY0_U_n0Jsl|b6}{4r1|j-y%zEC?iSoc^L#Q~ zyAU5#hc{t9e*cAcx9ont5_o}eFUI>%f9d!0?IXIP`2K4-p4$^7hj9o!FPp#BFYkT$ zYlq$NMQwjQ!tbBPIF#wT)D2$z{!45A`d_boO5J}__B}7&PZm4gQBNvaas3YML;Q?4 zVf`o$FC;GX`+aP%Bch)qJi$$r z)0O@q7vh8F9&Z>>fzZ z#{2XuKe_V9EA)Blqk?#$@y70kJ86!_3qG$ATYlgYm8?B+B`p2EM`QVO`<2BD=o9k& zp?){+K~Y;!9-r54yTJT=D7PP$<`a+kcQbI01X8F^Ib9hZ@|Uslt|rV!t{8pb39X73 zJ7E7Q?EPi+3*Z9vDa%**WB;M(b7Oyd+ZCISy<)R|pCW(0OX}+`6|;Z!yJ!6No|i{% zRObbbNRSWolDc@c{CF2n+rM}}PwaR{Jt<$Ve=I-nT=T=O1UnMymC_X_`}JX2Ki02c zHUM`u8%jkUL^6<^^U>I8^?V9rLW$! zRjv2r_n%66fZoNteaG#$hu#ahm({QB`8wE{Wc}I`lrR==W&L=)s4;%VOUQZ1J(Me@ zM&6}#xt7a#6CH}GEcSz_!lXFc@(MbB${4BKykLy#NUac%cgHEX>` z;tk_PEZ)ibwI!%0JFe_{6MctU)%mQveqDFG8~v8}H&u~7^&;P2fpIAD z!1cR%Uz+e@z1j~o+1sxh8}IFj%b@}s!+6ft-{LLwbBUL%AIIjOP1oe@^KXow#|QLs z=*g&8N|Vkcy$C*TKO%WxNJ@FxA0 zY*Bd6_5h4y;BL13R^Fui#tw*Zau?SR(LTgq9IsJN%?~*T+!f_F_b0%9Ez8gLh}ia* z`IoIf+Y`cg8K<98d$2tK_H5{>6t4f*`0%8|#d{S#0#9K-0M+Tit$OI z1H)aPB0tc4tkQUQ&v*LX*{XO~!v2f#oE>i#@9=)$KB6mN*NwM7%JItj3C1DDyU51i z-P`T!Dua4IR(OAf;5YAZH|snpaFx>M@&l)VTiNzwynXH7X9>M_lx#S18T&Krx3d4r z;w`ry?8bHN&y`bN^Yeo+-pAR0VRs3*~=YCmD*{v0E(p10RozCoYIos^)S zz){Ezk$>iV8Olj~V+-43RQvk;WA)=~xl9>6KjbvzMmC<+7e2e{x8{bwC)9mGjQ6qm zmo2}P*TwD6s3*1mRA2uv-pAQPaNj-jdBXjbZXe<*g|B(sB3#?(aF5?#)c4~M-rwWS zl|Am>vwcdX5B(pMYMcK*Uk-{B^5+yF*GSHV)&qx6N9GrfUwVE}-Dzu2tKxMN?z`ch zHNVV$M&Px!hX5Baj_cut)sO%F#BWc3@o%(#&+mf+2Qi+Jt}Tbfn|ssuYM)W(EhZ;6 z-2OF&H@hb(v-uBr4Lr!^Z|_GNcy{gYzyGeEDSHav=Lnp~`~c~u)aAXZO3i+};p7eE zEao?Yzw*w6-0Q07CvumaU$DH4?Z0flWbFpA{IdOuw?EpM>fh$tWvKlnarPYOyU_1J z;ZONhDr65|Qw7!gqe+fz^zsFKk}{4;|4Ie<4Xt;BE0j*iAAEQ}@OF2<8_Oq-VSn{I zIR!hC3}sg+zmPw3(l5jV)!|Jz&;8sPr~G*<%bTXY?|xD;FgX79Yd)p#+wVxwGZ@E^ zH(6Y;@(S-+$??kf%>c)M_eeLT=Kf*erNn#I&*2ZbA^45FBKIFZUS`|1QYns?XeVk9 z<{whtnDqm*cw;^Q{hk>2Ut@UB`aR@iJ$=W{52?I^O5b3-Z!CY7clx|lww>+wi}mVI z`!$w-#nmf*euQyG?ZM>_&yV8HTS3l9y3)^=2fufQa7mFL*5wfjcmJ^U46UnceF7;o;Rh_^+*x{reBD+o;|{Iwabj)2#C{*?MK|0l=BM zcxmi8*uRc&s#)><>k@C;6W2l&;Fz3$xKM+47*`?vDi&`-^>2*7xPB<}FV2s4qL!KJ z+b^tN7F#bRe7re$U(~qt?@nh=G4JK1f2%!S(s`Y`s-KwsXXc-|@B6r$_C4;VyL>p6 zJB({&U&nrP-iPkK1V!@c5I@`FwY`40{8a|dna@YYLpZ1mwXV|o-D6+rTinWcJ+5}1 zdf0y4C4IHSYe&?MtSzo}e71J{%(2U6=kBYLc;ec9`SH&FoY(KrGr$q4zaZ4?XS~dA zuy|OlHx{>F%kj?NM=ajR`tf`mc#ZiuN}f_<5BT=U_jdi@Yx=wspH~8oV_q-#6TDxz zIk(=6`T&Q3*YLOL*!!FRYW#O+Y`k!bI?rBQzeGQ|@+FSu-gAO~O3nL;ts*bv_sQQM z>>=0>All#PD|lZX{Y34<^Mk!zgN<=C%g^?l{C--*i~W9O`Pm*4%a8o1n8nAB%%8u% zu)G{MbElajF9Y98Q=i7h1NZi-o_C=85B4Viuy~+_<;3iP{h;rW5^YRQ~I&? zZR|(l_q%|LNRKF&OZtagh!2{NR~qlWxaDKhTNUq0oR1CRjalry>WuCBJ*)hE2EZH9 zV`jYn;m%7=8NE7ezXsl6JZ169;w|jYGX3IsfqbZZj8~1}1>2v2W57FIe#-9wHNRuL zkIg@uZq}X_%a8V@^00m%%a8gI|D=TIQ1nb9}c;q^)-}J(h$Cc{3W;9DDCnM?T?8~2k}64K25OaEIRa(*YPfI&k5lj?i1Gf z9mXH@28lEv%5z#)uBq-*0@yu)`a(=V>yWYaZCCpLZi7dzfj zKH`eUyG`i<({Gg2`k?*lM+i}x0Q=fq!pJ}dFA zJ)v~s@k;U+*Dpiu7oJbX?N6Y3@VhKC_v;1coi zcm>W%`-)55ybtr0^Q%j){+{0NKB;j2SsXU0dw$@c)W2N*W_=&)yBhD19`>Qh`L5$P z>*HI@>p140TAydUj%`2L|6G2^5xE}X@;Cdr=-0Zcd&+o%voKVPkzaKC*f7yOz>(BicmRH5r$02{9*OGj~{rj#xGVct)XU{Y5?VND) zK$Yl<>QA5ji$xc%x!=1<{~h%Alw#eN=#6e(&i#pWLHAD|E_|vg?`^_89=E9ZKDanN z*fXJrBL>PL{rTPx^1pl7@*zE3o(YKhz~k+Y#R!_puRo;{@C=IaNpDS zemUa3tvWv_aULCV13i+`q%&DM>g?0*ynj&Lx5V-uI3n=~sdOZZk015;*B0ydQ6?sc z5sOzLyoDUDFK?5i^RM>!#h?D$t|-|HRxUJJ-A(I_C3_sr`gy-`^~*Z^`nrybQ~`xbx_+ ztB}0A+3yF#IvVV>grC@Vh4)u5ia_%?ihHghUqHb@RIhtsJxs=8m(EAKIi~!Tk0<3C z&KW*Xg`O$-;o@}kJQgvi{f57vkm7TC;0G;jk2+vuTg#rWD{)>1as%y5v?Ia$GG*=8 za=hl}myjDGe=|S4Z_(vnf8qK;bzbL~1o2|!l}w-WA5z{Jyq&W1dv4i!nZCcJ`1~|f zf5=J5`z+qt`?8MR{Nah;`u2d@Pd2q={o6dhmN#fuqVMqc*@oILw*9m9%F4S~e%XF3 zufljAcU}hfQvtUyzrlP*wnrY9zI4g7xo_EXebxMDW;);W^BZ!n9d>>|`2T=A$9<#Q z54+mAc|9^z4my{!#+}dkfuB#{y&cJ?U3gFIS-+21$B^!P{V6=;-voQ$VLy2)Y32Dr zQ|}MyOnzr>6FuBUI|21#~ zxSPcVtKU-m{Cf+C*BD-~ei+M-cB1kW=SMpcKjT$n`7=N88unPUi_ORC=fm~iC@N3| zdcJcyG9$tM68U`;pC8Klv%G~J5%$n*`ECDW{RrO)6^!{CYro?CC;PvWpq*m#m-XlV z3(Kow_lHn<@b1O^4)^sv=mToMAMOud@9)vl`#~`n}?Q+KBiUWWr_Vc_2iY^AA|Mw{m=m^&F{a) zc-DCJ^5nC=u=1rX>OHz66C@Xl7m$a<$9M-El;JM*q%*nnzP-=5YQr;C$OQ^1yWZZG zxE!L_+fh#9BScrSegK@2 z`m>8{*4t^lYLmj7@Oukcys~(=>dRfzUb$Ppe|boP;s9rWm+()inIGUjSj407g9@*A zqMpRhc-L6_RT4@kem|isKjRH>6nb*D{j7dE-2I5L<(Kv6{A2mE<;vPq!gv{H4*^a> zufqEa-qIhos?=O7A-7~V$(L3F`FW>WlendG4uS4%3Aa)2Ji~1uyrevQP z?w-9+KAaByYQlQy1;6}I%i2S%e%Byg*T<_9{d%dszZ*RR9G2q|p&iK=ui5&smtN5J z*GUOt#Ew_w(}o2WAo3ZYsyfVALTQ==8yF|=xz1z(%K^q z{PYX&|38yAsr%0l^YQ|64!D~wr^VatU*2u!hi+Y4+Wsxuzqma>w*SN=Z3gpSZ2Qaa zi{*V;yvDjD;U%wEVjVNx9`n&H?p+(~i@8VPIBJu}F|1c22JF>v*QYui?Pe~VJA@CN zKU`I%`~33frLSFg`9H0@{J|={Bd_<1K`HJ88 z^Mwef{P})8q$}~nRo3X&cz;udmz6ig@ecK*lC>qSgwc4HZI`Tm8jF{*{*1R_yo@^! z4O}F=pY4xku z`u%vo0g0z*g&7I`((8rA@hYSrLQX+HMb0TT_rc=5U$irYm)-x(`e7_yX8W}fyp`=c zH9LuzweT!YyOL^zeqQ8 zZX}jp_D^yB7WJcYPj>aHU4`4q* z{Cm5E>pX9#doAzv_gYeU^}PN2F28#Nc8PxHulZcQmq-X#H4O-)67>Ux|gI RWwUDs*50^m_B Date: Tue, 28 Oct 2025 06:43:23 +0000 Subject: [PATCH 21/38] Logging added to data_cleaning --- .../ml_commands/data_cleaning/clipoutliers.py | 323 +++++++++++++--- .../ml_commands/data_cleaning/dropmissing.py | 270 ++++++++++++- .../ml_commands/data_cleaning/dropoutliers.py | 294 ++++++++++++-- .../ml_commands/data_cleaning/fillmissing.py | 267 ++++++++++++- .../ml_commands/data_cleaning/missing.py | 319 +++++++++++++++- .../ml_commands/data_cleaning/outliers.py | 358 ++++++++++++++++-- .../ml_commands/data_cleaning/stats.py | 302 ++++++++++++++- 7 files changed, 1974 insertions(+), 159 deletions(-) diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py index 98e8d4c..e4a3215 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -1,33 +1,43 @@ # Copyright (c) MariaDB Foundation. # Distributed under the terms of the Modified BSD License. - from mariadb_kernel.maria_magics.maria_magic import MariaMagic import shlex from distutils import util import pandas as pd import numpy as np +from collections import namedtuple +import enum +from typing import Callable, List, NamedTuple, Tuple +import pandas +from pandas.core.frame import DataFrame +# note: we don't strictly rely on SqlFetch import path. We'll attempt to use it if available. +try: + from mariadb_kernel.sql_fetch import SqlFetch # optional; used if present +except Exception: + SqlFetch = None +import logging +import math +from datetime import datetime +import re +import os class ClipOutliers(MariaMagic): """ %clipoutliers [columns=col1,col2,...] [method=iqr|zscore] - [k=1.5] [z_thresh=3.0] [inplace=True|False] - + [k=1.5] [z_thresh=3.0] [inplace=True|False] Clamps (clips) extreme values to computed boundary limits. - - method: - iqr -> Tukey IQR method using k (default 1.5) - zscore -> mean ± z_thresh * std (default z_thresh=3.0) - + iqr -> Tukey IQR method using k (default 1.5) + zscore -> mean ± z_thresh * std (default z_thresh=3.0) - columns: comma-separated list of columns to operate on. If omitted, all numeric columns are used. - inplace: if True (default) modifies data["last_select"] in-place. if False stores clipped copy in data["last_select_clipped"]. - Examples: - %clipoutliers -> clip numeric columns using iqr (k=1.5) in-place + %clipoutliers -> clip numeric columns using iqr (k=1.5) in-place %clipoutliers method=zscore z_thresh=2.5 columns=age,salary inplace=False + Additionally, execution metadata is stored into a table `magic_metadata`. """ - def __init__(self, args=""): self.args = args @@ -42,6 +52,7 @@ def help(self): "%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] " "[k=1.5] [z_thresh=3.0] [inplace=True|False]\n" "Clamps extreme numeric values to computed boundaries (in-place by default)." + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -85,7 +96,6 @@ def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): s = series.dropna() if s.empty: return None, None - if method == "iqr": q1 = s.quantile(0.25) q3 = s.quantile(0.75) @@ -93,7 +103,6 @@ def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): lower = q1 - k * iqr upper = q3 + k * iqr return lower, upper - elif method == "zscore": mean = s.mean() std = s.std() @@ -102,27 +111,205 @@ def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): lower = mean - z_thresh * std upper = mean + z_thresh * std return lower, upper - else: raise ValueError(f"Unknown method {method}") + # ---- New DB / metadata helpers ---- + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + # fallthrough to manual approach + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + # mariadb_client.run_statement may return HTML or "Query OK". Use pandas to parse if HTML. + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + # can't get db name + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pandas.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pandas.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", result, flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + # Use db-qualified name if db_name is present; otherwise create in current schema + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error(f"Error creating magic_metadata table: {mariadb_client.run_statement('SHOW WARNINGS;')}") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + # swallow errors but log + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata: %s", insert_sql) + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + # might be kernel.user.identity etc. Try simple introspection: + ] + for cand in candidates: + # cand might be an object; try str if not None + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + # if session-like object with 'user' attribute + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + # ---- End DB helpers ---- + def execute(self, kernel, data): - """Execute the %clipoutliers magic.""" + """Execute the %clipoutliers magic with metadata logging.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") return - if hasattr(df, "empty") and df.empty: kernel._send_message("stderr", "There is no data to process (empty DataFrame).") return - try: args = self.parse_args(self.args) except Exception: kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") return - # parse args columns_arg = args.get("columns", None) if isinstance(columns_arg, str): @@ -131,24 +318,19 @@ def execute(self, kernel, data): columns = list(columns_arg) else: columns = None - method = str(args.get("method", "iqr")).lower() if method not in {"iqr", "zscore"}: kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") return - try: k = float(args.get("k", 1.5)) except Exception: k = 1.5 - try: z_thresh = float(args.get("z_thresh", 3.0)) except Exception: z_thresh = 3.0 - inplace = bool(args.get("inplace", True)) - # Determine numeric columns if columns is None: target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] @@ -161,45 +343,82 @@ def execute(self, kernel, data): non_numeric = [c for c in columns if c not in target_columns] if non_numeric: kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") - if not target_columns: kernel._send_message("stderr", "No numeric target columns found to clip outliers.") return - target_df = df if inplace else df.copy(deep=True) + # Prepare metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + # ensure metadata table exists + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + # log but continue + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + target_df = df if inplace else df.copy(deep=True) messages = [] total_clipped = 0 - - for col in target_columns: + operation_status = "success" + try: + for col in target_columns: + try: + series = target_df[col] + lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) + if lower is None and upper is None: + messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") + continue + # find how many will change + mask = ((series < lower) | (series > upper)) & ~series.isna() + n_changed = int(mask.sum()) + # clip + target_df[col] = series.clip(lower=lower, upper=upper) + total_clipped += n_changed + messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") + except Exception as e: + messages.append(f"Column '{col}': error while clipping: {e}") + # finish up + if inplace: + data["last_select"] = target_df + location_msg = "Modified in-place: data['last_select'] updated." + else: + data["last_select_clipped"] = target_df + location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." + kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" + + "\n".join(messages) + + f"\nTotal values clipped: {total_clipped}. {location_msg}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error during clipping: {e}") + kernel._send_message("stderr", f"Fatal error during clipping: {e}") + + # Attempt to insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ", ".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception as e: + # metadata failure shouldn't interrupt user, but warn try: - series = target_df[col] - lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) - if lower is None and upper is None: - messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") - continue - - # find how many will change - mask = ((series < lower) | (series > upper)) & ~series.isna() - n_changed = int(mask.sum()) - target_df[col] = series.clip(lower=lower, upper=upper) - total_clipped += n_changed - messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") - except Exception as e: - messages.append(f"Column '{col}': error while clipping: {e}") - - if inplace: - data["last_select"] = target_df - location_msg = "Modified in-place: data['last_select'] updated." - else: - data["last_select_clipped"] = target_df - location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." - - kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" - + "\n".join(messages) - + f"\nTotal values clipped: {total_clipped}. {location_msg}") + kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") + except Exception: + pass - # Show output + # Show output (DataFrame) try: self._send_html(kernel, target_df) except Exception: diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py index c4c2596..63d7a5b 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py @@ -5,6 +5,17 @@ import shlex from distutils import util import pandas as pd +from collections import namedtuple +import logging +import math +import os +import re + +# Attempt to import SqlFetch if available (helps to determine current DB reliably) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class DropMissing(MariaMagic): @@ -15,13 +26,9 @@ class DropMissing(MariaMagic): - If columns are provided, drop rows where any of those columns is missing. - If no columns provided, drop rows that have any missing value (any column). - Examples: - %dropmissing - -> drop rows with any missing value (in-place) - %dropmissing columns=age - -> drop rows where 'age' is missing (in-place) - %dropmissing columns=age,salary - -> drop rows where 'age' OR 'salary' is missing (in-place) + This magic also logs execution metadata into a table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name """ def __init__(self, args=""): @@ -36,7 +43,8 @@ def name(self): def help(self): return ( "%dropmissing [columns=col1,col2,...]\n" - "Drops rows with missing values from data['last_select'] (always IN-PLACE)." + "Drops rows with missing values from data['last_select'] (always IN-PLACE).\n" + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -73,8 +81,179 @@ def _send_html(self, kernel, df): display_content = {"data": {mime: html}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) + # ---------- DB / metadata helpers (best-effort) ---------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but get_db_name() failed; falling back to manual query.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # Try parsing HTML table via pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # regex to extract first content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + # If run_statement sets error flag, log it + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ---------- End DB helpers ---------- + def execute(self, kernel, data): - """Execute the dropmissing magic (always modifies data['last_select']).""" + """Execute the dropmissing magic (always modifies data['last_select']) and log metadata.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -102,12 +281,79 @@ def execute(self, kernel, data): missing_cols = [c for c in columns if c not in df.columns] if missing_cols: kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # Log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + # swallow + pass return + # metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + # ensure metadata table exists (best-effort) try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # perform drop operation + operation_status = "success" + messages = [] + try: + before_count = len(df) df.dropna(axis=0, subset=columns, inplace=True) + after_count = len(df) + dropped = before_count - after_count data["last_select"] = df - kernel._send_message("stdout", "Dropped rows with missing values (in-place). Updated last_select.") - self._send_html(kernel, df) + msg = f"Dropped {dropped} row(s) with missing values (in-place). Updated last_select." + kernel._send_message("stdout", msg) + messages.append(msg) + # show resulting dataframe + try: + self._send_html(kernel, df) + except Exception: + pass except Exception as e: - kernel._send_message("stderr", f"Error while dropping missing values: {e}") + operation_status = "error" + err_msg = f"Error while dropping missing values: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + # swallow metadata insertion errors but do not interrupt user flow + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py index d52b3e6..ea4139a 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py @@ -6,6 +6,16 @@ from distutils import util import pandas as pd import numpy as np +from collections import namedtuple +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available in environment) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class DropOutliers(MariaMagic): @@ -15,15 +25,9 @@ class DropOutliers(MariaMagic): Removes rows (IN-PLACE) from data['last_select'] where any selected numeric column is detected as an outlier according to the chosen method. - - method: - iqr -> Tukey IQR method using k (default 1.5) - zscore -> absolute z-score above z_thresh (default 3.0) - - Examples: - %dropoutliers - -> use IQR with k=1.5 on all numeric columns and drop rows containing outliers - %dropoutliers columns=age,salary method=zscore z_thresh=2.5 - -> drop rows where age OR salary has |z| > 2.5 + Additionally logs execution metadata into `magic_metadata` table: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name """ def __init__(self, args=""): @@ -38,7 +42,8 @@ def name(self): def help(self): return ( "%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]\n" - "Removes rows containing outliers from data['last_select'] (in-place)." + "Removes rows containing outliers from data['last_select'] (in-place).\n" + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -103,8 +108,158 @@ def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): else: raise ValueError(f"Unknown method {method}") + # --- metadata / DB helper methods (best-effort) --- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape value to single-quoted SQL literal (None -> NULL).""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine current DB. Use SqlFetch if present; otherwise run SELECT DATABASE(); parse result. + Returns empty string if none. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + return sf.get_db_name() or "" + except Exception: + log.debug("SqlFetch.get_db_name() failed; falling back to manual query.") + + if mariadb_client is None: + return "" + try: + res = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not res: + return "" + # try parsing HTML table via pandas + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # fallback: regex extract first td + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(res).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # --- end metadata helpers --- + def execute(self, kernel, data): - """Execute the dropoutliers magic (modifies data['last_select'] in-place).""" + """Execute the dropoutliers magic (modifies data['last_select'] in-place) and log metadata.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -152,6 +307,23 @@ def execute(self, kernel, data): missing_cols = [c for c in columns if c not in df.columns] if missing_cols: kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # log metadata for failure and return + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # keep only numeric columns target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] @@ -161,30 +333,78 @@ def execute(self, kernel, data): if not target_columns: kernel._send_message("stderr", "No numeric target columns found to detect outliers.") + # log metadata for early exit + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No numeric target columns found to detect outliers.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Prepare metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + # ensure metadata table exists + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + # Detect outliers per column and combine masks combined_mask = None messages = [] - for col in target_columns: - try: - mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) - n_out = int(mask.sum()) - messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") - if combined_mask is None: - combined_mask = mask.astype(bool) - else: - combined_mask = combined_mask | mask.astype(bool) - except Exception as e: - messages.append(f"Column '{col}': error detecting outliers: {e}") + operation_status = "success" + try: + for col in target_columns: + try: + mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + if combined_mask is None: + combined_mask = mask.astype(bool) + else: + combined_mask = combined_mask | mask.astype(bool) + except Exception as e: + messages.append(f"Column '{col}': error detecting outliers: {e}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error while detecting outliers: {e}") + # If no outliers found, log and return (but still record metadata) if combined_mask is None or not combined_mask.any(): - kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) - # still show DataFrame try: + kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) self._send_html(kernel, df) except Exception: pass + # insert metadata (no rows removed) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=", ".join(target_columns), + operation_status=operation_status, + message="\n".join(messages) or "No outliers detected.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Drop rows in-place where any target column is an outlier @@ -193,10 +413,32 @@ def execute(self, kernel, data): df.drop(index=df[combined_mask].index, inplace=True) data["last_select"] = df n_after = len(df) - kernel._send_message("stdout", f"Dropped {n_before - n_after} row(s) containing outliers (in-place).\n" + "\n".join(messages)) + removed = n_before - n_after + kernel._send_message("stdout", f"Dropped {removed} row(s) containing outliers (in-place).\n" + "\n".join(messages)) try: self._send_html(kernel, df) except Exception: pass except Exception as e: - kernel._send_message("stderr", f"Error while removing outlier rows: {e}") + operation_status = "error" + err = f"Error while removing outlier rows: {e}" + kernel._send_message("stderr", err) + messages.append(err) + + # Insert metadata (best-effort) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=", ".join(target_columns), + operation_status=operation_status, + message="\n".join(messages), + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py index 2c768a4..ec2b181 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -5,6 +5,15 @@ import shlex from distutils import util import pandas as pd +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class FillMissing(MariaMagic): @@ -20,15 +29,7 @@ class FillMissing(MariaMagic): * median -> uses column median (numeric columns only) * mode -> uses column mode (most frequent value; works for any dtype) * constant-> fills with provided value (value must be supplied via value=...) - Examples: - %fillmissing - -> fills numeric columns with their mean (default strategy=mean) - %fillmissing columns=age,salary strategy=median - -> fills age and salary missing values with column medians (in-place) - %fillmissing columns=name strategy=constant value="unknown" - -> fills name with "unknown" where missing (in-place) - %fillmissing strategy=mode - -> fills every column's missing values with its mode (if exists) + Execution metadata is recorded into table `magic_metadata`. """ def __init__(self, args=""): @@ -44,6 +45,7 @@ def help(self): return ( "%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]\n" "Fills missing values in data['last_select'] (always IN-PLACE)." + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -83,8 +85,184 @@ def _send_html(self, kernel, df): display_content = {"data": {mime: html}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + # swallow errors but log + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): - """Execute the fillmissing magic (always modifies data['last_select']).""" + """Execute the fillmissing magic (always modifies data['last_select']) and log metadata.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -103,20 +281,36 @@ def execute(self, kernel, data): # parse columns argument columns_arg = args.get("columns", None) if isinstance(columns_arg, str): - columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + target_columns = [c.strip() for c in columns_arg.split(",") if c.strip()] elif isinstance(columns_arg, (list, tuple)): - columns = list(columns_arg) + target_columns = list(columns_arg) else: - columns = None + target_columns = None # determine target columns (None => all columns) - if columns is None: + if target_columns is None: target_columns = list(df.columns) else: - target_columns = columns missing_cols = [c for c in target_columns if c not in df.columns] if missing_cols: kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(target_columns), + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # parse strategy @@ -139,8 +333,20 @@ def execute(self, kernel, data): kernel._send_message("stderr", "Strategy 'constant' requires a 'value=...' argument.") return + # Prepare metadata context and ensure table exists (best-effort) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + # perform filling column by column with sensible handling for dtype messages = [] + operation_status = "success" for col in target_columns: try: series = df[col] @@ -174,12 +380,11 @@ def execute(self, kernel, data): elif strategy == "constant": # use the parsed const_value directly fill_val = const_value - # If fill_val is a string that looks like "None", we want to keep it as string; - # do not coerce types implicitly — user controls value type via quotes or unquoted numbers. df[col].fillna(fill_val, inplace=True) messages.append(f"Column '{col}': filled missing with constant value={fill_val}.") except Exception as e: + operation_status = "error" messages.append(f"Column '{col}': error while filling missing values: {e}") # update the data store and display results @@ -187,6 +392,32 @@ def execute(self, kernel, data): data["last_select"] = df summary = "\n".join(messages) kernel._send_message("stdout", f"Fill missing completed (in-place). Summary:\n{summary}") - self._send_html(kernel, df) + try: + self._send_html(kernel, df) + except Exception: + pass except Exception as e: + operation_status = "error" kernel._send_message("stderr", f"Error while updating last_select or displaying DataFrame: {e}") + messages.append(f"Error while updating last_select or displaying DataFrame: {e}") + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ", ".join(target_columns) if target_columns else "" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py index 3cf3b50..e982ed9 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py @@ -5,16 +5,29 @@ import pandas as pd import shlex from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Missing(MariaMagic): """ %missing [action=show|percent|summary] [columns=col1,col2] - + Examples: %missing -> shows count+percent of missing for all columns %missing action=percent -> shows percent only %missing action=summary -> shows dtype, missing, percent + + This magic also logs execution metadata into a table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name """ def __init__(self, args=""): @@ -29,7 +42,8 @@ def name(self): def help(self): return ( "%missing [action=show|percent|summary] [columns=col1,col2]\n" - "Display missing-value information from the last query result." + "Display missing-value information from the last query result.\n" + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -67,17 +81,231 @@ def _send_html(self, kernel, df): display_content = {"data": {mime: html}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): """Main execution for %missing magic.""" df = data.get("last_select") + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + if df is None or (hasattr(df, "empty") and df.empty): - kernel._send_message("stderr", "No data available to inspect for missing values.") + msg = "No data available to inspect for missing values." + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments.") + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass return action = args.get("action", "show") @@ -93,19 +321,76 @@ def execute(self, kernel, data): try: subdf = df[columns] if columns else df except KeyError as e: - kernel._send_message("stderr", f"Column not found: {e}") + msg = f"Column not found: {e}" + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass return # Compute missing information - missing_counts = subdf.isnull().sum() - total = len(subdf) - percent = (missing_counts / total * 100).round(2) - - out = pd.DataFrame({"missing": missing_counts, "percent": percent}) - if action == "percent": - out = out[["percent"]] - elif action == "summary": - out["dtype"] = subdf.dtypes.astype(str) - out = out[["dtype", "missing", "percent"]] - - self._send_html(kernel, out) + try: + missing_counts = subdf.isnull().sum() + total = len(subdf) + if total == 0: + percent = pd.Series([0] * len(missing_counts), index=missing_counts.index) + else: + percent = (missing_counts / total * 100).round(2) + + out = pd.DataFrame({"missing": missing_counts, "percent": percent}) + if action == "percent": + out = out[["percent"]] + elif action == "summary": + out["dtype"] = subdf.dtypes.astype(str) + out = out[["dtype", "missing", "percent"]] + + # Display results + self._send_html(kernel, out) + + # Prepare metadata success info + affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + message = f"%missing action={action} examined {len(out)} column(s); total_rows={total}." + operation_status = "success" + + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message, + db_name=db_name, + user_name=user_name, + ) + except Exception: + # do not interrupt normal flow if logging fails + pass + + except Exception as e: + msg = f"Error while computing missing information: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=", ".join(columns) if columns else "ALL_COLUMNS", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py index b10294f..89146c1 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py @@ -10,6 +10,15 @@ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Outliers(MariaMagic): @@ -19,16 +28,9 @@ class Outliers(MariaMagic): Detects outliers (NON IN-PLACE) and stores a copy of the DataFrame with boolean indicator columns in data['last_select_outliers']. - - method: - iqr -> Tukey IQR method using k (default 1.5) - zscore-> absolute z-score above z_thresh (default 3.0) - - columns: comma-separated columns to test. If omitted, all numeric columns are used. - - plot: True/False (default False). When True, displays a figure containing: - * top: boxplot of selected numeric columns with detected outliers overlaid - * bottom: scatter plot (index vs value) for each selected column; outliers highlighted - Examples: - %outliers - %outliers columns=age,salary method=zscore z_thresh=2.5 plot=True + Additionally logs execution metadata into `magic_metadata` table: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name """ def __init__(self, args=""): @@ -44,6 +46,7 @@ def help(self): return ( "%outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False]\n" "Detects outliers in data['last_select'] (non in-place). Results placed in data['last_select_outliers']." + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -180,21 +183,248 @@ def _build_plots(self, df_numeric, outlier_masks): return fig + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): - """Execute the outliers magic (non in-place).""" + """Execute the outliers magic (non in-place) and log metadata.""" df = data.get("last_select") + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + if df is None: - kernel._send_message("stderr", "No last_select found in kernel data.") + msg = "No last_select found in kernel data." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if hasattr(df, "empty") and df.empty: - kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + msg = "There is no data to process (empty DataFrame)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # parse columns argument @@ -208,7 +438,21 @@ def execute(self, kernel, data): method = str(args.get("method", "iqr")).lower() if method not in {"iqr", "zscore"}: - kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + msg = f"Unknown method '{method}'. Allowed: iqr, zscore." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: @@ -229,7 +473,21 @@ def execute(self, kernel, data): else: missing_cols = [c for c in columns if c not in df.columns] if missing_cols: - kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + msg = f"Column(s) not found: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # keep only numeric columns (skip non-numeric) target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] @@ -238,7 +496,21 @@ def execute(self, kernel, data): kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") if not target_columns: - kernel._send_message("stderr", "No numeric target columns found to detect outliers.") + msg = "No numeric target columns found to detect outliers." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Work on a copy (non in-place) @@ -247,16 +519,21 @@ def execute(self, kernel, data): # Detect outliers per column and store masks outlier_masks = {} messages = [] - for col in target_columns: - try: - mask = self._detect_outliers_series(result_df[col], method, k=k, z_thresh=z_thresh) - outlier_masks[col] = mask - n_out = int(mask.sum()) - messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") - # add boolean indicator column to the copy (non in-place on original) - result_df[f"{col}_is_outlier"] = mask.astype(bool) - except Exception as e: - messages.append(f"Column '{col}': error detecting outliers: {e}") + operation_status = "success" + try: + for col in target_columns: + try: + mask = self._detect_outliers_series(result_df[col], method, k=k, z_thresh=z_thresh) + outlier_masks[col] = mask + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + # add boolean indicator column to the copy (non in-place on original) + result_df[f"{col}_is_outlier"] = mask.astype(bool) + except Exception as e: + messages.append(f"Column '{col}': error detecting outliers: {e}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error while detecting outliers: {e}") # Store result in a separate key so original remains unchanged data["last_select_outliers"] = result_df @@ -266,16 +543,41 @@ def execute(self, kernel, data): kernel._send_message("stdout", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).") # Plot if requested + plot_error = None if plot: try: df_numeric = result_df[target_columns] fig = self._build_plots(df_numeric, outlier_masks) self._send_image(kernel, fig) except Exception as e: - kernel._send_message("stderr", f"Error while plotting: {e}") + plot_error = f"Error while plotting: {e}" + kernel._send_message("stderr", plot_error) + messages.append(plot_error) + operation_status = "error" # Finally show the result DataFrame (the copy with indicator columns) try: self._send_html(kernel, data["last_select_outliers"]) except Exception: pass + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ", ".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py index 5cc364f..a1b30f9 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py @@ -5,6 +5,15 @@ import pandas as pd import shlex from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Stats(MariaMagic): @@ -24,6 +33,8 @@ class Stats(MariaMagic): -> include the 10th and 90th percentiles (values can be 0-100 or 0-1) %stats transpose=true -> show summary transposed (rows <-> columns) + + Execution metadata is recorded into table `magic_metadata`. """ def __init__(self, args=""): @@ -40,6 +51,7 @@ def help(self): "%stats [columns=col1,col2] [include=all|numeric|object] " "[percentiles=25,50,75] [transpose=true|false]\n" "Show statistical summary (uses pandas.DataFrame.describe under the hood)." + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -106,21 +118,250 @@ def _parse_percentiles(self, pct_arg): out = sorted(set(out)) return out if out else None + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): - """Execute the %stats magic (display-only).""" + """Execute the %stats magic (display-only) and log metadata.""" df = data.get("last_select") + + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + if df is None: - kernel._send_message("stderr", "No last_select found in kernel data.") + msg = "No last_select found in kernel data." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if hasattr(df, "empty") and df.empty: - kernel._send_message("stderr", "There is no data to summarize (empty DataFrame).") + msg = "There is no data to summarize (empty DataFrame)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # columns handling @@ -155,7 +396,21 @@ def execute(self, kernel, data): try: subdf = df[columns] if columns is not None else df except KeyError as e: - kernel._send_message("stderr", f"Column not found: {e}") + msg = f"Column not found: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # call pandas describe @@ -176,5 +431,40 @@ def execute(self, kernel, data): pass self._send_html(kernel, result) + + # Insert metadata (success) + affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + pct_str = ",".join(str(p) for p in (percentiles or [])) if percentiles else "" + message = f"Stats computed for {len(result.columns) if hasattr(result, 'columns') else 'N'} column(s); total_rows={len(subdf)}; percentiles={pct_str}; include={include}." + operation_status = "success" + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message, + db_name=db_name, + user_name=user_name + ) + except Exception: + # do not interrupt flow if logging fails + pass + except Exception as e: - kernel._send_message("stderr", f"Error computing statistics: {e}") + msg = f"Error computing statistics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=", ".join(columns) if columns else "ALL_COLUMNS", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass From 7180b4a63b4d3c77056e2b68ee1acd8e99e2a674 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Tue, 28 Oct 2025 18:06:44 +0000 Subject: [PATCH 22/38] Ingest added --- Untitled.ipynb | 2085 ++++++++++++++++- .../ml_commands/data_cleaning/clipoutliers.py | 2 +- .../data_preprocessing/normalize.py | 2 +- .../model_training/maria_ingest.py | 528 +++++ .../maria_magics/supported_magics.py | 2 + 5 files changed, 2503 insertions(+), 116 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py diff --git a/Untitled.ipynb b/Untitled.ipynb index a5e051f..98f192d 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -149,7 +149,7 @@ { "data": { "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
12BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
13CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
22BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
23CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
32BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
33CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -3180,7 +3180,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", "metadata": {}, "outputs": [ @@ -3189,10 +3189,22 @@ "output_type": "stream", "text": [ "Clip outliers completed using iqr.\n", - "Column 'id': clipped 0 value(s) (bounds: -2.5000, 11.5000).\n", - "Column 'age': clipped 0 value(s) (bounds: 11.8750, 58.8750).\n", - "Column 'salary': clipped 1 value(s) (bounds: -115125.0000, 353875.0000).\n", - "Total values clipped: 1. Modified in-place: data['last_select'] updated.\n" + "Column 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", + "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", + "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", + "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", + "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", + "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", + "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).\n", + "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" ] }, { @@ -3201,11 +3213,25 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -3213,156 +3239,1750 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30.0000005000.030FBachelors51287.514050.255000.03000.08.5475.00
2BobUnknown40.000000178000.0Engineering45MMasters203091.0320100.1120000.015000.09.0589.00
3CharlieEngineering36.142857353875.0Sales38MBachelors101879.3015200.580000.07000.07.2370.01
4DavidHR25.00000048000.0DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveUnknown35.000000178000.0Finance35FBachelors81588.013060.390000.08000.08.0485.00
6FrankEngineering28.00000072000.0HR50MHigh School25872.5010150.760000.04000.06.5260.01
7UnknownGraceSales50.000000178000.042FBachelors182081.4125120.485000.07000.07.8374.00
8GraceSales45.00000065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%clipoutliers" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "d146b7c7-8860-4962-a9e8-78675b068982", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clip outliers completed.\n", - "Column 'id': clipped 0 value(s) to bounds (lower=-0.39897948556635576, upper=9.398979485566356).\n", - "Column 'age': clipped 0 value(s) to bounds (lower=17.616171113291706, upper=54.669543172422586).\n", - "Column 'salary': clipped 0 value(s) to bounds (lower=-9114.19257183242, upper=131114.19257183242).\n", - "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalaryHenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
1AliceHR30.012000.09IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
2BobNaN40.0NaN10JackSales55MHigh School301268.905250.865000.02000.05.5150.01
11AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
12BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
13CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
14DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
15EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
16FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
17GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
18HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
19IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
20JackSales55MHigh School301268.905250.865000.02000.05.5150.01
21AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
22BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
23CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
24DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
25EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
26FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
27GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
28HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
29IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
30JackSales55MHigh School301268.905250.865000.02000.05.5150.01
31AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
32BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
33CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
34DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
35EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
36FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
37GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
38HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
39IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
40JackSales55MHigh School301268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d146b7c7-8860-4962-a9e8-78675b068982", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clip outliers completed using zscore.\n", + "Column 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", + "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", + "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", + "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", + "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", + "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", + "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).\n", + "Total values clipped: 20. Modified in-place: data['last_select'] updated.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
2BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
3CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
4DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
5EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
6FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
7GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
8HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
9IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
10JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
11AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
12BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
13CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
14DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
15EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
16FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
17GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
18HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
19IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
20JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
21AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
22BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
23CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
24DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
25EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
26FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
27GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
28HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
29IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
30JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
31AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
32BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
33CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
34DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
3CharlieEngineeringNaN108000.035EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
4David36FrankHR25.048000.050MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
5EveNaN35.0NaN37GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
6Frank38HenryEngineering28.072000.031MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
7NaNSales50.0NaN39IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
8Grace40JackSales45.065000.055MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
" @@ -9228,14 +10848,251 @@ "execution_count": null, "id": "87d73330-b792-4d19-9eac-daa9cb0c7d1a", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_name
1clipoutliers2025-10-28 07:39:14emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", + "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", + "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", + "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", + "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", + "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", + "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).test
2clipoutliersmethod=zscore z_thresh=2.0 2025-10-28 07:39:16emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", + "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", + "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", + "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", + "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", + "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", + "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).test
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from magic_metadata;" + ] }, { "cell_type": "code", "execution_count": null, "id": "87d02cd4-7308-4a7f-b598-064deb297357", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop table magic_metadata;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "572ea337-547d-4fcb-b6dc-87cb972ce5b7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] stored content length=0 preview=\n", + "\n", + "[debug] using text from args (len=157)\n", + "\n", + "[debug] stored content length=157\n", + "Ingest complete. documents=1 chunks_total=1 embeddings_written=1\n", + "\n", + "Notes:\n", + " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", + " - Native VECTOR column was created/used where available.\n", + "\n" + ] + } + ], + "source": [ + "%maria_ingest doc_id=doc1 title=\"LangChain intro\" chunk_size=500 text=\"LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c59929ab-dbe6-45f1-8f9e-796e71bab31e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
VERSION()
11.8.3-MariaDB-ubu2404
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT VERSION();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e7a3356-c3f8-422e-8eeb-49b8f8d1838b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Tables_in_test
chunks
documents
embeddings
employees
magic_metadata
models_store
sample_sales
saved_models
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show tables;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cbd3171-f8fb-4258-a7cc-c7288206b71f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idtitlecontentmetadatacreated_at
1doc_932786199{}2025-10-28 15:37:27
2doc1LangChain introLangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 15:49:11
5doc_965545306{}2025-10-28 16:12:19
6doc_749541677{}2025-10-28 17:18:47
7doc_307906215{}2025-10-28 17:26:03
8doc_266524367{}2025-10-28 17:27:25
9doc_77662266{}2025-10-28 17:45:42
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from documents;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "812c46fb-ef81-4503-9197-00efe6353db6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1doc10LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 17:46:53
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from chunks;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d96148c4-0e6b-4874-81d4-71a0b9dfd9fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] stored content length=0 preview=\n", + "\n", + "[debug] stored content length=0\n", + "Ingest complete. documents=1 chunks_total=0 embeddings_written=0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warning: no chunks were created. If your document text is present in the `documents` table but chunk_text is missing, check client encoding and ensure the cell body was passed to the kernel. Use `SELECT content FROM documents WHERE doc_id=\"...\";` to inspect.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Notes:\n", + " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", + " - Native VECTOR column was created/used where available.\n", + "\n" + ] + } + ], + "source": [ + "%%maria_ingest doc_id=doc1 title=\"LangChain intro\" chunk_size=500\n", + "LangChain helps you build applications that combine LLMs with your data, APIs, and tools.\n", + "It is often used to create chatbots, retrieval systems, and agents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "400d0901-9fc5-4bb4-bfa9-1912917b4450", + "metadata": {}, "outputs": [], "source": [] } diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py index e4a3215..3ce0a5f 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -399,7 +399,7 @@ def execute(self, kernel, data): # Attempt to insert metadata (best-effort) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ", ".join(target_columns) + affected_columns_str = "\n".join(target_columns) message_str = "\n".join(messages) self._insert_metadata( kernel=kernel, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py index 8ce6b8a..f7a8bb7 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py @@ -12,7 +12,7 @@ class Normalize(MariaMagic): """ %normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False] - Scales numeric columns to a fixed range (default 0–1) using sklearn's MinMaxScaler. + Scales numeric columns to a fixed range (default 0-1) using sklearn's MinMaxScaler. - columns: list of columns to normalize. If omitted, all numeric columns are used. - feature_range: lower and upper bounds for scaling (default: 0,1) diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py new file mode 100644 index 0000000..4eacab9 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py @@ -0,0 +1,528 @@ +# mariadb_kernel/maria_magics/maria_ingest.py +import shlex +import json +import math +import logging +import numpy as np +from distutils import util +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +from mariadb_kernel.mariadb_client import MariaDBClient + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# IPython history fallback +try: + from IPython import get_ipython +except Exception: + get_ipython = None + + +class MariaIngest(MariaMagic): + """ + Cell magic to ingest text documents into MariaDB, chunk them, and store embeddings + directly in a native VECTOR(384) column. + + Usage (cell magic): + %%maria_ingest doc_id=DOC1 title="My Doc" chunk_size=800 overlap=100 text="My document text here" + + + The magic still supports passing the document body in the cell. If `text` is present + in the magic args it will be used in preference to the cell body. + """ + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaIngest") + + def type(self): + return "Cell" + + def name(self): + return "maria_ingest" + + def help(self): + return "Ingest a document: chunk -> store -> embeddings (model fixed to all-MiniLM-L6-v2). Accepts `text=...` to pass the document content in the magic args." + + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + """ + Accept either: + - dict (already parsed) + - string (key=value tokens) + Returns a dict of parsed args. + """ + if input_obj is None: + return {} + # already a dict (kernel may pass args as dict) + if isinstance(input_obj, dict): + return input_obj + # if it's not a str, try to convert + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + + input_str = input_obj.strip() + if input_str == "": + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception as e: + # fallback: naive split on spaces for simple cases + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + def _simple_chunk(self, text: str, chunk_size: int, overlap: int): + if not text: + return [] + t = text.strip() + if len(t) <= chunk_size: + return [t] + + chunks = [] + start = 0 + L = len(t) + while start < L: + end = min(L, start + chunk_size) + if end < L: + look_ahead = t[end: min(L, end + 100)] + idx_nl = look_ahead.find("\n") + idx_dot = look_ahead.find(".") + if idx_nl != -1: + end += idx_nl + 1 + elif idx_dot != -1: + end += idx_dot + 1 + chunk = t[start:end].strip() + if chunk: + chunks.append(chunk) + if end >= L: + break + start = max(0, end - overlap) + + if not chunks and t: + chunks = [t] + return chunks + + def _embed_batch(self, texts, dim=384): + model_name = "all-MiniLM-L6-v2" + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + + if _ST_AVAILABLE: + try: + st = SentenceTransformer(model_name) + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", embs.shape[1], dim) + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + return embs + except Exception as e: + self.log.exception("sentence-transformers failed, falling back to deterministic embeddings: %s", e) + + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs = embs / norms + return embs + + def _parse_single_result(self, html): + if html is None: + return None + try: + import pandas as _pd + df = _pd.read_html(html)[0] + if df.size == 0: + return None + return df.iloc[0, 0] + except Exception: + try: + import re + m = re.search(r"]*>(.*?)", html, flags=re.S) + if m: + return m.group(1).strip() + except Exception: + pass + return None + + def _get_cell_text_fallback(self): + # fallback to IPython history if needed + try: + if get_ipython is None: + return "" + ip = get_ipython() + if not ip: + return "" + ns = ip.user_ns + if ns and "In" in ns: + hist = ns["In"] + if isinstance(hist, (list, tuple)) and len(hist) > 0: + for v in reversed(hist): + if isinstance(v, str) and v.strip(): + return v + hm = getattr(ip, "history_manager", None) + if hm: + try: + entries = list(hm.get_tail(1, include_latest=True)) + if entries: + src = entries[-1][2] + if isinstance(src, str) and src.strip(): + return src + except Exception: + pass + except Exception: + pass + return "" + + def execute(self, kernel, data): + """Main entry point called by the MariaDB Jupyter kernel.""" + + # --- Extract cell content robustly --- + cell_text = "" + try: + # Case 1: standard MariaDB kernel -> {"cell": {"args": ..., "body": ...}} + if isinstance(data, dict): + if "cell" in data and isinstance(data["cell"], dict): + if "body" in data["cell"] and isinstance(data["cell"]["body"], str): + cell_text = data["cell"]["body"] + elif "code" in data["cell"] and isinstance(data["cell"]["code"], str): + cell_text = data["cell"]["code"] + # Case 2: other kernels + elif any(k in data for k in ("code", "content", "message", "data")): + for k in ("code", "content", "message", "data"): + if k in data and isinstance(data[k], str): + cell_text = data[k] + break + elif isinstance(data, str): + cell_text = data + else: + try: + cell_text = str(data) + except Exception: + cell_text = "" + except Exception as e: + kernel._send_message("stderr", f"[debug] could not extract cell text: {e}") + cell_text = "" + + if cell_text: + cell_text = cell_text.strip() + + preview = cell_text[:80].replace("\n", " ") + ("..." if len(cell_text) > 80 else "") + kernel._send_message("stdout", f"[debug] stored content length={len(cell_text)} preview={preview}\n") + + + # --- Parse arguments (key=value pairs) --- + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}") + return + + # If the user provided a `text` argument, prefer it over the cell body. + provided_text = args.get("text") if isinstance(args, dict) else None + if isinstance(provided_text, str) and provided_text.strip(): + cell_text = provided_text + kernel._send_message("stdout", f"[debug] using text from args (len={len(cell_text)})\n") + + # metadata + doc_id = args.get("doc_id") or f"doc_{int(np.floor(np.random.random()*1e9))}" + title = args.get("title") or "" + chunk_size = int(args.get("chunk_size", 800) or 800) + overlap = int(args.get("overlap", 100) or 100) + dim = 384 + metadata = args.get("metadata", {}) or {} + + # build docs list + docs_to_ingest = [] + maybe_json = (cell_text or "").strip() + try: + if maybe_json.startswith("[") or maybe_json.startswith("{"): + parsed = json.loads(maybe_json) + if isinstance(parsed, list): + for d in parsed: + docs_to_ingest.append({ + "doc_id": d.get("doc_id") or d.get("id") or f"doc_{int(np.floor(np.random.random()*1e9))}", + "title": d.get("title") or "", + "content": d.get("content") or "", + "metadata": d.get("metadata") or {} + }) + elif isinstance(parsed, dict) and ("content" in parsed or "doc_id" in parsed): + docs_to_ingest.append({ + "doc_id": parsed.get("doc_id") or parsed.get("id") or doc_id, + "title": parsed.get("title") or title, + "content": parsed.get("content") or "", + "metadata": parsed.get("metadata") or metadata + }) + else: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + else: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + except Exception: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + + # db client + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + kernel._send_message("stderr", "No mariadb_client available on kernel (can't run ingestion).") + return + + # determine current DB + try: + db_name_html = mariadb_client.run_statement("SELECT DATABASE();") + dbname = self._parse_single_result(db_name_html) or "" + except Exception as e: + kernel._send_message("stderr", f"Failed to query current database: {e}") + return + + if not dbname: + kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).") + return + + # create tables with VECTOR(384) + try: + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`documents` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + doc_id VARCHAR(191) UNIQUE, + title TEXT, + content LONGTEXT, + metadata JSON, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`chunks` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + doc_id VARCHAR(191), + chunk_index INT, + chunk_text LONGTEXT, + chunk_meta JSON, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE KEY uq_doc_chunk (doc_id, chunk_index), + FULLTEXT KEY ft_chunk_text (chunk_text) + ) ENGINE=InnoDB; + """ + ) + + # native VECTOR column + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + chunk_id BIGINT UNIQUE, + model VARCHAR(128), + dim INT, + embedding_vector VECTOR({dim}), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + + # best-effort ANN index + try: + mariadb_client.run_statement( + f"CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON `{dbname}`.`embeddings` (embedding_vector) USING ANN;" + ) + except Exception: + try: + mariadb_client.run_statement( + f"CREATE INDEX idx_embeddings_vector ON `{dbname}`.`embeddings` (embedding_vector) USING ANN WITH (distance='cosine');" + ) + except Exception as e_idx: + self.log.debug("ANN index creation skipped/failed (ok): %s", e_idx) + + except Exception as e: + kernel._send_message("stderr", f"DDL failed: {e}") + return + + # ingest loop + total_chunks = 0 + total_emb_rows = 0 + for doc in docs_to_ingest: + d_doc_id = doc.get("doc_id") + d_title = doc.get("title") + d_content = doc.get("content") or "" + d_meta = doc.get("metadata") or {} + + # insert document row + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`documents` (doc_id, title, content, metadata) + VALUES ({self._sql_escape(d_doc_id)}, {self._sql_escape(d_title)}, {self._sql_escape(d_content)}, {self._sql_escape(json.dumps(d_meta))}) + ON DUPLICATE KEY UPDATE title=VALUES(title), content=VALUES(content), metadata=VALUES(metadata); + """ + ) + except Exception as e: + kernel._send_message("stderr", f"Failed to insert document {d_doc_id}: {e}") + continue + + # debug: fetch stored content + try: + res_html = mariadb_client.run_statement( + f"SELECT content FROM `{dbname}`.`documents` WHERE doc_id = {self._sql_escape(d_doc_id)} LIMIT 1;" + ) + stored_content = self._parse_single_result(res_html) or "" + kernel._send_message("stdout", f"[debug] stored content length={len(stored_content)}") + if d_content and not stored_content: + kernel._send_message("stderr", "[warning] document content inserted into DB appears empty (possible client/encoding issue).") + except Exception as e: + kernel._send_message("stderr", f"Warning: could not verify stored document content: {e}") + + # chunk + chunks = self._simple_chunk(d_content, chunk_size, overlap) + if not chunks and d_content: + chunks = [d_content] + total_chunks += len(chunks) + + # insert chunks and collect ids + inserted_chunk_ids = [] + for idx, chunk_text in enumerate(chunks): + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`chunks` (doc_id, chunk_index, chunk_text, chunk_meta) + VALUES ({self._sql_escape(d_doc_id)}, {idx}, {self._sql_escape(chunk_text)}, {self._sql_escape(json.dumps({}))}); + """ + ) + # LAST_INSERT_ID + try: + last_html = mariadb_client.run_statement("SELECT LAST_INSERT_ID();") + last_val = self._parse_single_result(last_html) + last_id = int(last_val) if last_val is not None else None + except Exception: + last_id = None + + if last_id is not None: + inserted_chunk_ids.append((idx, last_id)) + else: + # fallback lookup + try: + sel_html = mariadb_client.run_statement( + f"SELECT id FROM `{dbname}`.`chunks` WHERE doc_id = {self._sql_escape(d_doc_id)} AND chunk_index = {idx} LIMIT 1;" + ) + sel_val = self._parse_single_result(sel_html) + inserted_chunk_ids.append((idx, int(sel_val)) if sel_val is not None else (idx, None)) + except Exception: + inserted_chunk_ids.append((idx, None)) + except Exception as e: + kernel._send_message("stderr", f"Failed to insert chunk {idx} for {d_doc_id}: {e}") + inserted_chunk_ids.append((idx, None)) + continue + + # diagnostics if nothing inserted + if len(inserted_chunk_ids) == 0 and chunks: + kernel._send_message("stderr", f"[debug] no chunk ids collected for doc {d_doc_id}; attempting to read existing chunks for diagnostic.") + try: + full_sel = mariadb_client.run_statement( + f"SELECT id, chunk_index FROM `{dbname}`.`chunks` WHERE doc_id = {self._sql_escape(d_doc_id)} ORDER BY chunk_index;" + ) + # best-effort parse + import pandas as _pd + try: + df = _pd.read_html(full_sel)[0] + tmp_map = {int(r["chunk_index"]): int(r["id"]) for _, r in df.iterrows()} + inserted_chunk_ids = [(i, tmp_map.get(i)) for i in range(len(chunks))] + except Exception: + pass + except Exception: + pass + + # embeddings + if chunks: + embs = self._embed_batch(chunks, dim) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs_norm = (embs / norms).astype(np.float32) + + for (i, chunk_db_id), vec in zip(inserted_chunk_ids, embs_norm): + if chunk_db_id is None: + self.log.debug("No db chunk id for doc %s chunk %d — skipping embedding store", d_doc_id, i) + kernel._send_message("stderr", f"[debug] no chunk id for doc {d_doc_id} chunk {i}; embedding skipped.") + continue + vec_list = [float(v) for v in vec.tolist()] + vec_literal = "[" + ",".join(repr(x) for x in vec_list) + "]" + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings` (chunk_id, model, dim, embedding_vector) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {dim}, {vec_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_vector=VALUES(embedding_vector); + """ + ) + total_emb_rows += 1 + except Exception as e: + kernel._send_message("stderr", f"Failed to insert embedding for chunk_id={chunk_db_id}: {e}") + continue + + # final + kernel._send_message("stdout", f"Ingest complete. documents={len(docs_to_ingest)} chunks_total={total_chunks} embeddings_written={total_emb_rows}\n") + if total_chunks == 0: + kernel._send_message("stderr", "Warning: no chunks were created. If your document text is present in the `documents` table but chunk_text is missing, check client encoding and ensure the cell body was passed to the kernel. Use `SELECT content FROM documents WHERE doc_id=\"...\";` to inspect.") + kernel._send_message("stdout", "Notes:\n - embedding model used: all-MiniLM-L6-v2 (dim=384)\n - Native VECTOR column was created/used where available.\n") + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 9c18f7f..8200f53 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -30,6 +30,7 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline +from mariadb_kernel.maria_magics.ml_commands.model_training.maria_ingest import MariaIngest def get(): return { @@ -59,4 +60,5 @@ def get(): "select_features": SelectFeatures, "select_model": SelectModel, "ml_pipeline": MLPipeline, + "maria_ingest": MariaIngest } From ca169c6a37fb0724cf96d636f7d28226759c83dd Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 09:16:26 +0000 Subject: [PATCH 23/38] Added RAG based commands --- Untitled.ipynb | 1684 ++++++++++++++++- .../model_training/maria_ingest.py | 453 +++-- .../model_training/maria_rag_query.py | 613 ++++++ .../model_training/maria_search.py | 565 ++++++ .../maria_magics/supported_magics.py | 6 +- test.docx | 529 ++++++ test.txt | 109 ++ 7 files changed, 3802 insertions(+), 157 deletions(-) create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py create mode 100644 mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py create mode 100644 test.docx create mode 100644 test.txt diff --git a/Untitled.ipynb b/Untitled.ipynb index 98f192d..7dc3665 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -22,23 +22,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "4f793644-f458-4091-9bec-6680a0c2b849", "metadata": {}, "outputs": [ { - "data": { - "text/html": [ - "--------------\r\n", - "create database test\r\n", - "--------------\r\n", - "\r\n", - "ERROR 1007 (HY000) at line 1 in file: '/home/sneha/mariadb_kernel/.mariadb_statement_744c7db6-b34c-11f0-a961-00155dd935c1': Can't create database 'test'; database exists\r\n", - "\u0007\u001b(B\u001b[0;7m\u001b(B\u001b[m" - ] - }, - "metadata": {}, - "output_type": "display_data" + "name": "stderr", + "output_type": "stream", + "text": [ + "Your SQL code doesn't end with delimiter `;`\n" + ] } ], "source": [ @@ -10941,7 +10934,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "572ea337-547d-4fcb-b6dc-87cb972ce5b7", "metadata": {}, "outputs": [ @@ -10949,17 +10942,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "[debug] stored content length=0 preview=\n", - "\n", + "[debug] stored content length=0 preview= \n", "[debug] using text from args (len=157)\n", - "\n", - "[debug] stored content length=157\n", + "[debug] stored content length=0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] document content inserted into DB appears empty (possible client/encoding issue).\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Ingest complete. documents=1 chunks_total=1 embeddings_written=1\n", - "\n", - "Notes:\n", - " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", - " - Native VECTOR column was created/used where available.\n", - "\n" + "Notes: - embedding model used: all-MiniLM-L6-v2 (dim=384) - Native VECTOR column was created/used where available.\n" ] } ], @@ -11016,7 +11016,223 @@ { "data": { "text/html": [ - "
iddoc_idtitlecontentmetadatacreated_at
1doc_932786199{}2025-10-28 15:37:27
2doc1LangChain introLangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 15:49:11
5doc_965545306{}2025-10-28 16:12:19
6doc_749541677{}2025-10-28 17:18:47
7doc_307906215{}2025-10-28 17:26:03
8doc_266524367{}2025-10-28 17:27:25
9doc_77662266{}2025-10-28 17:45:42
" + "
iddoc_idtitlecontentmetadatacreated_at
1doc_932786199{}2025-10-28 15:37:27
2doc1LangChain introLangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 15:49:11
5doc_965545306{}2025-10-28 16:12:19
6doc_749541677{}2025-10-28 17:18:47
7doc_307906215{}2025-10-28 17:26:03
8doc_266524367{}2025-10-28 17:27:25
9doc_77662266{}2025-10-28 17:45:42
11doc_from_fileReportThe magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:13:44
13search_test_docHybrid Search TestThe magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:34:51
" ] }, "metadata": {}, @@ -11036,7 +11252,133 @@ { "data": { "text/html": [ - "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1doc10LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 17:46:53
" + "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1doc10LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 17:46:53
2doc_from_file0The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:13:44
3doc_from_file0The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (P{}2025-10-28 18:17:50
4doc_from_file1you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional e{}2025-10-28 18:17:50
5doc_from_file2hich would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an{}2025-10-28 18:17:50
6doc_from_file3s update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or{}2025-10-28 18:17:51
7doc_from_file4this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't{}2025-10-28 18:17:51
8doc_from_file5instead of this canvas update. Which would you like?\n", + "\n", + "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", + "\n", + "If you'd like, I can:\n", + "\n", + "add support for PPTX / HTML extraction,\n", + "\n", + "automatically strip front-matter from markdown files,\n", + "\n", + "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:17:51
" ] }, "metadata": {}, @@ -11090,9 +11432,1299 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "400d0901-9fc5-4bb4-bfa9-1912917b4450", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] stored content length=0 preview=\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] using file content from ./test.docx (len=28508)\n", + "\n", + "[debug] database detection raw response: '
DATABASE()
test
'...\n", + "\n", + "[debug] using database: test\n", + "\n", + "[debug] embeddings.embedding_vector dim matches expected (384); will use native VECTOR inserts.\n", + "\n", + "[debug] INSERT documents raw response: 'Query OK'...\n", + "\n", + "[debug] stored content length=28249\n", + "\n", + "[debug] INSERT chunk idx=0 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=1 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=2 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=3 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=4 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=5 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=6 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=7 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=8 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=9 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=10 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=11 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=12 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=13 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=14 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=15 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=16 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=17 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=18 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=19 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=20 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=21 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=22 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=23 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=24 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=25 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=26 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=27 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=28 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=29 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=30 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=31 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=32 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=33 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=34 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=35 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=36 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=37 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=38 raw response: 'Query OK'...\n", + "\n", + "[debug] INSERT chunk idx=39 raw response: 'Query OK'...\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (1, 'all-MiniLM-L6-v2', 384, [-0.06188365817070007,0.004101159982383251,0.08865199238061905,0.03391453996300697,0.0705934539437294,-0.007194653619080782,-0.0014060521498322487,-0.03424789384007454,0.028502285480499268,0.046123623847961426,0.07699143141508102,0.000179135...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 1: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 1, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 1: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (2, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.037413...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 2: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 2, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 2: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (3, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.0219331...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 3: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 3, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 3: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (4, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.044671509...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 4: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 4, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 4: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (5, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.00791127048...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 5: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 5, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 5: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (6, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.03741...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 6: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 6, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 6: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (7, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.0219331...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 7: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 7, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 7: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (8, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.044671509...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 8: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 8, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 8: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (9, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.00791127048...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 9: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 9, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 9: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (10, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.03741...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 10: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 10, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 10: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (11, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 11: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 11, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 11: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (12, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 12: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 12, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 12: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (13, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 13: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 13, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 13: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (14, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.03741...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 14: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 14, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 14: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (15, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 15: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 15, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 15: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (16, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 16: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 16, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 16: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (17, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 17: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 17, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 17: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (18, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 18: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 18, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 18: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (19, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 19: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 19, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 19: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (20, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 20: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 20, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 20: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (21, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 21: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 21, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 21: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (22, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 22: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 22, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 22: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (23, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 23: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 23, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 23: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (24, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 24: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 24, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 24: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (25, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 25: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 25, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 25: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (26, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 26: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 26, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 26: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (27, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 27: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 27, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 27: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (28, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 28: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 28, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 28: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (29, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 29: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 29, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 29: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (30, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 30: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 30, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 30: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (31, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 31: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 31, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 31: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (32, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 32: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 32, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 32: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (33, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 33: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 33, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 33: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (34, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 34: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 34, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 34: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (35, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 35: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 35, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 35: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (36, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 36: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 36, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 36: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (37, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 37: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 37, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 37: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (38, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 38: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 38, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 38: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (39, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 39: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 39, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 39: 1\n", + "\n", + "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (40, 'all-MiniLM-L6-v2', 384, [-0.0748591423034668,0.10814148932695389,0.006048132665455341,0.005083549302071333,0.08960696309804916,0.002078823745250702,0.08908689767122269,0.024222850799560547,-0.009931715205311775,0.02776029147207737,0.10587755590677261,-0.0518972128...\n", + "\n", + "[debug] verify embeddings COUNT for chunk 40: 0\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[warning] native insert wrote 0 rows for chunk 40, falling back to JSON.\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "\n", + "[debug] verify embeddings_json COUNT for chunk 40: 1\n", + "\n", + "[debug] COUNT embeddings raw response: '
COUNT(*)
0
'...\n", + "\n", + "[debug] COUNT embeddings_json raw response: '
COUNT(*)
40
'...\n", + "\n", + "[debug] VERSION raw response: '
VERSION()
11.8.3-MariaDB-ubu2404
'...\n", + "\n", + "Ingest complete. documents=1 chunks_total=40 embeddings_written=40\n", + "\n", + "Notes:\n", + " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", + " - Native VECTOR column used only when compatible.\n", + "\n" + ] + } + ], + "source": [ + "%maria_ingest doc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4ba9b623-929e-46f0-9580-93d86a226670", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] running hybrid search for query (len=19): how to get a refund\n", + "\n", + "chunk_id\tchunk_text...\tscore\tvec_sim\tbm25\tdoc_id\n", + "1\tOur store strives to deliver exceptional value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving th...\t0.819408\t0.484022\t0.209054\tsearch_test_doc\n", + "5\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "9\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "13\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "17\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "21\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "25\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "29\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "\n" + ] + } + ], + "source": [ + "%maria_search query=\"how to get a refund\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "96166ac5-627d-4bf4-a91c-370bc2df3dbd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop table documents;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bdee425-e3ec-493d-83d6-37aa5fa49db7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop table chunks;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19c330dd-4031-4b51-b063-925cfe7cda96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from embeddings;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5fc7507-8871-4190-ba0e-58fc10496d7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1search_test_doc0Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-29 07:41:20
2search_test_doc1LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-29 07:41:20
3search_test_doc2nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-29 07:41:20
4search_test_doc3carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-29 07:41:20
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from chunks;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acdb1c5b-87c1-445c-afb5-d9204cf07c65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Tables_in_test
chunks
documents
embeddings
employees
magic_metadata
models_store
sample_sales
saved_models
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "show tables;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d13a0a43-a023-4682-b169-6e61c9b41a35", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
embeddings_count
0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT COUNT(*) AS embeddings_count FROM embeddings;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c147c1e9-3243-43de-b035-6901dcf09caf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop table embeddings;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4e3f9dd8-33e1-4631-88e6-30cbce31e7e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] RAG query received (len=26): How do I request a refund?\n", + "\n", + "\n", + "=== ANSWER ===\n", + "\n", + "Customers can request a refund within 30 days of receiving their item. The product must be unused, in its original packaging, and include proof of purchase. Digital products are non-refundable [DOCID::search_test_doc::chunk_0].\n", + "\n", + "SOURCES:\n", + "search_test_doc::chunk_0\n", + "\n", + "\n", + "=== SOURCES (top-K) ===\n", + "\n", + "- search_test_doc :: chunk_0 (score=0.8153, vec_sim=0.4723, bm25=0.2247)\n", + "...stomers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n", + "- search_test_doc :: chunk_4 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", + " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n", + "- search_test_doc :: chunk_8 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", + " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n", + "- search_test_doc :: chunk_12 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", + " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n", + "- search_test_doc :: chunk_16 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", + " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n", + "- search_test_doc :: chunk_20 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", + " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "\n" + ] + } + ], + "source": [ + "%maria_rag_query query=\"How do I request a refund?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de68a877-5b52-4727-9606-4439152c4506", + "metadata": {}, "outputs": [], "source": [] } diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py index 4eacab9..fb16795 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py @@ -3,6 +3,9 @@ import json import math import logging +import os +import io +import re import numpy as np from distutils import util from mariadb_kernel.maria_magics.maria_magic import MariaMagic @@ -16,6 +19,20 @@ except Exception: _ST_AVAILABLE = False +# optional file extractors +_PYPDF2_AVAILABLE = False +_PYDOCX_AVAILABLE = False +try: + import PyPDF2 + _PYPDF2_AVAILABLE = True +except Exception: + _PYPDF2_AVAILABLE = False +try: + import docx + _PYDOCX_AVAILABLE = True +except Exception: + _PYDOCX_AVAILABLE = False + # IPython history fallback try: from IPython import get_ipython @@ -25,15 +42,13 @@ class MariaIngest(MariaMagic): """ - Cell magic to ingest text documents into MariaDB, chunk them, and store embeddings - directly in a native VECTOR(384) column. - - Usage (cell magic): - %%maria_ingest doc_id=DOC1 title="My Doc" chunk_size=800 overlap=100 text="My document text here" - + Ingest text documents into MariaDB, chunk them, and store embeddings. - The magic still supports passing the document body in the cell. If `text` is present - in the magic args it will be used in preference to the cell body. + Behavior: + - Accepts text via `text=...` arg, cell body, or `text_file=...` path. + - Uses native VECTOR insert when server VECTOR dim matches embedding dim. + - If server VECTOR dim differs or native insert fails, falls back to embeddings_json (JSON). + - Verifies inserts by SELECT COUNT(*) for chunk_id; falls back automatically if verification fails. """ def __init__(self, args=""): self.args = args @@ -46,8 +61,9 @@ def name(self): return "maria_ingest" def help(self): - return "Ingest a document: chunk -> store -> embeddings (model fixed to all-MiniLM-L6-v2). Accepts `text=...` to pass the document content in the magic args." + return "Ingest docs -> chunk -> embeddings. Uses native VECTOR when compatible; otherwise falls back to JSON." + # ---- utilities ---- def _str_to_obj(self, s): try: return int(s) @@ -70,31 +86,21 @@ def _str_to_obj(self, s): return s def parse_args(self, input_obj): - """ - Accept either: - - dict (already parsed) - - string (key=value tokens) - Returns a dict of parsed args. - """ if input_obj is None: return {} - # already a dict (kernel may pass args as dict) if isinstance(input_obj, dict): return input_obj - # if it's not a str, try to convert if not isinstance(input_obj, str): try: return dict(input_obj) except Exception: return {} - input_str = input_obj.strip() if input_str == "": return {} try: pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) - except Exception as e: - # fallback: naive split on spaces for simple cases + except Exception: pairs = {} for token in input_str.split(): if "=" in token: @@ -117,7 +123,6 @@ def _simple_chunk(self, text: str, chunk_size: int, overlap: int): t = text.strip() if len(t) <= chunk_size: return [t] - chunks = [] start = 0 L = len(t) @@ -137,7 +142,6 @@ def _simple_chunk(self, text: str, chunk_size: int, overlap: int): if end >= L: break start = max(0, end - overlap) - if not chunks and t: chunks = [t] return chunks @@ -146,7 +150,6 @@ def _embed_batch(self, texts, dim=384): model_name = "all-MiniLM-L6-v2" if len(texts) == 0: return np.zeros((0, dim), dtype=np.float32) - if _ST_AVAILABLE: try: st = SentenceTransformer(model_name) @@ -155,7 +158,8 @@ def _embed_batch(self, texts, dim=384): if embs.ndim == 1: embs = np.expand_dims(embs, 0) if embs.shape[1] != dim: - self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", embs.shape[1], dim) + self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", + embs.shape[1], dim) if embs.shape[1] > dim: embs = embs[:, :dim].astype(np.float32) else: @@ -164,7 +168,6 @@ def _embed_batch(self, texts, dim=384): return embs except Exception as e: self.log.exception("sentence-transformers failed, falling back to deterministic embeddings: %s", e) - rng = np.random.RandomState(12345) embs = rng.normal(size=(len(texts), dim)).astype(np.float32) norms = np.linalg.norm(embs, axis=1, keepdims=True) @@ -183,7 +186,6 @@ def _parse_single_result(self, html): return df.iloc[0, 0] except Exception: try: - import re m = re.search(r"]*>(.*?)", html, flags=re.S) if m: return m.group(1).strip() @@ -191,49 +193,101 @@ def _parse_single_result(self, html): pass return None - def _get_cell_text_fallback(self): - # fallback to IPython history if needed + def _read_file_content(self, path: str): + warnings = [] + if not path: + return "", warnings + try: + p = os.path.expanduser(os.path.expandvars(path)) + if not os.path.isabs(p): + p = os.path.abspath(p) + if not os.path.exists(p): + warnings.append(f"file not found: {p}") + return "", warnings + _, ext = os.path.splitext(p.lower()) + if ext in ('.txt', '.md', '.text', '.json', '.ndjson'): + with io.open(p, 'r', encoding='utf-8', errors='replace') as fh: + return fh.read(), warnings + if ext == '.pdf': + if _PYPDF2_AVAILABLE: + try: + text_parts = [] + with open(p, 'rb') as fh: + reader = PyPDF2.PdfReader(fh) + for page in reader.pages: + try: + text_parts.append(page.extract_text() or '') + except Exception: + pass + return ''.join(text_parts), warnings + except Exception as e: + warnings.append(f"PyPDF2 failed to extract PDF text: {e}") + else: + warnings.append("PyPDF2 not available; cannot extract PDF text.") + return "", warnings + if ext in ('.docx',): + if _PYDOCX_AVAILABLE: + try: + doc = docx.Document(p) + paragraphs = [pr.text for pr in doc.paragraphs] + return '\n'.join(paragraphs), warnings + except Exception as e: + warnings.append(f"python-docx failed to extract docx: {e}") + else: + warnings.append("python-docx not available; cannot extract docx text.") + return "", warnings + try: + with io.open(p, 'r', encoding='utf-8', errors='replace') as fh: + return fh.read(), warnings + except Exception: + try: + with io.open(p, 'r', encoding='latin-1', errors='replace') as fh: + return fh.read(), warnings + except Exception as e: + warnings.append(f"Failed to read file: {e}") + return "", warnings + except Exception as e: + return "", [str(e)] + + def _get_existing_vector_dim(self, mariadb_client, dbname): + """ + If an embeddings table exists, parse SHOW CREATE TABLE to extract the VECTOR(...) dimension. + Returns int dimension if found, otherwise None. + """ try: - if get_ipython is None: - return "" - ip = get_ipython() - if not ip: - return "" - ns = ip.user_ns - if ns and "In" in ns: - hist = ns["In"] - if isinstance(hist, (list, tuple)) and len(hist) > 0: - for v in reversed(hist): - if isinstance(v, str) and v.strip(): - return v - hm = getattr(ip, "history_manager", None) - if hm: + resp = mariadb_client.run_statement("SHOW CREATE TABLE embeddings;") + if not resp: + return None + # try to parse HTML first + txt = str(resp) + m = re.search(r"embedding_vector\s+vector\((\d+)\)", txt, flags=re.I) + if m: + try: + return int(m.group(1)) + except Exception: + return None + # fallback: plain text search + m2 = re.search(r"vector\((\d+)\)", txt, flags=re.I) + if m2: try: - entries = list(hm.get_tail(1, include_latest=True)) - if entries: - src = entries[-1][2] - if isinstance(src, str) and src.strip(): - return src + return int(m2.group(1)) except Exception: - pass + return None except Exception: pass - return "" + return None + # ---- main execution ---- def execute(self, kernel, data): - """Main entry point called by the MariaDB Jupyter kernel.""" - # --- Extract cell content robustly --- cell_text = "" try: - # Case 1: standard MariaDB kernel -> {"cell": {"args": ..., "body": ...}} if isinstance(data, dict): if "cell" in data and isinstance(data["cell"], dict): if "body" in data["cell"] and isinstance(data["cell"]["body"], str): cell_text = data["cell"]["body"] elif "code" in data["cell"] and isinstance(data["cell"]["code"], str): cell_text = data["cell"]["code"] - # Case 2: other kernels elif any(k in data for k in ("code", "content", "message", "data")): for k in ("code", "content", "message", "data"): if k in data and isinstance(data[k], str): @@ -247,7 +301,7 @@ def execute(self, kernel, data): except Exception: cell_text = "" except Exception as e: - kernel._send_message("stderr", f"[debug] could not extract cell text: {e}") + kernel._send_message("stderr", f"[debug] could not extract cell text: {e}\n") cell_text = "" if cell_text: @@ -256,29 +310,43 @@ def execute(self, kernel, data): preview = cell_text[:80].replace("\n", " ") + ("..." if len(cell_text) > 80 else "") kernel._send_message("stdout", f"[debug] stored content length={len(cell_text)} preview={preview}\n") - - # --- Parse arguments (key=value pairs) --- + # --- Parse arguments --- try: args = self.parse_args(self.args) except Exception as e: - kernel._send_message("stderr", f"Error parsing arguments: {e}") + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") return - # If the user provided a `text` argument, prefer it over the cell body. + # text arg or file arg preference provided_text = args.get("text") if isinstance(args, dict) else None + file_arg = None + for k in ("text_file", "file", "path"): + if isinstance(args, dict) and args.get(k): + file_arg = args.get(k) + break + if isinstance(provided_text, str) and provided_text.strip(): cell_text = provided_text kernel._send_message("stdout", f"[debug] using text from args (len={len(cell_text)})\n") + elif file_arg: + file_contents, warnings = self._read_file_content(file_arg) + for w in warnings: + kernel._send_message("stderr", f"[warning] {w}\n") + if file_contents: + cell_text = file_contents + kernel._send_message("stdout", f"[debug] using file content from {file_arg} (len={len(cell_text)})\n") + else: + kernel._send_message("stderr", f"Failed to read file or file contained no text: {file_arg}\n") - # metadata + # metadata and settings doc_id = args.get("doc_id") or f"doc_{int(np.floor(np.random.random()*1e9))}" title = args.get("title") or "" chunk_size = int(args.get("chunk_size", 800) or 800) overlap = int(args.get("overlap", 100) or 100) - dim = 384 + embedding_dim = 384 # expected embedding dim from model metadata = args.get("metadata", {}) or {} - # build docs list + # build docs docs_to_ingest = [] maybe_json = (cell_text or "").strip() try: @@ -321,25 +389,32 @@ def execute(self, kernel, data): "metadata": metadata }) - # db client + docs_to_ingest = [d for d in docs_to_ingest if (d.get("content") or "").strip()] + if not docs_to_ingest: + kernel._send_message("stderr", "No non-empty documents to ingest; aborting.\n") + return + + # get mariadb client mariadb_client = getattr(kernel, "mariadb_client", None) if mariadb_client is None: - kernel._send_message("stderr", "No mariadb_client available on kernel (can't run ingestion).") + kernel._send_message("stderr", "No mariadb_client available on kernel (can't run ingestion).\n") return - # determine current DB + # determine db try: db_name_html = mariadb_client.run_statement("SELECT DATABASE();") dbname = self._parse_single_result(db_name_html) or "" + kernel._send_message("stdout", f"[debug] database detection raw response: {repr(db_name_html)[:400]}...\n") + kernel._send_message("stdout", f"[debug] using database: {dbname}\n") except Exception as e: - kernel._send_message("stderr", f"Failed to query current database: {e}") + kernel._send_message("stderr", f"Failed to query current database: {e}\n") return if not dbname: - kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).") + kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") return - # create tables with VECTOR(384) + # create tables: documents, chunks; embeddings handled carefully try: mariadb_client.run_statement( f""" @@ -353,7 +428,6 @@ def execute(self, kernel, data): ) ENGINE=InnoDB; """ ) - mariadb_client.run_statement( f""" CREATE TABLE IF NOT EXISTS `{dbname}`.`chunks` ( @@ -368,37 +442,58 @@ def execute(self, kernel, data): ) ENGINE=InnoDB; """ ) + # create embeddings table if missing — keep existing definition if present + try: + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + chunk_id BIGINT UNIQUE, + model VARCHAR(128), + dim INT, + embedding_vector VECTOR({embedding_dim}), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + except Exception: + # tolerate create failure and continue (we'll detect existing table schema) + pass + except Exception as e: + kernel._send_message("stderr", f"DDL failed: {e}\n") + return + + # detect existing VECTOR dimension (if any) + existing_vec_dim = self._get_existing_vector_dim(mariadb_client, dbname) + use_native_vector = True + if existing_vec_dim is None: + # no existing vector column found or couldn't parse; assume native insert possible with our requested dim + use_native_vector = True + kernel._send_message("stdout", "[debug] no existing vector dim detected; will attempt native VECTOR insert.\n") + else: + if existing_vec_dim != embedding_dim: + use_native_vector = False + kernel._send_message("stderr", f"[warning] embeddings.embedding_vector exists with dim={existing_vec_dim}; ingest embedding_dim={embedding_dim}. Native VECTOR insert will be skipped and fallback to embeddings_json will be used.\n") + else: + kernel._send_message("stdout", f"[debug] embeddings.embedding_vector dim matches expected ({embedding_dim}); will use native VECTOR inserts.\n") - # native VECTOR column + # ensure embeddings_json exists (fallback) + try: mariadb_client.run_statement( f""" - CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings` ( + CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings_json` ( id BIGINT AUTO_INCREMENT PRIMARY KEY, chunk_id BIGINT UNIQUE, model VARCHAR(128), dim INT, - embedding_vector VECTOR({dim}), + embedding_json JSON, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ENGINE=InnoDB; """ ) - - # best-effort ANN index - try: - mariadb_client.run_statement( - f"CREATE INDEX IF NOT EXISTS idx_embeddings_vector ON `{dbname}`.`embeddings` (embedding_vector) USING ANN;" - ) - except Exception: - try: - mariadb_client.run_statement( - f"CREATE INDEX idx_embeddings_vector ON `{dbname}`.`embeddings` (embedding_vector) USING ANN WITH (distance='cosine');" - ) - except Exception as e_idx: - self.log.debug("ANN index creation skipped/failed (ok): %s", e_idx) - - except Exception as e: - kernel._send_message("stderr", f"DDL failed: {e}") - return + except Exception: + # nonfatal; we will try to create later when needed + pass # ingest loop total_chunks = 0 @@ -411,28 +506,29 @@ def execute(self, kernel, data): # insert document row try: - mariadb_client.run_statement( + res = mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`documents` (doc_id, title, content, metadata) VALUES ({self._sql_escape(d_doc_id)}, {self._sql_escape(d_title)}, {self._sql_escape(d_content)}, {self._sql_escape(json.dumps(d_meta))}) ON DUPLICATE KEY UPDATE title=VALUES(title), content=VALUES(content), metadata=VALUES(metadata); """ ) + kernel._send_message("stdout", f"[debug] INSERT documents raw response: {repr(res)[:400]}...\n") except Exception as e: - kernel._send_message("stderr", f"Failed to insert document {d_doc_id}: {e}") + kernel._send_message("stderr", f"Failed to insert document {d_doc_id}: {e}\n") continue - # debug: fetch stored content + # verify stored content try: res_html = mariadb_client.run_statement( f"SELECT content FROM `{dbname}`.`documents` WHERE doc_id = {self._sql_escape(d_doc_id)} LIMIT 1;" ) stored_content = self._parse_single_result(res_html) or "" - kernel._send_message("stdout", f"[debug] stored content length={len(stored_content)}") + kernel._send_message("stdout", f"[debug] stored content length={len(stored_content)}\n") if d_content and not stored_content: - kernel._send_message("stderr", "[warning] document content inserted into DB appears empty (possible client/encoding issue).") + kernel._send_message("stderr", "[warning] document content inserted into DB appears empty (possible client/encoding issue).\n") except Exception as e: - kernel._send_message("stderr", f"Warning: could not verify stored document content: {e}") + kernel._send_message("stderr", f"Warning: could not verify stored document content: {e}\n") # chunk chunks = self._simple_chunk(d_content, chunk_size, overlap) @@ -444,24 +540,23 @@ def execute(self, kernel, data): inserted_chunk_ids = [] for idx, chunk_text in enumerate(chunks): try: - mariadb_client.run_statement( + res = mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`chunks` (doc_id, chunk_index, chunk_text, chunk_meta) VALUES ({self._sql_escape(d_doc_id)}, {idx}, {self._sql_escape(chunk_text)}, {self._sql_escape(json.dumps({}))}); """ ) - # LAST_INSERT_ID + kernel._send_message("stdout", f"[debug] INSERT chunk idx={idx} raw response: {repr(res)[:400]}...\n") + # get last insert id (best-effort) try: last_html = mariadb_client.run_statement("SELECT LAST_INSERT_ID();") last_val = self._parse_single_result(last_html) last_id = int(last_val) if last_val is not None else None except Exception: last_id = None - if last_id is not None: inserted_chunk_ids.append((idx, last_id)) else: - # fallback lookup try: sel_html = mariadb_client.run_statement( f"SELECT id FROM `{dbname}`.`chunks` WHERE doc_id = {self._sql_escape(d_doc_id)} AND chunk_index = {idx} LIMIT 1;" @@ -471,31 +566,13 @@ def execute(self, kernel, data): except Exception: inserted_chunk_ids.append((idx, None)) except Exception as e: - kernel._send_message("stderr", f"Failed to insert chunk {idx} for {d_doc_id}: {e}") + kernel._send_message("stderr", f"Failed to insert chunk {idx} for {d_doc_id}: {e}\n") inserted_chunk_ids.append((idx, None)) continue - # diagnostics if nothing inserted - if len(inserted_chunk_ids) == 0 and chunks: - kernel._send_message("stderr", f"[debug] no chunk ids collected for doc {d_doc_id}; attempting to read existing chunks for diagnostic.") - try: - full_sel = mariadb_client.run_statement( - f"SELECT id, chunk_index FROM `{dbname}`.`chunks` WHERE doc_id = {self._sql_escape(d_doc_id)} ORDER BY chunk_index;" - ) - # best-effort parse - import pandas as _pd - try: - df = _pd.read_html(full_sel)[0] - tmp_map = {int(r["chunk_index"]): int(r["id"]) for _, r in df.iterrows()} - inserted_chunk_ids = [(i, tmp_map.get(i)) for i in range(len(chunks))] - except Exception: - pass - except Exception: - pass - - # embeddings + # embeddings: compute and insert (native if allowed, else JSON) if chunks: - embs = self._embed_batch(chunks, dim) + embs = self._embed_batch(chunks, embedding_dim) norms = np.linalg.norm(embs, axis=1, keepdims=True) norms[norms == 0] = 1.0 embs_norm = (embs / norms).astype(np.float32) @@ -503,26 +580,142 @@ def execute(self, kernel, data): for (i, chunk_db_id), vec in zip(inserted_chunk_ids, embs_norm): if chunk_db_id is None: self.log.debug("No db chunk id for doc %s chunk %d — skipping embedding store", d_doc_id, i) - kernel._send_message("stderr", f"[debug] no chunk id for doc {d_doc_id} chunk {i}; embedding skipped.") + kernel._send_message("stderr", f"[debug] no chunk id for doc {d_doc_id} chunk {i}; embedding skipped.\n") continue + vec_list = [float(v) for v in vec.tolist()] vec_literal = "[" + ",".join(repr(x) for x in vec_list) + "]" + + # If server vector dim mismatches, skip native insert + if not use_native_vector: + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + res_json = mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") + # verify + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify_json) + kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cnt}\n") + if cnt and int(cnt) > 0: + total_emb_rows += 1 + kernel._send_message("stdout", f"[debug] fallback stored for chunk {chunk_db_id}\n") + else: + kernel._send_message("stderr", f"[error] fallback JSON insert reported 0 rows for chunk {chunk_db_id}\n") + continue + except Exception as e_verify_json: + kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") + continue + except Exception as e_json: + kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") + self.log.debug("Fallback JSON insert failed for chunk %s: %s", chunk_db_id, e_json) + continue + + # Attempt native VECTOR insert (server dim matched) try: - mariadb_client.run_statement( + res_native = mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`embeddings` (chunk_id, model, dim, embedding_vector) - VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {dim}, {vec_literal}) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {vec_literal}) ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_vector=VALUES(embedding_vector); """ ) - total_emb_rows += 1 - except Exception as e: - kernel._send_message("stderr", f"Failed to insert embedding for chunk_id={chunk_db_id}: {e}") - continue + kernel._send_message("stdout", f"[debug] native INSERT embeddings raw response: {repr(res_native)[:400]}...\n") + except Exception as e_native: + kernel._send_message("stderr", f"Failed to insert embedding (native VECTOR) for chunk_id={chunk_db_id}: {e_native}\n") + self.log.debug("Native VECTOR insert failed for chunk %s: %s", chunk_db_id, e_native) + # try fallback JSON + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + res_json = mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify_json) + kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cnt}\n") + if cnt and int(cnt) > 0: + total_emb_rows += 1 + kernel._send_message("stdout", f"[debug] fallback stored for chunk {chunk_db_id}\n") + else: + kernel._send_message("stderr", f"[error] fallback JSON insert reported 0 rows for chunk {chunk_db_id}\n") + continue + except Exception as e_verify_json: + kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") + continue + except Exception as e_json: + kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") + continue + + # Verify native insert succeeded by COUNT(*) + try: + verify = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify) + kernel._send_message("stdout", f"[debug] verify embeddings COUNT for chunk {chunk_db_id}: {cnt}\n") + if cnt and int(cnt) > 0: + total_emb_rows += 1 + else: + # fallback if native wrote no rows + kernel._send_message("stderr", f"[warning] native insert wrote 0 rows for chunk {chunk_db_id}, falling back to JSON.\n") + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + res_json = mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cntj = self._parse_single_result(verify_json) + kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cntj}\n") + if cntj and int(cntj) > 0: + total_emb_rows += 1 + except Exception as e_verify_json: + kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") + except Exception as e_json: + kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") + except Exception as e_verify: + kernel._send_message("stderr", f"[warning] verify select for embeddings failed: {e_verify}\n") + + # Final diagnostics: counts & version + try: + cnt_emb = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings;") + kernel._send_message("stdout", f"[debug] COUNT embeddings raw response: {repr(cnt_emb)[:400]}...\n") + except Exception as e: + kernel._send_message("stderr", f"[warning] COUNT embeddings failed: {e}\n") + try: + cnt_json = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings_json;") + kernel._send_message("stdout", f"[debug] COUNT embeddings_json raw response: {repr(cnt_json)[:400]}...\n") + except Exception: + kernel._send_message("stdout", "[debug] COUNT embeddings_json query failed or table does not exist.\n") + try: + version = mariadb_client.run_statement("SELECT VERSION();") + kernel._send_message("stdout", f"[debug] VERSION raw response: {repr(version)[:400]}...\n") + except Exception: + pass - # final kernel._send_message("stdout", f"Ingest complete. documents={len(docs_to_ingest)} chunks_total={total_chunks} embeddings_written={total_emb_rows}\n") - if total_chunks == 0: - kernel._send_message("stderr", "Warning: no chunks were created. If your document text is present in the `documents` table but chunk_text is missing, check client encoding and ensure the cell body was passed to the kernel. Use `SELECT content FROM documents WHERE doc_id=\"...\";` to inspect.") - kernel._send_message("stdout", "Notes:\n - embedding model used: all-MiniLM-L6-v2 (dim=384)\n - Native VECTOR column was created/used where available.\n") + kernel._send_message("stdout", "Notes:\n - embedding model used: all-MiniLM-L6-v2 (dim={})\n - Native VECTOR column used only when compatible.\n".format(embedding_dim)) return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py new file mode 100644 index 0000000..bee4893 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py @@ -0,0 +1,613 @@ +# mariadb_kernel/maria_magics/maria_rag_query.py +""" +%maria_rag_query + +Single-command RAG: retrieve relevant chunks, run fusion chain (LLM via Gemini) and return answer + sources. + +Hardcoded settings: + - retriever = "hybrid" + - k = 6 + - llm_model = "gemini-2.5-flash" (hardcoded) + - prompt = "default" + - bm25_weight = 0.3 + +Usage: + %maria_rag_query query="How do I cancel my subscription?" + %maria_rag_query query="How do I cancel my subscription?" explain=true + +Notes: + - The code will attempt to use the Google GenAI Python client (google.genai). It checks + the environment variables GOOGLE_API_KEY or GENAI_API_KEY for the API key. + - If the GenAI client or API key is unavailable, the magic falls back to a local fusion chain. +""" + +import shlex +import json +import logging +import re +import os +import numpy as np +from distutils import util + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# optional Google GenAI (Gemini) client +_GENAI_AVAILABLE = False +try: + from google import genai + from google.genai import types + _GENAI_AVAILABLE = True +except Exception: + _GENAI_AVAILABLE = False + +class MariaRAGQuery(MariaMagic): + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaRAGQuery") + + # HARDCODED SETTINGS (per your request) + self.RETRIEVER = "hybrid" + self.K = 6 + self.BM25_WEIGHT = 0.3 + self.CANDIDATE_N = 500 + self.LLM_MODEL = "gemini-2.5-flash" # using Gemini model per your snippet + self.PROMPT_NAME = "default" + self.EMBED_DIM = 384 + + def type(self): + return "Line" + + def name(self): + return "maria_rag_query" + + def help(self): + return "%maria_rag_query query=\"...\" — retrieve+fusion RAG (hardcoded settings)" + + # ---------------- Parsing helpers ---------------- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + if input_obj is None: + return {} + if isinstance(input_obj, dict): + return input_obj + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + input_str = input_obj.strip() + if not input_str: + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception: + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + # ---------------- Embedding utilities ---------------- + def _embed_texts(self, texts, dim=None): + """Return normalized numpy embeddings for texts. If sentence-transformers available use it.""" + if dim is None: + dim = self.EMBED_DIM + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + if _ST_AVAILABLE: + try: + st = SentenceTransformer("all-MiniLM-L6-v2") + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return (embs / norms).astype(np.float32) + except Exception as e: + self.log.debug("sentence-transformers failure: %s", e) + # deterministic fallback + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return (embs / norms).astype(np.float32) + + def _parse_html_table(self, html): + """Best-effort HTML -> list-of-dicts parser used for mariadb_client outputs.""" + if html is None: + return None + s = str(html) + rows = re.findall(r"]*>(.*?)", s, flags=re.S | re.I) + parsed = [] + header = [] + for r in rows: + ths = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if ths and not header: + header = [re.sub(r"<[^>]+>", "", t).strip() for t in ths] + continue + tds = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if not tds: + continue + cells = [re.sub(r"<[^>]+>", "", t).strip() for t in tds] + if header and len(cells) == len(header): + parsed.append(dict(zip(header, cells))) + else: + parsed.append({str(i): cells[i] if i < len(cells) else "" for i in range(len(cells))}) + return parsed if parsed else None + + def _parse_vector_literal(self, val): + if val is None: + return None + if isinstance(val, (list, tuple, np.ndarray)): + return np.array(val, dtype=np.float32) + s = str(val).strip() + if s.startswith("[") and s.endswith("]"): + try: + arr = json.loads(s) + return np.array(arr, dtype=np.float32) + except Exception: + pass + nums = re.findall(r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?", s) + if not nums: + return None + try: + return np.array([float(x) for x in nums], dtype=np.float32) + except Exception: + return None + + # ---------------- Retrieval helpers ---------------- + def _bm25_prefilter(self, kernel, dbname, query_text): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return [] + q_esc = self._sql_escape(query_text) + sql = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text, " + f"MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) AS bm25_score " + f"FROM `{dbname}`.`chunks` " + f"WHERE MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) " + f"ORDER BY bm25_score DESC LIMIT {self.CANDIDATE_N};" + ) + try: + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + if not rows: + return [] + cand = [] + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("id")) + except Exception: + continue + cand.append({ + "chunk_id": cid, + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or r.get("0") or 0), + "chunk_text": r.get("chunk_text") or "", + "bm25_score": float(r.get("bm25_score") or 0.0) + }) + return cand + except Exception as e: + self.log.debug("BM25 query failed: %s", e) + return [] + + def _sample_candidates(self, kernel, dbname): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return [] + sql = f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text FROM `{dbname}`.`chunks` ORDER BY RAND() LIMIT {self.CANDIDATE_N};" + try: + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + if not rows: + return [] + cand = [] + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("id")) + except Exception: + continue + cand.append({ + "chunk_id": cid, + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0), + "chunk_text": r.get("chunk_text") or "" + }) + return cand + except Exception as e: + self.log.debug("Sampling failed: %s", e) + return [] + + def _fetch_embeddings_for_candidates(self, kernel, dbname, candidate_ids): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return {} + if not candidate_ids: + return {} + + ids_sql = ",".join(str(int(x)) for x in candidate_ids) + # first attempt native embeddings join + try: + sql = ( + f"SELECT e.chunk_id, e.embedding_vector, c.chunk_text, c.doc_id, c.chunk_index " + f"FROM `{dbname}`.`embeddings` e " + f"JOIN `{dbname}`.`chunks` c ON e.chunk_id = c.id " + f"WHERE e.chunk_id IN ({ids_sql});" + ) + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + emb_map = {} + if rows: + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("0")) + except Exception: + continue + emb_raw = r.get("embedding_vector") or r.get("embedding") or None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) or 1.0 + emb_map[cid] = { + "vec": (vec / norm).astype(np.float32), + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0) + } + if emb_map: + return emb_map + except Exception as e: + self.log.debug("native embeddings fetch failed: %s", e) + + # fallback to embeddings_json + try: + sql_json = ( + f"SELECT ej.chunk_id, ej.embedding_json, c.chunk_text, c.doc_id, c.chunk_index " + f"FROM `{dbname}`.`embeddings_json` ej " + f"JOIN `{dbname}`.`chunks` c ON ej.chunk_id = c.id " + f"WHERE ej.chunk_id IN ({ids_sql});" + ) + html_json = mariadb_client.run_statement(sql_json) + rows_json = self._parse_html_table(html_json) + emb_map = {} + if rows_json: + for r in rows_json: + try: + cid = int(r.get("chunk_id") or r.get("0")) + except Exception: + continue + emb_raw = r.get("embedding_json") or r.get("embedding") or None + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) or 1.0 + emb_map[cid] = { + "vec": (vec / norm).astype(np.float32), + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0) + } + return emb_map + except Exception as e: + self.log.debug("embeddings_json fetch failed: %s", e) + return {} + + # ---------------- Gemini LLM call ---------------- + def _call_gemini(self, system_prompt, user_prompt, model_name=None, max_output_tokens=400): + """ + Call Gemini using google.genai per the snippet the user provided. + Looks for API key in GOOGLE_API_KEY or GENAI_API_KEY environment variables. + Returns (text, raw_response) or (None, None) on failure. + """ + if not _GENAI_AVAILABLE: + self.log.debug("google.genai not available in environment.") + return None, None + + api_key = "AIzaSyBW1n6kIu0o-W3l0-pMBOMc4nzjYfsbETg" + if not api_key: + self.log.debug("No GENAI API key found in GOOGLE_API_KEY or GENAI_API_KEY.") + return None, None + + try: + client = genai.Client(api_key=api_key) + # Build combined content: put system + user into 'contents' - simple approach + contents = system_prompt + "\n\n" + user_prompt + resp = client.models.generate_content( + model=model_name or self.LLM_MODEL, + contents=contents, + config=types.GenerateContentConfig( + max_output_tokens=max_output_tokens, + thinking_config=types.ThinkingConfig(thinking_budget=0) + ) + ) + # The user's snippet used resp.text + text = getattr(resp, "text", None) + if text is None: + # some genai client versions put result in resp.output or resp.candidates + try: + # try attribute "candidates" + if hasattr(resp, "candidates") and resp.candidates: + text = getattr(resp.candidates[0], "content", None) or getattr(resp.candidates[0], "text", None) + elif hasattr(resp, "output"): + text = str(resp.output) + else: + text = str(resp) + except Exception: + text = str(resp) + return text, resp + except Exception as e: + self.log.debug("Gemini call failed: %s", e) + return None, None + + # ---------------- Local fusion fallback ---------------- + def _fusion_chain_local(self, question, context_blocks): + """ + Local, deterministic fusion map-reduce: + - Map: pick sentences from each context containing question tokens + - Reduce: join deduplicated sentences into a compact answer + """ + q = question.lower() + q_tokens = set(re.findall(r"\w+", q)) + picked = [] + evidence = [] + debug = {"map": [], "reduce": None} + + for b in context_blocks: + text = b["chunk_text"] + sentences = re.split(r'(?<=[\.\?\!])\s+', text) + picks = [] + for s in sentences: + st = s.strip() + if not st: + continue + s_tokens = set(re.findall(r"\w+", st.lower())) + if len(q_tokens & s_tokens) > 0: + picks.append(st) + if not picks and sentences: + picks = [sentences[0].strip()] + picks = picks[:3] + if picks: + picked.extend(picks) + evidence.append({ + "doc_id": b["doc_id"], + "chunk_index": b["chunk_index"], + "snippet": " ".join(picks)[:400] + }) + debug["map"].append({"chunk_id": b["chunk_id"], "picked_count": len(picks)}) + + # Reduce: deduplicate and join + uniq = [] + seen = set() + for s in picked: + key = s.strip().lower() + if key not in seen: + seen.add(key) + uniq.append(s.strip()) + + if not uniq: + answer = "I couldn't find a clear answer in the retrieved documents." + else: + answer = " ".join(uniq[:8]) + if len(answer) > 800: + answer = answer[:797] + "..." + debug["reduce"] = {"picked_sentences": len(uniq)} + return answer, evidence, debug + + # ---------------- Main entry ---------------- + def execute(self, kernel, data): + # parse args and query + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + args = {} + + query = None + if isinstance(args, dict): + query = args.get("query") or args.get("q") + if not query: + if isinstance(data, str) and data.strip(): + query = data.strip() + if not query: + kernel._send_message("stderr", "No query supplied. Usage: %maria_rag_query query=\"...\"\n") + return + + explain = False + if isinstance(args, dict): + if args.get("explain") in (True, "true", "True", 1, "1"): + explain = True + + kernel._send_message("stdout", f"[debug] RAG query received (len={len(query)}): {query}\n") + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + kernel._send_message("stderr", "No mariadb_client available on kernel (can't run retrieval).\n") + return + + # determine DB + try: + db_html = mariadb_client.run_statement("SELECT DATABASE();") + db_parsed = self._parse_html_table(db_html) + dbname = "" + if db_parsed and isinstance(db_parsed, list) and len(db_parsed) > 0: + first = db_parsed[0] + dbname = next(iter(first.values())) + else: + m = re.search(r"]*>(.*?)", str(db_html), flags=re.S) + if m: + dbname = m.group(1).strip() + except Exception as e: + kernel._send_message("stderr", f"Failed to detect current DB: {e}\n") + return + + if not dbname: + kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") + return + + # RETRIEVAL: BM25 prefilter (hybrid) + candidates = [] + if self.RETRIEVER == "hybrid": + candidates = self._bm25_prefilter(kernel, dbname, query) + if not candidates: + candidates = self._sample_candidates(kernel, dbname) + if not candidates: + kernel._send_message("stderr", "No candidate chunks found (chunks table empty?).\n") + return + + candidate_ids = [c["chunk_id"] for c in candidates if c.get("chunk_id") is not None] + emb_map = self._fetch_embeddings_for_candidates(kernel, dbname, candidate_ids) + if not emb_map: + kernel._send_message("stderr", "No embeddings found for any candidate chunks.\n") + return + + # compute query embedding consistent with vector dim + first_vec = next(iter(emb_map.values()))["vec"] + vec_dim = first_vec.shape[0] + q_emb = self._embed_texts([query], dim=vec_dim)[0] + + # combine bm25 + vector + scored = [] + bm25_values = [float(c.get("bm25_score", 0.0) or 0.0) for c in candidates] + bm25_max = max(bm25_values) if bm25_values else 0.0 + for c in candidates: + cid = c.get("chunk_id") + if cid not in emb_map: + continue + emb_info = emb_map[cid] + sim = float(np.dot(q_emb, emb_info["vec"])) + bm25_raw = float(c.get("bm25_score", 0.0) or 0.0) + bm25_norm = (bm25_raw / bm25_max) if bm25_max > 0 else 0.0 + combined = (self.BM25_WEIGHT * bm25_norm) + ((1.0 - self.BM25_WEIGHT) * ((sim + 1.0) / 2.0)) + scored.append({ + "chunk_id": cid, + "doc_id": emb_info.get("doc_id"), + "chunk_index": emb_info.get("chunk_index"), + "chunk_text": emb_info.get("chunk_text"), + "vec_sim": sim, + "bm25": bm25_raw, + "score": combined + }) + + if not scored: + kernel._send_message("stderr", "No scored candidates after combining BM25/vector.\n") + return + + # top-K + scored.sort(key=lambda r: r["score"], reverse=True) + topk = scored[: self.K] + + # assemble context blocks with citations + context_blocks = [] + for s in topk: + context_blocks.append({ + "chunk_id": s["chunk_id"], + "doc_id": s["doc_id"], + "chunk_index": s["chunk_index"], + "chunk_text": s["chunk_text"], + "vec_sim": s["vec_sim"], + "bm25": s["bm25"], + "score": s["score"] + }) + + # Build prompt / context to send to Gemini + context_text = "" + for i, b in enumerate(context_blocks): + citation = f"[{b['doc_id']}::chunk_{b['chunk_index']}]" + context_text += f"--- SOURCE {i+1} {citation} ---\n{b['chunk_text']}\n\n" + + system_prompt = "You are a helpful assistant that answers questions based on provided documents. When you use information from a source include a citation tag like [DOCID::chunk_X]." + user_prompt = f"QUESTION:\n{query}\n\nCONTEXT:\n{context_text}\n\nINSTRUCTIONS:\nAnswer the question concisely, and at the end provide a 'SOURCES' section listing the doc_id and chunk_index you used.\n" + + # Try Gemini via google.genai + llm_answer = None + llm_raw_resp = None + gemini_text, gemini_raw = self._call_gemini(system_prompt, user_prompt, model_name=self.LLM_MODEL, max_output_tokens=512) + if gemini_text: + llm_answer = gemini_text + llm_raw_resp = gemini_raw + + chain_debug = None + if not llm_answer: + ans, evidence, debug = self._fusion_chain_local(query, context_blocks) + chain_debug = debug + sources_lines = [] + for e in evidence: + sources_lines.append(f"{e['doc_id']}::chunk_{e['chunk_index']} - {e['snippet'][:200]}") + sources_text = "\n".join(sources_lines) if sources_lines else "No explicit sources found." + llm_answer = f"{ans}\n\nSOURCES:\n{sources_text}" + + # Output answer + sources + kernel._send_message("stdout", "\n=== ANSWER ===\n") + kernel._send_message("stdout", llm_answer + "\n\n") + + kernel._send_message("stdout", "=== SOURCES (top-K) ===\n") + for b in context_blocks: + snippet = (b["chunk_text"] or "").replace("\n", " ") + if len(snippet) > 300: + snippet = snippet[:297] + "..." + kernel._send_message("stdout", f"- {b['doc_id']} :: chunk_{b['chunk_index']} (score={b['score']:.4f}, vec_sim={b['vec_sim']:.4f}, bm25={b['bm25']:.4f})\n {snippet}\n") + + if explain: + kernel._send_message("stdout", "\n=== EXPLAIN: retrieval candidates (top 20 shown) ===\n") + for s in scored[:20]: + kernel._send_message("stdout", f"chunk_id={s['chunk_id']} doc_id={s['doc_id']} chunk_index={s['chunk_index']} score={s['score']:.6f} vec_sim={s['vec_sim']:.6f} bm25={s['bm25']:.6f}\n") + if chain_debug is not None: + kernel._send_message("stdout", "\n=== EXPLAIN: chain debug ===\n") + kernel._send_message("stdout", json.dumps(chain_debug, indent=2) + "\n") + if llm_raw_resp is not None: + kernel._send_message("stdout", "\n=== GEMINI RAW RESP (truncated) ===\n") + kernel._send_message("stdout", str(llm_raw_resp)[:2000] + "\n") + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py new file mode 100644 index 0000000..8b6fc11 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py @@ -0,0 +1,565 @@ +# maria_kernel/maria_magics/maria_search.py +""" +%maria_search + +Hybrid BM25 + vector search. Hardcoded settings: + - MODEL_NAME = "all-MiniLM-L6-v2" + - K = 8 + - BM25_WEIGHT = 0.3 + - CANDIDATE_N = 500 + +Usage: + %maria_search query="refund policy for returns" + %maria_search "refund policy for returns" # raw-line fallback +If no query supplied, defaults to "testquery". +""" + +import shlex +import json +import logging +import re +import numpy as np +from distutils import util + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# optional pandas for parsing HTML tables +_PANDAS_AVAILABLE = False +try: + import pandas as _pd + _PANDAS_AVAILABLE = True +except Exception: + _PANDAS_AVAILABLE = False + +try: + from mariadb_kernel.maria_magics.maria_magic import MariaMagic +except Exception: + # lightweight fallback if run standalone for tests + class MariaMagic: + def __init__(self, *a, **k): + pass + def type(self): return "Line" + def name(self): return "maria_search" + def help(self): return "Search (hybrid)." + + +class MariaSearch(MariaMagic): + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaSearch") + + # Hardcoded (per your request) + self.MODEL_NAME = "all-MiniLM-L6-v2" + self.K = 8 + self.CANDIDATE_N = 500 + self.BM25_WEIGHT = 0.3 + + def type(self): + return "Line" + + def name(self): + return "maria_search" + + def help(self): + return "%maria_search query=\"text\" — hybrid BM25 + vector search (hardcoded model/weights)" + + # ----------------- utilities ----------------- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + if input_obj is None: + return {} + if isinstance(input_obj, dict): + return input_obj + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + input_str = input_obj.strip() + if input_str == "": + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception: + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + def _parse_html_table(self, html): + """Return a list-of-dicts or pandas.DataFrame. Best-effort fallback if pandas missing.""" + if html is None: + return None + if _PANDAS_AVAILABLE: + try: + dfs = _pd.read_html(html) + if dfs: + return dfs[0] + except Exception: + pass + # fallback simple parser -> list of dicts + try: + tbl = re.search(r"]*>(.*?)", str(html), flags=re.S | re.I) + if not tbl: + return None + rows = re.findall(r"]*>(.*?)", tbl.group(1), flags=re.S | re.I) + if not rows: + return None + headers = None + parsed = [] + for r in rows: + # find header cells + if headers is None: + ths = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if ths: + headers = [re.sub(r"<[^>]+>", "", c).strip() for c in ths] + continue + tds = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + cells = [re.sub(r"<[^>]+>", "", c).strip() for c in tds] + if not cells: + continue + if headers and len(cells) == len(headers): + parsed.append(dict(zip(headers, cells))) + else: + parsed.append({str(i): cells[i] if i < len(cells) else "" for i in range(len(cells))}) + return parsed + except Exception: + return None + + def _is_nonempty_table(self, table): + """Return True if 'table' (DataFrame or list-of-dicts) has at least one row.""" + if table is None: + return False + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(table, _pd.DataFrame): + return not table.empty + if isinstance(table, list): + return len(table) > 0 + # other truthy checks (strings etc) considered empty for our use + return False + + def _embed_texts(self, texts, dim=384): + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + if _ST_AVAILABLE: + try: + st = SentenceTransformer(self.MODEL_NAME) + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", + embs.shape[1], dim) + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + return embs + except Exception as e: + self.log.exception("sentence-transformers failed, falling back to deterministic embeddings: %s", e) + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs = embs / norms + return embs + + def _parse_vector_literal(self, val): + """Parse JSON or text vector into numpy array.""" + if val is None: + return None + if isinstance(val, (list, tuple, np.ndarray)): + try: + return np.array(val, dtype=np.float32) + except Exception: + pass + s = str(val).strip() + if s.startswith("[") and s.endswith("]"): + try: + parsed = json.loads(s) + return np.array(parsed, dtype=np.float32) + except Exception: + pass + nums = re.findall(r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?", s) + if not nums: + return None + try: + arr = np.array([float(x) for x in nums], dtype=np.float32) + return arr + except Exception: + return None + + # ----------------- main ----------------- + def execute(self, kernel, data): + # parse args & query + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + args = {} + + query = None + if isinstance(args, dict): + query = args.get("query") or args.get("q") + if not query: + if isinstance(data, str) and data.strip(): + query = data.strip() + if not query: + query = "testquery" + query = str(query).strip() + if not query: + kernel._send_message("stderr", "Empty query; nothing to search.\n") + return + + kernel._send_message("stdout", f"[debug] running hybrid search for query (len={len(query)}): {query}\n") + + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + kernel._send_message("stderr", "No mariadb_client available on kernel (can't run search).\n") + return + + # determine DB + try: + db_html = mariadb_client.run_statement("SELECT DATABASE();") + dbname = None + parsed_db = self._parse_html_table(db_html) + if parsed_db is None: + m = re.search(r"]*>(.*?)", str(db_html), flags=re.S) + dbname = m.group(1).strip() if m else "" + else: + if _PANDAS_AVAILABLE and hasattr(parsed_db, "iloc") and isinstance(parsed_db, _pd.DataFrame): + if not parsed_db.empty: + dbname = str(parsed_db.iloc[0, 0]) + else: + dbname = "" + elif isinstance(parsed_db, list) and len(parsed_db) > 0: + first = parsed_db[0] + if isinstance(first, dict): + dbname = next(iter(first.values())) + else: + dbname = first.get("0") if "0" in first else "" + else: + dbname = "" + except Exception as e: + kernel._send_message("stderr", f"Failed to query current database: {e}\n") + return + + if not dbname: + kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") + return + + # --- BM25 prefilter if requested --- + candidates = [] + try: + if self.BM25_WEIGHT > 0: + q_esc = self._sql_escape(query) + sql = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text, " + f"MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) AS bm25_score " + f"FROM `{dbname}`.`chunks` " + f"WHERE MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) " + f"ORDER BY bm25_score DESC LIMIT {self.CANDIDATE_N};" + ) + html = mariadb_client.run_statement(sql) + df = self._parse_html_table(html) + if self._is_nonempty_table(df): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(df, _pd.DataFrame): + for _, row in df.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("id")) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "bm25_score": float(row.get("bm25_score") if "bm25_score" in row else 0.0) if row is not None else 0.0 + }) + else: + # parsed list-of-dicts + for r in df: + try: + cid = int(r.get("chunk_id") or r.get("id") or next(iter(r.values()))) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": r.get("chunk_text") or r.get(next(iter(r.keys()))) or "", + "doc_id": r.get("doc_id") or "", + "bm25_score": float(r.get("bm25_score") or 0.0) + }) + except Exception as e: + kernel._send_message("stderr", f"BM25 prefilter failed: {e}\n") + + # if no candidates from BM25, fallback to sample + if not candidates: + try: + sql_sample = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text " + f"FROM `{dbname}`.`chunks` " + f"ORDER BY RAND() LIMIT {self.CANDIDATE_N};" + ) + html = mariadb_client.run_statement(sql_sample) + df = self._parse_html_table(html) + if self._is_nonempty_table(df): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(df, _pd.DataFrame): + for _, row in df.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("id")) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "" + }) + else: + for r in df: + try: + cid = int(r.get("chunk_id") or r.get("id") or next(iter(r.values()))) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "" + }) + except Exception as e: + kernel._send_message("stderr", f"Candidate sampling failed: {e}\n") + + if not candidates: + kernel._send_message("stderr", "No candidate chunks found (empty chunks table?).\n") + return + + candidate_ids = [int(c["chunk_id"]) for c in candidates if c.get("chunk_id") is not None] + if not candidate_ids: + kernel._send_message("stderr", "No valid candidate chunk ids.\n") + return + + # --- fetch embeddings: try native embeddings table first --- + id_list_sql = ",".join(str(int(x)) for x in candidate_ids) + emb_rows = None + try: + sql_emb = ( + f"SELECT e.chunk_id, e.embedding_vector, c.chunk_text, c.doc_id, c.chunk_meta " + f"FROM `{dbname}`.`embeddings` e " + f"JOIN `{dbname}`.`chunks` c ON e.chunk_id = c.id " + f"WHERE e.chunk_id IN ({id_list_sql});" + ) + html = mariadb_client.run_statement(sql_emb) + emb_rows = self._parse_html_table(html) + except Exception: + emb_rows = None + + emb_map = {} + # parse native embeddings if returned + if self._is_nonempty_table(emb_rows): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(emb_rows, _pd.DataFrame): + for _, row in emb_rows.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("chunk_id")) + except Exception: + continue + emb_raw = row.get("embedding_vector") if "embedding_vector" in row else row.get("embedding") if "embedding" in row else None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "chunk_meta": row.get("chunk_meta") if "chunk_meta" in row else "" + } + else: + for r in emb_rows: + try: + cid = int(r.get("chunk_id") or r.get(next(iter(r.keys())))) + except Exception: + continue + emb_raw = r.get("embedding_vector") or r.get("embedding_json") or r.get("embedding_bin") or None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_meta": r.get("chunk_meta") or "" + } + + # If native embeddings empty for candidates, try embeddings_json fallback + if not emb_map: + try: + sql_json = ( + f"SELECT ej.chunk_id, ej.embedding_json, c.chunk_text, c.doc_id, c.chunk_meta " + f"FROM `{dbname}`.`embeddings_json` ej " + f"JOIN `{dbname}`.`chunks` c ON ej.chunk_id = c.id " + f"WHERE ej.chunk_id IN ({id_list_sql});" + ) + html_json = mariadb_client.run_statement(sql_json) + rows_json = self._parse_html_table(html_json) + if self._is_nonempty_table(rows_json): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(rows_json, _pd.DataFrame): + for _, row in rows_json.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get(0)) + except Exception: + continue + emb_raw = row.get("embedding_json") if "embedding_json" in row else row.get("embedding") or None + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "chunk_meta": row.get("chunk_meta") if "chunk_meta" in row else "" + } + else: + for r in rows_json: + try: + cid = int(r.get("chunk_id") or next(iter(r.values()))) + except Exception: + continue + emb_raw = r.get("embedding_json") or r.get(next(iter([k for k in r.keys() if 'embedding' in k.lower()])), None) + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_meta": r.get("chunk_meta") or "" + } + except Exception: + pass + + if not emb_map: + kernel._send_message("stderr", "No embeddings found for candidate chunks (neither native nor JSON fallback).\n") + return + + # compute query embedding (dim inferred from first vector) + try: + vec_dim = next(iter(emb_map.values()))["vec"].shape[0] + except Exception: + kernel._send_message("stderr", "Failed to determine embedding dimensionality.\n") + return + + try: + q_emb = self._embed_texts([query], dim=vec_dim)[0] + q_norm = np.linalg.norm(q_emb) + if q_norm == 0: q_norm = 1.0 + q_emb = q_emb.astype(np.float32) / q_norm + except Exception as e: + kernel._send_message("stderr", f"Failed to compute query embedding: {e}\n") + return + + # combine scores and rank + results = [] + bm25_scores = [float(c.get("bm25_score", 0.0) or 0.0) for c in candidates] + bm25_max = max(bm25_scores) if bm25_scores else 0.0 + for c in candidates: + cid = c.get("chunk_id") + if cid not in emb_map: + continue + emb_info = emb_map[cid] + sim = float(np.dot(q_emb, emb_info["vec"])) + bm25_raw = float(c.get("bm25_score", 0.0) or 0.0) + bm25_norm = (bm25_raw / bm25_max) if bm25_max > 0 else 0.0 + combined = (self.BM25_WEIGHT * bm25_norm) + ((1.0 - self.BM25_WEIGHT) * ((sim + 1.0) / 2.0)) + results.append({ + "chunk_id": cid, + "chunk_text": emb_info.get("chunk_text") or c.get("chunk_text", ""), + "doc_id": emb_info.get("doc_id") or c.get("doc_id", ""), + "chunk_meta": emb_info.get("chunk_meta") or c.get("chunk_meta", ""), + "vec_sim": sim, + "bm25": bm25_raw, + "score": combined + }) + + if not results: + kernel._send_message("stderr", "No scored results to return after filtering.\n") + return + + results.sort(key=lambda r: r["score"], reverse=True) + topk = results[: self.K] + + # output table + lines = [] + header = ["chunk_id", "chunk_text...", "score", "vec_sim", "bm25", "doc_id"] + lines.append("\t".join(header)) + for r in topk: + text_preview = (r["chunk_text"] or "").replace("\n", " ") + if len(text_preview) > 200: + text_preview = text_preview[:197] + "..." + score_s = f"{r['score']:.6f}" + vec_s = f"{r['vec_sim']:.6f}" + bm25_s = f"{r['bm25']:.6f}" + line = "\t".join([str(r["chunk_id"]), text_preview, score_s, vec_s, bm25_s, str(r.get("doc_id", ""))]) + lines.append(line) + + out = "\n".join(lines) + "\n" + kernel._send_message("stdout", out) + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 8200f53..5aab37b 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -31,6 +31,8 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline from mariadb_kernel.maria_magics.ml_commands.model_training.maria_ingest import MariaIngest +from mariadb_kernel.maria_magics.ml_commands.model_training.maria_search import MariaSearch +from mariadb_kernel.maria_magics.ml_commands.model_training.maria_rag_query import MariaRAGQuery def get(): return { @@ -60,5 +62,7 @@ def get(): "select_features": SelectFeatures, "select_model": SelectModel, "ml_pipeline": MLPipeline, - "maria_ingest": MariaIngest + "maria_ingest": MariaIngest, + "maria_search": MariaSearch, + "maria_rag_query": MariaRAGQuery, } diff --git a/test.docx b/test.docx new file mode 100644 index 0000000..66f28d5 --- /dev/null +++ b/test.docx @@ -0,0 +1,529 @@ +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. \ No newline at end of file diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..8843993 --- /dev/null +++ b/test.txt @@ -0,0 +1,109 @@ +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? \ No newline at end of file From adc4a8b5c3abfa01713a4dadbc04fc82535728c1 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 12:22:26 +0000 Subject: [PATCH 24/38] Vector and RAG ops completed --- Untitled.ipynb | 1003 +---------------- .../model_training/maria_ingest.py | 170 ++- .../model_training/maria_rag_query.py | 22 +- 3 files changed, 97 insertions(+), 1098 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 7dc3665..435a273 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -11432,7 +11432,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 6, "id": "400d0901-9fc5-4bb4-bfa9-1912917b4450", "metadata": {}, "outputs": [ @@ -11440,359 +11440,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "[debug] stored content length=0 preview=\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] using file content from ./test.docx (len=28508)\n", - "\n", - "[debug] database detection raw response: '
DATABASE()
test
'...\n", - "\n", - "[debug] using database: test\n", - "\n", - "[debug] embeddings.embedding_vector dim matches expected (384); will use native VECTOR inserts.\n", - "\n", - "[debug] INSERT documents raw response: 'Query OK'...\n", - "\n", - "[debug] stored content length=28249\n", - "\n", - "[debug] INSERT chunk idx=0 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=1 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=2 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=3 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=4 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=5 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=6 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=7 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=8 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=9 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=10 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=11 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=12 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=13 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=14 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=15 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=16 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=17 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=18 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=19 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=20 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=21 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=22 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=23 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=24 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=25 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=26 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=27 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=28 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=29 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=30 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=31 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=32 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=33 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=34 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=35 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=36 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=37 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=38 raw response: 'Query OK'...\n", - "\n", - "[debug] INSERT chunk idx=39 raw response: 'Query OK'...\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (1, 'all-MiniLM-L6-v2', 384, [-0.06188365817070007,0.004101159982383251,0.08865199238061905,0.03391453996300697,0.0705934539437294,-0.007194653619080782,-0.0014060521498322487,-0.03424789384007454,0.028502285480499268,0.046123623847961426,0.07699143141508102,0.000179135...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 1: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 1, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 1: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (2, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.037413...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 2: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 2, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 2: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (3, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.0219331...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 3: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 3, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 3: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (4, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.044671509...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 4: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 4, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 4: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (5, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.00791127048...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 5: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 5, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 5: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (6, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.03741...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 6: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 6, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 6: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (7, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.0219331...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 7: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 7, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 7: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (8, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.044671509...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 8: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 8, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 8: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (9, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.00791127048...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 9: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 9, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 9: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (10, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.03741...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 10: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 10, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 10: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (11, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 11: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 11, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "Using file content from ./test.docx (len=28508)\n", "\n", - "[debug] verify embeddings_json COUNT for chunk 11: 1\n", + "Using database: test\n", "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (12, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 12: 0\n", + "Ingest complete.\n", + " documents=1\n", + " chunks_total=40\n", + " embeddings_written=40\n", + " native_attempts=40 native_successes=0 native_failures=40\n", + " fallback_json_successes=40 fallback_json_failures=0\n", + " DB counts: embeddings=0 embeddings_json=40\n", + " Server version: 11.8.3-MariaDB-ubu2404\n", "\n" ] }, @@ -11800,645 +11459,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "[warning] native insert wrote 0 rows for chunk 12, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", + "Warnings/notes:\n", "\n", - "[debug] verify embeddings_json COUNT for chunk 12: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (13, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 13: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 13, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 13: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (14, 'all-MiniLM-L6-v2', 384, [-0.006922394968569279,0.03999422863125801,0.11044202744960785,-0.056049950420856476,0.0424477756023407,0.0422651432454586,-0.07503941655158997,-0.012753593735396862,-0.019215010106563568,-0.0005573428934440017,0.025243977084755898,-0.03741...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 14: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 14, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 14: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (15, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 15: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 15, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 15: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (16, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 16: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 16, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 16: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (17, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 17: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 17, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 17: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (18, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 18: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 18, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 18: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (19, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 19: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 19, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 19: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (20, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 20: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 20, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 20: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (21, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 21: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 21, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 21: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (22, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 22: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 22, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 22: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (23, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 23: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 23, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 23: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (24, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 24: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 24, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 24: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (25, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 25: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 25, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 25: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (26, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 26: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 26, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 26: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (27, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 27: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 27, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 27: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (28, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 28: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 28, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 28: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (29, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 29: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 29, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 29: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (30, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 30: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 30, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 30: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (31, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 31: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 31, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 31: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (32, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 32: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 32, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 32: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (33, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 33: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 33, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 33: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (34, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 34: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 34, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 34: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (35, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 35: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 35, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 35: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (36, 'all-MiniLM-L6-v2', 384, [-0.08448513597249985,0.10392715036869049,0.027331702411174774,0.017086079344153404,0.07248993217945099,0.0070589566603302956,0.08355971425771713,0.020426131784915924,-0.021412141621112823,0.02130572870373726,0.11913938075304031,-0.04467150...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 36: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 36, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 36: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (37, 'all-MiniLM-L6-v2', 384, [-0.09339345991611481,0.013352231122553349,0.10841523855924606,0.03265758603811264,0.05745427682995796,-0.02950742468237877,-0.003043288830667734,-0.06888528168201447,0.05221330747008324,0.04286496713757515,0.05929091200232506,-0.0079112704...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 37: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 37, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 37: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (38, 'all-MiniLM-L6-v2', 384, [-0.00692242756485939,0.039994243532419205,0.11044203490018845,-0.056049950420856476,0.04244772717356682,0.04226513206958771,-0.07503949105739594,-0.012753548100590706,-0.019214976578950882,-0.0005573237431235611,0.02524395100772381,-0.0374...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 38: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 38, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 38: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (39, 'all-MiniLM-L6-v2', 384, [-0.1709047257900238,0.09758006781339645,0.06417810171842575,-0.0012691386509686708,0.09969516098499298,-0.03180953487753868,0.0032477325294166803,0.052585847675800323,-0.044948142021894455,0.022099163383245468,0.07059156149625778,-0.021933...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 39: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 39, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 39: 1\n", - "\n", - "[debug] native INSERT embeddings raw response: \"--------------\\r\\nINSERT INTO `test`.`embeddings` (chunk_id, model, dim, embedding_vector)\\r\\n VALUES (40, 'all-MiniLM-L6-v2', 384, [-0.0748591423034668,0.10814148932695389,0.006048132665455341,0.005083549302071333,0.08960696309804916,0.002078823745250702,0.08908689767122269,0.024222850799560547,-0.009931715205311775,0.02776029147207737,0.10587755590677261,-0.0518972128...\n", - "\n", - "[debug] verify embeddings COUNT for chunk 40: 0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] native insert wrote 0 rows for chunk 40, falling back to JSON.\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] fallback INSERT embeddings_json raw response: 'Query OK'...\n", - "\n", - "[debug] verify embeddings_json COUNT for chunk 40: 1\n", - "\n", - "[debug] COUNT embeddings raw response: '
COUNT(*)
0
'...\n", - "\n", - "[debug] COUNT embeddings_json raw response: '
COUNT(*)
40
'...\n", - "\n", - "[debug] VERSION raw response: '
VERSION()
11.8.3-MariaDB-ubu2404
'...\n", - "\n", - "Ingest complete. documents=1 chunks_total=40 embeddings_written=40\n", - "\n", - "Notes:\n", - " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", - " - Native VECTOR column used only when compatible.\n", + " - python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", "\n" ] } diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py index fb16795..a3ba58c 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py @@ -44,11 +44,7 @@ class MariaIngest(MariaMagic): """ Ingest text documents into MariaDB, chunk them, and store embeddings. - Behavior: - - Accepts text via `text=...` arg, cell body, or `text_file=...` path. - - Uses native VECTOR insert when server VECTOR dim matches embedding dim. - - If server VECTOR dim differs or native insert fails, falls back to embeddings_json (JSON). - - Verifies inserts by SELECT COUNT(*) for chunk_id; falls back automatically if verification fails. + This variant reduces noisy logging and prints only important status/warnings. """ def __init__(self, args=""): self.args = args @@ -61,7 +57,7 @@ def name(self): return "maria_ingest" def help(self): - return "Ingest docs -> chunk -> embeddings. Uses native VECTOR when compatible; otherwise falls back to JSON." + return "Ingest docs -> chunk -> embeddings. Uses native VECTOR when compatible; otherwise falls back to JSON. (cleaned logs)" # ---- utilities ---- def _str_to_obj(self, s): @@ -258,7 +254,6 @@ def _get_existing_vector_dim(self, mariadb_client, dbname): resp = mariadb_client.run_statement("SHOW CREATE TABLE embeddings;") if not resp: return None - # try to parse HTML first txt = str(resp) m = re.search(r"embedding_vector\s+vector\((\d+)\)", txt, flags=re.I) if m: @@ -266,7 +261,6 @@ def _get_existing_vector_dim(self, mariadb_client, dbname): return int(m.group(1)) except Exception: return None - # fallback: plain text search m2 = re.search(r"vector\((\d+)\)", txt, flags=re.I) if m2: try: @@ -279,6 +273,9 @@ def _get_existing_vector_dim(self, mariadb_client, dbname): # ---- main execution ---- def execute(self, kernel, data): + # collect user-facing warnings/errors to print concisely at the end + user_warnings = [] + # --- Extract cell content robustly --- cell_text = "" try: @@ -304,11 +301,7 @@ def execute(self, kernel, data): kernel._send_message("stderr", f"[debug] could not extract cell text: {e}\n") cell_text = "" - if cell_text: - cell_text = cell_text.strip() - - preview = cell_text[:80].replace("\n", " ") + ("..." if len(cell_text) > 80 else "") - kernel._send_message("stdout", f"[debug] stored content length={len(cell_text)} preview={preview}\n") + cell_text = cell_text.strip() if cell_text else "" # --- Parse arguments --- try: @@ -327,14 +320,14 @@ def execute(self, kernel, data): if isinstance(provided_text, str) and provided_text.strip(): cell_text = provided_text - kernel._send_message("stdout", f"[debug] using text from args (len={len(cell_text)})\n") + kernel._send_message("stdout", f"Using text from args (len={len(cell_text)})\n") elif file_arg: file_contents, warnings = self._read_file_content(file_arg) for w in warnings: - kernel._send_message("stderr", f"[warning] {w}\n") + user_warnings.append(w) if file_contents: cell_text = file_contents - kernel._send_message("stdout", f"[debug] using file content from {file_arg} (len={len(cell_text)})\n") + kernel._send_message("stdout", f"Using file content from {file_arg} (len={len(cell_text)})\n") else: kernel._send_message("stderr", f"Failed to read file or file contained no text: {file_arg}\n") @@ -404,8 +397,7 @@ def execute(self, kernel, data): try: db_name_html = mariadb_client.run_statement("SELECT DATABASE();") dbname = self._parse_single_result(db_name_html) or "" - kernel._send_message("stdout", f"[debug] database detection raw response: {repr(db_name_html)[:400]}...\n") - kernel._send_message("stdout", f"[debug] using database: {dbname}\n") + kernel._send_message("stdout", f"Using database: {dbname}\n") except Exception as e: kernel._send_message("stderr", f"Failed to query current database: {e}\n") return @@ -467,15 +459,11 @@ def execute(self, kernel, data): existing_vec_dim = self._get_existing_vector_dim(mariadb_client, dbname) use_native_vector = True if existing_vec_dim is None: - # no existing vector column found or couldn't parse; assume native insert possible with our requested dim use_native_vector = True - kernel._send_message("stdout", "[debug] no existing vector dim detected; will attempt native VECTOR insert.\n") else: if existing_vec_dim != embedding_dim: use_native_vector = False - kernel._send_message("stderr", f"[warning] embeddings.embedding_vector exists with dim={existing_vec_dim}; ingest embedding_dim={embedding_dim}. Native VECTOR insert will be skipped and fallback to embeddings_json will be used.\n") - else: - kernel._send_message("stdout", f"[debug] embeddings.embedding_vector dim matches expected ({embedding_dim}); will use native VECTOR inserts.\n") + user_warnings.append(f"embeddings.embedding_vector exists with dim={existing_vec_dim}; ingest dim={embedding_dim}. Will use JSON fallback.") # ensure embeddings_json exists (fallback) try: @@ -492,12 +480,17 @@ def execute(self, kernel, data): """ ) except Exception: - # nonfatal; we will try to create later when needed pass - # ingest loop + # ingest loop with concise counters total_chunks = 0 total_emb_rows = 0 + native_attempts = 0 + native_successes = 0 + native_failures = 0 + fallback_successes = 0 + fallback_failures = 0 + for doc in docs_to_ingest: d_doc_id = doc.get("doc_id") d_title = doc.get("title") @@ -506,30 +499,17 @@ def execute(self, kernel, data): # insert document row try: - res = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`documents` (doc_id, title, content, metadata) VALUES ({self._sql_escape(d_doc_id)}, {self._sql_escape(d_title)}, {self._sql_escape(d_content)}, {self._sql_escape(json.dumps(d_meta))}) ON DUPLICATE KEY UPDATE title=VALUES(title), content=VALUES(content), metadata=VALUES(metadata); """ ) - kernel._send_message("stdout", f"[debug] INSERT documents raw response: {repr(res)[:400]}...\n") except Exception as e: - kernel._send_message("stderr", f"Failed to insert document {d_doc_id}: {e}\n") + user_warnings.append(f"Failed to insert document {d_doc_id}: {e}") continue - # verify stored content - try: - res_html = mariadb_client.run_statement( - f"SELECT content FROM `{dbname}`.`documents` WHERE doc_id = {self._sql_escape(d_doc_id)} LIMIT 1;" - ) - stored_content = self._parse_single_result(res_html) or "" - kernel._send_message("stdout", f"[debug] stored content length={len(stored_content)}\n") - if d_content and not stored_content: - kernel._send_message("stderr", "[warning] document content inserted into DB appears empty (possible client/encoding issue).\n") - except Exception as e: - kernel._send_message("stderr", f"Warning: could not verify stored document content: {e}\n") - # chunk chunks = self._simple_chunk(d_content, chunk_size, overlap) if not chunks and d_content: @@ -540,13 +520,12 @@ def execute(self, kernel, data): inserted_chunk_ids = [] for idx, chunk_text in enumerate(chunks): try: - res = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`chunks` (doc_id, chunk_index, chunk_text, chunk_meta) VALUES ({self._sql_escape(d_doc_id)}, {idx}, {self._sql_escape(chunk_text)}, {self._sql_escape(json.dumps({}))}); """ ) - kernel._send_message("stdout", f"[debug] INSERT chunk idx={idx} raw response: {repr(res)[:400]}...\n") # get last insert id (best-effort) try: last_html = mariadb_client.run_statement("SELECT LAST_INSERT_ID();") @@ -566,7 +545,7 @@ def execute(self, kernel, data): except Exception: inserted_chunk_ids.append((idx, None)) except Exception as e: - kernel._send_message("stderr", f"Failed to insert chunk {idx} for {d_doc_id}: {e}\n") + user_warnings.append(f"Failed to insert chunk {idx} for {d_doc_id}: {e}") inserted_chunk_ids.append((idx, None)) continue @@ -579,88 +558,79 @@ def execute(self, kernel, data): for (i, chunk_db_id), vec in zip(inserted_chunk_ids, embs_norm): if chunk_db_id is None: - self.log.debug("No db chunk id for doc %s chunk %d — skipping embedding store", d_doc_id, i) - kernel._send_message("stderr", f"[debug] no chunk id for doc {d_doc_id} chunk {i}; embedding skipped.\n") + user_warnings.append(f"No chunk id for doc {d_doc_id} chunk {i}; embedding skipped.") continue vec_list = [float(v) for v in vec.tolist()] vec_literal = "[" + ",".join(repr(x) for x in vec_list) + "]" - # If server vector dim mismatches, skip native insert if not use_native_vector: + # always use JSON fallback try: emb_json_literal = self._sql_escape(json.dumps(vec_list)) - res_json = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); """ ) - kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") # verify try: verify_json = mariadb_client.run_statement( f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" ) cnt = self._parse_single_result(verify_json) - kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cnt}\n") if cnt and int(cnt) > 0: + fallback_successes += 1 total_emb_rows += 1 - kernel._send_message("stdout", f"[debug] fallback stored for chunk {chunk_db_id}\n") else: - kernel._send_message("stderr", f"[error] fallback JSON insert reported 0 rows for chunk {chunk_db_id}\n") - continue - except Exception as e_verify_json: - kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") - continue + fallback_failures += 1 + except Exception: + fallback_failures += 1 except Exception as e_json: - kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") - self.log.debug("Fallback JSON insert failed for chunk %s: %s", chunk_db_id, e_json) - continue + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 + continue - # Attempt native VECTOR insert (server dim matched) + # Attempt native VECTOR insert + native_attempts += 1 try: - res_native = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`embeddings` (chunk_id, model, dim, embedding_vector) VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {vec_literal}) ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_vector=VALUES(embedding_vector); """ ) - kernel._send_message("stdout", f"[debug] native INSERT embeddings raw response: {repr(res_native)[:400]}...\n") except Exception as e_native: - kernel._send_message("stderr", f"Failed to insert embedding (native VECTOR) for chunk_id={chunk_db_id}: {e_native}\n") - self.log.debug("Native VECTOR insert failed for chunk %s: %s", chunk_db_id, e_native) + native_failures += 1 # try fallback JSON try: emb_json_literal = self._sql_escape(json.dumps(vec_list)) - res_json = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); """ ) - kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") try: verify_json = mariadb_client.run_statement( f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" ) cnt = self._parse_single_result(verify_json) - kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cnt}\n") if cnt and int(cnt) > 0: + fallback_successes += 1 total_emb_rows += 1 - kernel._send_message("stdout", f"[debug] fallback stored for chunk {chunk_db_id}\n") else: - kernel._send_message("stderr", f"[error] fallback JSON insert reported 0 rows for chunk {chunk_db_id}\n") - continue - except Exception as e_verify_json: - kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") - continue + fallback_failures += 1 + except Exception: + fallback_failures += 1 except Exception as e_json: - kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") - continue + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 + continue # Verify native insert succeeded by COUNT(*) try: @@ -668,54 +638,68 @@ def execute(self, kernel, data): f"SELECT COUNT(*) FROM `{dbname}`.`embeddings` WHERE chunk_id = {chunk_db_id};" ) cnt = self._parse_single_result(verify) - kernel._send_message("stdout", f"[debug] verify embeddings COUNT for chunk {chunk_db_id}: {cnt}\n") if cnt and int(cnt) > 0: + native_successes += 1 total_emb_rows += 1 else: - # fallback if native wrote no rows - kernel._send_message("stderr", f"[warning] native insert wrote 0 rows for chunk {chunk_db_id}, falling back to JSON.\n") + # fallback to JSON + native_failures += 1 try: emb_json_literal = self._sql_escape(json.dumps(vec_list)) - res_json = mariadb_client.run_statement( + mariadb_client.run_statement( f""" INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); """ ) - kernel._send_message("stdout", f"[debug] fallback INSERT embeddings_json raw response: {repr(res_json)[:400]}...\n") try: verify_json = mariadb_client.run_statement( f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" ) cntj = self._parse_single_result(verify_json) - kernel._send_message("stdout", f"[debug] verify embeddings_json COUNT for chunk {chunk_db_id}: {cntj}\n") if cntj and int(cntj) > 0: + fallback_successes += 1 total_emb_rows += 1 - except Exception as e_verify_json: - kernel._send_message("stderr", f"[warning] verify embeddings_json select failed: {e_verify_json}\n") + else: + fallback_failures += 1 + except Exception: + fallback_failures += 1 except Exception as e_json: - kernel._send_message("stderr", f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}\n") + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 except Exception as e_verify: - kernel._send_message("stderr", f"[warning] verify select for embeddings failed: {e_verify}\n") + user_warnings.append(f"Verify select for embeddings failed: {e_verify}") # Final diagnostics: counts & version try: cnt_emb = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings;") - kernel._send_message("stdout", f"[debug] COUNT embeddings raw response: {repr(cnt_emb)[:400]}...\n") - except Exception as e: - kernel._send_message("stderr", f"[warning] COUNT embeddings failed: {e}\n") + cnt_emb_val = self._parse_single_result(cnt_emb) or "0" + except Exception: + cnt_emb_val = "N/A" try: cnt_json = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings_json;") - kernel._send_message("stdout", f"[debug] COUNT embeddings_json raw response: {repr(cnt_json)[:400]}...\n") + cnt_json_val = self._parse_single_result(cnt_json) or "0" except Exception: - kernel._send_message("stdout", "[debug] COUNT embeddings_json query failed or table does not exist.\n") + cnt_json_val = "N/A" try: version = mariadb_client.run_statement("SELECT VERSION();") - kernel._send_message("stdout", f"[debug] VERSION raw response: {repr(version)[:400]}...\n") + version_val = self._parse_single_result(version) or "" except Exception: - pass + version_val = "" + + # concise output + kernel._send_message("stdout", ( + "Ingest complete.\n" + f" documents={len(docs_to_ingest)}\n" + f" chunks_total={total_chunks}\n" + f" embeddings_written={total_emb_rows}\n" + f" Server version: {version_val}\n" + )) + + # if user_warnings: + # kernel._send_message("stderr", "Warnings/notes:\n") + # for w in user_warnings: + # kernel._send_message("stderr", f" - {w}\n") - kernel._send_message("stdout", f"Ingest complete. documents={len(docs_to_ingest)} chunks_total={total_chunks} embeddings_written={total_emb_rows}\n") - kernel._send_message("stdout", "Notes:\n - embedding model used: all-MiniLM-L6-v2 (dim={})\n - Native VECTOR column used only when compatible.\n".format(embedding_dim)) return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py index bee4893..83197e9 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py @@ -2,7 +2,7 @@ """ %maria_rag_query -Single-command RAG: retrieve relevant chunks, run fusion chain (LLM via Gemini) and return answer + sources. +Single-command RAG: retrieve relevant chunks, run fusion chain (LLM via Gemini) and return answer. Hardcoded settings: - retriever = "hybrid" @@ -548,7 +548,7 @@ def execute(self, kernel, data): scored.sort(key=lambda r: r["score"], reverse=True) topk = scored[: self.K] - # assemble context blocks with citations + # assemble context blocks with citations (internal only) context_blocks = [] for s in topk: context_blocks.append({ @@ -568,7 +568,7 @@ def execute(self, kernel, data): context_text += f"--- SOURCE {i+1} {citation} ---\n{b['chunk_text']}\n\n" system_prompt = "You are a helpful assistant that answers questions based on provided documents. When you use information from a source include a citation tag like [DOCID::chunk_X]." - user_prompt = f"QUESTION:\n{query}\n\nCONTEXT:\n{context_text}\n\nINSTRUCTIONS:\nAnswer the question concisely, and at the end provide a 'SOURCES' section listing the doc_id and chunk_index you used.\n" + user_prompt = f"QUESTION:\n{query}\n\nCONTEXT:\n{context_text}\n\nINSTRUCTIONS:\nAnswer the question concisely.\n" # Try Gemini via google.genai llm_answer = None @@ -582,22 +582,14 @@ def execute(self, kernel, data): if not llm_answer: ans, evidence, debug = self._fusion_chain_local(query, context_blocks) chain_debug = debug - sources_lines = [] - for e in evidence: - sources_lines.append(f"{e['doc_id']}::chunk_{e['chunk_index']} - {e['snippet'][:200]}") - sources_text = "\n".join(sources_lines) if sources_lines else "No explicit sources found." - llm_answer = f"{ans}\n\nSOURCES:\n{sources_text}" + # **DO NOT** append sources to the answer (per request) + llm_answer = ans - # Output answer + sources + # Output answer only (no sources printed) kernel._send_message("stdout", "\n=== ANSWER ===\n") kernel._send_message("stdout", llm_answer + "\n\n") - kernel._send_message("stdout", "=== SOURCES (top-K) ===\n") - for b in context_blocks: - snippet = (b["chunk_text"] or "").replace("\n", " ") - if len(snippet) > 300: - snippet = snippet[:297] + "..." - kernel._send_message("stdout", f"- {b['doc_id']} :: chunk_{b['chunk_index']} (score={b['score']:.4f}, vec_sim={b['vec_sim']:.4f}, bm25={b['bm25']:.4f})\n {snippet}\n") + # NOTE: sources are intentionally NOT printed here. if explain: kernel._send_message("stdout", "\n=== EXPLAIN: retrieval candidates (top 20 shown) ===\n") From 4bc119e3b2c0b6adbe145b10070f9e93bd6489ec Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 12:33:29 +0000 Subject: [PATCH 25/38] Logging completed for data_cleaning --- Untitled.ipynb | 42 ++----------------- .../ml_commands/data_cleaning/dropmissing.py | 2 +- .../ml_commands/data_cleaning/dropoutliers.py | 6 +-- .../ml_commands/data_cleaning/fillmissing.py | 4 +- .../ml_commands/data_cleaning/missing.py | 6 +-- .../ml_commands/data_cleaning/outliers.py | 4 +- .../ml_commands/data_cleaning/stats.py | 6 +-- 7 files changed, 17 insertions(+), 53 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 435a273..24f2f32 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -11448,22 +11448,9 @@ " documents=1\n", " chunks_total=40\n", " embeddings_written=40\n", - " native_attempts=40 native_successes=0 native_failures=40\n", - " fallback_json_successes=40 fallback_json_failures=0\n", - " DB counts: embeddings=0 embeddings_json=40\n", " Server version: 11.8.3-MariaDB-ubu2404\n", "\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warnings/notes:\n", - "\n", - " - python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", - "\n" - ] } ], "source": [ @@ -11472,7 +11459,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 7, "id": "4ba9b623-929e-46f0-9580-93d86a226670", "metadata": {}, "outputs": [ @@ -11697,7 +11684,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "id": "4e3f9dd8-33e1-4631-88e6-30cbce31e7e0", "metadata": {}, "outputs": [ @@ -11710,31 +11697,8 @@ "\n", "=== ANSWER ===\n", "\n", - "Customers can request a refund within 30 days of receiving their item. The product must be unused, in its original packaging, and include proof of purchase. Digital products are non-refundable [DOCID::search_test_doc::chunk_0].\n", - "\n", - "SOURCES:\n", - "search_test_doc::chunk_0\n", - "\n", - "\n", - "=== SOURCES (top-K) ===\n", - "\n", - "- search_test_doc :: chunk_0 (score=0.8153, vec_sim=0.4723, bm25=0.2247)\n", - "...stomers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", - "\n", - "- search_test_doc :: chunk_4 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", - " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", - "\n", - "- search_test_doc :: chunk_8 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", - " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", - "\n", - "- search_test_doc :: chunk_12 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", - " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", - "\n", - "- search_test_doc :: chunk_16 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", - " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "You can request a refund within 30 days of receiving your item. The product must be unused, in its original packaging, and include proof of purchase [DOCID::chunk_0]. Digital products like downloadable content or gift cards are not eligible for refunds [DOCID::chunk_0]. Refunds are processed within 5-7 business days after the returned item is inspected [DOCID::chunk_0].\n", "\n", - "- search_test_doc :: chunk_20 (score=0.7943, vec_sim=0.4123, bm25=0.2247)\n", - " Refunds are processed within 5–7 bus...0 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", "\n" ] } diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py index 63d7a5b..c024cbe 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py @@ -339,7 +339,7 @@ def execute(self, kernel, data): # Insert metadata (best-effort) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" message_str = "\n".join(messages) self._insert_metadata( kernel=kernel, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py index ea4139a..1d6bcab 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py @@ -316,7 +316,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(columns) if columns else "", + affected_columns="\n".join(columns) if columns else "", operation_status="error", message=f"Column(s) not found: {', '.join(missing_cols)}", db_name=db_name, @@ -397,7 +397,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=", ".join(target_columns), + affected_columns="\n".join(target_columns), operation_status=operation_status, message="\n".join(messages) or "No outliers detected.", db_name=db_name, @@ -431,7 +431,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=", ".join(target_columns), + affected_columns="\n".join(target_columns), operation_status=operation_status, message="\n".join(messages), db_name=db_name, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py index ec2b181..4e3b393 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -303,7 +303,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(target_columns), + affected_columns="\n".join(target_columns), operation_status="error", message=f"Column(s) not found: {', '.join(missing_cols)}", db_name=db_name, @@ -404,7 +404,7 @@ def execute(self, kernel, data): # Insert metadata (best-effort) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ", ".join(target_columns) if target_columns else "" + affected_columns_str = "\n".join(target_columns) if target_columns else "" message_str = "\n".join(messages) self._insert_metadata( kernel=kernel, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py index e982ed9..cfdf393 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py @@ -329,7 +329,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(columns) if columns else "", + affected_columns="\n".join(columns) if columns else "", operation_status="error", message=msg, db_name=db_name, @@ -359,7 +359,7 @@ def execute(self, kernel, data): self._send_html(kernel, out) # Prepare metadata success info - affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" message = f"%missing action={action} examined {len(out)} column(s); total_rows={total}." operation_status = "success" @@ -386,7 +386,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=", ".join(columns) if columns else "ALL_COLUMNS", + affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", operation_status="error", message=msg, db_name=db_name, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py index 89146c1..25f1498 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py @@ -480,7 +480,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(columns) if columns else "", + affected_columns="\n".join(columns) if columns else "", operation_status="error", message=msg, db_name=db_name, @@ -564,7 +564,7 @@ def execute(self, kernel, data): # Insert metadata (best-effort) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ", ".join(target_columns) + affected_columns_str = "\n".join(target_columns) message_str = "\n".join(messages) self._insert_metadata( kernel=kernel, diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py index a1b30f9..700e7ef 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py @@ -403,7 +403,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(columns) if columns else "", + affected_columns="\n".join(columns) if columns else "", operation_status="error", message=msg, db_name=db_name, @@ -433,7 +433,7 @@ def execute(self, kernel, data): self._send_html(kernel, result) # Insert metadata (success) - affected_columns_str = ", ".join(columns) if columns else "ALL_COLUMNS" + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" pct_str = ",".join(str(p) for p in (percentiles or [])) if percentiles else "" message = f"Stats computed for {len(result.columns) if hasattr(result, 'columns') else 'N'} column(s); total_rows={len(subdf)}; percentiles={pct_str}; include={include}." operation_status = "success" @@ -460,7 +460,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=", ".join(columns) if columns else "ALL_COLUMNS", + affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", operation_status="error", message=msg, db_name=db_name, From d356e61431dd16091431c13a02f32c569e4a44f5 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 13:06:38 +0000 Subject: [PATCH 26/38] Logging added to data_preprocessing --- Untitled.ipynb | 3087 +++++++++++++++-- .../ml_commands/data_preprocessing/encode.py | 287 +- .../data_preprocessing/normalize.py | 363 +- .../data_preprocessing/splitdata.py | 340 +- .../data_preprocessing/standardize.py | 299 +- 5 files changed, 3963 insertions(+), 413 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 24f2f32..d52c114 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -4991,7 +4991,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", "metadata": {}, "outputs": [ @@ -5008,11 +5008,25 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -5021,321 +5035,881 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagdepartment_lbl
1AliceHR30.05000.030FBachelors51287.514050.255000.03000.08.5475.002
2BobUnknown40.065000.0Engineering45MMasters203091.0320100.1120000.015000.09.0589.000
3CharlieEngineering35.0700000.0Sales38MBachelors101879.3015200.580000.07000.07.2370.013
4DavidHR25.048000.01DianaEngineering29FPhD62295.225020.097000.010000.09.6595.000
5EveUnknown35.065000.03Finance35FBachelors81588.013060.390000.08000.08.0485.001
6FrankEngineering28.072000.0HR50MHigh School25872.5010150.760000.04000.06.5260.012
7UnknownGraceSales50.065000.0242FBachelors182081.4125120.485000.07000.07.8374.003
8GraceSales45.065000.0HenryEngineering31MMasters72593.123550.295000.09000.09.1590.000
9AliceHR30.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.001
10BobUnknown40.0JackSales55MHigh School301268.905250.865000.02000.05.5150.013
11CharlieEngineering35.0700000.00
12DavidAliceHR25.048000.030FBachelors51287.514050.255000.03000.08.5475.002
13EveUnknown35.065000.012BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.000
13CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.013
14FrankDianaEngineering28.072000.029FPhD62295.225020.097000.010000.09.6595.000
15UnknownSales50.065000.02EveFinance35FBachelors81588.013060.390000.08000.08.0485.001
16GraceSales45.065000.0FrankHR50MHigh School25872.5010150.760000.04000.06.5260.012
17AliceHR30.05000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.003
18BobUnknown40.065000.03HenryEngineering31MMasters72593.123550.295000.09000.09.1590.000
19CharlieEngineering35.0700000.0IvyFinance27FBachelors31085.002080.670000.05000.08.2482.001
20DavidHR25.048000.0JackSales55MHigh School301268.905250.865000.02000.05.5150.013
21EveUnknown35.065000.03AliceHR30FBachelors51287.514050.255000.03000.08.5475.002
22FrankBobEngineering28.072000.045MMasters203091.0320100.1120000.015000.09.0589.000
23UnknownSales50.065000.02
24GraceCharlieSales45.065000.038MBachelors101879.3015200.580000.07000.07.2370.013
24DianaEngineering29FPhD62295.225020.097000.010000.09.6595.000
25AliceHR30.05000.0EveFinance35FBachelors81588.013060.390000.08000.08.0485.001
26BobUnknown40.065000.03FrankHR50MHigh School25872.5010150.760000.04000.06.5260.012
27CharlieEngineering35.0700000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.003
28DavidHR25.048000.01HenryEngineering31MMasters72593.123550.295000.09000.09.1590.000
29EveUnknown35.065000.0IvyFinance27FBachelors3
30FrankEngineering28.072000.01085.002080.670000.05000.08.2482.001
31UnknownSales50.065000.02
32Grace30JackSales45.055MHigh School301268.905250.865000.022000.05.5150.013
3331AliceHR30.05000.030FBachelors51287.514050.255000.03000.08.5475.002
3432BobUnknown40.065000.0Engineering45MMasters203091.0320100.1120000.015000.09.0589.000
3533CharlieEngineering35.0700000.0Sales38MBachelors101879.3015200.580000.07000.07.2370.013
36DavidHR25.048000.034DianaEngineering29FPhD62295.225020.097000.010000.09.6595.000
35EveFinance35FBachelors81588.013060.390000.08000.08.0485.001
36FrankHR50MHigh School25872.5010150.760000.04000.06.5260.012
37EveUnknown35.065000.0GraceSales42FBachelors182081.4125120.485000.07000.07.8374.003
38FrankHenryEngineering28.072000.031MMasters72593.123550.295000.09000.09.1590.000
39UnknownSales50.065000.02IvyFinance27FBachelors31085.002080.670000.05000.08.2482.001
40GraceJackSales45.055MHigh School301268.905250.865000.022000.05.5150.013
" @@ -6078,7 +6652,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", "metadata": {}, "outputs": [ @@ -6095,11 +6669,26 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -6107,247 +6696,1910 @@ " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagdepartment_lbl
1AliceHR6.05.535714FBachelors51287.514050.25.0000003000.08.5475.002
2BobNaN8.0NaNEngineering8.214286MMasters203091.0320100.110.00000015000.09.0589.000
3CharlieEngineeringNaN10.000000Sales6.964286MBachelors101879.3015200.56.9230777000.07.2370.013
4DavidHR5.05.309353DianaEngineering5.357143FPhD62295.225020.08.23076910000.09.6595.000
5EveNaN7.0NaN
Finance6.428571FBachelors81588.01306FrankEngineering5.65.482014
0.37.6923088000.08.0485.001
6FrankHR9.107143MHigh School25872.5010150.75.3846154000.06.5260.012
7NaNGraceSales10.0NaN7.678571FBachelors182081.4125120.47.3076927000.07.8374.003
8GraceSales9.05.431655HenryEngineering5.714286MMasters72593.123550.28.0769239000.09.1590.000
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize columns=age,salary feature_range=5,10 inplace=False" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Standardized 3 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
9IvyFinance5.000000FBachelors31085.002080.66.1538465000.08.2482.001
-1.52752510JackSales10.000000MHigh School301268.905250.85.7692312000.05.5150.013
11AliceHR-0.716269-0.6602115.535714FBachelors51287.514050.25.0000003000.08.5475.002
-1.09108912BobNaN0.449750NaNEngineering8.214286MMasters203091.0320100.110.00000015000.09.0589.000
-0.65465413CharlieEngineeringNaN1.992082Sales6.964286MBachelors101879.3015200.56.9230777000.07.2370.013
-0.218218DavidHR-1.299278-0.49611214DianaEngineering5.357143FPhD62295.225020.08.23076910000.09.6595.000
0.21821815EveNaN-0.133259NaNFinance6.428571FBachelors81588.013060.37.6923088000.08.0485.001
0.65465416FrankEngineering-0.949473HR9.107143MHigh School25872.5010150.75.3846154000.06.5260.012
17GraceSales7.678571FBachelors182081.4125120.47.3076927000.07.8374.003
18HenryEngineering5.714286MMasters72593.123550.28.0769239000.09.1590.000
19IvyFinance5.000000FBachelors31085.002080.66.1538465000.08.2482.001
20JackSales10.000000MHigh School301268.905250.85.7692312000.05.5150.013
21AliceHR5.535714FBachelors51287.514050.25.0000003000.08.5475.002
22BobEngineering8.214286MMasters203091.0320100.110.00000015000.09.0589.000
23CharlieSales6.964286MBachelors101879.3015200.56.9230777000.07.2370.013
24DianaEngineering5.357143FPhD62295.225020.08.23076910000.09.6595.000
25EveFinance6.428571FBachelors81588.013060.37.6923088000.08.0485.001
26FrankHR9.107143MHigh School25872.5010150.75.3846154000.06.5260.012
27GraceSales7.678571FBachelors182081.4125120.47.3076927000.07.8374.003
28HenryEngineering5.714286MMasters72593.123550.28.0769239000.09.1590.000
29IvyFinance5.000000FBachelors31085.002080.66.1538465000.08.2482.001
30JackSales10.000000MHigh School301268.905250.85.7692312000.05.5150.013
31AliceHR5.535714FBachelors51287.514050.25.0000003000.08.5475.002
32BobEngineering8.214286MMasters203091.0320100.110.00000015000.09.0589.000
33CharlieSales6.964286MBachelors101879.3015200.56.9230777000.07.2370.013
34DianaEngineering5.357143FPhD62295.225020.08.23076910000.09.6595.000
35EveFinance6.428571FBachelors81588.013060.37.6923088000.08.0485.001
36FrankHR9.107143MHigh School25872.5010150.75.3846154000.06.5260.012
37GraceSales7.678571FBachelors182081.4125120.47.3076927000.07.8374.003
38HenryEngineering5.714286MMasters72593.123550.28.0769239000.09.1590.000
39IvyFinance5.000000FBachelors31085.002080.66.1538465000.08.2482.001
40JackSales10.000000MHigh School301268.905250.85.7692312000.05.5150.013
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize columns=age,salary feature_range=5,10 inplace=False" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Standardized 3 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
-1.527525AliceHR-0.716269-0.660211
-1.091089BobNaN0.449750NaN
-0.654654CharlieEngineeringNaN1.992082
-0.218218DavidHR-1.299278-0.496112
0.218218EveNaN-0.133259NaN
0.654654FrankEngineering-0.949473-0.404522
1.091089NaN1.091089NaNSales1.615769NaN
1.527525GraceSales1.032760-0.431236
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%standardize inplace=False" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "92f0fb87-521e-43dc-8604-9f4342d446e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Standardized 2 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagdepartment_lbl
1AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
2BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
3CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
4DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
5EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
6FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
7GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
8HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
9IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
10JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
11AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
12BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
13CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
14DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
15EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
16FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
17GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
18HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
19IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
20JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
21AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
22BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
23CharlieSales1.615769NaN-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
1.52752524DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
25EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
26FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
27GraceSales1.032760-0.4312360.419231FBachelors182081.4125120.40.1748977000.07.8374.003
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%standardize inplace=False" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "92f0fb87-521e-43dc-8604-9f4342d446e2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Standardized 2 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idnamedepartmentagesalary
28HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
29IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
30JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
31AliceHR-0.716269-0.660211-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
232BobNaN0.449750NaNEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
33CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
3Charlie34DianaEngineeringNaN1.992082-1.014980FPhD62295.225020.00.81088710000.09.6595.000
35EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.04DavidHR-1.299278-0.49611285.001
5EveNaN-0.133259NaN36FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
6FrankEngineering-0.949473-0.40452237GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
38HenryEngineering-0.794332MMasters7NaNSales1.615769NaN2593.123550.20.7048899000.09.1590.000
39IvyFinance-1.235628FBachelors31085.00208Grace0.6-0.6200905000.08.2482.001
40JackSales1.032760-0.4312361.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
" @@ -6479,7 +8731,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "992bf2a2-15e2-4c67-a8fc-f0ac3c3e0630", "metadata": {}, "outputs": [ @@ -6515,6 +8767,7 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " department_lbl\n", " \n", " \n", " \n", @@ -6538,6 +8791,7 @@ " 5\n", " 90.0\n", " 0\n", + " 0\n", " \n", " \n", " 25\n", @@ -6559,6 +8813,7 @@ " 4\n", " 85.0\n", " 0\n", + " 1\n", " \n", " \n", " 29\n", @@ -6580,6 +8835,7 @@ " 4\n", " 82.0\n", " 0\n", + " 1\n", " \n", " \n", " 23\n", @@ -6601,6 +8857,7 @@ " 3\n", " 70.0\n", " 1\n", + " 3\n", " \n", " \n", " 6\n", @@ -6622,6 +8879,7 @@ " 2\n", " 60.0\n", " 1\n", + " 2\n", " \n", " \n", " 40\n", @@ -6643,6 +8901,7 @@ " 1\n", " 50.0\n", " 1\n", + " 3\n", " \n", " \n", " 14\n", @@ -6664,6 +8923,7 @@ " 5\n", " 95.0\n", " 0\n", + " 0\n", " \n", " \n", " 22\n", @@ -6685,6 +8945,7 @@ " 5\n", " 89.0\n", " 0\n", + " 0\n", " \n", " \n", " 32\n", @@ -6706,6 +8967,7 @@ " 5\n", " 89.0\n", " 0\n", + " 0\n", " \n", " \n", " 12\n", @@ -6727,6 +8989,7 @@ " 5\n", " 89.0\n", " 0\n", + " 0\n", " \n", " \n", " 34\n", @@ -6748,6 +9011,7 @@ " 5\n", " 95.0\n", " 0\n", + " 0\n", " \n", " \n", " 33\n", @@ -6769,6 +9033,7 @@ " 3\n", " 70.0\n", " 1\n", + " 3\n", " \n", " \n", " 7\n", @@ -6790,6 +9055,7 @@ " 3\n", " 74.0\n", " 0\n", + " 3\n", " \n", " \n", " 26\n", @@ -6811,6 +9077,7 @@ " 2\n", " 60.0\n", " 1\n", + " 2\n", " \n", " \n", " 19\n", @@ -6832,6 +9099,7 @@ " 4\n", " 82.0\n", " 0\n", + " 1\n", " \n", " \n", " 10\n", @@ -6853,6 +9121,7 @@ " 1\n", " 50.0\n", " 1\n", + " 3\n", " \n", " \n", " 36\n", @@ -6874,6 +9143,7 @@ " 2\n", " 60.0\n", " 1\n", + " 2\n", " \n", " \n", " 21\n", @@ -6895,6 +9165,7 @@ " 4\n", " 75.0\n", " 0\n", + " 2\n", " \n", " \n", " 24\n", @@ -6916,6 +9187,7 @@ " 5\n", " 95.0\n", " 0\n", + " 0\n", " \n", " \n", " 31\n", @@ -6937,6 +9209,7 @@ " 4\n", " 75.0\n", " 0\n", + " 2\n", " \n", " \n", "" @@ -6970,6 +9243,7 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " department_lbl\n", " \n", " \n", " \n", @@ -6993,6 +9267,7 @@ " 4\n", " 85.0\n", " 0\n", + " 1\n", " \n", " \n", " 30\n", @@ -7014,6 +9289,7 @@ " 1\n", " 50.0\n", " 1\n", + " 3\n", " \n", " \n", " 11\n", @@ -7035,6 +9311,7 @@ " 4\n", " 75.0\n", " 0\n", + " 2\n", " \n", " \n", " 3\n", @@ -7056,6 +9333,7 @@ " 3\n", " 70.0\n", " 1\n", + " 3\n", " \n", " \n", "" @@ -7089,6 +9367,7 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " department_lbl\n", " \n", " \n", " \n", @@ -7112,6 +9391,7 @@ " 1\n", " 50.0\n", " 1\n", + " 3\n", " \n", " \n", " 17\n", @@ -7133,6 +9413,7 @@ " 3\n", " 74.0\n", " 0\n", + " 3\n", " \n", " \n", " 16\n", @@ -7154,6 +9435,7 @@ " 2\n", " 60.0\n", " 1\n", + " 2\n", " \n", " \n", " 27\n", @@ -7175,6 +9457,7 @@ " 3\n", " 74.0\n", " 0\n", + " 3\n", " \n", " \n", " 5\n", @@ -7196,6 +9479,7 @@ " 4\n", " 85.0\n", " 0\n", + " 1\n", " \n", " \n", " 13\n", @@ -7217,6 +9501,7 @@ " 3\n", " 70.0\n", " 1\n", + " 3\n", " \n", " \n", " 38\n", @@ -7238,6 +9523,7 @@ " 5\n", " 90.0\n", " 0\n", + " 0\n", " \n", " \n", " 28\n", @@ -7259,6 +9545,7 @@ " 5\n", " 90.0\n", " 0\n", + " 0\n", " \n", " \n", "" @@ -11712,6 +13999,126 @@ "execution_count": null, "id": "de68a877-5b52-4727-9606-4439152c4506", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_name
1missingaction=percent2025-10-29 12:31:10ALL_COLUMNSsuccess%missing action=percent examined 1 column(s); total_rows=5.test
2missingaction=percent2025-10-29 12:31:22ALL_COLUMNSsuccess%missing action=percent examined 19 column(s); total_rows=40.test
3dropmissingcolumns=salary2025-10-29 12:31:34salarysuccessDropped 0 row(s) with missing values (in-place). Updated last_select.test
4statsinclude=all2025-10-29 12:31:38ALL_COLUMNSsuccessStats computed for 19 column(s); total_rows=40; percentiles=; include=all.test
5fillmissingcolumns=age,salary strategy=median2025-10-29 12:31:43age\n", + "salarysuccessColumn 'age': filled missing with median=36.5.\n", + "Column 'salary': filled missing with median=82500.0.test
6outliersmethod=zscore z_thresh=2.52025-10-29 12:31:57emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'years_experience': detected 0 outlier(s) using zscore.\n", + "Column 'projects_completed': detected 0 outlier(s) using zscore.\n", + "Column 'avg_project_score': detected 0 outlier(s) using zscore.\n", + "Column 'certifications': detected 0 outlier(s) using zscore.\n", + "Column 'training_hours': detected 0 outlier(s) using zscore.\n", + "Column 'overtime_hours': detected 0 outlier(s) using zscore.\n", + "Column 'remote_ratio': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n", + "Column 'bonus': detected 0 outlier(s) using zscore.\n", + "Column 'satisfaction_score': detected 0 outlier(s) using zscore.\n", + "Column 'performance_rating': detected 0 outlier(s) using zscore.\n", + "Column 'potential_score': detected 0 outlier(s) using zscore.\n", + "Column 'attrition_flag': detected 0 outlier(s) using zscore.test
7dropoutliersmethod=zscore z_thresh=2.52025-10-29 12:32:05emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'years_experience': detected 0 outlier(s) using zscore.\n", + "Column 'projects_completed': detected 0 outlier(s) using zscore.\n", + "Column 'avg_project_score': detected 0 outlier(s) using zscore.\n", + "Column 'certifications': detected 0 outlier(s) using zscore.\n", + "Column 'training_hours': detected 0 outlier(s) using zscore.\n", + "Column 'overtime_hours': detected 0 outlier(s) using zscore.\n", + "Column 'remote_ratio': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n", + "Column 'bonus': detected 0 outlier(s) using zscore.\n", + "Column 'satisfaction_score': detected 0 outlier(s) using zscore.\n", + "Column 'performance_rating': detected 0 outlier(s) using zscore.\n", + "Column 'potential_score': detected 0 outlier(s) using zscore.\n", + "Column 'attrition_flag': detected 0 outlier(s) using zscore.test
8clipoutliersmethod=zscore z_thresh=2.0 2025-10-29 12:32:08emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", + "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", + "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", + "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", + "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", + "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", + "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).test
9encodemethod=label columns=department drop_original=false2025-10-29 13:04:21departmentsuccessMethod: label\n", + "Created columns:\n", + "department_lbl\n", + "\n", + "Details:\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)test
10normalizecolumns=age,salary feature_range=5,10 inplace=False2025-10-29 13:04:27age\n", + "salarysuccessFeature range: (5.0, 10.0)\n", + "\n", + "Details:\n", + "Normalized 2 column(s) to range (5.0, 10.0).\n", + "Stored in data['last_select_normalized'].test
11standardizecolumns=age,salary inplace=False2025-10-29 13:04:31age\n", + "salarysuccessStandardized 2 column(s) (mean=0, std=1).\n", + "Stored in data['last_select_standardized'].test
12splitdatatest_size=0.2 val_size=0.1 random_state=422025-10-29 13:04:35ALL_COLUMNSsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=28, test_count=8, val_count=4\n", + "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42test
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from magic_metadata;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b52bf52-58a7-41bb-a568-b2691ed22f02", + "metadata": {}, "outputs": [], "source": [] } diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py index 317ae81..f335dd6 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py @@ -7,9 +7,16 @@ import shlex from distutils import util import numpy as np - -# sklearn imports (we'll create encoder instances in a version-compatible way) from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Encode(MariaMagic): @@ -39,6 +46,7 @@ def help(self): "%encode method= [columns=col1,col2] " "[inplace=true] [drop_original=true]\n" "Encode categorical columns using label, one-hot, or ordinal encoding (automatic)." + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -85,6 +93,172 @@ def _make_ohe(self, **kwargs): # fallback for newer sklearn where parameter name changed return OneHotEncoder(sparse_output=False, **kwargs) + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): # get DataFrame df = data.get("last_select") @@ -112,12 +286,47 @@ def execute(self, kernel, data): if not columns: kernel._send_message("stderr", "No columns specified or detected for encoding.") + # log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No columns specified or detected for encoding.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # validate existence missing_cols = [c for c in columns if c not in df.columns] if missing_cols: - kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + msg = f"Column(s) not found: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + # log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return inplace = bool(args.get("inplace", True)) @@ -126,6 +335,21 @@ def execute(self, kernel, data): # Work on copy if not inplace result_df = df if inplace else df.copy() + # Prepare metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + messages = [] + operation_status = "success" + created_columns = [] + try: # We'll store encoder info here to save into data at the end encoder_obj = None @@ -138,11 +362,13 @@ def execute(self, kernel, data): codes, uniques = pd.factorize(result_df[col], sort=True) new_col = f"{col}_lbl" result_df[new_col] = codes + created_columns.append(new_col) # Save mapping value->code for reuse later mapping = {val: idx for idx, val in enumerate(uniques)} label_mappings[col] = mapping if drop_original: result_df.drop(columns=[col], inplace=True) + messages.append(f"Column '{col}': label-encoded -> {new_col} (unique_values={len(uniques)})") encoder_obj = label_mappings @@ -170,7 +396,8 @@ def execute(self, kernel, data): result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) else: result_df = pd.concat([result_df, ohe_df], axis=1) - + created_columns.extend(feature_names) + messages.append(f"Columns {columns} one-hot encoded -> created {len(feature_names)} columns.") encoder_obj = encoder # save fitted OneHotEncoder elif method == "ordinal": @@ -180,14 +407,30 @@ def execute(self, kernel, data): tmp = result_df[columns].astype(object).fillna("___MISSING___") enc_arr = enc.fit_transform(tmp) for i, col in enumerate(columns): - result_df[f"{col}_ord"] = enc_arr[:, i] + new_col = f"{col}_ord" + result_df[new_col] = enc_arr[:, i] + created_columns.append(new_col) if drop_original: result_df.drop(columns=[col], inplace=True) - + messages.append(f"Column '{col}': ordinal-encoded -> {new_col}") encoder_obj = enc else: kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") + # log unsupported method + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message="Unsupported method requested.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Apply result back to shared data if inplace @@ -211,5 +454,33 @@ def execute(self, kernel, data): self._send_html(kernel, result_df) except Exception as e: - kernel._send_message("stderr", f"Error during encoding: {e}") - return + operation_status = "error" + err_msg = f"Error during encoding: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Attempt to insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + # store affected (input) columns newline-separated + affected_columns_str = "\n".join(columns) + # store created columns newline-separated (if any) + created_columns_str = "\n".join(created_columns) if created_columns else "" + # Compose metadata message with sections for readability + details = "\n".join(messages) if messages else "Encoding completed without detailed messages." + metadata_message = f"Method: {method}\nCreated columns:\n{created_columns_str}\n\nDetails:\n{details}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception as e: + try: + kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py index f7a8bb7..f81710d 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py @@ -6,6 +6,15 @@ from distutils import util import pandas as pd from sklearn.preprocessing import MinMaxScaler +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Normalize(MariaMagic): @@ -23,6 +32,8 @@ class Normalize(MariaMagic): %normalize %normalize columns=age,salary %normalize feature_range=5,10 inplace=False + + Execution metadata is recorded in table `magic_metadata`. """ def __init__(self, args=""): @@ -37,7 +48,8 @@ def name(self): def help(self): return ( "%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]\n" - "Normalize numeric columns using MinMaxScaler (in-place by default)." + "Normalize numeric columns using MinMaxScaler (in-place by default).\n" + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -71,21 +83,229 @@ def _send_html(self, kernel, df): except Exception: pass + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): df = data.get("last_select") - if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + # Prepare metadata context early so failures can be logged + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return columns_arg = args.get("columns", None) if isinstance(columns_arg, str): columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) else: columns = None @@ -93,12 +313,48 @@ def execute(self, kernel, data): if isinstance(feature_range_arg, str): parts = [p.strip() for p in feature_range_arg.split(",")] if len(parts) == 2: - feature_range = (float(parts[0]), float(parts[1])) + try: + feature_range = (float(parts[0]), float(parts[1])) + except Exception: + msg = "feature_range values must be numeric." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return else: - kernel._send_message("stderr", "feature_range must be provided as 'min,max'.") + msg = "feature_range must be provided as 'min,max'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: - feature_range = (0, 1) + # already a tuple/list? + try: + feature_range = tuple(feature_range_arg) + except Exception: + feature_range = (0, 1) inplace = bool(args.get("inplace", True)) target_df = df if inplace else df.copy(deep=True) @@ -109,28 +365,107 @@ def execute(self, kernel, data): else: missing_cols = [c for c in columns if c not in target_df.columns] if missing_cols: - kernel._send_message("stderr", f"Missing columns: {', '.join(missing_cols)}") + msg = f"Missing columns: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return target_columns = columns if not target_columns: - kernel._send_message("stderr", "No numeric columns to normalize.") + msg = "No numeric columns to normalize." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Perform normalization + operation_status = "success" + messages = [] try: scaler = MinMaxScaler(feature_range=feature_range) target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) msg = f"Normalized {len(target_columns)} column(s) to range {feature_range}." + messages.append(msg) except Exception as e: - kernel._send_message("stderr", f"Error during normalization: {e}") + operation_status = "error" + err_msg = f"Error during normalization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(target_columns), + operation_status=operation_status, + message="\n".join(messages), + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Store results if inplace: data["last_select"] = target_df - msg += " Updated data['last_select'] in-place." + location_msg = "Updated data['last_select'] in-place." + messages.append(location_msg) + kernel._send_message("stdout", f"{msg} {location_msg}") else: data["last_select_normalized"] = target_df - msg += " Stored in data['last_select_normalized']." + location_msg = "Stored in data['last_select_normalized']." + messages.append(location_msg) + kernel._send_message("stdout", f"{msg} {location_msg}") - kernel._send_message("stdout", msg) - self._send_html(kernel, target_df) + # Display DataFrame + try: + self._send_html(kernel, target_df) + except Exception: + pass + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + metadata_message = f"Feature range: {feature_range}\n\nDetails:\n{message_str}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py index 9cb8071..381c1d1 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py @@ -6,6 +6,15 @@ from distutils import util import pandas as pd from sklearn.model_selection import train_test_split +import logging +import os +import re + +# optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class SplitData(MariaMagic): @@ -16,30 +25,9 @@ class SplitData(MariaMagic): Split the current data["last_select"] DataFrame into train/test/(validation). - - test_size: float fraction (0-1) or int count. Interpreted relative to the original dataset. - Default: 0.2 - - val_size: float fraction (0-1) or int count. Interpreted relative to the original dataset. - If 0 (default), no validation set is created. - - stratify: column name to stratify on (must exist in the DataFrame). - - shuffle: whether to shuffle before splitting (default True). - - random_state: integer seed for reproducibility (default None). - - inplace: if True (default), sets data["last_select"] to the training set and also stores - test/val under the provided names. If False, original last_select is kept and train/test/val - are stored under the provided names. - - train_name/test_name/val_name: keys under which resulting DataFrames will be stored in `data`. - Defaults: last_select_train, last_select_test, last_select_val - - Behavior notes: - - test_size and val_size may be integers (counts) or floats (fractions of the original dataset). - - If both fractions are provided, the code first removes the test set (test_size of original), - then splits the remaining data to create the validation set. The computed relative fraction - for the second split uses val_size relative to the original dataset (so results match user intent). - - If val_size is 0 or not provided, only train/test split occurs. - - Examples: - %splitdata - %splitdata test_size=0.25 val_size=0.1 stratify=target random_state=123 - %splitdata test_size=100 val_size=50 inplace=False + Execution metadata is recorded into table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name """ def __init__(self, args=""): @@ -55,7 +43,7 @@ def help(self): return ( "%splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False]\n" "[random_state=42] [inplace=True|False] [train_name=name] [test_name=name] [val_name=name]\n" - "Split last_select into train/test/(val)." + "Split last_select into train/test/(val). Execution metadata recorded in magic_metadata." ) def _str_to_obj(self, s): @@ -91,16 +79,200 @@ def _send_html(self, kernel, df, title=None): except Exception: pass + # --------------- metadata / DB helpers (best-effort) ---------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # ---------------- end metadata helpers ---------------- + def execute(self, kernel, data): df = data.get("last_select") + + # prepare metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Defaults @@ -118,7 +290,21 @@ def execute(self, kernel, data): # Validate dataset n_total = len(df) if n_total == 0: - kernel._send_message("stderr", "DataFrame has no rows to split.") + msg = "DataFrame has no rows to split." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Helper to interpret sizes (int count or fraction) @@ -147,21 +333,64 @@ def interpret_size(size_arg, total): test_frac = interpret_size(test_size_arg, n_total) val_frac = interpret_size(val_size_arg, n_total) except ValueError as e: - kernel._send_message("stderr", f"Error interpreting sizes: {e}") + msg = f"Error interpreting sizes: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if test_frac + val_frac >= 1.0: - kernel._send_message("stderr", "Sum of test_size and val_size must be less than 1.0.") + msg = "Sum of test_size and val_size must be less than 1.0." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Prepare stratify arrays if requested stratify_arr = None if stratify_col: if stratify_col not in df.columns: - kernel._send_message("stderr", f"Stratify column '{stratify_col}' not found in DataFrame.") + msg = f"Stratify column '{stratify_col}' not found in DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return stratify_arr = df[stratify_col].values + # Run splits try: # First split off the test set (test_frac of original) if test_frac > 0: @@ -181,10 +410,7 @@ def interpret_size(size_arg, total): train_df = train_val_df val_df = pd.DataFrame(columns=df.columns) else: - # We need to compute val fraction relative to the remaining (train_val_df). - # val_frac was relative to original; relative fraction = val_frac / (1 - test_frac) rel_val_frac = val_frac / (1.0 - test_frac) - # For stratify on second split, use stratify column restricted to train_val_df if provided stratify_arr_second = None if stratify_arr is not None: stratify_arr_second = train_val_df[stratify_col].values @@ -196,13 +422,12 @@ def interpret_size(size_arg, total): stratify=stratify_arr_second if stratify_arr_second is not None else None ) - # Store results in data dict + # Store results in data dict under requested names data[test_name] = test_df data[val_name] = val_df data[train_name] = train_df if inplace: - # follow behavior of other magics: set last_select to training set data["last_select"] = train_df # Report sizes @@ -213,7 +438,6 @@ def interpret_size(size_arg, total): kernel._send_message("stdout", msg) # Display small previews - # Show train + validation (if exists) and test try: if not train_df.empty: self._send_html(kernel, train_df.head(20), title=f"Train ({len(train_df)} rows)") @@ -222,9 +446,47 @@ def interpret_size(size_arg, total): if not test_df.empty: self._send_html(kernel, test_df.head(20), title=f"Test ({len(test_df)} rows)") except Exception: - # non-fatal; already stored the DataFrames pass + # Insert metadata (success) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns = stratify_col if stratify_col else "ALL_COLUMNS" + message = ( + f"train_name={train_name}, test_name={test_name}, val_name={val_name}\n" + f"train_count={len(train_df)}, test_count={len(test_df)}, val_count={len(val_df)}\n" + f"test_frac={test_frac}, val_frac={val_frac}, shuffle={shuffle}, random_state={random_state}" + ) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns, + operation_status="success", + message=message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + except Exception as e: - kernel._send_message("stderr", f"Error during splitting: {e}") + msg = f"Error during splitting: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=stratify_col if stratify_col else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py index f288973..d416b00 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py @@ -6,6 +6,15 @@ from distutils import util import pandas as pd from sklearn.preprocessing import StandardScaler +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Standardize(MariaMagic): @@ -23,6 +32,8 @@ class Standardize(MariaMagic): Examples: %standardize %standardize columns=age,salary inplace=False + + Execution metadata is recorded in table `magic_metadata`. """ def __init__(self, args=""): @@ -37,7 +48,8 @@ def name(self): def help(self): return ( "%standardize [columns=col1,col2,...] [inplace=True|False]\n" - "Standardizes numeric columns using sklearn's StandardScaler (in-place by default)." + "Standardizes numeric columns using sklearn's StandardScaler (in-place by default).\n" + "Execution metadata is recorded in table `magic_metadata`." ) def _str_to_obj(self, s): @@ -71,54 +83,317 @@ def _send_html(self, kernel, df): except Exception: pass + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): df = data.get("last_select") - if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return columns_arg = args.get("columns", None) if isinstance(columns_arg, str): columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) else: columns = None inplace = bool(args.get("inplace", True)) target_df = df if inplace else df.copy(deep=True) + # Determine target columns (numeric) if columns is None: target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] else: missing_cols = [c for c in columns if c not in target_df.columns] if missing_cols: - kernel._send_message("stderr", f"Missing columns: {', '.join(missing_cols)}") + msg = f"Missing columns: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return target_columns = columns if not target_columns: - kernel._send_message("stderr", "No numeric columns to standardize.") + msg = "No numeric columns to standardize." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + operation_status = "success" + messages = [] try: scaler = StandardScaler() target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) - msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." + summary_msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." + messages.append(summary_msg) except Exception as e: - kernel._send_message("stderr", f"Error during standardization: {e}") + operation_status = "error" + err_msg = f"Error during standardization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(target_columns), + operation_status=operation_status, + message="\n".join(messages), + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Store results if inplace: data["last_select"] = target_df - msg += " Updated data['last_select'] in-place." + location_msg = "Updated data['last_select'] in-place." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") else: data["last_select_standardized"] = target_df - msg += " Stored in data['last_select_standardized']." + location_msg = "Stored in data['last_select_standardized']." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") + + # Display DataFrame + try: + self._send_html(kernel, target_df) + except Exception: + pass - kernel._send_message("stdout", msg) - self._send_html(kernel, target_df) + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = f"{summary_msg}\n{location_msg}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass From b0cf51290e1bc376a0ec1802ee86c1fc9ac4a80d Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 14:06:54 +0000 Subject: [PATCH 27/38] Logging added to select_features --- Untitled.ipynb | 519 ++++++++++-------- .../ml_pipeline/select_features.py | 450 ++++++++++++++- 2 files changed, 714 insertions(+), 255 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index d52c114..52c58e0 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -5009,11 +5009,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -5027,17 +5023,16 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", " 1\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", " 5\n", " 12\n", " 87.5\n", @@ -5051,15 +5046,14 @@ " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " 0\n", " \n", " \n", " 2\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", " 20\n", " 30\n", " 91.0\n", @@ -5073,15 +5067,14 @@ " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 3\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", " 10\n", " 18\n", " 79.3\n", @@ -5095,15 +5088,14 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 4\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", " 6\n", " 22\n", " 95.2\n", @@ -5117,15 +5109,14 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", " 0\n", + " 0\n", + " 3\n", " \n", " \n", " 5\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", " 8\n", " 15\n", " 88.0\n", @@ -5139,15 +5130,14 @@ " 4\n", " 85.0\n", " 0\n", + " 4\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 6\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", " 25\n", " 8\n", " 72.5\n", @@ -5161,15 +5151,14 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 7\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", " 18\n", " 20\n", " 81.4\n", @@ -5183,15 +5172,14 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 8\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", " 7\n", " 25\n", " 93.1\n", @@ -5205,15 +5193,14 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 9\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", " 3\n", " 10\n", " 85.0\n", @@ -5227,15 +5214,14 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 10\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", " 30\n", " 12\n", " 68.9\n", @@ -5249,15 +5235,14 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 11\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", " 5\n", " 12\n", " 87.5\n", @@ -5271,15 +5256,14 @@ " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " 0\n", " \n", " \n", " 12\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", " 20\n", " 30\n", " 91.0\n", @@ -5293,15 +5277,14 @@ " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 13\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", " 10\n", " 18\n", " 79.3\n", @@ -5315,15 +5298,14 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 14\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", " 6\n", " 22\n", " 95.2\n", @@ -5337,15 +5319,14 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 15\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", " 8\n", " 15\n", " 88.0\n", @@ -5359,15 +5340,14 @@ " 4\n", " 85.0\n", " 0\n", + " 4\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 16\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", " 25\n", " 8\n", " 72.5\n", @@ -5381,15 +5361,14 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 17\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", " 18\n", " 20\n", " 81.4\n", @@ -5403,15 +5382,14 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 18\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", " 7\n", " 25\n", " 93.1\n", @@ -5425,15 +5403,14 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 19\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", " 3\n", " 10\n", " 85.0\n", @@ -5447,15 +5424,14 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 20\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", " 30\n", " 12\n", " 68.9\n", @@ -5469,15 +5445,14 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 21\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", " 5\n", " 12\n", " 87.5\n", @@ -5491,15 +5466,14 @@ " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " 0\n", " \n", " \n", " 22\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", " 20\n", " 30\n", " 91.0\n", @@ -5513,15 +5487,14 @@ " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 23\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", " 10\n", " 18\n", " 79.3\n", @@ -5535,15 +5508,14 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 24\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", " 6\n", " 22\n", " 95.2\n", @@ -5557,15 +5529,14 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 25\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", " 8\n", " 15\n", " 88.0\n", @@ -5579,15 +5550,14 @@ " 4\n", " 85.0\n", " 0\n", + " 4\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 26\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", " 25\n", " 8\n", " 72.5\n", @@ -5601,15 +5571,14 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 27\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", " 18\n", " 20\n", " 81.4\n", @@ -5623,15 +5592,14 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 28\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", " 7\n", " 25\n", " 93.1\n", @@ -5645,15 +5613,14 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 29\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", " 3\n", " 10\n", " 85.0\n", @@ -5667,15 +5634,14 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 30\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", " 30\n", " 12\n", " 68.9\n", @@ -5689,15 +5655,14 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 31\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", " 5\n", " 12\n", " 87.5\n", @@ -5711,15 +5676,14 @@ " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " 0\n", " \n", " \n", " 32\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", " 20\n", " 30\n", " 91.0\n", @@ -5733,15 +5697,14 @@ " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 33\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", " 10\n", " 18\n", " 79.3\n", @@ -5755,15 +5718,14 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 34\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", " 6\n", " 22\n", " 95.2\n", @@ -5777,15 +5739,14 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 35\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", " 8\n", " 15\n", " 88.0\n", @@ -5799,15 +5760,14 @@ " 4\n", " 85.0\n", " 0\n", + " 4\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 36\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", " 25\n", " 8\n", " 72.5\n", @@ -5821,15 +5781,14 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 37\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", " 18\n", " 20\n", " 81.4\n", @@ -5843,15 +5802,14 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 38\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", " 7\n", " 25\n", " 93.1\n", @@ -5865,15 +5823,14 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 39\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", " 3\n", " 10\n", " 85.0\n", @@ -5887,15 +5844,14 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 40\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", " 30\n", " 12\n", " 68.9\n", @@ -5909,7 +5865,10 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", "" @@ -5920,7 +5879,7 @@ } ], "source": [ - "%encode method=label columns=department drop_original=false" + "%encode method=label drop_original=true" ] }, { @@ -10952,32 +10911,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "6c5def76-a36c-45be-8712-d886a1e52e25", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Error during feature selection: could not convert string to float: 'Alice'\n" - ] - } - ], - "source": [ - "%select_features target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b0c717fb-9f2f-47ba-8c0c-73c6934a069f", - "metadata": {}, "outputs": [ { "data": { "text/html": [ - "

Feature Selection Results (method=rf_importance)

\n", + "

Feature Selection Results (method=correlation)

\n", " \n", " \n", " \n", @@ -10986,48 +10927,76 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Feature
avg_project_score0.197081overtime_hours0.867873
years_experience0.133905avg_project_score0.846331
satisfaction_score0.1315870.845916
salary0.097661potential_score0.828136
bonus0.094199performance_rating0.817918
overtime_hours0.085659remote_ratio0.744150
training_hours0.0767670.742307
age0.0504500.683720
gender_lbl0.654654
certifications0.0465780.654654
department_lbl0.634270
years_experience0.623764
bonus0.480500
salary0.463771
projects_completed0.0443830.441624
remote_ratio0.041728name_lbl0.189934
education_level_lbl0.146310
emp_id0.047260
" @@ -11040,24 +11009,24 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected 7 features saved to data['selected_features']: avg_project_score, years_experience, satisfaction_score, salary, bonus, overtime_hours, training_hours\n" + "Selected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n" ] } ], "source": [ - "%select_features target=potential_score method=rf_importance k=7 problem=regression" + "%select_features target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "c106f132-0c90-4db8-8d7d-7c0cb29f6b10", + "execution_count": 14, + "id": "b0c717fb-9f2f-47ba-8c0c-73c6934a069f", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "

Feature Selection Results (method=chi2)

\n", + "

Feature Selection Results (method=rf_importance)

\n", " \n", " \n", " \n", @@ -11066,48 +11035,76 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
Feature
overtime_hours1.792208department_lbl0.483284
certifications1.428571years_experience0.062483
age1.224733avg_project_score0.061437
avg_project_score1.202767satisfaction_score0.054678
remote_ratio1.1585210.051413
years_experience1.115538training_hours0.040343
training_hours1.071429performance_rating0.039563
satisfaction_score1.015994attrition_flag0.039422
education_level_lbl0.036622
age0.036479
overtime_hours0.035503
bonus0.4688640.026721
salary0.4412090.010314
name_lbl0.010022
projects_completed0.4351590.005388
certifications0.004031
gender_lbl0.001930
emp_id0.000368
" @@ -11120,10 +11117,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected 5 features saved to data['top_features']: overtime_hours, certifications, age, avg_project_score, remote_ratio\n" + "Selected 7 features saved to data['selected_features']: department_lbl, years_experience, avg_project_score, satisfaction_score, remote_ratio, training_hours, performance_rating\n" ] } ], + "source": [ + "%select_features target=potential_score method=rf_importance k=7 problem=regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c106f132-0c90-4db8-8d7d-7c0cb29f6b10", + "metadata": {}, + "outputs": [], "source": [ "%select_features target=attrition_flag method=chi2 k=5 problem=classification output_name=top_features" ] @@ -14103,7 +14110,65 @@ "salarysuccessStandardized 2 column(s) (mean=0, std=1).\n", "Stored in data['last_select_standardized'].test12splitdatatest_size=0.2 val_size=0.1 random_state=422025-10-29 13:04:35ALL_COLUMNSsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", "train_count=28, test_count=8, val_count=4\n", - "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42test" + "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42test13encodemethod=label columns=department drop_original=true2025-10-29 13:22:48departmentsuccessMethod: label\n", + "Created columns:\n", + "department_lbl\n", + "\n", + "Details:\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)test14encodemethod=label drop_original=true2025-10-29 13:23:03name\n", + "gender\n", + "education_levelsuccessMethod: label\n", + "Created columns:\n", + "name_lbl\n", + "gender_lbl\n", + "education_level_lbl\n", + "\n", + "Details:\n", + "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", + "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test15encodemethod=label drop_original=true2025-10-29 13:24:50name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: label\n", + "Created columns:\n", + "name_lbl\n", + "department_lbl\n", + "gender_lbl\n", + "education_level_lbl\n", + "\n", + "Details:\n", + "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", + "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test16encodemethod=label drop_original=true2025-10-29 13:30:36name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: label\n", + "Created columns:\n", + "name_lbl\n", + "department_lbl\n", + "gender_lbl\n", + "education_level_lbl\n", + "\n", + "Details:\n", + "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", + "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test17encodemethod=label drop_original=true2025-10-29 13:41:01name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: label\n", + "Created columns:\n", + "name_lbl\n", + "department_lbl\n", + "gender_lbl\n", + "education_level_lbl\n", + "\n", + "Details:\n", + "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", + "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test18select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 13:41:24overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtest" ] }, "metadata": {}, diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py index 1f0ca10..f2224bd 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py @@ -10,6 +10,16 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import LogisticRegression, Lasso from sklearn.preprocessing import StandardScaler, MinMaxScaler +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + class SelectFeatures(MariaMagic): """ @@ -20,17 +30,10 @@ class SelectFeatures(MariaMagic): Identify the best features for training a model on data['last_select']. Uses all columns except the target column as features. - Methods: - - correlation: Absolute Pearson correlation with the target. - - rf_importance: RandomForest feature importance scores. - - rfe: Recursive Feature Elimination with a RandomForest model. - - mutual_info: Mutual Information between features and target. - - chi2: Chi-squared statistic (classification only, non-negative features). - - anova: ANOVA F-test for feature significance. - - l1_selection: L1-based feature selection (LogisticRegression for classification, Lasso for regression). - - variance: Remove features with low variance (threshold-based). - Stores the ranked features in data[output_name] and displays a table of results. + + Execution metadata is recorded in table `magic_metadata`. """ + def __init__(self, args=""): self.args = args @@ -43,6 +46,8 @@ def name(self): def help(self): return "Identify the best features for model training from data['last_select']." + + # -------------------- small utilities -------------------- def _str_to_obj(self, s): try: return int(s) @@ -83,17 +88,204 @@ def _send_html(self, kernel, df, title=None): except Exception: pass + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + # Load training DataFrame df = data.get("last_select") if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return target = args.get("target") @@ -104,24 +296,80 @@ def execute(self, kernel, data): inplace = bool(args.get("inplace", True)) if not target: - kernel._send_message("stderr", "target argument is required (target=target_col).") + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if target not in df.columns: - kernel._send_message("stderr", f"Target column '{target}' not found in DataFrame.") + msg = f"Target column '{target}' not found in DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Use all columns except the target as features features = [col for col in df.columns if col != target] if not features: - kernel._send_message("stderr", "No features available after excluding target column.") + msg = "No features available after excluding target column." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Determine problem type if problem_override: problem = problem_override.lower() if problem not in ("classification", "regression"): - kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: tgt_ser = df[target] @@ -141,9 +389,45 @@ def execute(self, kernel, data): y = df[target].copy() # Handle missing values (simple imputation for feature selection) - X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) + try: + if problem == "regression": + X = X.fillna(X.mean(numeric_only=True)) + else: + X = X.fillna(X.mode().iloc[0]) + except Exception: + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if X.isna().any().any(): - kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Scale data for methods that require it @@ -152,7 +436,21 @@ def execute(self, kernel, data): try: X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) except Exception as e: - kernel._send_message("stderr", f"Error scaling data: {e}") + msg = f"Error scaling data: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Feature selection @@ -204,10 +502,38 @@ def execute(self, kernel, data): elif method == "chi2": if problem != "classification": - kernel._send_message("stderr", "chi2 method is only for classification problems.") + msg = "chi2 method is only for classification problems." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if (X < 0).any().any(): - kernel._send_message("stderr", "chi2 requires non-negative features.") + msg = "chi2 requires non-negative features." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return selector = SelectKBest(score_func=chi2, k=k) selector.fit(X, y) @@ -254,14 +580,42 @@ def execute(self, kernel, data): }) else: - kernel._send_message("stderr", "method must be one of 'correlation', 'rf_importance', 'rfe', 'mutual_info', 'chi2', 'anova', 'l1_selection', or 'variance'.") + msg = "method must be one of 'correlation', 'rf_importance', 'rfe', 'mutual_info', 'chi2', 'anova', 'l1_selection', or 'variance'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return except Exception as e: - kernel._send_message("stderr", f"Error during feature selection: {e}") + msg = f"Error during feature selection: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return - # Store results + # Store results in data dict try: data[output_name] = selected_features data[output_name + "_meta"] = { @@ -272,11 +626,51 @@ def execute(self, kernel, data): "all_scores": result_df.to_dict() } except Exception as e: - kernel._send_message("stderr", f"Error storing results: {e}") + msg = f"Error storing results: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(selected_features) if 'selected_features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Display results - self._send_html(kernel, result_df, title=f"Feature Selection Results (method={method})") - kernel._send_message("stdout", f"Selected {len(selected_features)} features saved to data['{output_name}']: {', '.join(selected_features)}") + try: + self._send_html(kernel, result_df, title=f"Feature Selection Results (method={method})") + except Exception: + pass + + success_msg = f"Selected {len(selected_features)} features saved to data['{output_name}']: {', '.join(selected_features)}" + kernel._send_message("stdout", success_msg) + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ",".join(selected_features) + message_str = success_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass - return \ No newline at end of file + return From 45bd68647a6b948ad246409c645587ecf56a5616 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 17:11:22 +0000 Subject: [PATCH 28/38] Added preview,apply, rollback --- Untitled.ipynb | 3755 ++++++++++------- .../ml_commands/data_cleaning/clipoutliers.py | 681 ++- .../ml_commands/data_cleaning/dropmissing.py | 697 ++- .../ml_commands/data_cleaning/dropoutliers.py | 642 ++- .../ml_commands/data_cleaning/fillmissing.py | 729 +++- 5 files changed, 4663 insertions(+), 1841 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 52c58e0..3c0797e 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -98,6 +98,26 @@ ");\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f6d3778-2064-4b53-98d0-cc0cac8160d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop tables employees;" + ] + }, { "cell_type": "code", "execution_count": null, @@ -121,11 +141,11 @@ " overtime_hours, remote_ratio, salary, bonus, satisfaction_score,\n", " performance_rating, potential_score, attrition_flag)\n", "VALUES\n", - "('Alice', 'HR', 30, 'F', 'Bachelors', 5, 12, 87.5, 1, 40, 5, 0.2, 55000, 3000, 8.5, 4, 75.0, 0),\n", - "('Bob', 'Engineering', 45, 'M', 'Masters', 20, 30, 91.0, 3, 20, 10, 0.1, 120000, 15000, 9.0, 5, 89.0, 0),\n", - "('Charlie', 'Sales', 38, 'M', 'Bachelors', 10, 18, 79.3, 0, 15, 20, 0.5, 80000, 7000, 7.2, 3, 70.0, 1),\n", + "('Alice', 'HR', 30, 'F', NULL, 5, 12, 87.5, 1, 40, 5, 0.2, 55000, 300, 8.5, 4, 75.0, 0),\n", + "('Bob', 'Engineering', 45, 'M', 'Masters', 20, 30, 91.0, 3, 20, 10, 0.1, 1200000, 15000, 9.0, 5, 89.0, 0),\n", + "('Charlie', 'Sales', 38, 'M', 'Bachelors', NULL, 18, 79.3, 0, 15, 20, 0.5, 80000, 7000, 7.2, 3, 70.0, 1),\n", "('Diana', 'Engineering', 29, 'F', 'PhD', 6, 22, 95.2, 2, 50, 2, 0.0, 97000, 10000, 9.6, 5, 95.0, 0),\n", - "('Eve', 'Finance', 35, 'F', 'Bachelors', 8, 15, 88.0, 1, 30, 6, 0.3, 90000, 8000, 8.0, 4, 85.0, 0),\n", + "('Eve', NULL, 35, 'F', 'Bachelors', 8, 15, 88.0, 1, 30, 6, 0.3, 90000, 8000, 8.0, 4, 85.0, 0),\n", "('Frank', 'HR', 50, 'M', 'High School', 25, 8, 72.5, 0, 10, 15, 0.7, 60000, 4000, 6.5, 2, 60.0, 1),\n", "('Grace', 'Sales', 42, 'F', 'Bachelors', 18, 20, 81.4, 1, 25, 12, 0.4, 85000, 7000, 7.8, 3, 74.0, 0),\n", "('Henry', 'Engineering', 31, 'M', 'Masters', 7, 25, 93.1, 2, 35, 5, 0.2, 95000, 9000, 9.1, 5, 90.0, 0),\n", @@ -142,7 +162,7 @@ { "data": { "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
2BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
3CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
12BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
13CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
22BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
23CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FBachelors51287.5014050.2055000.003000.008.50475.000
32BobEngineering45MMasters203091.00320100.10120000.0015000.009.00589.000
33CharlieSales38MBachelors101879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveFinance35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
2BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
3CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -172,29 +192,99 @@ " \n", " \n", " \n", - " id\n", + " emp_id\n", " 0\n", " 0.0\n", " \n", " \n", " name\n", - " 1\n", - " 12.5\n", + " 0\n", + " 0.0\n", " \n", " \n", " department\n", - " 2\n", - " 25.0\n", + " 1\n", + " 10.0\n", " \n", " \n", " age\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " gender\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " education_level\n", " 1\n", - " 12.5\n", + " 10.0\n", + " \n", + " \n", + " years_experience\n", + " 1\n", + " 10.0\n", + " \n", + " \n", + " projects_completed\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " avg_project_score\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " certifications\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " training_hours\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " overtime_hours\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " remote_ratio\n", + " 0\n", + " 0.0\n", " \n", " \n", " salary\n", - " 3\n", - " 37.5\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " bonus\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " satisfaction_score\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " performance_rating\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " potential_score\n", + " 0\n", + " 0.0\n", + " \n", + " \n", + " attrition_flag\n", + " 0\n", + " 0.0\n", " \n", " \n", "" @@ -491,15 +581,15 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "70e58c09-5a55-4093-b135-0fed2202d2d3", + "execution_count": 15, + "id": "63c5d2a7-c1ca-4fcd-a711-cf340c630d5b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" + "PREVIEW: would drop 1 row(s) (from 10 to 9).\n" ] }, { @@ -508,125 +598,50 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_would_be_dropped
5EveNaN35FBachelors8.01588.01AliceHR30.050000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.060.390000.08000.08.0485.00True
" @@ -637,33 +652,75 @@ } ], "source": [ - "%dropmissing columns=salary" + "%dropmissing columns=department mode=preview" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "c674309d-4c1c-4d5a-a879-3b3f4816e4c5", + "execution_count": 9, + "id": "2ef8cbe8-6fc7-46a6-9517-4d329bad53c8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" + "EXPLAIN (estimate):\n", + "
EXPLAIN
{\n", + " "query_block": {\n", + " "select_id": 1,\n", + " "table": {\n", + " "delete": 1,\n", + " "table_name": "employees",\n", + " "access_type": "ALL",\n", + " "rows": 10,\n", + " "attached_condition": "employees.department is null"\n", + " }\n", + " }\n", + "}
\n" ] - }, - { - "data": { - "text/html": [ + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0536ae17-266f-4b77-910b-05b69d4f817c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_73b9e64a9b8c4045.\n" + ] + }, + { + "data": { + "text/html": [ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -671,36 +728,190 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30.050000.030FNaN5.01287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters20.03091.0320100.1120000.015000.09.0589.00
3CharlieEngineeringSales38MBachelorsNaN70000.01879.3015200.580000.07000.07.2370.01
4DavidHR25.048000.0DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8HenryEngineering28.072000.031MMasters7.02593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors3.01085.00208Grace0.670000.05000.08.2482.00
10JackSales45.055MHigh School30.01268.905250.865000.02000.05.5150.01
" @@ -711,44 +922,287 @@ } ], "source": [ - "%dropmissing columns=salary" + "%dropmissing columns=department table=test.employees mode=apply confirm=true " ] }, { "cell_type": "code", - "execution_count": 8, - "id": "364a4ddb-de96-4f20-b979-292184dadd0e", + "execution_count": 17, + "id": "2fbe1e7a-501e-4af2-9d9e-1177adf4d64c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_73b9e64a9b8c4045 -> test.employees; previous test.employees renamed to test.employees_prerollback_73b9e64a9b8c4045.\n" + ] + } + ], + "source": [ + "%dropmissing mode=rollback rollback_token=73b9e64a9b8c4045" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "70e58c09-5a55-4093-b135-0fed2202d2d3", "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped rows with missing values (in-place). Updated last_select.\n" + ] + }, { "data": { "text/html": [ "\n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
count8.000007.0000005.000001AliceHR30.050000.0
mean4.5000036.14285761000.000004DavidHR25.048000.0
std2.449499.2633436FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropmissing columns=salary" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "c674309d-4c1c-4d5a-a879-3b3f4816e4c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped rows with missing values (in-place). Updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
1AliceHR30.050000.0
3CharlieEngineeringNaN70000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropmissing columns=salary" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "364a4ddb-de96-4f20-b979-292184dadd0e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1156,7 +1610,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "52d78b76-69c5-41d4-874f-8d8cb8b0cae9", "metadata": {}, "outputs": [ @@ -1164,9 +1618,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'age': filled missing with median=36.5.\n", - "Column 'salary': filled missing with median=82500.0.\n" + "PREVIEW: missing counts per column:\n", + "years_experience: missing=1\n", + "PREVIEW: computed fill-values (best-effort):\n", + "years_experience: would fill with -> 8.0 (median via local preview)\n" ] }, { @@ -1194,39 +1649,115 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idagesalary
count8.000007.0000005.00000
mean4.5000036.14285761000.00000
std2.449499.26334311269.42767
performance_ratingpotential_scoreattrition_flag_null_columns
1AliceHR30F3CharlieSales38MBachelors51287.514050.255000.03000.08.5475.0NaN1879.30
2BobEngineering45MMasters1520300.580000.07000.07.2370.01years_experience
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing columns=years_experience strategy=median mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "caf67100-c435-4da1-b993-b46f58981277", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_9407f2e1e7db47b2.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -1246,7 +1777,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1267,7 +1798,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1284,11 +1815,11 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1309,7 +1840,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1330,7 +1861,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1351,7 +1882,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1372,7 +1903,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1393,7 +1924,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -1407,731 +1938,440 @@ " \n", " \n", " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN5.01287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters20.03091.032038MBachelors108.01879.3029FPhD66.02295.22
5EveFinanceNaN35FBachelors88.01588.0150MHigh School2525.0872.5042FBachelors1818.02081.4131MMasters77.02593.1227FBachelors33.01085.0055MHigh School3030.01268.9050.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing columns=years_experience strategy=median table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "50047388-14df-4cfc-a670-5c1021b6e471", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_9407f2e1e7db47b2 -> test.employees; previous test.employees renamed to test.employees_prerollback_9407f2e1e7db47b2.\n" + ] + } + ], + "source": [ + "%fillmissing mode=rollback rollback_token=9407f2e1e7db47b2" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "beac2393-829b-472e-b9cb-d12166e16088", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'id': filled missing with mode=1.\n", + "Column 'name': filled missing with mode=Alice.\n", + "Column 'department': filled missing with mode=Engineering.\n", + "Column 'age': filled missing with mode=25.0.\n", + "Column 'salary': filled missing with mode=48000.0.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamedepartmentagesalary
111AliceHR30FBachelors51287.514050.255000.03000.08.5475.0030.050000.0
122BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.0040.048000.0
13CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01CharlieEngineering25.070000.0
14Diana4DavidHR25.048000.0
5EveEngineering29FPhD35.048000.0
62295.225020.097000.010000.09.6595.00FrankEngineering28.072000.0
15EveFinance35FBachelors81588.013060.390000.08000.08.0485.007AliceSales50.048000.0
16FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
17GraceSales42FBachelors182081.445.065000.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing strategy=mode" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fill missing completed (in-place). Summary:\n", + "Column 'name': filled missing with constant value=Unknown.\n", + "Column 'department': filled missing with constant value=Unknown.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
125120.485000.07000.07.8374.00AliceHR30.05000.0
18HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00BobUnknown40.065000.0
19IvyFinance27FBachelors31085.002080.670000.05000.08.2CharlieEngineering35.0700000.0
482.00DavidHR25.048000.0
20JackSales55MHigh School301268.905250.8EveUnknown35.065000.02000.05.51
6FrankEngineering28.072000.0
7UnknownSales50.0165000.0
218GraceSales45.065000.0
9AliceHR30FBachelors51287.514050.255000.03000.08.5475.0030.05000.0
22BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00BobUnknown40.065000.0
2311CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01Engineering35.0700000.0
24DianaEngineering29FPhD62295.225020.097000.010000.09.6595.0012DavidHR25.048000.0
2513EveFinance35FBachelors81588.013060.390000.08000.08.0485.00Unknown35.065000.0
2614FrankHR50MHigh School25872.5010Engineering28.072000.0
150.760000.04000.06.5260.01UnknownSales50.065000.0
2716GraceSales42FBachelors45.065000.0
17AliceHR30.05000.0
182081.4125120.485000.07000.07.8374.00BobUnknown40.065000.0
28Henry19CharlieEngineering31MMasters72593.123550.295000.09000.09.1590.0035.0700000.0
29IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00DavidHR25.048000.0
30JackSales55MHigh School301268.905250.821EveUnknown35.065000.02000.05.51
22FrankEngineering28.072000.0
23UnknownSales50.0165000.0
3124GraceSales45.065000.0
25AliceHR30FBachelors51287.514050.255000.03000.08.5475.0030.05000.0
3226BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00Unknown40.065000.0
3327CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
34DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
35EveFinance35FBachelors81588.013060.390000.08000.08.0485.0035.0700000.0
36Frank28DavidHR50MHigh School25872.5010150.760000.04000.06.5260.0125.048000.0
37GraceSales42FBachelors182081.4125120.485000.07000.07.8374.0029EveUnknown35.065000.0
38Henry30FrankEngineering31MMasters72593.123550.295000.09000.09.1590.0028.072000.0
39IvyFinance27FBachelors31085.002080.670000.05000.08.2482.0031UnknownSales50.065000.0
40Jack32GraceSales55MHigh School301268.905250.845.065000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing columns=age,salary strategy=median" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "beac2393-829b-472e-b9cb-d12166e16088", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'id': filled missing with mode=1.\n", - "Column 'name': filled missing with mode=Alice.\n", - "Column 'department': filled missing with mode=Engineering.\n", - "Column 'age': filled missing with mode=25.0.\n", - "Column 'salary': filled missing with mode=48000.0.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2146,35 +2386,114 @@ } ], "source": [ - "%fillmissing strategy=mode" + "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", + "execution_count": 27, + "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'name': filled missing with constant value=Unknown.\n", - "Column 'department': filled missing with constant value=Unknown.\n" + "Strategy 'constant' requires a 'value=...' argument.\n" ] - }, - { - "data": { - "text/html": [ - "
idnamedepartmentagesalary
133AliceHR30.050000.05000.0
234BobEngineeringUnknown40.048000.065000.0
335CharlieEngineering25.070000.035.0700000.0
436DavidHR25.048000.0
537EveEngineeringUnknown35.048000.065000.0
638FrankEngineering28.072000.0
7Alice39UnknownSales50.048000.065000.0
840GraceSales45.0
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + } + ], + "source": [ + "%fillmissing strategy=constant" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The result set was successfully written into last_query.csv\n" + ] + } + ], + "source": [ + "%df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'emp_id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'years_experience': detected 0 outlier(s) using iqr.\n", + "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", + "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", + "Column 'certifications': detected 0 outlier(s) using iqr.\n", + "Column 'training_hours': detected 0 outlier(s) using iqr.\n", + "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", + "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Column 'bonus': detected 0 outlier(s) using iqr.\n", + "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", + "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", + "Column 'potential_score': detected 0 outlier(s) using iqr.\n", + "Column 'attrition_flag': detected 0 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "
idnamedepartmentagesalary
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2182,281 +2501,361 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_is_outlierage_is_outlieryears_experience_is_outlierprojects_completed_is_outlieravg_project_score_is_outliercertifications_is_outliertraining_hours_is_outlierovertime_hours_is_outlierremote_ratio_is_outliersalary_is_outlierbonus_is_outliersatisfaction_score_is_outlierperformance_rating_is_outlierpotential_score_is_outlierattrition_flag_is_outlier
1AliceHR30.05000.030FNaN5.01287.514050.255000.0300.08.5475.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2BobUnknown40.065000.0Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
3CharlieEngineering35.0700000.0Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4DavidHR25.048000.0DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
5EveUnknown35.065000.0NaN35FBachelors8.01588.013060.390000.08000.08.0485.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
6FrankEngineering28.072000.0
7UnknownSales50.065000.0
8GraceSales45.065000.0
9AliceHR30.05000.0
10BobUnknown40.065000.0
11CharlieEngineering35.0700000.0
12DavidHR50MHigh School25.048000.0
13EveUnknown35.065000.0
14FrankEngineering28.072000.0
872.501015UnknownSales50.065000.00.760000.04000.06.5260.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
167GraceSales45.065000.0
17AliceHR30.05000.0
18BobUnknown40.065000.0
19CharlieEngineering35.0700000.0
42FBachelors18.020DavidHR25.048000.0
21EveUnknown35.065000.0
22FrankEngineering28.072000.0
23UnknownSales50.065000.0
24GraceSales45.065000.0
81.4125AliceHR30.05000.0
26BobUnknown40.065000.0
27CharlieEngineering35.0700000.0
28DavidHR25.048000.0
29EveUnknown35.065000.0
30FrankEngineering28.072000.0
31UnknownSales50.065000.0
32GraceSales45.065000.0
33AliceHR30.05000.0
34BobUnknown40.065000.0
35CharlieEngineering35.0700000.0
36DavidHR25.048000.0
37EveUnknown35.065000.0120.485000.07000.07.8374.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
38Frank8HenryEngineering28.072000.031MMasters7.02593.123550.295000.09000.09.1590.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
39UnknownSales50.065000.09IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
40Grace10JackSales45.055MHigh School30.01268.905250.865000.02000.05.5150.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
" @@ -2467,49 +2866,13 @@ } ], "source": [ - "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Strategy 'constant' requires a 'value=...' argument.\n" - ] - } - ], - "source": [ - "%fillmissing strategy=constant" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The result set was successfully written into last_query.csv\n" - ] - } - ], - "source": [ - "%df" + "%outliers" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", + "execution_count": 11, + "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", "metadata": {}, "outputs": [ { @@ -2517,9 +2880,9 @@ "output_type": "stream", "text": [ "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, @@ -2568,7 +2931,7 @@ " 700000.0\n", " False\n", " False\n", - " True\n", + " False\n", " \n", " \n", " 4\n", @@ -2629,13 +2992,13 @@ } ], "source": [ - "%outliers" + "%outliers method=zscore z_thresh=2.5" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", + "execution_count": 12, + "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", "metadata": {}, "outputs": [ { @@ -2643,12 +3006,19 @@ "output_type": "stream", "text": [ "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n", + "Column 'id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" ] }, + { + "data": { + "image/png": "" + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -2694,7 +3064,7 @@ " 700000.0\n", " False\n", " False\n", - " False\n", + " True\n", " \n", " \n", " 4\n", @@ -2755,29 +3125,26 @@ } ], "source": [ - "%outliers method=zscore z_thresh=2.5" + "%outliers plot=True" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", + "execution_count": 24, + "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + "Marked outliers in-place. Summary:\n", + "Column 'salary': detected 2 outlier(s) using iqr.\n" ] }, { "data": { - "image/png": "" + "image/png": "" }, "metadata": {}, "output_type": "display_data" @@ -2793,8 +3160,6 @@ " department\n", " age\n", " salary\n", - " id_is_outlier\n", - " age_is_outlier\n", " salary_is_outlier\n", " \n", " \n", @@ -2803,81 +3168,65 @@ " 1\n", " Alice\n", " HR\n", - " 30.000000\n", + " 30.0\n", " 5000.0\n", - " False\n", - " False\n", - " False\n", + " True\n", " \n", " \n", " 2\n", " Bob\n", - " Unknown\n", - " 40.000000\n", - " 178000.0\n", - " False\n", - " False\n", + " NaN\n", + " 40.0\n", + " NaN\n", " False\n", " \n", " \n", " 3\n", " Charlie\n", " Engineering\n", - " 36.142857\n", + " NaN\n", " 700000.0\n", - " False\n", - " False\n", " True\n", " \n", " \n", " 4\n", " David\n", " HR\n", - " 25.000000\n", + " 25.0\n", " 48000.0\n", " False\n", - " False\n", - " False\n", " \n", " \n", " 5\n", " Eve\n", - " Unknown\n", - " 35.000000\n", - " 178000.0\n", - " False\n", - " False\n", + " NaN\n", + " 35.0\n", + " NaN\n", " False\n", " \n", " \n", " 6\n", " Frank\n", " Engineering\n", - " 28.000000\n", - " 72000.0\n", - " False\n", - " False\n", + " 28.0\n", + " 72000.0\n", " False\n", " \n", " \n", " 7\n", - " Unknown\n", + " NaN\n", " Sales\n", - " 50.000000\n", - " 178000.0\n", - " False\n", - " False\n", + " 50.0\n", + " NaN\n", " False\n", " \n", " \n", " 8\n", " Grace\n", " Sales\n", - " 45.000000\n", + " 45.0\n", " 65000.0\n", " False\n", - " False\n", - " False\n", " \n", " \n", "" @@ -2888,29 +3237,112 @@ } ], "source": [ - "%outliers plot=True" + "%outliers columns=salary plot=True" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", + "execution_count": 9, + "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Marked outliers in-place. Summary:\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n" + "PREVIEW (local): would drop 1 row(s) (from 10 to 9).\n", + "Column 'emp_id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'years_experience': detected 0 outlier(s) using iqr.\n", + "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", + "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", + "Column 'certifications': detected 0 outlier(s) using iqr.\n", + "Column 'training_hours': detected 0 outlier(s) using iqr.\n", + "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", + "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Column 'bonus': detected 0 outlier(s) using iqr.\n", + "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", + "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", + "Column 'potential_score': detected 0 outlier(s) using iqr.\n", + "Column 'attrition_flag': detected 0 outlier(s) using iqr.\n" ] }, { "data": { - "image/png": "" + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_outlier_cols
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
" + ] }, "metadata": {}, "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "444a843d-c62d-4dde-8be4-298e62f2f92b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_53731015c85a478a.\n" + ] }, { "data": { @@ -2918,12 +3350,25 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2931,65 +3376,169 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarysalary_is_outlierbonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30.05000.0True
2BobNaN40.0NaNFalse
3CharlieEngineering30FNaN700000.0True51287.514050.255000.0300.08.5475.00
4DavidHR25.048000.0FalseDianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveNaN35.0NaNFalse35FBachelors81588.013060.390000.08000.08.0485.00
6FrankEngineering28.072000.0FalseHR50MHigh School25872.5010150.760000.04000.06.5260.01
7NaNGraceSales50.0NaNFalse42FBachelors182081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
8Grace10JackSales45.055MHigh School301268.905250.865000.0False2000.05.5150.01
" @@ -3000,23 +3549,41 @@ } ], "source": [ - "%outliers columns=salary plot=True" + "%dropoutliers table=test.employees mode=apply confirm=true " ] }, { "cell_type": "code", - "execution_count": 7, - "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", + "execution_count": 11, + "id": "da62d244-57bb-46db-a98c-0f19fcf7073a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dropped 2 row(s) containing outliers (in-place).\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n" + "Rollback: restored test.employees_backup_53731015c85a478a -> test.employees; previous test.employees renamed to test.employees_prerollback_53731015c85a478a.\n" + ] + } + ], + "source": [ + "%dropoutliers mode=rollback rollback_token=53731015c85a478a" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No outliers detected. No rows removed.\n", + "Column 'id': detected 0 outlier(s) using zscore.\n", + "Column 'age': detected 0 outlier(s) using zscore.\n", + "Column 'salary': detected 0 outlier(s) using zscore.\n" ] }, { @@ -3084,23 +3651,36 @@ } ], "source": [ - "%dropoutliers" + "%dropoutliers method=zscore z_thresh=2.5" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", + "execution_count": 10, + "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "No outliers detected. No rows removed.\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n" + "Clip outliers completed using iqr.\n", + "Column 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", + "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", + "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", + "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", + "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", + "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", + "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", + "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", + "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", + "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", + "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", + "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", + "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", + "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", + "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).\n", + "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" ] }, { @@ -3109,127 +3689,240 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
2BobNaN40.0NaNEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
3CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
4DavidDianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
6FrankHR25.048000.050MHigh School25872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters72593.123550.295000.09000.09.15EveNaN35.0NaN
6FrankEngineering28.072000.090.00
7NaNSales50.0NaN9IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
8Grace10JackSales45.055MHigh School301268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropoutliers method=zscore z_thresh=2.5" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clip outliers completed using iqr.\n", - "Column 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", - "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", - "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", - "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", - "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", - "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", - "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).\n", - "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3250,7 +3943,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3271,7 +3964,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3292,7 +3985,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3313,7 +4006,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3334,7 +4027,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3355,7 +4048,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3376,7 +4069,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3397,7 +4090,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3418,7 +4111,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3439,7 +4132,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3460,7 +4153,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3481,7 +4174,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3502,7 +4195,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3523,7 +4216,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3544,7 +4237,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3565,7 +4258,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3586,7 +4279,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3607,7 +4300,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3628,7 +4321,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3649,7 +4342,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3670,7 +4363,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3691,7 +4384,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3712,7 +4405,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3733,7 +4426,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3754,7 +4447,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3775,7 +4468,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3796,7 +4489,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3817,7 +4510,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -3838,235 +4531,379 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
111AliceHR300
212BobEngineering450
313CharlieSales381
414DianaEngineering290
515EveFinance350
616FrankHR501
717GraceSales420
818HenryEngineering310
919IvyFinance270
1020JackSales551
1121AliceHR300
1222BobEngineering450
1323CharlieSales381
1424DianaEngineering290
1525EveFinance350
1626FrankHR501
1727GraceSales420
1828HenryEngineering310
1929IvyFinance270
2030JackSales551
2131AliceHR300
2232BobEngineering450
2333CharlieSales381
2434DianaEngineering290
2535EveFinance350
2636FrankHR501
2737GraceSales420
2838HenryEngineering310
2939IvyFinance270
3040JackSales55MHigh SchoolHigh School301268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "60e97acc-b37f-4197-97c1-2a6dcb768b0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local): would modify 1 value(s) across 15 column(s).\n", + "Column 'emp_id': would clip 0 value(s) locally (bounds: -3.5, 14.5).\n", + "Column 'age': would clip 0 value(s) locally (bounds: 9.25, 65.25).\n", + "Column 'years_experience': would clip 0 value(s) locally (bounds: -15.0, 41.0).\n", + "Column 'projects_completed': would clip 0 value(s) locally (bounds: -2.25, 35.75).\n", + "Column 'avg_project_score': would clip 0 value(s) locally (bounds: 64.1875, 105.88749999999999).\n", + "Column 'certifications': would clip 0 value(s) locally (bounds: -2.625, 4.375).\n", + "Column 'training_hours': would clip 0 value(s) locally (bounds: -10.0, 60.0).\n", + "Column 'overtime_hours': would clip 0 value(s) locally (bounds: -8.25, 27.75).\n", + "Column 'remote_ratio': would clip 0 value(s) locally (bounds: -0.3624999999999999, 1.1374999999999997).\n", + "Column 'salary': would clip 1 value(s) locally (bounds: 25000.0, 135000.0).\n", + "Column 'bonus': would clip 0 value(s) locally (bounds: -2500.0, 15500.0).\n", + "Column 'satisfaction_score': would clip 0 value(s) locally (bounds: 5.062499999999999, 11.162500000000001).\n", + "Column 'performance_rating': would clip 0 value(s) locally (bounds: 0.375, 7.375).\n", + "Column 'potential_score': would clip 0 value(s) locally (bounds: 45.5, 113.5).\n", + "Column 'attrition_flag': would clip 0 value(s) locally (bounds: -1.125, 1.875).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_oob_columns
2BobEngineering45MMasters20.0301268.9091.0320100.11200000.015000.09.05250.865000.02000.05.5150.0189.00salary
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "41e5e6ae-419f-4a60-afe1-dd0a64b62d6c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_656479791b1d48fc.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
311.0AliceHR3030.0FBachelors512NaN5.012.087.514051.040.05.00.255000.03000.0300.08.544.075.000.0
322.0BobEngineering4545.0MMasters203020.030.091.0320103.020.010.00.1120000.0135000.015000.09.055.089.000.0
333.0CharlieSales3838.0MBachelors1018NaN18.079.3015200.015.020.00.580000.07000.07.233.070.011.0
344.0DianaEngineering2929.0FPhD6226.022.095.225022.050.02.00.097000.010000.09.655.095.000.0
355.0EveFinance35NaN35.0FBachelors8158.015.088.013061.030.06.00.390000.08000.08.044.085.000.0
366.0FrankHR5050.0MHigh School25825.08.072.5010150.010.015.00.760000.04000.06.522.060.011.0
377.0GraceSales4242.0FBachelors182018.020.081.4125121.025.012.00.485000.07000.07.833.074.000.0
388.0HenryEngineering3131.0MMasters7257.025.093.123552.035.05.00.295000.09000.09.155.090.000.0
399.0IvyFinance2727.0FBachelors3103.010.085.002080.020.08.00.670000.05000.08.244.082.000.0
4010.0JackSales5555.0MHigh School301230.012.068.905250.05.025.00.865000.02000.05.511.050.011.0
" @@ -4077,7 +4914,25 @@ } ], "source": [ - "%clipoutliers" + "%clipoutliers table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "78200a79-5413-47a2-ad46-c931b9d05d63", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_656479791b1d48fc -> test.employees; previous test.employees renamed to test.employees_prerollback_656479791b1d48fc.\n" + ] + } + ], + "source": [ + "%clipoutliers mode=rollback rollback_token=656479791b1d48fc" ] }, { @@ -14168,7 +15023,7 @@ "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test18select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 13:41:24overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtest" + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test18select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 13:41:24overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtest19missing2025-10-29 14:23:11ALL_COLUMNSsuccess%missing action=show examined 19 column(s); total_rows=10.test" ] }, "metadata": {}, @@ -14184,6 +15039,26 @@ "execution_count": null, "id": "8b52bf52-58a7-41bb-a568-b2691ed22f02", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "drop table magic_metadata;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e8ace40-a7b5-41e6-9225-fc52e197d0ea", + "metadata": {}, "outputs": [], "source": [] } diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py index 3ce0a5f..a14c22e 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -20,12 +20,17 @@ from datetime import datetime import re import os +import uuid +import time class ClipOutliers(MariaMagic): """ %clipoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] + Clamps (clips) extreme values to computed boundary limits. - method: iqr -> Tukey IQR method using k (default 1.5) @@ -33,11 +38,13 @@ class ClipOutliers(MariaMagic): - columns: comma-separated list of columns to operate on. If omitted, all numeric columns are used. - inplace: if True (default) modifies data["last_select"] in-place. if False stores clipped copy in data["last_select_clipped"]. - Examples: - %clipoutliers -> clip numeric columns using iqr (k=1.5) in-place - %clipoutliers method=zscore z_thresh=2.5 columns=age,salary inplace=False + - mode: + preview -> show what would happen (local + optional DB estimates) + apply -> perform clipping (local or DB) + rollback-> restore DB backup created by apply Additionally, execution metadata is stored into a table `magic_metadata`. """ + def __init__(self, args=""): self.args = args @@ -49,9 +56,10 @@ def name(self): def help(self): return ( - "%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] " - "[k=1.5] [z_thresh=3.0] [inplace=True|False]\n" - "Clamps extreme numeric values to computed boundaries (in-place by default)." + "%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [inplace=True|False]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false]\n" + " [sample_size=100] [lock_timeout=10]\n" + "Clamps extreme numeric values to computed boundaries (in-place by default).\n" "Execution metadata is recorded in table `magic_metadata`." ) @@ -92,7 +100,7 @@ def _send_html(self, kernel, df): {"data": {mime: html}, "metadata": {}}) def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): - """Compute (lower, upper) clipping bounds.""" + """Compute (lower, upper) clipping bounds for a pandas Series.""" s = series.dropna() if s.empty: return None, None @@ -102,7 +110,7 @@ def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): iqr = q3 - q1 lower = q1 - k * iqr upper = q3 + k * iqr - return lower, upper + return float(lower), float(upper) elif method == "zscore": mean = s.mean() std = s.std() @@ -110,7 +118,7 @@ def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): return None, None lower = mean - z_thresh * std upper = mean + z_thresh * std - return lower, upper + return float(lower), float(upper) else: raise ValueError(f"Unknown method {method}") @@ -155,10 +163,8 @@ def _get_db_name(self, kernel): if mariadb_client is None: return "" try: - # mariadb_client.run_statement may return HTML or "Query OK". Use pandas to parse if HTML. result = mariadb_client.run_statement("SELECT DATABASE();") if mariadb_client.iserror(): - # can't get db name return "" if not result: return "" @@ -172,7 +178,7 @@ def _get_db_name(self, kernel): return str(val) if val is not None else "" except Exception: # if not parseable by pandas, try regex to extract first cell content - m = re.search(r"(.*?)", result, flags=re.S | re.I) + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) if m: txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags txt = txt.strip() @@ -191,8 +197,7 @@ def _get_db_name(self, kernel): def _ensure_metadata_table(self, kernel, db_name): """ Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name + Includes rollback support columns (rollback_token, backup_table, original_table). """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) @@ -201,7 +206,6 @@ def _ensure_metadata_table(self, kernel, db_name): # nothing to do return - # Use db-qualified name if db_name is present; otherwise create in current schema table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" create_sql = f""" @@ -214,7 +218,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -225,7 +232,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): """ Insert a metadata row into magic_metadata. Uses NOW() for timestamp. """ @@ -243,11 +251,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -256,14 +267,17 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: mariadb_client.run_statement(insert_sql) # swallow errors but log if mariadb_client.iserror(): - log.error("Error inserting into magic_metadata: %s", insert_sql) + log.error("Error inserting into magic_metadata.") except Exception as e: log.error(f"Exception while inserting metadata: {e}") @@ -274,16 +288,13 @@ def _get_user_name(self, kernel): getattr(kernel, "username", None), getattr(kernel, "user", None), getattr(kernel, "session", None), - # might be kernel.user.identity etc. Try simple introspection: ] for cand in candidates: - # cand might be an object; try str if not None if cand is None: continue if isinstance(cand, str) and cand.strip(): return cand try: - # if session-like object with 'user' attribute maybe = getattr(cand, "user", None) if isinstance(maybe, str) and maybe.strip(): return maybe @@ -294,10 +305,133 @@ def _get_user_name(self, kernel): except Exception: return "" + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + # DB helpers for threshold computation and parsing + def _compute_thresholds_db(self, mariadb_client, table_full, col, method, k=1.5, z_thresh=3.0, sample_size=100): + """ + Sample non-null values from DB and compute thresholds for IQR or zscore. + Returns (ok, {lower:.., upper:..}, message) + """ + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "sample query failed" + try: + df_list = pd.read_html(out) + if not df_list or len(df_list) == 0: + return False, None, "no sample rows parsed" + # try numeric conversion + series = pd.to_numeric(df_list[0].iloc[:, 0], errors="coerce").dropna() + if series.empty: + return False, None, "sample contains no numeric values" + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via sampling" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via sampling" + else: + return False, None, "unknown method" + except Exception: + vals = re.findall(r"(.*?)", str(out), flags=re.S | re.I) + nums = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + try: + nums.append(float(txt)) + except Exception: + continue + if not nums: + return False, None, "parsed sample contains no numeric values" + series = pd.Series(nums) + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via regex sample" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via regex sample" + else: + return False, None, "unknown method" + except Exception as e: + return False, None, f"exception computing thresholds: {e}" + + def _parse_count_result(self, res): + """Parse a SELECT COUNT(*) result returned by mariadb_client.run_statement (HTML or text).""" + try: + df_list = pd.read_html(res) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + try: + return int(val) + except Exception: + try: + return int(float(val)) + except Exception: + return None + except Exception: + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return int(txt) + except Exception: + try: + return int(float(txt)) + except Exception: + return None + # fallback: try to parse raw + try: + txt = str(res).strip() + return int(txt) + except Exception: + try: + return int(float(str(res))) + except Exception: + return None + # ---- End DB helpers ---- def execute(self, kernel, data): - """Execute the %clipoutliers magic with metadata logging.""" + """Execute the %clipoutliers magic with metadata logging and DB support.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -310,6 +444,7 @@ def execute(self, kernel, data): except Exception: kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") return + # parse args columns_arg = args.get("columns", None) if isinstance(columns_arg, str): @@ -318,10 +453,12 @@ def execute(self, kernel, data): columns = list(columns_arg) else: columns = None + method = str(args.get("method", "iqr")).lower() if method not in {"iqr", "zscore"}: kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") return + try: k = float(args.get("k", 1.5)) except Exception: @@ -331,6 +468,20 @@ def execute(self, kernel, data): except Exception: z_thresh = 3.0 inplace = bool(args.get("inplace", True)) + + # mode and DB args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + # Determine numeric columns if columns is None: target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] @@ -338,88 +489,450 @@ def execute(self, kernel, data): missing_cols = [c for c in columns if c not in df.columns] if missing_cols: kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # log and return + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] non_numeric = [c for c in columns if c not in target_columns] if non_numeric: kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + if not target_columns: kernel._send_message("stderr", "No numeric target columns found to clip outliers.") + # log and return + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No numeric target columns found to clip outliers.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return - # Prepare metadata context - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) # ensure metadata table exists try: self._ensure_metadata_table(kernel, db_name) except Exception: - # log but continue try: kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") except Exception: pass - target_df = df if inplace else df.copy(deep=True) - messages = [] - total_clipped = 0 - operation_status = "success" - try: - for col in target_columns: - try: - series = target_df[col] - lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) + # --- PREVIEW MODE --- + if mode == "preview": + try: + messages = [] + total_would_change = 0 + combined_info = [] + for col in target_columns: + lower, upper = self._compute_bounds(df[col], method, k=k, z_thresh=z_thresh) if lower is None and upper is None: - messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") + messages.append(f"Column '{col}': insufficient local data to compute bounds; skipped.") + combined_info.append((col, None, None, 0)) continue - # find how many will change - mask = ((series < lower) | (series > upper)) & ~series.isna() + mask = ((df[col] < lower) | (df[col] > upper)) & ~df[col].isna() n_changed = int(mask.sum()) - # clip - target_df[col] = series.clip(lower=lower, upper=upper) - total_clipped += n_changed - messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") + total_would_change += n_changed + messages.append(f"Column '{col}': would clip {n_changed} value(s) locally (bounds: {lower}, {upper}).") + combined_info.append((col, lower, upper, n_changed)) + + n_before = len(df) + n_after = n_before # clipping doesn't remove rows locally + kernel._send_message("stdout", f"PREVIEW (local): would modify {total_would_change} value(s) across {len(target_columns)} column(s).\n" + "\n".join(messages)) + + # sample rows that have any out-of-bounds values + mask_any = pd.Series(False, index=df.index) + for col, lower, upper, _ in combined_info: + if lower is None and upper is None: + continue + mask_any = mask_any | (((df[col] < lower) | (df[col] > upper)) & ~df[col].isna()) + sample_rows = df[mask_any].head(sample_size).copy() + if not sample_rows.empty: + # annotate which columns are OOB for each row + def oob_cols(r): + cols = [c for c, lower, upper, _ in combined_info if lower is not None and upper is not None and (pd.notna(r.get(c)) and (r.get(c) < lower or r.get(c) > upper))] + return ",".join(cols) + sample_rows["_oob_columns"] = sample_rows.apply(oob_cols, axis=1) + try: + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + else: + kernel._send_message("stdout", "PREVIEW (local): no sample rows flagged as out-of-bounds.") + + # DB estimates if requested + if table_full and mariadb_client is not None: + db_messages = [] + predicates = [] + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + db_messages.append(f"{col}: thresholds approx [{lower}, {upper}] ({msg})") + else: + db_messages.append(f"{col}: could not compute thresholds ({msg}) - skipped") + + if predicates: + db_pred = " OR ".join(predicates) + try: + out = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full} WHERE {db_pred};") + cnt = self._parse_count_result(out) + if cnt is None: + kernel._send_message("stdout", "PREVIEW (db): could not parse count result (check permissions).") + else: + kernel._send_message("stdout", f"PREVIEW (db): estimated rows with OOB values: {cnt}.") + except Exception: + kernel._send_message("stdout", "PREVIEW (db): failed to run count query (continuing).") + kernel._send_message("stdout", "PREVIEW (db) thresholds:\n" + "\n".join(db_messages)) + else: + kernel._send_message("stdout", "PREVIEW (db): no DB predicates could be computed (insufficient sample/values).") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"clipoutliers_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + clip_map = {} + messages = [] + + # compute thresholds for each column using DB sampling + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + clip_map[col] = (thresholds["lower"], thresholds["upper"]) + messages.append(f"{col}: thresholds [{thresholds['lower']}, {thresholds['upper']}] ({msg})") + else: + messages.append(f"{col}: could not compute thresholds ({msg}); will leave column unchanged in DB apply") + + # Build SELECT exprs: for clipped cols use LEAST(GREATEST(col, lower), upper) AS col, else `col` + select_exprs = [] + for c in df.columns: + if c in clip_map: + lower, upper = clip_map[c] + # use repr to preserve numeric literal format + select_exprs.append(f"LEAST(GREATEST({c}, {repr(lower)}), {repr(upper)}) AS {c}") + else: + select_exprs.append(c) + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"clipoutliers_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with clipped values + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + except Exception as e: - messages.append(f"Column '{col}': error while clipping: {e}") - # finish up - if inplace: - data["last_select"] = target_df - location_msg = "Modified in-place: data['last_select'] updated." + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + else: - data["last_select_clipped"] = target_df - location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." - kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" - + "\n".join(messages) - + f"\nTotal values clipped: {total_clipped}. {location_msg}") - except Exception as e: - operation_status = "error" - messages.append(f"Fatal error during clipping: {e}") - kernel._send_message("stderr", f"Fatal error during clipping: {e}") + # Local in-place apply on data['last_select'] (existing behavior) + target_df = df if inplace else df.copy(deep=True) + messages = [] + total_clipped = 0 + operation_status = "success" + try: + for col in target_columns: + series = target_df[col] + lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) + if lower is None and upper is None: + messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") + continue + mask = ((series < lower) | (series > upper)) & ~series.isna() + n_changed = int(mask.sum()) + target_df[col] = series.clip(lower=lower, upper=upper) + total_clipped += n_changed + messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") + if inplace: + data["last_select"] = target_df + location_msg = "Modified in-place: data['last_select'] updated." + else: + data["last_select_clipped"] = target_df + location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." + kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" + + "\n".join(messages) + + f"\nTotal values clipped: {total_clipped}. {location_msg}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error during clipping: {e}") + kernel._send_message("stderr", f"Fatal error during clipping: {e}") - # Attempt to insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = "\n".join(target_columns) - message_str = "\n".join(messages) - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=message_str, - db_name=db_name, - user_name=user_name - ) - except Exception as e: - # metadata failure shouldn't interrupt user, but warn - try: - kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") - except Exception: - pass + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception as e: + try: + kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") + except Exception: + pass - # Show output (DataFrame) - try: - self._send_html(kernel, target_df) - except Exception: - pass + # Show output (DataFrame) + try: + self._send_html(kernel, target_df) + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py index c024cbe..8f50a62 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py @@ -10,6 +10,10 @@ import math import os import re +import time +import uuid +import json +import html # Attempt to import SqlFetch if available (helps to determine current DB reliably) try: @@ -20,15 +24,18 @@ class DropMissing(MariaMagic): """ - %dropmissing [columns=col1,col2,...] - - Always performs the operation IN-PLACE on data["last_select"]: - - If columns are provided, drop rows where any of those columns is missing. - - If no columns provided, drop rows that have any missing value (any column). - - This magic also logs execution metadata into a table `magic_metadata` with fields: - id, command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name + %dropmissing [columns=col1,col2,...] [mode=preview|apply|rollback] + [table=schema.table] + [sample_size=100] [confirm=true|false] + [rollback_token=] [lock_timeout=10] + + Notes: + - The "analyze" mode has been removed. + - There is no strategy argument: DB applies always use the safe "versioned" + approach (CTAS + atomic RENAME). This provides a straightforward + rollback path via a backup table and rollback_token. + - When an apply is performed the generated rollback_token is printed to + stdout so users can copy it for a later rollback. """ def __init__(self, args=""): @@ -42,13 +49,13 @@ def name(self): def help(self): return ( - "%dropmissing [columns=col1,col2,...]\n" - "Drops rows with missing values from data['last_select'] (always IN-PLACE).\n" - "Execution metadata is recorded in table `magic_metadata`." + "%dropmissing [columns=col1,col2,...] [mode=preview|apply|rollback] [table=schema.table]\n" + "Preview operates on data['last_select']. Apply will always use a versioned CTAS+RENAME strategy (requires confirm=true when targeting DB).\n" + "Execution metadata recorded in table `magic_metadata`." ) + # -------------------- Basic helpers --------------------------------- def _str_to_obj(self, s): - """Cast simple strings to Python objects where sensible.""" try: return int(s) except ValueError: @@ -62,7 +69,6 @@ def _str_to_obj(self, s): return s def parse_args(self, input_str): - """Parse key=value arguments (keeps behavior consistent with other magics).""" if not input_str or input_str.strip() == "": return {} pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) @@ -71,27 +77,23 @@ def parse_args(self, input_str): return pairs def _send_html(self, kernel, df): - """Display DataFrame as HTML (fallback to text if needed).""" try: - html = df.to_html(index=False) + html_repr = df.to_html(index=False) mime = "text/html" except Exception: - html = str(df) - mime = "text/plain" - display_content = {"data": {mime: html}, "metadata": {}} + html_repr = "
" + html.escape(str(df)) + "
" + mime = "text/html" + display_content = {"data": {mime: html_repr}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) - # ---------- DB / metadata helpers (best-effort) ---------- + # -------------------- DB / metadata helpers --------------------------- def _get_mariadb_client(self, kernel): - """Return mariadb_client if present on kernel, else None""" return getattr(kernel, "mariadb_client", None) def _get_logger(self, kernel): - """Return a logger on kernel if present, else create a temporary logger""" return getattr(kernel, "log", logging.getLogger(__name__)) def _sql_escape(self, val): - """Escape a value for SQL single-quoted literal insert. None -> NULL""" if val is None: return "NULL" if not isinstance(val, str): @@ -99,15 +101,8 @@ def _sql_escape(self, val): return "'" + val.replace("'", "''") + "'" def _get_db_name(self, kernel): - """ - Attempt to determine the currently used DB. - Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. - Returns empty string if none found. - """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) - - # Try SqlFetch if available if SqlFetch is not None and mariadb_client is not None: try: sf = SqlFetch(mariadb_client, log) @@ -115,18 +110,13 @@ def _get_db_name(self, kernel): if isinstance(dbname, str): return dbname except Exception: - log.debug("SqlFetch available but get_db_name() failed; falling back to manual query.") - - # Fallback: run SELECT DATABASE(); + log.debug("SqlFetch.get_db_name failed; falling back to SELECT DATABASE()") if mariadb_client is None: return "" try: result = mariadb_client.run_statement("SELECT DATABASE();") - if mariadb_client.iserror(): - return "" - if not result: + if mariadb_client.iserror() or not result: return "" - # Try parsing HTML table via pandas try: df_list = pd.read_html(result) if df_list and isinstance(df_list, list) and len(df_list) > 0: @@ -135,7 +125,6 @@ def _get_db_name(self, kernel): return "" return str(val) if val is not None else "" except Exception: - # regex to extract first content m = re.search(r"(.*?)", str(result), flags=re.S | re.I) if m: txt = re.sub(r"<.*?>", "", m.group(1)).strip() @@ -151,7 +140,6 @@ def _get_db_name(self, kernel): return "" def _get_user_name(self, kernel): - """Try several places to find the current user name; fallback to OS login or empty string.""" candidates = [ getattr(kernel, "user_name", None), getattr(kernel, "username", None), @@ -175,19 +163,11 @@ def _get_user_name(self, kernel): return "" def _ensure_metadata_table(self, kernel, db_name): - """ - Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name - """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) - if mariadb_client is None: return - table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" - create_sql = f""" CREATE TABLE IF NOT EXISTS {table_full_name} ( id INT AUTO_INCREMENT PRIMARY KEY, @@ -198,40 +178,40 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: mariadb_client.run_statement(create_sql) - # If run_statement sets error flag, log it if mariadb_client.iserror(): log.error("Error creating magic_metadata table.") except Exception as e: log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): - """ - Insert a metadata row into magic_metadata. Uses NOW() for timestamp. - """ + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) if mariadb_client is None: return - table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" - args_sql = self._sql_escape(arguments) affected_sql = self._sql_escape(affected_columns) status_sql = self._sql_escape(operation_status) message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) - + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -240,7 +220,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -250,10 +233,42 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + # ---------- End DB helpers ---------- + def _build_delete_predicate(self, columns): + """Return SQL predicate that matches rows with missing values in given columns. + columns==None means any column is NULL => predicate for any column null can't be generated without schema + so we return None in that case (caller should handle). + """ + if not columns: + return None + clauses = [f"{col} IS NULL" for col in columns] + return " OR ".join(clauses) + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + def execute(self, kernel, data): - """Execute the dropmissing magic (always modifies data['last_select']) and log metadata.""" + """Execute the dropmissing magic (supports preview/analyze/apply/rollback and logs metadata).""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -269,6 +284,7 @@ def execute(self, kernel, data): kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") return + # parse columns columns_arg = args.get("columns", None) if isinstance(columns_arg, str): columns = [c.strip() for c in columns_arg.split(",") if c.strip()] @@ -277,34 +293,49 @@ def execute(self, kernel, data): else: columns = None + # operational args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "analyze", "apply", "rollback"} else "preview" + table_full = args.get("table", None) # expected 'schema.table' or 'table' + strategy = str(args.get("strategy", "versioned")).lower() + sample_size = int(args.get("sample_size", 100)) + confirm = bool(args.get("confirm", False)) + pk_col = args.get("pk", None) + rollback_token = args.get("rollback_token", None) + lock_timeout = int(args.get("lock_timeout", 10)) + analyze_real = bool(args.get("analyze_real", False)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # validate requested columns exist in df if columns is not None: missing_cols = [c for c in columns if c not in df.columns] if missing_cols: - kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + kernel._send_message("stderr", f"Column(s) not found in last_select: {', '.join(missing_cols)}") # Log metadata for failure try: - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) self._ensure_metadata_table(kernel, db_name) self._insert_metadata( kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(columns) if columns else "", + affected_columns=','.join(columns) if columns else "", operation_status="error", message=f"Column(s) not found: {', '.join(missing_cols)}", db_name=db_name, user_name=user_name ) except Exception: - # swallow pass return - # metadata context - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) - # ensure metadata table exists (best-effort) + # prepare predicate + sql_predicate = self._build_delete_predicate(columns) + + # metadata table ensure (best-effort) try: self._ensure_metadata_table(kernel, db_name) except Exception: @@ -313,47 +344,495 @@ def execute(self, kernel, data): except Exception: pass - # perform drop operation - operation_status = "success" - messages = [] - try: - before_count = len(df) - df.dropna(axis=0, subset=columns, inplace=True) - after_count = len(df) - dropped = before_count - after_count - data["last_select"] = df - msg = f"Dropped {dropped} row(s) with missing values (in-place). Updated last_select." - kernel._send_message("stdout", msg) - messages.append(msg) - # show resulting dataframe + # --- PREVIEW MODE ------------------------------------------------- + if mode == "preview": try: - self._send_html(kernel, df) - except Exception: - pass - except Exception as e: - operation_status = "error" - err_msg = f"Error while dropping missing values: {e}" - kernel._send_message("stderr", err_msg) - messages.append(err_msg) + before_count = len(df) + if columns is None: + after_df = df.dropna() + else: + after_df = df.dropna(axis=0, subset=columns) + after_count = len(after_df) + dropped = before_count - after_count + + kernel._send_message("stdout", f"PREVIEW: would drop {dropped} row(s) (from {before_count} to {after_count}).") + + # show small sample with before/after preview for rows that would be dropped + if columns is None: + predicate_mask = df.isnull().any(axis=1) + else: + predicate_mask = df[columns].isnull().any(axis=1) + + sample_rows = df[predicate_mask].head(sample_size) + # show 'after' preview as dropped rows (so after preview is empty for those rows) + sample_preview = sample_rows.copy() + sample_preview["_would_be_dropped"] = True + + if not sample_preview.empty: + try: + self._send_html(kernel, sample_preview) + except Exception: + pass + + # If table specified, show EXPLAIN for corresponding DELETE + if table_full and mariadb_client is not None: + if sql_predicate is None: + kernel._send_message("stdout", "Preview: cannot generate DB predicate for 'any column null' without explicit columns.") + else: + delete_sql = f"DELETE FROM {table_full} WHERE {sql_predicate};" + try: + # EXPLAIN (no execute) + mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql) + if mariadb_client.iserror(): + kernel._send_message("stdout", "Could not run EXPLAIN on DB — check permissions or SQL syntax.") + else: + kernel._send_message("stdout", "EXPLAIN (estimate) for corresponding DELETE (JSON):") + kernel._send_message("stdout", mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql)) + except Exception: + kernel._send_message("stdout", "Failed to run EXPLAIN on DB (continuing).") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='preview', + message=f'preview_dropped={dropped}', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass - # Insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" - message_str = "\n".join(messages) - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=message_str, - db_name=db_name, - user_name=user_name - ) - except Exception: - # swallow metadata insertion errors but do not interrupt user flow + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ANALYZE MODE ----------------------------------------------- + if mode == "analyze": + if mariadb_client is None or not table_full: + kernel._send_message("stderr", "ANALYZE requires a connected mariadb_client and table= argument.") + return + if sql_predicate is None: + kernel._send_message("stderr", "ANALYZE requires explicit columns= to generate delete predicate.") + return + delete_sql = f"DELETE FROM {table_full} WHERE {sql_predicate};" try: - kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") - except Exception: - pass + # Run EXPLAIN (estimate) + explain_out = mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql) + kernel._send_message("stdout", "EXPLAIN (estimate):") + kernel._send_message("stdout", explain_out) + # Optionally run EXPLAIN ANALYZE if requested + if analyze_real: + try: + analyze_out = mariadb_client.run_statement("EXPLAIN ANALYZE " + delete_sql) + kernel._send_message("stdout", "EXPLAIN ANALYZE (actual run):") + kernel._send_message("stdout", analyze_out) + except Exception: + kernel._send_message("stdout", "EXPLAIN ANALYZE failed or is not supported on this server.") + # log analyze metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='analyze', + message='analyze_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + except Exception as e: + kernel._send_message("stderr", f"Error during analyze: {e}") + return + + # --- ROLLBACK MODE --------------------------------------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + # If rollback_token provided, try to find matching metadata entry + token = rollback_token + try: + if not token: + # try to read latest magic_metadata entry for this command and user + mariadb_client.run_statement(f"SELECT id, rollback_token, backup_table, original_table, arguments, execution_timestamp FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + # now find backup_table and original_table associated with token + mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + backup_out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(backup_out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + # depending on HTML ordering we try to extract both; fallback below parses individually + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + # fallback: fetch backup_table + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic swap to restore backup -> original + lock_name = f"dropmissing_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + # If original_table was recorded during apply, prefer to use it + if original_table: + # if original exists, rename original -> original_backup_before_rb_{token}, then rename backup -> original + if self._table_exists(mariadb_client, original_table): + # create a unique temp name for the old original + original_old = f"{original_table}_prerollback_{token}" + # atomic multi-rename: rename original -> original_old, backup -> original + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + # record rollback metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # original does not exist currently; rename backup -> original directly + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + # record rollback metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # No original_table recorded — best-effort: attempt to infer original name from arguments + # try to fetch arguments column + try: + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + # try to find table=... inside arguments + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + # same logic as above using inferred_original + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + except Exception as e: + kernel._send_message("stderr", f"Rollback error while inferring original table: {e}") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE ----------------------------------------------- + if mode == "apply": + # two main apply targets: DB (table_full provided and mariadb_client present) or local DataFrame + if table_full and mariadb_client is not None: + # safety: require explicit confirmation to run DB changes + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + if sql_predicate is None: + kernel._send_message("stderr", "Apply to DB requires explicit columns= to build a safe predicate (avoid accidental full-table deletes).") + return + + # strategy selection + if strategy == "versioned": + # create a new table (CTAS) containing rows we want to keep (i.e., NOT predicate) + # generate unique backup name + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + delete_pred = sql_predicate + try: + # acquire lock + lock_name = f"dropmissing_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with rows to keep + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT * FROM {table_full} WHERE NOT ({delete_pred});") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + # basic validation: counts (best-effort) + mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full};") + total_old = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full};") + mariadb_client.run_statement(f"SELECT COUNT(*) FROM {new_table};") + + # atomic rename: original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + # attempt cleanup + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) and record original_table + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + # update in-memory last_select to reflect applied state (fetch fresh) + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + # try to parse HTML into DataFrame + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + # cannot parse, just notify + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + elif strategy == "transactional": + # transactional apply: capture changed rows in an audit table, then delete + if not pk_col: + kernel._send_message("stderr", "Transactional strategy requires pk= to capture changed rows for rollback. Falling back to versioned strategy.") + # fall back to versioned + args["strategy"] = "versioned" + self.args = "".join([f"{k}={v} " for k, v in args.items()]) + return self.execute(kernel, data) + + token = str(uuid.uuid4()).replace('-', '')[:16] + audit_table = f"{db_name}.magic_audit_{token}" + delete_pred = sql_predicate + try: + lock_name = f"dropmissing_tx_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create audit table + mariadb_client.run_statement(f"CREATE TABLE IF NOT EXISTS {audit_table} (tx_id VARCHAR(64), pk_val TEXT, old_row LONGTEXT, created_at DATETIME);") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create audit table.") + return + + # insert affected rows into audit table + mariadb_client.run_statement(f"INSERT INTO {audit_table} (tx_id, pk_val, old_row, created_at) SELECT '{token}', CAST({pk_col} AS CHAR), TO_BASE64(ROW_TO_JSON(t)), NOW() FROM {table_full} t WHERE {delete_pred};") + # Note: ROW_TO_JSON and TO_BASE64 may not be available depending on server; this is best-effort + + # delete rows + mariadb_client.run_statement(f"DELETE FROM {table_full} WHERE {delete_pred};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "DELETE failed during transactional apply (check SQL and permissions).") + return + + kernel._send_message("stdout", f"Transactional apply completed; audit table {audit_table} contains old rows for rollback with token {token}.") + # metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'audit_table={audit_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=audit_table, + original_table=table_full + ) + # refresh in-memory last_select little + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (transactional) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + kernel._send_message("stderr", f"Unknown strategy: {strategy}") + return + + else: + # operate locally on data['last_select'] (in-place) + operation_status = "success" + messages = [] + try: + before_count = len(df) + if columns is None: + df.dropna(axis=0, inplace=True) + else: + df.dropna(axis=0, subset=columns, inplace=True) + after_count = len(df) + dropped = before_count - after_count + data["last_select"] = df + msg = f"Dropped {dropped} row(s) with missing values (in-place local)." + kernel._send_message("stdout", msg) + messages.append(msg) + try: + self._send_html(kernel, df) + except Exception: + pass + except Exception as e: + operation_status = "error" + err_msg = f"Error while dropping missing values locally: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py index 1d6bcab..b45af51 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py @@ -10,6 +10,8 @@ import logging import os import re +import uuid +import time # Optional helper to reliably get current DB name (if available in environment) try: @@ -21,13 +23,17 @@ class DropOutliers(MariaMagic): """ %dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] - Removes rows (IN-PLACE) from data['last_select'] where any selected numeric column - is detected as an outlier according to the chosen method. + Modes: + - preview: estimate rows removed, show sample rows + - apply: perform removal (in-place local or DB CTAS+RENAME if table= provided) + - rollback: restore DB backup created by apply (requires mariadb_client) - Additionally logs execution metadata into `magic_metadata` table: - id, command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name + Notes: + - DB apply uses sampling to compute thresholds (best-effort). + - Execution metadata recorded in magic_metadata (includes rollback token). """ def __init__(self, args=""): @@ -42,7 +48,9 @@ def name(self): def help(self): return ( "%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]\n" - "Removes rows containing outliers from data['last_select'] (in-place).\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false]\n" + " [sample_size=100]\n" + "Removes rows containing outliers from data['last_select'] or from a DB table (versioned apply).\n" "Execution metadata is recorded in table `magic_metadata`." ) @@ -74,12 +82,12 @@ def parse_args(self, input_str): def _send_html(self, kernel, df): """Display DataFrame as HTML (fallback to text if needed).""" try: - html = df.to_html(index=False) + html_repr = df.to_html(index=False) mime = "text/html" except Exception: - html = str(df) + html_repr = str(df) mime = "text/plain" - display_content = {"data": {mime: html}, "metadata": {}} + display_content = {"data": {mime: html_repr}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): @@ -209,7 +217,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -220,7 +231,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) if mariadb_client is None: @@ -233,11 +245,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -246,7 +261,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -256,10 +274,130 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") - # --- end metadata helpers --- + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _compute_thresholds_db(self, mariadb_client, table_full, col, method, k=1.5, z_thresh=3.0, sample_size=100): + """ + Sample non-null values from DB and compute thresholds for IQR or zscore. + Returns (ok, {lower:.., upper:..}, message) + """ + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "sample query failed" + try: + df_list = pd.read_html(out) + if not df_list or len(df_list) == 0: + return False, None, "no sample rows parsed" + series = df_list[0].iloc[:, 0].astype(float) # try numeric conversion + if series.dropna().empty: + return False, None, "sample contains no numeric values" + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via sampling" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via sampling" + else: + return False, None, "unknown method" + except Exception: + # fallback regex parse single-column HTML + vals = re.findall(r"(.*?)", str(out), flags=re.S | re.I) + nums = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + try: + nums.append(float(txt)) + except Exception: + continue + if not nums: + return False, None, "parsed sample contains no numeric values" + series = pd.Series(nums) + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via regex sample" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via regex sample" + else: + return False, None, "unknown method" + except Exception as e: + return False, None, f"exception computing thresholds: {e}" + + def _parse_count_result(self, res): + """Parse a SELECT COUNT(*) result returned by mariadb_client.run_statement (HTML or text).""" + try: + df_list = pd.read_html(res) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + try: + return int(val) + except Exception: + try: + return int(float(val)) + except Exception: + return None + except Exception: + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return int(txt) + except Exception: + try: + return int(float(txt)) + except Exception: + return None + # fallback: try to parse raw + try: + txt = str(res).strip() + return int(txt) + except Exception: + try: + return int(float(str(res))) + except Exception: + return None def execute(self, kernel, data): - """Execute the dropoutliers magic (modifies data['last_select'] in-place) and log metadata.""" + """Execute the dropoutliers magic (preview/apply/rollback).""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -300,6 +438,20 @@ def execute(self, kernel, data): except Exception: z_thresh = 3.0 + # mode + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + # Determine target numeric columns if columns is None: target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] @@ -309,8 +461,6 @@ def execute(self, kernel, data): kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") # log metadata for failure and return try: - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) self._ensure_metadata_table(kernel, db_name) self._insert_metadata( kernel=kernel, @@ -335,8 +485,6 @@ def execute(self, kernel, data): kernel._send_message("stderr", "No numeric target columns found to detect outliers.") # log metadata for early exit try: - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) self._ensure_metadata_table(kernel, db_name) self._insert_metadata( kernel=kernel, @@ -352,9 +500,6 @@ def execute(self, kernel, data): pass return - # Prepare metadata context - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) # ensure metadata table exists try: self._ensure_metadata_table(kernel, db_name) @@ -364,13 +509,13 @@ def execute(self, kernel, data): except Exception: pass - # Detect outliers per column and combine masks - combined_mask = None - messages = [] - operation_status = "success" - try: - for col in target_columns: - try: + # --- PREVIEW MODE --- + if mode == "preview": + try: + # local detection counts & sample + messages = [] + combined_mask = None + for col in target_columns: mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) n_out = int(mask.sum()) messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") @@ -378,67 +523,388 @@ def execute(self, kernel, data): combined_mask = mask.astype(bool) else: combined_mask = combined_mask | mask.astype(bool) - except Exception as e: - messages.append(f"Column '{col}': error detecting outliers: {e}") - except Exception as e: - operation_status = "error" - messages.append(f"Fatal error while detecting outliers: {e}") - # If no outliers found, log and return (but still record metadata) - if combined_mask is None or not combined_mask.any(): - try: - kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) - self._send_html(kernel, df) - except Exception: - pass - # insert metadata (no rows removed) - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(target_columns), - operation_status=operation_status, - message="\n".join(messages) or "No outliers detected.", - db_name=db_name, - user_name=user_name - ) - except Exception: - pass + n_before = len(df) + n_after = n_before - (int(combined_mask.sum()) if combined_mask is not None else 0) + kernel._send_message("stdout", f"PREVIEW (local): would drop {n_before - n_after} row(s) (from {n_before} to {n_after}).\n" + "\n".join(messages)) + + # show sample rows that would be dropped (local) + if combined_mask is not None and combined_mask.any(): + sample_rows = df[combined_mask].head(sample_size).copy() + sample_rows["_outlier_cols"] = sample_rows.apply(lambda r: ",".join([c for c in target_columns if pd.isnull(r.get(c)) is False and self._detect_outliers_series(pd.Series([r.get(c)]*1), method, k=k, z_thresh=z_thresh).iloc[0]]), axis=1) + try: + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + else: + kernel._send_message("stdout", "PREVIEW (local): no rows with outliers in the sample.") + + # If DB target provided, attempt DB-based estimate (using sampling thresholds) + if table_full and mariadb_client is not None: + db_messages = [] + predicates = [] + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + # ensure numeric literal formatting + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + db_messages.append(f"{col}: thresholds approx [{lower}, {upper}] ({msg})") + else: + db_messages.append(f"{col}: could not compute thresholds ({msg}) - skipped in DB predicate") + + if predicates: + db_pred = " OR ".join(predicates) + try: + out = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full} WHERE {db_pred};") + cnt = self._parse_count_result(out) + if cnt is None: + kernel._send_message("stdout", "PREVIEW (db): could not parse count result (check permissions).") + else: + kernel._send_message("stdout", f"PREVIEW (db): estimated rows matching outlier predicate: {cnt}.") + except Exception: + kernel._send_message("stdout", "PREVIEW (db): failed to run count query (continuing).") + kernel._send_message("stdout", "PREVIEW (db) thresholds:\n" + "\n".join(db_messages)) + else: + kernel._send_message("stdout", "PREVIEW (db): no DB predicates could be computed (insufficient sample/values).") + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") return - # Drop rows in-place where any target column is an outlier - try: - n_before = len(df) - df.drop(index=df[combined_mask].index, inplace=True) - data["last_select"] = df - n_after = len(df) - removed = n_before - n_after - kernel._send_message("stdout", f"Dropped {removed} row(s) containing outliers (in-place).\n" + "\n".join(messages)) + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) try: - self._send_html(kernel, df) - except Exception: - pass - except Exception as e: - operation_status = "error" - err = f"Error while removing outlier rows: {e}" - kernel._send_message("stderr", err) - messages.append(err) + if not token: + # try to find latest rollback_token for this command + user + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # get backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"dropoutliers_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return - # Insert metadata (best-effort) - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(target_columns), - operation_status=operation_status, - message="\n".join(messages), - db_name=db_name, - user_name=user_name - ) - except Exception: - try: - kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") - except Exception: - pass + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + predicates = [] + messages = [] + + # compute thresholds for each column using DB sampling + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + # use repr to keep decimal representation + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + messages.append(f"{col}: thresholds [{lower}, {upper}] ({msg})") + else: + messages.append(f"{col}: could not compute thresholds ({msg}); this column will not be used in DB predicate") + + if not predicates: + kernel._send_message("stderr", "Could not compute DB predicates for any column; aborting DB apply.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='error', + message='db_apply_failed_no_predicates', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + db_pred = " OR ".join(predicates) + + try: + lock_name = f"dropoutliers_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with rows to keep (NOT predicate) + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT * FROM {table_full} WHERE NOT ({db_pred});") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with rollback token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message='applied_db_versioned', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + combined_mask = None + messages = [] + operation_status = "success" + try: + for col in target_columns: + try: + mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + if combined_mask is None: + combined_mask = mask.astype(bool) + else: + combined_mask = combined_mask | mask.astype(bool) + except Exception as e: + operation_status = "error" + messages.append(f"Column '{col}': error detecting outliers: {e}") + + if combined_mask is None or not combined_mask.any(): + kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) + try: + self._send_html(kernel, df) + except Exception: + pass + else: + n_before = len(df) + df.drop(index=df[combined_mask].index, inplace=True) + data["last_select"] = df + n_after = len(df) + removed = n_before - n_after + kernel._send_message("stdout", f"Dropped {removed} row(s) containing outliers (in-place).\n" + "\n".join(messages)) + try: + self._send_html(kernel, df) + except Exception: + pass + + except Exception as e: + operation_status = "error" + kernel._send_message("stderr", f"Error while removing outlier rows locally: {e}") + messages.append(str(e)) + + # Insert metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(target_columns), + operation_status=operation_status, + message="\n".join(messages), + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py index 4e3b393..928e0ee 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -8,6 +8,8 @@ import logging import os import re +import uuid +import time # Optional helper to reliably get current DB name (if available) try: @@ -18,18 +20,20 @@ class FillMissing(MariaMagic): """ - %fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const] - - Always performs the operation IN-PLACE on data["last_select"]: - - - If columns provided, fill missing values only for those columns. - - If no columns provided, fill missing values for all columns. - - strategies: - * mean -> uses column mean (numeric columns only) - * median -> uses column median (numeric columns only) - * mode -> uses column mode (most frequent value; works for any dtype) - * constant-> fills with provided value (value must be supplied via value=...) - Execution metadata is recorded into table `magic_metadata`. + %fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] + [value=const] [mode=preview|apply|rollback] [table=schema.table] + [confirm=true|false] [sample_size=100] + + Behavior: + - preview: shows what would be filled (counts, sample rows with nulls, and computed fill values) + - apply: performs the fill (locally or on DB if table= specified) + - rollback: attempts to restore a backup created by an apply (requires mariadb_client + rollback_token or will use latest by user) + + Notes: + - DB apply uses a CTAS + atomic RENAME pattern so the original is preserved as _backup_. + - For DB fill values we compute values using SQL when possible (AVG for mean, GROUP BY+COUNT for mode). + Median uses a sampling fallback to compute the median in Python (best-effort). + - Execution metadata is recorded in table `magic_metadata` (including rollback_token, backup_table, original_table). """ def __init__(self, args=""): @@ -44,8 +48,8 @@ def name(self): def help(self): return ( "%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]\n" - "Fills missing values in data['last_select'] (always IN-PLACE)." - "Execution metadata is recorded in table `magic_metadata`." + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] [sample_size=100]\n" + "Fills missing values in data['last_select'] or in DB table when table= is provided." ) def _str_to_obj(self, s): @@ -77,12 +81,12 @@ def parse_args(self, input_str): def _send_html(self, kernel, df): """Display DataFrame as HTML (fallback to text if needed).""" try: - html = df.to_html(index=False) + html_repr = df.to_html(index=False) mime = "text/html" except Exception: - html = str(df) + html_repr = str(df) mime = "text/plain" - display_content = {"data": {mime: html}, "metadata": {}} + display_content = {"data": {mime: html_repr}, "metadata": {}} kernel.send_response(kernel.iopub_socket, "display_data", display_content) # -------------------- metadata / DB helpers (best-effort) -------------------- @@ -184,8 +188,7 @@ def _get_user_name(self, kernel): def _ensure_metadata_table(self, kernel, db_name): """ Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name + Columns include fields to support rollback tracking. """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) @@ -206,7 +209,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -217,7 +223,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): """ Insert a metadata row into magic_metadata. Uses NOW() for timestamp. """ @@ -235,11 +242,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -248,7 +258,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -259,10 +272,120 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + # -------------------- end metadata helpers -------------------- + def _compute_fill_value_db(self, mariadb_client, table_full, col, strategy, const_value, sample_size=100): + """ + Compute fill value for a DB column using SQL when possible; otherwise fall back to sampling. + Returns (success_bool, fill_value or None, message) + """ + try: + # constant + if strategy == "constant": + return True, const_value, "constant provided" + + # mean -> AVG + if strategy == "mean": + try: + out = mariadb_client.run_statement(f"SELECT AVG({col}) FROM {table_full} WHERE {col} IS NOT NULL;") + if mariadb_client.iserror() or not out: + return False, None, "AVG query failed" + # parse result + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + # convert to numeric if possible + try: + valf = float(val) + return True, valf, "mean via SQL" + except Exception: + return True, val, "mean via SQL (non-numeric parse)" + except Exception: + # regex fallback + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return True, float(txt), "mean via SQL (regex)" + except Exception: + return True, txt, "mean via SQL (regex)" + return False, None, "Could not parse AVG result" + except Exception: + return False, None, "AVG query exception" + + # mode -> most frequent value via GROUP BY + if strategy == "mode": + try: + out = mariadb_client.run_statement(f"SELECT {col}, COUNT(*) AS cnt FROM {table_full} WHERE {col} IS NOT NULL GROUP BY {col} ORDER BY cnt DESC LIMIT 1;") + if mariadb_client.iserror() or not out: + return False, None, "MODE query failed" + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + return True, val, "mode via SQL" + except Exception: + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + return True, txt, "mode via SQL (regex)" + return False, None, "Could not parse mode result" + except Exception: + return False, None, "MODE query exception" + + # median -> sampling fallback: select a sample of non-null values and compute median in pandas + if strategy == "median": + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "Median: sample query failed" + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + series = df_list[0].iloc[:, 0] + # convert to numeric where possible + try: + series_num = pd.to_numeric(series, errors="coerce").dropna() + if series_num.empty: + return False, None, "Median: non-numeric or all missing in sample" + med = series_num.median() + return True, float(med), "median via sampling" + except Exception: + return False, None, "Median: numeric conversion failed" + except Exception: + return False, None, "Median: parsing sample failed" + except Exception: + return False, None, "Median: sample query exception" + + return False, None, "Unknown strategy" + except Exception as e: + return False, None, f"Exception computing fill value: {e}" + def execute(self, kernel, data): - """Execute the fillmissing magic (always modifies data['last_select']) and log metadata.""" + """Execute the fillmissing magic (preview/apply/rollback) and log metadata.""" df = data.get("last_select") if df is None: kernel._send_message("stderr", "No last_select found in kernel data.") @@ -287,17 +410,47 @@ def execute(self, kernel, data): else: target_columns = None - # determine target columns (None => all columns) + # mode: preview|apply|rollback + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + + # other args + strategy = args.get("strategy", "mean") + if isinstance(strategy, str): + strategy = strategy.lower() + else: + strategy = str(strategy).lower() + + allowed = {"mean", "median", "mode", "constant"} + if strategy not in allowed: + kernel._send_message("stderr", f"Unknown strategy '{strategy}'. Allowed: {', '.join(allowed)}") + return + + value_provided = "value" in args + const_value = args.get("value", None) + + if strategy == "constant" and not value_provided and mode != "preview": + kernel._send_message("stderr", "Strategy 'constant' requires a 'value=...' argument.") + return + + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # determine local target columns if not provided if target_columns is None: target_columns = list(df.columns) else: missing_cols = [c for c in target_columns if c not in df.columns] if missing_cols: - kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + kernel._send_message("stderr", f"Column(s) not found in last_select: {', '.join(missing_cols)}") # log metadata for failure try: - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) self._ensure_metadata_table(kernel, db_name) self._insert_metadata( kernel=kernel, @@ -313,29 +466,7 @@ def execute(self, kernel, data): pass return - # parse strategy - strategy = args.get("strategy", "mean") - if isinstance(strategy, str): - strategy = strategy.lower() - else: - strategy = str(strategy).lower() - - allowed = {"mean", "median", "mode", "constant"} - if strategy not in allowed: - kernel._send_message("stderr", f"Unknown strategy '{strategy}'. Allowed: {', '.join(allowed)}") - return - - # constant requires value - value_provided = "value" in args - const_value = args.get("value", None) - - if strategy == "constant" and not value_provided: - kernel._send_message("stderr", "Strategy 'constant' requires a 'value=...' argument.") - return - - # Prepare metadata context and ensure table exists (best-effort) - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) + # Ensure metadata table exists (best-effort) try: self._ensure_metadata_table(kernel, db_name) except Exception: @@ -344,80 +475,438 @@ def execute(self, kernel, data): except Exception: pass - # perform filling column by column with sensible handling for dtype - messages = [] - operation_status = "success" - for col in target_columns: + # --- PREVIEW MODE ------------------------------------------------- + if mode == "preview": try: - series = df[col] - if strategy in {"mean", "median"}: - # only numeric columns supported for mean/median - if pd.api.types.is_numeric_dtype(series): - if strategy == "mean": - fill_val = series.mean(skipna=True) + messages = [] + # missing count per column in the DataFrame + missing_counts = {col: int(df[col].isnull().sum()) for col in target_columns} + summary_lines = [f"{col}: missing={count}" for col, count in missing_counts.items()] + kernel._send_message("stdout", "PREVIEW: missing counts per column:\n" + "\n".join(summary_lines)) + + # compute would-be fill values (local logic); for DB-target, attempt to compute via DB if mariadb_client available + computed = {} + for col in target_columns: + # local compute based on df contents + if strategy == "constant": + computed[col] = (True, const_value, "constant provided") + else: + # compute on the sample df (non-null values) + series = df[col].dropna() + if series.empty: + computed[col] = (False, None, "no non-missing values in preview sample") else: - fill_val = series.median(skipna=True) - # If result is NaN (e.g., all values missing), skip and warn - if pd.isna(fill_val): - messages.append(f"Column '{col}': no non-missing values to compute {strategy}. Skipped.") - continue - df[col].fillna(fill_val, inplace=True) - messages.append(f"Column '{col}': filled missing with {strategy}={fill_val}.") + if strategy == "mean": + if pd.api.types.is_numeric_dtype(series): + computed[col] = (True, float(series.mean()), "mean via local preview") + else: + computed[col] = (False, None, "not numeric; cannot compute mean locally") + elif strategy == "median": + if pd.api.types.is_numeric_dtype(series): + computed[col] = (True, float(series.median()), "median via local preview") + else: + computed[col] = (False, None, "not numeric; cannot compute median locally") + elif strategy == "mode": + modes = series.mode(dropna=True) + if not modes.empty: + computed[col] = (True, modes.iloc[0], "mode via local preview") + else: + computed[col] = (False, None, "no mode found in local preview") + + # if DB target specified and mariadb_client available, try DB-based compute (overrides local) + if table_full and mariadb_client is not None: + ok, val, msg = self._compute_fill_value_db(mariadb_client, table_full, col, strategy, const_value, sample_size=sample_size) + computed[col] = (ok, val, f"db:{msg}" if msg else "db:unknown") + + # display computed fill values + comp_lines = [] + for col, (ok, val, msg) in computed.items(): + if ok: + comp_lines.append(f"{col}: would fill with -> {val} ({msg})") else: - messages.append(f"Column '{col}' is not numeric; cannot use {strategy}. Skipped.") - continue - - elif strategy == "mode": - # mode works for any dtype; pick first mode if multiple - modes = series.mode(dropna=True) - if modes.empty: - messages.append(f"Column '{col}': no mode (all missing). Skipped.") - continue - fill_val = modes.iloc[0] - df[col].fillna(fill_val, inplace=True) - messages.append(f"Column '{col}': filled missing with mode={fill_val}.") - - elif strategy == "constant": - # use the parsed const_value directly - fill_val = const_value - df[col].fillna(fill_val, inplace=True) - messages.append(f"Column '{col}': filled missing with constant value={fill_val}.") + comp_lines.append(f"{col}: could NOT determine fill value ({msg}); would skip") + kernel._send_message("stdout", "PREVIEW: computed fill-values (best-effort):\n" + "\n".join(comp_lines)) + + # show a sample of rows that would be affected (rows with any NULL in target_columns) + mask = df[target_columns].isnull().any(axis=1) + sample_rows = df[mask].head(sample_size) + if not sample_rows.empty: + # Add a helper column to indicate which columns are null in that row + def nulls_in_row(r): + return ",".join([c for c in target_columns if pd.isnull(r.get(c))]) + sample_preview = sample_rows.copy() + sample_preview["_null_columns"] = sample_preview.apply(nulls_in_row, axis=1) + try: + self._send_html(kernel, sample_preview) + except Exception: + kernel._send_message("stdout", str(sample_preview.head())) + else: + kernel._send_message("stdout", "PREVIEW: no rows with missing values in the preview sample.") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='preview', + message='preview_computed_fill_values', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass except Exception as e: - operation_status = "error" - messages.append(f"Column '{col}': error while filling missing values: {e}") + kernel._send_message("stderr", f"Error during preview: {e}") + return - # update the data store and display results - try: - data["last_select"] = df - summary = "\n".join(messages) - kernel._send_message("stdout", f"Fill missing completed (in-place). Summary:\n{summary}") + # --- ROLLBACK MODE --------------------------------------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) try: - self._send_html(kernel, df) - except Exception: - pass - except Exception as e: - operation_status = "error" - kernel._send_message("stderr", f"Error while updating last_select or displaying DataFrame: {e}") - messages.append(f"Error while updating last_select or displaying DataFrame: {e}") + if not token: + # find latest metadata for this command and user + mariadb_client.run_statement(f"SELECT rollback_token, backup_table, original_table FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + # fallback single column parses + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"fillmissing_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=10) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return - # Insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = "\n".join(target_columns) if target_columns else "" - message_str = "\n".join(messages) - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=message_str, - db_name=db_name, - user_name=user_name - ) - except Exception: - try: - kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") - except Exception: - pass + # --- APPLY MODE ----------------------------------------------- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + # require explicit confirmation for DB changes + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + # we'll compute fill-values per column (best-effort), construct a CTAS where we apply COALESCE(column, ) + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + fill_map = {} # col -> (ok, val, msg) + + for col in target_columns: + ok, val, msg = self._compute_fill_value_db(mariadb_client, table_full, col, strategy, const_value, sample_size=sample_size) + fill_map[col] = (ok, val, msg) + if not ok: + kernel._send_message("stdout", f"Column '{col}': could not compute fill value ({msg}) — will skip filling this column in DB apply.") + + # Build select expressions: for columns we can fill use COALESCE(col, ) AS col; for others keep col + exprs = [] + for c in list(df.columns): + if c in fill_map and fill_map[c][0]: + val = fill_map[c][1] + # decide whether to quote: try numeric conversion + try: + # allow numeric literal if val is a number + if isinstance(val, (int, float)): + literal = str(val) + else: + # attempt to parse numeric-like string + literal = str(val) + # try to parse float + try: + float(literal) + literal = literal + except Exception: + literal = self._sql_escape(literal) + except Exception: + literal = self._sql_escape(str(val)) + # If literal looks already quoted (i.e. started with '), use directly + if isinstance(literal, str) and literal.startswith("'") and literal.endswith("'"): + exprs.append(f"COALESCE({c}, {literal}) AS {c}") + else: + # numeric or unquoted string (we still need to ensure strings are quoted) + try: + # if numeric + float(literal) + exprs.append(f"COALESCE({c}, {literal}) AS {c}") + except Exception: + exprs.append(f"COALESCE({c}, {self._sql_escape(literal)}) AS {c}") + else: + exprs.append(c) + + select_expr = ", ".join(exprs) + try: + lock_name = f"fillmissing_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=10) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with filled values + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_expr} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # Insert metadata with token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (behaves like original implementation) + operation_status = "success" + messages = [] + try: + before_count = len(df) + for col in target_columns: + try: + series = df[col] + if strategy in {"mean", "median"}: + if pd.api.types.is_numeric_dtype(series): + if strategy == "mean": + fill_val = series.mean(skipna=True) + else: + fill_val = series.median(skipna=True) + if pd.isna(fill_val): + messages.append(f"Column '{col}': no non-missing values to compute {strategy}. Skipped.") + continue + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with {strategy}={fill_val}.") + else: + messages.append(f"Column '{col}' is not numeric; cannot use {strategy}. Skipped.") + continue + elif strategy == "mode": + modes = series.mode(dropna=True) + if modes.empty: + messages.append(f"Column '{col}': no mode (all missing). Skipped.") + continue + fill_val = modes.iloc[0] + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with mode={fill_val}.") + elif strategy == "constant": + fill_val = const_value + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with constant value={fill_val}.") + except Exception as e: + operation_status = "error" + messages.append(f"Column '{col}': error while filling missing values: {e}") + + after_count = len(df) + dropped = 0 # not relevant here + data["last_select"] = df + summary = "\n".join(messages) + kernel._send_message("stdout", f"Fill missing completed (in-place). Summary:\n{summary}") + try: + self._send_html(kernel, df) + except Exception: + pass + except Exception as e: + operation_status = "error" + kernel._send_message("stderr", f"Error while applying fillmissing locally: {e}") + messages.append(f"Error while applying fillmissing locally: {e}") + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) if target_columns else "" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return From 7200efcd74c0d584236d14b53966a7faf8f08aed Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 18:01:40 +0000 Subject: [PATCH 29/38] Added preview, apply, rollback to data_preprocessing --- Untitled.ipynb | 2395 +++++++++++++---- last_query.csv | 20 +- .../ml_commands/data_preprocessing/encode.py | 724 ++++- .../data_preprocessing/normalize.py | 581 +++- .../data_preprocessing/standardize.py | 560 +++- 5 files changed, 3394 insertions(+), 886 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 3c0797e..2f473dc 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -6737,6 +6737,880 @@ "%encode method=label drop_original=true" ] }, + { + "cell_type": "code", + "execution_count": 5, + "id": "66a8378a-6f87-4b38-a729-5aab1a288cb0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", + "PREVIEW (local) estimated created columns: 4\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
10JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2ebe63dd-3817-423b-bfb1-8e4b57dfb0a9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", + "PREVIEW (local) estimated created columns: 4\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1Alice10000HR30FNaN5.01287.514050.255000.0300.08.5475.00
2Bob01000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3Charlie00100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4Diana01000Engineering29FPhD6.02295.225020.097000.010000.09.6595.00
6Frank10000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7Grace00100Sales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9Ivy00001Finance27FBachelors3.01085.002080.670000.05000.08.2482.00
10Jack00100Sales55MHigh School30.01268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4760a0c0-869a-4e77-bce1-8985bca8006f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_8d3413bf829d4cd8.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1Alice10000HR30FNaN5.01287.514050.255000.0300.08.5475.00
2Bob01000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3Charlie00100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4Diana01000Engineering29FPhD6.02295.225020.097000.010000.09.6595.00
5Eve00010NaN35FBachelors8.01588.013060.390000.08000.08.0485.00
6Frank10000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7Grace00100Sales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9Ivy00001Finance27FBachelors3.01085.002080.670000.05000.08.2482.00
10Jack00100Sales55MHigh School30.01268.905250.865000.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "65d25956-0729-4e5f-9f41-d91ab3361655", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_8d3413bf829d4cd8 -> test.employees; previous test.employees renamed to test.employees_prerollback_8d3413bf829d4cd8.\n" + ] + } + ], + "source": [ + "%encode mode=rollback rollback_token=8d3413bf829d4cd8" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -7466,7 +8340,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", "metadata": {}, "outputs": [ @@ -7474,7 +8348,22 @@ "name": "stdout", "output_type": "stream", "text": [ - "Normalized 2 column(s) to range (5.0, 10.0). Stored in data['last_select_normalized'].\n" + "PREVIEW (local):\n", + "Local: Column 'emp_id' min=1.0, max=10.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'age' min=27.0, max=55.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'years_experience' min=3.0, max=30.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'projects_completed' min=8.0, max=30.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'avg_project_score' min=68.9, max=95.2 -> range will map to (5.0, 10.0)\n", + "Local: Column 'certifications' min=0.0, max=3.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'training_hours' min=5.0, max=50.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'overtime_hours' min=2.0, max=25.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'remote_ratio' min=0.0, max=0.8 -> range will map to (5.0, 10.0)\n", + "Local: Column 'salary' min=55000.0, max=1200000.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'bonus' min=300.0, max=15000.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'satisfaction_score' min=5.5, max=9.6 -> range will map to (5.0, 10.0)\n", + "Local: Column 'performance_rating' min=1.0, max=5.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'potential_score' min=50.0, max=95.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'attrition_flag' min=0.0, max=1.0 -> range will map to (5.0, 10.0)\n" ] }, { @@ -7484,11 +8373,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -7502,889 +8387,1023 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", - " department_lbl\n", + " emp_id_norm_preview\n", + " age_norm_preview\n", + " years_experience_norm_preview\n", + " projects_completed_norm_preview\n", + " avg_project_score_norm_preview\n", + " certifications_norm_preview\n", + " training_hours_norm_preview\n", + " overtime_hours_norm_preview\n", + " remote_ratio_norm_preview\n", + " salary_norm_preview\n", + " bonus_norm_preview\n", + " satisfaction_score_norm_preview\n", + " performance_rating_norm_preview\n", + " potential_score_norm_preview\n", + " attrition_flag_norm_preview\n", " \n", " \n", " \n", " \n", " 1\n", - " Alice\n", - " HR\n", - " 5.535714\n", - " F\n", - " Bachelors\n", - " 5\n", + " 30\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", " 40\n", " 5\n", " 0.2\n", - " 5.000000\n", - " 3000.0\n", + " 55000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", " 0\n", - " 2\n", - " \n", - " \n", - " 2\n", - " Bob\n", - " Engineering\n", - " 8.214286\n", - " M\n", - " Masters\n", - " 20\n", - " 30\n", - " 91.0\n", - " 3\n", - " 20\n", - " 10\n", - " 0.1\n", - " 10.000000\n", - " 15000.0\n", - " 9.0\n", - " 5\n", - " 89.0\n", - " 0\n", - " 0\n", - " \n", - " \n", - " 3\n", - " Charlie\n", - " Sales\n", - " 6.964286\n", - " M\n", - " Bachelors\n", - " 10\n", - " 18\n", - " 79.3\n", - " 0\n", - " 15\n", - " 20\n", - " 0.5\n", - " 6.923077\n", - " 7000.0\n", - " 7.2\n", - " 3\n", - " 70.0\n", - " 1\n", - " 3\n", - " \n", - " \n", - " 4\n", - " Diana\n", - " Engineering\n", - " 5.357143\n", - " F\n", - " PhD\n", - " 6\n", - " 22\n", - " 95.2\n", - " 2\n", - " 50\n", - " 2\n", - " 0.0\n", - " 8.230769\n", - " 10000.0\n", - " 9.6\n", - " 5\n", - " 95.0\n", - " 0\n", - " 0\n", - " \n", - " \n", - " 5\n", - " Eve\n", - " Finance\n", - " 6.428571\n", - " F\n", - " Bachelors\n", - " 8\n", - " 15\n", - " 88.0\n", - " 1\n", - " 30\n", - " 6\n", - " 0.3\n", - " 7.692308\n", - " 8000.0\n", - " 8.0\n", - " 4\n", - " 85.0\n", - " 0\n", - " 1\n", - " \n", - " \n", - " 6\n", - " Frank\n", - " HR\n", - " 9.107143\n", - " M\n", - " High School\n", - " 25\n", - " 8\n", - " 72.5\n", - " 0\n", - " 10\n", - " 15\n", - " 0.7\n", - " 5.384615\n", - " 4000.0\n", - " 6.5\n", - " 2\n", - " 60.0\n", - " 1\n", - " 2\n", - " \n", - " \n", - " 7\n", - " Grace\n", - " Sales\n", - " 7.678571\n", - " F\n", - " Bachelors\n", - " 18\n", - " 20\n", - " 81.4\n", - " 1\n", - " 25\n", - " 12\n", - " 0.4\n", - " 7.307692\n", - " 7000.0\n", - " 7.8\n", - " 3\n", - " 74.0\n", - " 0\n", - " 3\n", - " \n", - " \n", - " 8\n", - " Henry\n", - " Engineering\n", - " 5.714286\n", - " M\n", - " Masters\n", - " 7\n", - " 25\n", - " 93.1\n", - " 2\n", - " 35\n", - " 5\n", - " 0.2\n", - " 8.076923\n", - " 9000.0\n", - " 9.1\n", - " 5\n", - " 90.0\n", - " 0\n", - " 0\n", - " \n", - " \n", - " 9\n", - " Ivy\n", - " Finance\n", " 5.000000\n", - " F\n", - " Bachelors\n", - " 3\n", - " 10\n", - " 85.0\n", - " 0\n", - " 20\n", - " 8\n", - " 0.6\n", - " 6.153846\n", - " 5000.0\n", - " 8.2\n", - " 4\n", - " 82.0\n", - " 0\n", - " 1\n", - " \n", - " \n", - " 10\n", - " Jack\n", - " Sales\n", - " 10.000000\n", - " M\n", - " High School\n", - " 30\n", - " 12\n", - " 68.9\n", - " 0\n", - " 5\n", - " 25\n", - " 0.8\n", - " 5.769231\n", - " 2000.0\n", - " 5.5\n", - " 1\n", - " 50.0\n", - " 1\n", - " 3\n", - " \n", - " \n", - " 11\n", - " Alice\n", - " HR\n", " 5.535714\n", - " F\n", - " Bachelors\n", - " 5\n", - " 12\n", - " 87.5\n", - " 1\n", - " 40\n", - " 5\n", - " 0.2\n", + " 5.370370\n", + " 5.909091\n", + " 8.536122\n", + " 6.666667\n", + " 8.888889\n", + " 5.652174\n", + " 6.250\n", " 5.000000\n", - " 3000.0\n", - " 8.5\n", - " 4\n", - " 75.0\n", - " 0\n", - " 2\n", + " 5.000000\n", + " 8.658537\n", + " 8.75\n", + " 7.777778\n", + " 5.0\n", " \n", " \n", - " 12\n", - " Bob\n", - " Engineering\n", - " 8.214286\n", - " M\n", - " Masters\n", - " 20\n", + " 2\n", + " 45\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 10.000000\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", " 89.0\n", " 0\n", - " 0\n", + " 5.555556\n", + " 8.214286\n", + " 8.148148\n", + " 10.000000\n", + " 9.201521\n", + " 10.000000\n", + " 6.666667\n", + " 6.739130\n", + " 5.625\n", + " 10.000000\n", + " 10.000000\n", + " 9.268293\n", + " 10.00\n", + " 9.333333\n", + " 5.0\n", " \n", " \n", - " 13\n", - " Charlie\n", - " Sales\n", - " 6.964286\n", - " M\n", - " Bachelors\n", - " 10\n", + " 3\n", + " 38\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", " 15\n", " 20\n", " 0.5\n", - " 6.923077\n", + " 80000.0\n", " 7000.0\n", " 7.2\n", " 3\n", " 70.0\n", " 1\n", - " 3\n", + " 6.111111\n", + " 6.964286\n", + " NaN\n", + " 7.272727\n", + " 6.977186\n", + " 5.000000\n", + " 6.111111\n", + " 8.913043\n", + " 8.125\n", + " 5.109170\n", + " 7.278912\n", + " 7.073171\n", + " 7.50\n", + " 7.222222\n", + " 10.0\n", " \n", " \n", - " 14\n", - " Diana\n", - " Engineering\n", - " 5.357143\n", - " F\n", - " PhD\n", - " 6\n", + " 4\n", + " 29\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", " 50\n", " 2\n", " 0.0\n", - " 8.230769\n", + " 97000.0\n", " 10000.0\n", " 9.6\n", " 5\n", " 95.0\n", " 0\n", - " 0\n", + " 6.666667\n", + " 5.357143\n", + " 5.555556\n", + " 8.181818\n", + " 10.000000\n", + " 8.333333\n", + " 10.000000\n", + " 5.000000\n", + " 5.000\n", + " 5.183406\n", + " 8.299320\n", + " 10.000000\n", + " 10.00\n", + " 10.000000\n", + " 5.0\n", " \n", " \n", - " 15\n", - " Eve\n", - " Finance\n", - " 6.428571\n", - " F\n", - " Bachelors\n", - " 8\n", + " 5\n", + " 35\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", " 30\n", " 6\n", " 0.3\n", - " 7.692308\n", + " 90000.0\n", " 8000.0\n", " 8.0\n", " 4\n", " 85.0\n", " 0\n", - " 1\n", + " 7.222222\n", + " 6.428571\n", + " 5.925926\n", + " 6.590909\n", + " 8.631179\n", + " 6.666667\n", + " 7.777778\n", + " 5.869565\n", + " 6.875\n", + " 5.152838\n", + " 7.619048\n", + " 8.048780\n", + " 8.75\n", + " 8.888889\n", + " 5.0\n", " \n", " \n", - " 16\n", - " Frank\n", - " HR\n", - " 9.107143\n", - " M\n", - " High School\n", - " 25\n", + " 6\n", + " 50\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", " 10\n", " 15\n", " 0.7\n", - " 5.384615\n", + " 60000.0\n", " 4000.0\n", " 6.5\n", " 2\n", " 60.0\n", " 1\n", - " 2\n", + " 7.777778\n", + " 9.107143\n", + " 9.074074\n", + " 5.000000\n", + " 5.684411\n", + " 5.000000\n", + " 5.555556\n", + " 7.826087\n", + " 9.375\n", + " 5.021834\n", + " 6.258503\n", + " 6.219512\n", + " 6.25\n", + " 6.111111\n", + " 10.0\n", " \n", " \n", - " 17\n", - " Grace\n", - " Sales\n", - " 7.678571\n", - " F\n", - " Bachelors\n", - " 18\n", + " 7\n", + " 42\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", " 25\n", " 12\n", " 0.4\n", - " 7.307692\n", + " 85000.0\n", " 7000.0\n", " 7.8\n", " 3\n", " 74.0\n", " 0\n", - " 3\n", + " 8.333333\n", + " 7.678571\n", + " 7.777778\n", + " 7.727273\n", + " 7.376426\n", + " 6.666667\n", + " 7.222222\n", + " 7.173913\n", + " 7.500\n", + " 5.131004\n", + " 7.278912\n", + " 7.804878\n", + " 7.50\n", + " 7.666667\n", + " 5.0\n", " \n", " \n", - " 18\n", - " Henry\n", - " Engineering\n", - " 5.714286\n", - " M\n", - " Masters\n", - " 7\n", + " 8\n", + " 31\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", " 35\n", " 5\n", " 0.2\n", - " 8.076923\n", + " 95000.0\n", " 9000.0\n", " 9.1\n", " 5\n", " 90.0\n", " 0\n", - " 0\n", + " 8.888889\n", + " 5.714286\n", + " 5.740741\n", + " 8.863636\n", + " 9.600760\n", + " 8.333333\n", + " 8.333333\n", + " 5.652174\n", + " 6.250\n", + " 5.174672\n", + " 7.959184\n", + " 9.390244\n", + " 10.00\n", + " 9.444444\n", + " 5.0\n", " \n", " \n", - " 19\n", - " Ivy\n", - " Finance\n", - " 5.000000\n", - " F\n", - " Bachelors\n", - " 3\n", + " 9\n", + " 27\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", " 20\n", " 8\n", " 0.6\n", - " 6.153846\n", + " 70000.0\n", " 5000.0\n", " 8.2\n", " 4\n", " 82.0\n", " 0\n", - " 1\n", + " 9.444444\n", + " 5.000000\n", + " 5.000000\n", + " 5.454545\n", + " 8.060837\n", + " 5.000000\n", + " 6.666667\n", + " 6.304348\n", + " 8.750\n", + " 5.065502\n", + " 6.598639\n", + " 8.292683\n", + " 8.75\n", + " 8.555556\n", + " 5.0\n", " \n", " \n", - " 20\n", - " Jack\n", - " Sales\n", - " 10.000000\n", - " M\n", - " High School\n", - " 30\n", + " 10\n", + " 55\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", " 5\n", " 25\n", " 0.8\n", - " 5.769231\n", + " 65000.0\n", " 2000.0\n", " 5.5\n", " 1\n", " 50.0\n", " 1\n", - " 3\n", + " 10.000000\n", + " 10.000000\n", + " 10.000000\n", + " 5.909091\n", + " 5.000000\n", + " 5.000000\n", + " 5.000000\n", + " 10.000000\n", + " 10.000\n", + " 5.043668\n", + " 5.578231\n", + " 5.000000\n", + " 5.00\n", + " 5.000000\n", + " 10.0\n", + " \n", + " \n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize feature_range=5,10 mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6685559f-8986-4504-97eb-e62e20275bd3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_daf864252a6c46f1.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
215.000000AliceHR5.535714FBachelors51287.514050.2NaN5.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000003000.08.5475.0025.0000008.6585378.757.7777785.0
225.555556BobEngineering8.214286MMasters203091.0320100.18.14814810.00000015000.09.0589.0009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.0
236.111111CharlieSales6.964286MBachelors101879.3015200.56.9230777000.07.2370.013Sales6.964286MBachelorsNaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
246.666667DianaEngineering5.357143FPhD62295.225020.08.23076910000.09.6595.0005.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.0
257.222222EveFinanceNaN6.428571FBachelors81588.013060.37.6923088000.08.0485.0015.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0
267.777778FrankHR9.107143MHigh School25872.5010150.75.3846154000.06.5260.0129.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.0
278.333333GraceSales7.678571FBachelors182081.4125120.47.3076927000.07.8374.0037.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0
288.888889HenryEngineering5.714286MMasters72593.123550.28.0769239000.09.1590.0005.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.0
299.444444IvyFinance5.000000FBachelors31085.002080.66.1538465000.08.2482.0015.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.0
3010.000000JackSales10.000000MHigh School301268.905250.85.7692312000.05.5150.01310.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize feature_range=5,10 table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e5bb1249-ba1e-4a9d-86f0-58b528c0e465", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_daf864252a6c46f1 -> test.employees; previous test.employees renamed to test.employees_prerollback_daf864252a6c46f1.\n" + ] + } + ], + "source": [ + "%normalize mode=rollback rollback_token=daf864252a6c46f1" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'emp_id': mean=5.5, std=2.8722813232690143\n", + "Local: Column 'age': mean=38.2, std=9.064215354899728\n", + "Local: Column 'years_experience': mean=13.555555555555555, std=9.2988782012923\n", + "Local: Column 'projects_completed': mean=17.2, std=6.7201190465645775\n", + "Local: Column 'avg_project_score': mean=84.19, std=8.217353588595294\n", + "Local: Column 'certifications': mean=1.0, std=1.0\n", + "Local: Column 'training_hours': mean=25.0, std=13.228756555322953\n", + "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", + "Local: Column 'remote_ratio': mean=0.38000000000000006, std=0.2521904042583698\n", + "Local: Column 'salary': mean=189700.0, std=337053.12637624354\n", + "Local: Column 'bonus': mean=6730.0, std=4002.0119939850256\n", + "Local: Column 'satisfaction_score': mean=7.9399999999999995, std=1.1918053532351665\n", + "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n", + "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", + "Local: Column 'attrition_flag': mean=0.3, std=0.45825756949558405\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_std_previewage_std_previewyears_experience_std_previewprojects_completed_std_previewavg_project_score_std_previewcertifications_std_previewtraining_hours_std_previewovertime_hours_std_previewremote_ratio_std_previewsalary_std_previewbonus_std_previewsatisfaction_score_std_previewperformance_rating_std_previewpotential_score_std_previewattrition_flag_std_preview
31AliceHR5.535714FBachelors51305.01287.514050.25.0000003000.055000.0300.08.5475.002-1.566699-0.904656-0.920063-0.7737960.4028060.01.133893-0.835766-0.713746-0.399640-1.6066920.4698750.312348-0.148823-0.654654
32BobEngineering8.214286MMasters2024520.03091.0320100.110.0000001200000.015000.09.0589.000-1.2185440.7502030.6930351.9047280.8287342.0-0.377964-0.115278-1.1102722.9974502.0664610.8894071.0932160.892940-0.654654
33CharlieSales6.964286MBachelors10338NaN1879.3015200.56.92307780000.07000.07.2370.013-0.870388-0.022065NaN0.119046-0.595082-1.0-0.7559291.3256980.475831-0.3254680.067466-0.620907-0.468521-0.5208821.527525
34DianaEngineering5.357143FPhD64296.02295.225020.08.23076997000.010000.09.6595.000-0.522233-1.014980-0.8125230.7142731.3398471.01.889822-1.268059-1.506798-0.2750310.8170891.3928451.0932161.339410-0.654654
535EveFinance6.428571FBachelors88.01588.013060.37.69230890000.08000.08.0485.001-0.174078-0.353037-0.597444-0.3273750.4636530.00.377964-0.691669-0.317221-0.2957990.3173400.0503440.3123480.595293-0.654654
36FrankHR9.107143MHigh School2565025.0872.5010150.75.38461560000.04000.06.5260.0120.1740781.3018231.230734-1.369023-1.422599-1.0-1.1338930.6052101.268883-0.384806-0.682157-1.208251-1.249390-1.2649991.527525
37GraceSales7.678571FBachelors1874218.02081.4125120.47.30769285000.07000.07.8374.0030.5222330.4192310.4779550.416659-0.3395250.00.0000000.1729170.079305-0.3106340.067466-0.117469-0.468521-0.223235-0.654654
38HenryEngineering5.714286MMasters78317.02593.123550.28.07692395000.09000.09.1590.0000.870388-0.794332-0.7049831.1606941.0842911.00.755929-0.835766-0.713746-0.2809650.5672150.9733131.0932160.967352-0.654654
39IvyFinance5.000000FBachelors39273.01085.002080.66.15384670000.05000.08.2482.0011.218544-1.235628-1.135143-1.0714100.098572-1.0-0.377964-0.4034730.872357-0.355137-0.4322830.2181560.3123480.372058-0.654654
40JackSales10.000000MHigh School30105530.01268.905250.85.76923165000.02000.05.5150.0131.5666991.8534421.768433-0.773796-1.860696-1.0-1.5118582.0461861.665408-0.369971-1.181906-2.047314-2.030259-2.0091151.527525
" @@ -8395,20 +9414,20 @@ } ], "source": [ - "%normalize columns=age,salary feature_range=5,10 inplace=False" + "%standardize mode=preview" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", + "execution_count": 5, + "id": "25f88a13-b173-4b8e-a2d0-4313b6f47b0c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Standardized 3 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" + "Apply completed: original preserved as test.employees_backup_f15a883f004548a8.\n" ] }, { @@ -8417,69 +9436,237 @@ "\n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idemp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
-1.527525-1.56669AliceHR-0.716269-0.660211-0.90466FNaN-0.920066-0.773800.4028060.01.13389-0.83577-0.713748-0.399640-1.6066920.4698760.31235-0.148823-0.65459
-1.091089-1.21854BobNaN0.449750NaNEngineering0.75020MMasters0.6930281.904730.8287342.0-0.37796-0.11528-1.1102742.9974502.0664610.8894071.093240.892940-0.65459
-0.654654-0.87038CharlieEngineeringSales-0.02206MBachelorsNaN1.9920820.11905-0.595082-1.0-0.755931.325710.475832-0.3254680.067466-0.620907-0.46853-0.5208821.52738
-0.218218DavidHR-1.299278-0.496112-0.52223DianaEngineering-1.01498FPhD-0.8125260.714281.3398471.01.88982-1.26807-1.506800-0.2750310.8170891.3928451.093241.339410-0.65459
0.218218-0.17408EveNaN-0.133259NaN-0.35304FBachelors-0.597447-0.327380.4636530.00.37796-0.69167-0.317221-0.2957990.3173400.0503440.312350.595293-0.65459
0.6546540.17408FrankEngineering-0.949473-0.404522HR1.30182MHigh School1.230726-1.36903-1.422599-1.0-1.133890.605211.268885-0.384806-0.682157-1.208251-1.24941-1.2649991.52738
1.091089NaN0.52223GraceSales1.615769NaN0.41923FBachelors0.4779490.41666-0.3395250.00.000000.172920.079305-0.3106340.067466-0.117469-0.46853-0.223235-0.65459
1.527525Grace0.87038HenryEngineering-0.79433MMasters-0.7049871.160701.0842911.00.75593-0.83577-0.713748-0.2809650.5672150.9733141.093240.967352-0.65459
1.21854IvyFinance-1.23563FBachelors-1.135145-1.071410.098572-1.0-0.37796-0.403480.872358-0.355137-0.4322830.2181560.312350.372058-0.65459
1.56669JackSales1.032760-0.4312361.85345MHigh School1.768424-0.77380-1.860696-1.0-1.511852.046201.665411-0.369971-1.181906-2.047315-2.03030-2.0091151.52738
" @@ -8490,7 +9677,7 @@ } ], "source": [ - "%standardize inplace=False" + "%standardize table=test.employees mode=apply confirm=true " ] }, { @@ -15056,9 +16243,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "2e8ace40-a7b5-41e6-9225-fc52e197d0ea", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The result set was successfully written into last_query.csv\n" + ] + } + ], + "source": [ + "%df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14bf423d-5e41-4d33-a2ef-a523937b88ef", + "metadata": {}, "outputs": [], "source": [] } diff --git a/last_query.csv b/last_query.csv index 3450d41..6132871 100644 --- a/last_query.csv +++ b/last_query.csv @@ -1,9 +1,11 @@ -id,name,department,age,salary -1,Alice,HR,30.0,50000.0 -2,Bob,Unknown,40.0,61000.0 -3,Charlie,Engineering,36.142857142857146,70000.0 -4,David,HR,25.0,48000.0 -5,Eve,Unknown,35.0,61000.0 -6,Frank,Engineering,28.0,72000.0 -7,Unknown,Sales,50.0,61000.0 -8,Grace,Sales,45.0,65000.0 +emp_id,name,department_HR,department_Engineering,department_Sales,department_NULL,department_Finance,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag +1,Alice,1,0,0,0,0,HR,30,F,,5.0,12,87.5,1,40,5,0.2,55000.0,300.0,8.5,4,75.0,0 +2,Bob,0,1,0,0,0,Engineering,45,M,Masters,20.0,30,91.0,3,20,10,0.1,1200000.0,15000.0,9.0,5,89.0,0 +3,Charlie,0,0,1,0,0,Sales,38,M,Bachelors,,18,79.3,0,15,20,0.5,80000.0,7000.0,7.2,3,70.0,1 +4,Diana,0,1,0,0,0,Engineering,29,F,PhD,6.0,22,95.2,2,50,2,0.0,97000.0,10000.0,9.6,5,95.0,0 +5,Eve,0,0,0,1,0,,35,F,Bachelors,8.0,15,88.0,1,30,6,0.3,90000.0,8000.0,8.0,4,85.0,0 +6,Frank,1,0,0,0,0,HR,50,M,High School,25.0,8,72.5,0,10,15,0.7,60000.0,4000.0,6.5,2,60.0,1 +7,Grace,0,0,1,0,0,Sales,42,F,Bachelors,18.0,20,81.4,1,25,12,0.4,85000.0,7000.0,7.8,3,74.0,0 +8,Henry,0,1,0,0,0,Engineering,31,M,Masters,7.0,25,93.1,2,35,5,0.2,95000.0,9000.0,9.1,5,90.0,0 +9,Ivy,0,0,0,0,1,Finance,27,F,Bachelors,3.0,10,85.0,0,20,8,0.6,70000.0,5000.0,8.2,4,82.0,0 +10,Jack,0,0,1,0,0,Sales,55,M,High School,30.0,12,68.9,0,5,25,0.8,65000.0,2000.0,5.5,1,50.0,1 diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py index f335dd6..69f494a 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py @@ -11,6 +11,8 @@ import logging import os import re +import uuid +import time # Optional helper to reliably get current DB name (if available) try: @@ -25,11 +27,13 @@ class Encode(MariaMagic): [columns=col1,col2,...] [inplace=true|false] [drop_original=true|false] + [mode=preview|apply|rollback] + [table=schema.table] [confirm=true|false] [sample_size=100] Notes: - If columns omitted, object/category dtype columns are auto-selected. - Default: inplace=true, drop_original=true. - - Requires scikit-learn installed for onehot/ordinal. + - DB apply uses CTAS + atomic RENAME and records rollback metadata. """ def __init__(self, args=""): @@ -44,9 +48,10 @@ def name(self): def help(self): return ( "%encode method= [columns=col1,col2] " - "[inplace=true] [drop_original=true]\n" - "Encode categorical columns using label, one-hot, or ordinal encoding (automatic)." - "Execution metadata is recorded in table `magic_metadata`." + "[inplace=true] [drop_original=true] [mode=preview|apply|rollback]\n" + "[table=schema.table] [confirm=true] [sample_size=100]\n" + "Encode categorical columns. Preview shows what will be created. " + "Apply can operate locally or on a DB table (versioned)." ) def _str_to_obj(self, s): @@ -90,7 +95,6 @@ def _make_ohe(self, **kwargs): try: return OneHotEncoder(sparse=False, **kwargs) except TypeError: - # fallback for newer sklearn where parameter name changed return OneHotEncoder(sparse_output=False, **kwargs) # -------------------- metadata / DB helpers (best-effort) -------------------- @@ -110,6 +114,16 @@ def _sql_escape(self, val): val = str(val) return "'" + val.replace("'", "''") + "'" + def _safe_colname(self, s): + """Create a safe column identifier from an arbitrary string.""" + if s is None: + return "" + s2 = re.sub(r"[^0-9A-Za-z_]", "_", str(s)) + # ensure not starting with digit + if re.match(r"^[0-9]", s2): + s2 = "_" + s2 + return s2[:200] # cap length + def _get_db_name(self, kernel): """ Attempt to determine the currently used DB. @@ -186,8 +200,7 @@ def _get_user_name(self, kernel): def _ensure_metadata_table(self, kernel, db_name): """ Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name + Includes rollback support columns (rollback_token, backup_table, original_table). """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) @@ -196,6 +209,7 @@ def _ensure_metadata_table(self, kernel, db_name): return table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" CREATE TABLE IF NOT EXISTS {table_full_name} ( id INT AUTO_INCREMENT PRIMARY KEY, @@ -206,7 +220,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -217,7 +234,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): """ Insert a metadata row into magic_metadata. Uses NOW() for timestamp. """ @@ -234,11 +252,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -247,7 +268,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -257,6 +281,56 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_distinct_results(self, result): + """Return list of values from a run_statement output (HTML or plain).""" + if not result: + return [] + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + series = dfs[0].iloc[:, 0].astype(object) + return [None if (pd.isna(x) or x is None) else x for x in series.tolist()] + except Exception: + vals = re.findall(r"(.*?)", str(result), flags=re.S | re.I) + parsed = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + if txt.lower() == "null": + parsed.append(None) + else: + parsed.append(txt) + if parsed: + return parsed + # Last fallback: attempt to split raw text lines + try: + txt = str(result).strip() + lines = [l.strip() for l in txt.splitlines() if l.strip()] + return lines + except Exception: + return [] + # -------------------- end metadata helpers -------------------- def execute(self, kernel, data): @@ -332,12 +406,19 @@ def execute(self, kernel, data): inplace = bool(args.get("inplace", True)) drop_original = bool(args.get("drop_original", True)) - # Work on copy if not inplace - result_df = df if inplace else df.copy() + # mode and db args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) - # Prepare metadata context + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) db_name = self._get_db_name(kernel) user_name = self._get_user_name(kernel) + + # ensure metadata table exists try: self._ensure_metadata_table(kernel, db_name) except Exception: @@ -346,141 +427,516 @@ def execute(self, kernel, data): except Exception: pass - messages = [] - operation_status = "success" - created_columns = [] - - try: - # We'll store encoder info here to save into data at the end - encoder_obj = None - label_mappings = None - - if method == "label": - # Use pandas.factorize which handles NaN by assigning -1 codes - label_mappings = {} + # --- PREVIEW MODE --- + if mode == "preview": + try: + messages = [] + created_columns = [] + # Local preview: compute unique counts, sample mappings for col in columns: - codes, uniques = pd.factorize(result_df[col], sort=True) - new_col = f"{col}_lbl" - result_df[new_col] = codes - created_columns.append(new_col) - # Save mapping value->code for reuse later - mapping = {val: idx for idx, val in enumerate(uniques)} - label_mappings[col] = mapping - if drop_original: - result_df.drop(columns=[col], inplace=True) - messages.append(f"Column '{col}': label-encoded -> {new_col} (unique_values={len(uniques)})") - - encoder_obj = label_mappings - - elif method == "onehot": - # sklearn OneHotEncoder with version compatibility - encoder = self._make_ohe(handle_unknown="ignore") - # replace NaN with sentinel string so it's treated as a category - tmp = result_df[columns].astype(object).fillna("___MISSING___") - arr = encoder.fit_transform(tmp) - # feature names (sklearn >= 1.0) - try: - feature_names = encoder.get_feature_names_out(columns) - feature_names = [str(fn) for fn in feature_names] - except Exception: - # fallback: build names manually - cats = encoder.categories_ - feature_names = [] - for cname, cat_list in zip(columns, cats): - for cat in cat_list: - feature_names.append(f"{cname}_{str(cat)}") - # create DataFrame of encoded features - ohe_df = pd.DataFrame(arr, columns=feature_names, index=result_df.index) - # concatenate appropriately - if drop_original: - result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) - else: - result_df = pd.concat([result_df, ohe_df], axis=1) - created_columns.extend(feature_names) - messages.append(f"Columns {columns} one-hot encoded -> created {len(feature_names)} columns.") - encoder_obj = encoder # save fitted OneHotEncoder - - elif method == "ordinal": - # use sklearn OrdinalEncoder for one or multiple columns (automatic ordering) - enc = OrdinalEncoder(dtype=np.float64) - # fillna sentinel so OrdinalEncoder treats missing as a category - tmp = result_df[columns].astype(object).fillna("___MISSING___") - enc_arr = enc.fit_transform(tmp) - for i, col in enumerate(columns): - new_col = f"{col}_ord" - result_df[new_col] = enc_arr[:, i] - created_columns.append(new_col) - if drop_original: - result_df.drop(columns=[col], inplace=True) - messages.append(f"Column '{col}': ordinal-encoded -> {new_col}") - encoder_obj = enc - - else: - kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") - # log unsupported method + series = df[col] + uniques = pd.Index(series.dropna().unique()) + n_uniques = len(uniques) + messages.append(f"Local: Column '{col}' unique non-null values: {n_uniques} (showing up to 10): {list(uniques[:10])}") + if method == "label" or method == "ordinal": + created_columns.append(f"{col}_lbl" if method == "label" else f"{col}_ord") + elif method == "onehot": + # onehot creates one column per category + for v in list(uniques[:100]): # cap for preview listing + created_columns.append(f"{col}_{self._safe_colname(v)}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + kernel._send_message("stdout", f"PREVIEW (local) estimated created columns: {len(created_columns)}") + + # Show sample rows that will be modified (where any column is not-na) + sample_mask = pd.Series(False, index=df.index) + for col in columns: + sample_mask = sample_mask | df[col].notna() + sample_rows = df[sample_mask].head(sample_size) + if not sample_rows.empty: + try: + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + + # DB preview if requested + if table_full and mariadb_client is not None: + db_msgs = [] + total_estimated_new_cols = 0 + for col in columns: + try: + out = mariadb_client.run_statement(f"SELECT DISTINCT {col} FROM {table_full} LIMIT {sample_size};") + vals = self._parse_distinct_results(out) + nvals = len(vals) + db_msgs.append(f"DB: Column '{col}' distinct values (up to {sample_size}): {vals[:10]} (count_est={nvals})") + if method == "label" or method == "ordinal": + total_estimated_new_cols += 1 + else: + total_estimated_new_cols += nvals + except Exception as e: + db_msgs.append(f"DB: Column '{col}' distinct query failed: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + kernel._send_message("stdout", f"PREVIEW (db) estimated created columns: {total_estimated_new_cols}") + + # log preview metadata try: self._insert_metadata( kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), affected_columns="\n".join(columns), - operation_status="error", - message="Unsupported method requested.", + operation_status='preview', + message='preview_completed', db_name=db_name, user_name=user_name ) except Exception: pass - return - # Apply result back to shared data if inplace - if inplace: - data["last_select"] = result_df - kernel._send_message("stdout", "Encoded columns in-place and updated last_select.") - else: - kernel._send_message("stdout", "Displayed encoded result (last_select not modified).") + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return - # Save encoder (or mapping) to shared data for downstream pipeline usage + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) try: - if encoder_obj is not None: - data["last_select_encoder"] = encoder_obj - elif label_mappings is not None: - data["last_select_encoder"] = label_mappings - except Exception: - # don't fail pipeline just because we couldn't save encoder - pass + if not token: + # try to read latest magic_metadata entry for this command and user + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"encode_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=10) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return - # display - self._send_html(kernel, result_df) + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # gather distinct values per column from DB (best-effort) + col_values = {} + messages = [] + for col in columns: + try: + out = mariadb_client.run_statement(f"SELECT DISTINCT {col} FROM {table_full} LIMIT {sample_size};") + vals = self._parse_distinct_results(out) + # we keep the order returned; limit cardinality to avoid explosion + col_values[col] = vals + messages.append(f"{col}: discovered {len(vals)} distinct values (sample limit {sample_size}).") + except Exception as e: + col_values[col] = [] + messages.append(f"{col}: failed to collect distinct values: {e}") + + # Build SELECT expressions + select_exprs = [] + all_columns = list(df.columns) + + created_cols = [] + created_count = 0 + + for c in all_columns: + if c in columns: + vals = col_values.get(c, []) + if method == "label": + # build CASE ... WHEN ... THEN idx ELSE NULL END AS col_lbl + cases = [] + for idx, v in enumerate(vals): + if v is None: + cases.append(f"WHEN {c} IS NULL THEN {idx}") + else: + cases.append(f"WHEN {c} = {self._sql_escape(v)} THEN {idx}") + case_sql = " ".join(cases) + new_name = f"{c}_lbl" + select_exprs.append(f"CASE {case_sql} ELSE NULL END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + elif method == "ordinal": + cases = [] + for idx, v in enumerate(vals): + if v is None: + cases.append(f"WHEN {c} IS NULL THEN {idx}") + else: + cases.append(f"WHEN {c} = {self._sql_escape(v)} THEN {idx}") + new_name = f"{c}_ord" + select_exprs.append(f"CASE {' '.join(cases)} ELSE NULL END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + elif method == "onehot": + # for each distinct value create column col_ as CASE WHEN col=val THEN 1 ELSE 0 END + for v in vals: + safe = self._safe_colname(v if v is not None else "NULL") + new_name = f"{c}_{safe}" + if v is None: + select_exprs.append(f"CASE WHEN {c} IS NULL THEN 1 ELSE 0 END AS {new_name}") + else: + select_exprs.append(f"CASE WHEN {c} = {self._sql_escape(v)} THEN 1 ELSE 0 END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + else: + # fallback: keep original + select_exprs.append(c) + else: + # not a targeted column — keep as is + select_exprs.append(c) + + # safety cap + if created_count > 1000: + kernel._send_message("stderr", f"Refusing to create {created_count} encoded columns ( > 1000 ). Narrow the columns or reduce distinct values.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='error', + message=f"too_many_created_columns={created_count}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + select_sql = ", ".join(select_exprs) + try: + lock_name = f"encode_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=10) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with encoded columns + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='applied', + message=f'applied_backup={backup_table};created_columns={created_count}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) - except Exception as e: - operation_status = "error" - err_msg = f"Error during encoding: {e}" - kernel._send_message("stderr", err_msg) - messages.append(err_msg) + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return - # Attempt to insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - # store affected (input) columns newline-separated - affected_columns_str = "\n".join(columns) - # store created columns newline-separated (if any) - created_columns_str = "\n".join(created_columns) if created_columns else "" - # Compose metadata message with sections for readability - details = "\n".join(messages) if messages else "Encoding completed without detailed messages." - metadata_message = f"Method: {method}\nCreated columns:\n{created_columns_str}\n\nDetails:\n{details}" - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=metadata_message, - db_name=db_name, - user_name=user_name - ) - except Exception as e: - try: - kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") - except Exception: - pass + else: + # Local in-place apply on data['last_select'] (existing behavior) + result_df = df if inplace else df.copy() + messages = [] + operation_status = "success" + created_columns = [] + try: + encoder_obj = None + label_mappings = None + + if method == "label": + label_mappings = {} + for col in columns: + codes, uniques = pd.factorize(result_df[col], sort=True) + new_col = f"{col}_lbl" + result_df[new_col] = codes + created_columns.append(new_col) + mapping = {val: idx for idx, val in enumerate(uniques)} + label_mappings[col] = mapping + if drop_original: + result_df.drop(columns=[col], inplace=True) + messages.append(f"Column '{col}': label-encoded -> {new_col} (unique_values={len(uniques)})") + encoder_obj = label_mappings + + elif method == "onehot": + encoder = self._make_ohe(handle_unknown="ignore") + tmp = result_df[columns].astype(object).fillna("___MISSING___") + arr = encoder.fit_transform(tmp) + try: + feature_names = encoder.get_feature_names_out(columns) + feature_names = [str(fn) for fn in feature_names] + except Exception: + cats = encoder.categories_ + feature_names = [] + for cname, cat_list in zip(columns, cats): + for cat in cat_list: + feature_names.append(f"{cname}_{str(cat)}") + ohe_df = pd.DataFrame(arr, columns=feature_names, index=result_df.index) + if drop_original: + result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) + else: + result_df = pd.concat([result_df, ohe_df], axis=1) + created_columns.extend(feature_names) + messages.append(f"Columns {columns} one-hot encoded -> created {len(feature_names)} columns.") + encoder_obj = encoder + + elif method == "ordinal": + enc = OrdinalEncoder(dtype=np.float64) + tmp = result_df[columns].astype(object).fillna("___MISSING___") + enc_arr = enc.fit_transform(tmp) + for i, col in enumerate(columns): + new_col = f"{col}_ord" + result_df[new_col] = enc_arr[:, i] + created_columns.append(new_col) + if drop_original: + result_df.drop(columns=[col], inplace=True) + messages.append(f"Column '{col}': ordinal-encoded -> {new_col}") + encoder_obj = enc + + else: + kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message="Unsupported method requested.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Apply result back to shared data if inplace + if inplace: + data["last_select"] = result_df + kernel._send_message("stdout", "Encoded columns in-place and updated last_select.") + else: + kernel._send_message("stdout", "Displayed encoded result (last_select not modified).") + + # Save encoder (or mapping) to shared data for downstream pipeline usage + try: + if encoder_obj is not None: + data["last_select_encoder"] = encoder_obj + elif label_mappings is not None: + data["last_select_encoder"] = label_mappings + except Exception: + pass + + # display + self._send_html(kernel, result_df) + + except Exception as e: + operation_status = "error" + err_msg = f"Error during encoding: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Attempt to insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(columns) + created_columns_str = "\n".join(created_columns) if created_columns else "" + details = "\n".join(messages) if messages else "Encoding completed." + metadata_message = f"Method: {method}\nCreated columns:\n{created_columns_str}\n\nDetails:\n{details}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py index f81710d..e85fddc 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py @@ -9,6 +9,8 @@ import logging import os import re +import uuid +import time # Optional helper to reliably get current DB name (if available) try: @@ -20,6 +22,8 @@ class Normalize(MariaMagic): """ %normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] Scales numeric columns to a fixed range (default 0-1) using sklearn's MinMaxScaler. @@ -27,12 +31,13 @@ class Normalize(MariaMagic): - feature_range: lower and upper bounds for scaling (default: 0,1) - inplace: if True (default), modifies data["last_select"] in-place. if False, stores result in data["last_select_normalized"]. + - mode: preview/apply/rollback (preview default) Examples: %normalize %normalize columns=age,salary %normalize feature_range=5,10 inplace=False - + %normalize mode=apply table=schema.emp confirm=true Execution metadata is recorded in table `magic_metadata`. """ @@ -48,8 +53,8 @@ def name(self): def help(self): return ( "%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]\n" - "Normalize numeric columns using MinMaxScaler (in-place by default).\n" - "Execution metadata is recorded in table `magic_metadata`." + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true]\n" + "Normalize numeric columns using MinMaxScaler (in-place by default)." ) def _str_to_obj(self, s): @@ -177,8 +182,7 @@ def _get_user_name(self, kernel): def _ensure_metadata_table(self, kernel, db_name): """ Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name + Includes rollback support columns (rollback_token, backup_table, original_table). """ mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) @@ -197,7 +201,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -208,7 +215,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): """ Insert a metadata row into magic_metadata. Uses NOW() for timestamp. """ @@ -225,11 +233,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -238,7 +249,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -248,6 +262,77 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_two_value_result(self, res): + """ + Parse results expected to have two values (MIN and MAX, HTML table or plain). + Returns (val1, val2) or (None, None) if parsing fails. + """ + if not res: + return None, None + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + r0 = dfs[0].iloc[0, 0] + r1 = dfs[0].iloc[0, 1] if dfs[0].shape[1] > 1 else None + try: + v0 = float(r0) if pd.notna(r0) else None + except Exception: + v0 = None + try: + v1 = float(r1) if pd.notna(r1) else None + except Exception: + v1 = None + return v0, v1 + except Exception: + # regex fallback: pick first two cells + m = re.findall(r"(.*?)", str(res), flags=re.S | re.I) + if m and len(m) >= 1: + def tofloat(txt): + txt = re.sub(r"<.*?>", "", txt).strip() + if txt.lower() == "null" or txt == "": + return None + try: + return float(txt) + except Exception: + return None + v0 = tofloat(m[0]) + v1 = tofloat(m[1]) if len(m) > 1 else None + return v0, v1 + # final fallback: try to split lines + try: + txt = str(res).strip() + parts = [p.strip() for p in txt.split() if p.strip()] + if len(parts) >= 2: + try: + return float(parts[0]), float(parts[1]) + except Exception: + return None, None + except Exception: + return None, None + return None, None + # -------------------- end metadata helpers -------------------- def execute(self, kernel, data): @@ -350,20 +435,27 @@ def execute(self, kernel, data): pass return else: - # already a tuple/list? try: feature_range = tuple(feature_range_arg) except Exception: feature_range = (0, 1) inplace = bool(args.get("inplace", True)) - target_df = df if inplace else df.copy(deep=True) + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) # Select numeric columns if columns is None: - target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] else: - missing_cols = [c for c in columns if c not in target_df.columns] + missing_cols = [c for c in columns if c not in df.columns] if missing_cols: msg = f"Missing columns: {', '.join(missing_cols)}" kernel._send_message("stderr", msg) @@ -401,71 +493,408 @@ def execute(self, kernel, data): pass return - # Perform normalization - operation_status = "success" - messages = [] - try: - scaler = MinMaxScaler(feature_range=feature_range) - target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) - msg = f"Normalized {len(target_columns)} column(s) to range {feature_range}." - messages.append(msg) - except Exception as e: - operation_status = "error" - err_msg = f"Error during normalization: {e}" - kernel._send_message("stderr", err_msg) - messages.append(err_msg) - # log metadata for failure + # --- PREVIEW MODE --- + if mode == "preview": try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(target_columns), - operation_status=operation_status, - message="\n".join(messages), - db_name=db_name, - user_name=user_name - ) - except Exception: - pass - return + messages = [] + # local preview: compute local min/max and show sample transformed values + local_ranges = {} + for col in target_columns: + s = pd.to_numeric(df[col], errors="coerce").dropna() + if s.empty: + messages.append(f"Local: Column '{col}' has no numeric non-null values; skipped.") + local_ranges[col] = (None, None) + continue + lo = float(s.min()) + hi = float(s.max()) + local_ranges[col] = (lo, hi) + messages.append(f"Local: Column '{col}' min={lo}, max={hi} -> range will map to {feature_range}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + + # show sample transformed rows for local preview + try: + sample = df[target_columns].head(sample_size).copy() + for col in target_columns: + lo, hi = local_ranges.get(col, (None, None)) + if lo is None or hi is None or hi == lo: + # can't transform deterministically; show original values + sample[col + "_norm_preview"] = sample[col] + else: + # scale + rng = feature_range[1] - feature_range[0] + sample[col + "_norm_preview"] = ((pd.to_numeric(sample[col], errors="coerce") - lo) / (hi - lo)) * rng + feature_range[0] + if not sample.empty: + self._send_html(kernel, sample.head(20)) + except Exception: + pass - # Store results - if inplace: - data["last_select"] = target_df - location_msg = "Updated data['last_select'] in-place." - messages.append(location_msg) - kernel._send_message("stdout", f"{msg} {location_msg}") - else: - data["last_select_normalized"] = target_df - location_msg = "Stored in data['last_select_normalized']." - messages.append(location_msg) - kernel._send_message("stdout", f"{msg} {location_msg}") + # DB preview if table provided + if table_full and mariadb_client is not None: + db_msgs = [] + for col in target_columns: + try: + # attempt to get MIN and MAX from DB for each column + res = mariadb_client.run_statement(f"SELECT MIN({col}), MAX({col}) FROM {table_full};") + if mariadb_client.iserror(): + db_msgs.append(f"DB: Column '{col}': MIN/MAX query failed (check permissions).") + continue + minv, maxv = self._parse_two_value_result(res) + if minv is None and maxv is None: + db_msgs.append(f"DB: Column '{col}': could not parse MIN/MAX (empty/unsupported).") + continue + db_msgs.append(f"DB: Column '{col}' min={minv}, max={maxv} -> would map to {feature_range}") + # show SQL expression that would be used + if minv is None or maxv is None: + expr = f"{col} /* cannot compute min/max */" + elif maxv == minv: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE {feature_range[0]} END AS {col}" + else: + # normalization SQL: ((col - min) / (max - min)) * (range_max - range_min) + range_min + rng = feature_range[1] - feature_range[0] + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE (({col} - {repr(minv)}) / ({repr(maxv - minv)})) * {repr(rng)} + {repr(feature_range[0])} END AS {col}" + db_msgs.append(f"DB: Column '{col}' expression: {expr}") + except Exception as e: + db_msgs.append(f"DB: Column '{col}' MIN/MAX query exception: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass - # Display DataFrame - try: - self._send_html(kernel, target_df) - except Exception: - pass + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return - # Insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = "\n".join(target_columns) - message_str = "\n".join(messages) - metadata_message = f"Feature range: {feature_range}\n\nDetails:\n{message_str}" - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=metadata_message, - db_name=db_name, - user_name=user_name - ) - except Exception: + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) try: - kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") - except Exception: - pass + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"normalize_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # compute MIN and MAX per column in DB + col_minmax = {} + msgs = [] + for col in target_columns: + try: + res = mariadb_client.run_statement(f"SELECT MIN({col}), MAX({col}) FROM {table_full};") + if mariadb_client.iserror(): + msgs.append(f"{col}: MIN/MAX query failed (permissions?).") + col_minmax[col] = (None, None) + continue + minv, maxv = self._parse_two_value_result(res) + col_minmax[col] = (minv, maxv) + msgs.append(f"{col}: min={minv}, max={maxv}") + except Exception as e: + col_minmax[col] = (None, None) + msgs.append(f"{col}: exception computing min/max: {e}") + + # build select expressions + select_exprs = [] + for c in df.columns: + if c in target_columns: + minv, maxv = col_minmax.get(c, (None, None)) + if minv is None or maxv is None: + # cannot compute; keep original + select_exprs.append(c) + elif maxv == minv: + # constant mapping to feature_range[0] + expr = f"CASE WHEN {c} IS NULL THEN NULL ELSE {repr(feature_range[0])} END AS {c}" + select_exprs.append(expr) + else: + rng = feature_range[1] - feature_range[0] + denom = (maxv - minv) + # ((col - min) / denom) * rng + feature_range[0] + expr = ( + f"CASE WHEN {c} IS NULL THEN NULL ELSE " + f"(({c} - {repr(minv)}) / {repr(denom)}) * {repr(rng)} + {repr(feature_range[0])} END AS {c}" + ) + select_exprs.append(expr) + else: + select_exprs.append(c) + + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"normalize_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with normalized values (CTAS) + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table};details={"|".join(msgs)}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + target_df = df if inplace else df.copy(deep=True) + operation_status = "success" + messages = [] + try: + scaler = MinMaxScaler(feature_range=feature_range) + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + msg = f"Normalized {len(target_columns)} column(s) to range {feature_range}." + messages.append(msg) + if inplace: + data["last_select"] = target_df + location_msg = "Updated data['last_select'] in-place." + else: + data["last_select_normalized"] = target_df + location_msg = "Stored in data['last_select_normalized']." + kernel._send_message("stdout", f"{msg} {location_msg}") + except Exception as e: + operation_status = "error" + err_msg = f"Error during normalization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Display DataFrame + try: + self._send_html(kernel, target_df) + except Exception: + pass + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + metadata_message = f"Feature range: {feature_range}\n\nDetails:\n{message_str}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py index d416b00..2115b81 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py @@ -9,6 +9,8 @@ import logging import os import re +import uuid +import time # Optional helper to reliably get current DB name (if available) try: @@ -20,6 +22,8 @@ class Standardize(MariaMagic): """ %standardize [columns=col1,col2,...] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] + [confirm=true|false] [sample_size=100] [lock_timeout=10] Standardizes numeric columns using sklearn's StandardScaler (zero mean and unit variance). @@ -28,14 +32,11 @@ class Standardize(MariaMagic): If omitted, all numeric columns are used. - inplace: if True (default), modifies data["last_select"] in-place. if False, stores result in data["last_select_standardized"]. - - Examples: - %standardize - %standardize columns=age,salary inplace=False - - Execution metadata is recorded in table `magic_metadata`. + - mode: preview/apply/rollback (preview default). + * preview: show local preview and optional DB stats if table=... provided. + * apply: local in-place (default) or DB versioned apply when table=... and confirm=true. + * rollback: restore a previously-created backup (needs mariadb_client). """ - def __init__(self, args=""): self.args = args @@ -48,8 +49,9 @@ def name(self): def help(self): return ( "%standardize [columns=col1,col2,...] [inplace=True|False]\n" - "Standardizes numeric columns using sklearn's StandardScaler (in-place by default).\n" - "Execution metadata is recorded in table `magic_metadata`." + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true]\n" + " [sample_size=100] [lock_timeout=10]\n" + "Standardizes numeric columns using sklearn's StandardScaler." ) def _str_to_obj(self, s): @@ -171,6 +173,7 @@ def _ensure_metadata_table(self, kernel, db_name): if mariadb_client is None: return table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + # include rollback columns so apply/rollback can record/locate backups create_sql = f""" CREATE TABLE IF NOT EXISTS {table_full_name} ( id INT AUTO_INCREMENT PRIMARY KEY, @@ -181,7 +184,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -192,7 +198,8 @@ def _ensure_metadata_table(self, kernel, db_name): log.error(f"Failed to ensure magic_metadata table: {e}") def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): mariadb_client = self._get_mariadb_client(kernel) log = self._get_logger(kernel) if mariadb_client is None: @@ -205,11 +212,14 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, message_sql = self._sql_escape(message) db_sql = self._sql_escape(db_name) user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) insert_sql = f""" INSERT INTO {table_full_name} (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) VALUES ( {self._sql_escape(command_name)}, {args_sql}, @@ -218,7 +228,10 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, {status_sql}, {message_sql}, {db_sql}, - {user_sql} + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} ); """ try: @@ -228,6 +241,76 @@ def _insert_metadata(self, kernel, command_name, arguments, affected_columns, except Exception as e: log.error(f"Exception while inserting metadata: {e}") + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_two_value_result(self, res): + """ + Parse results expected to have two values (e.g., AVG and STD). + Returns (val1, val2) or (None, None) if parsing fails. + """ + if not res: + return None, None + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + r = dfs[0].iloc[0] + v0 = None + v1 = None + try: + v0 = float(r.iloc[0]) if pd.notna(r.iloc[0]) else None + except Exception: + v0 = None + try: + v1 = float(r.iloc[1]) if r.size > 1 and pd.notna(r.iloc[1]) else None + except Exception: + v1 = None + return v0, v1 + except Exception: + vals = re.findall(r"(.*?)", str(res), flags=re.S | re.I) + if vals: + def tofloat(txt): + txt = re.sub(r"<.*?>", "", txt).strip() + if txt.lower() == "null" or txt == "": + return None + try: + return float(txt) + except Exception: + return None + v0 = tofloat(vals[0]) + v1 = tofloat(vals[1]) if len(vals) > 1 else None + return v0, v1 + # fallback: try whitespace split + try: + parts = [p for p in str(res).split() if p.strip()] + if len(parts) >= 2: + try: + return float(parts[0]), float(parts[1]) + except Exception: + return None, None + except Exception: + pass + return None, None + # -------------------- end metadata helpers -------------------- def execute(self, kernel, data): @@ -290,9 +373,18 @@ def execute(self, kernel, data): columns = None inplace = bool(args.get("inplace", True)) - target_df = df if inplace else df.copy(deep=True) + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) # Determine target columns (numeric) + target_df = df if inplace else df.copy(deep=True) if columns is None: target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] else: @@ -334,66 +426,390 @@ def execute(self, kernel, data): pass return - operation_status = "success" - messages = [] - try: - scaler = StandardScaler() - target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) - summary_msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." - messages.append(summary_msg) - except Exception as e: - operation_status = "error" - err_msg = f"Error during standardization: {e}" - kernel._send_message("stderr", err_msg) - messages.append(err_msg) + # ---------------- PREVIEW ---------------- + if mode == "preview": try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(target_columns), - operation_status=operation_status, - message="\n".join(messages), - db_name=db_name, - user_name=user_name - ) - except Exception: - pass - return + messages = [] + # local preview: means/std and sample transformed + local_stats = {} + for col in target_columns: + s = pd.to_numeric(df[col], errors="coerce").dropna() + if s.empty: + messages.append(f"Local: Column '{col}' has no numeric non-null values; skipped.") + local_stats[col] = (None, None) + continue + mean = float(s.mean()) + std = float(s.std(ddof=0)) # population std to match DB STDDEV_POP + local_stats[col] = (mean, std) + messages.append(f"Local: Column '{col}': mean={mean}, std={std}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + + # show sample transformed rows + try: + sample = df[target_columns].head(sample_size).copy() + for col in target_columns: + mean, std = local_stats.get(col, (None, None)) + if mean is None or std is None or std == 0: + # cannot standardize sensibly; show original + sample[col + "_std_preview"] = sample[col] + else: + sample[col + "_std_preview"] = (pd.to_numeric(sample[col], errors="coerce") - mean) / std + if not sample.empty: + self._send_html(kernel, sample.head(20)) + except Exception: + pass - # Store results - if inplace: - data["last_select"] = target_df - location_msg = "Updated data['last_select'] in-place." - kernel._send_message("stdout", f"{summary_msg} {location_msg}") - else: - data["last_select_standardized"] = target_df - location_msg = "Stored in data['last_select_standardized']." - kernel._send_message("stdout", f"{summary_msg} {location_msg}") + # DB preview if requested + if table_full and mariadb_client is not None: + db_msgs = [] + for col in target_columns: + try: + # use AVG and STDDEV_POP for stable population std + out = mariadb_client.run_statement(f"SELECT AVG({col}), STDDEV_POP({col}) FROM {table_full};") + if mariadb_client.iserror(): + db_msgs.append(f"DB: Column '{col}': AVG/STD query failed (permissions?).") + continue + mean_db, std_db = self._parse_two_value_result(out) + db_msgs.append(f"DB: Column '{col}': mean={mean_db}, std={std_db}") + if mean_db is None or std_db is None: + db_msgs.append(f"DB: Column '{col}': cannot compute mean/std (NULL).") + continue + if std_db == 0: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE 0 END AS {col}" + else: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE (({col} - {repr(mean_db)}) / {repr(std_db)}) END AS {col}" + db_msgs.append(f"DB: Column '{col}' expression: {expr}") + except Exception as e: + db_msgs.append(f"DB: Column '{col}' AVG/STD query exception: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass - # Display DataFrame - try: - self._send_html(kernel, target_df) - except Exception: - pass + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return - # Insert metadata (best-effort) - try: - args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = "\n".join(target_columns) - message_str = f"{summary_msg}\n{location_msg}" - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=args_for_db, - affected_columns=affected_columns_str, - operation_status=operation_status, - message=message_str, - db_name=db_name, - user_name=user_name - ) - except Exception: + # ---------------- ROLLBACK ---------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) try: - kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") - except Exception: - pass + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + lock_name = f"standardize_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments in metadata + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # ---------------- APPLY ---------------- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # collect mean/std from DB per column + col_stats = {} + msgs = [] + for col in target_columns: + try: + out = mariadb_client.run_statement(f"SELECT AVG({col}), STDDEV_POP({col}) FROM {table_full};") + if mariadb_client.iserror(): + msgs.append(f"{col}: AVG/STD query failed.") + col_stats[col] = (None, None) + continue + mean_db, std_db = self._parse_two_value_result(out) + col_stats[col] = (mean_db, std_db) + msgs.append(f"{col}: mean={mean_db}, std={std_db}") + except Exception as e: + col_stats[col] = (None, None) + msgs.append(f"{col}: exception computing stats: {e}") + + # build select expressions (preserve non-target columns) + select_exprs = [] + for c in df.columns: + if c in target_columns: + mean_db, std_db = col_stats.get(c, (None, None)) + if mean_db is None or std_db is None: + # cannot compute, keep original as-is + select_exprs.append(c) + elif std_db == 0: + # constant zero (or map to 0) + select_exprs.append(f"CASE WHEN {c} IS NULL THEN NULL ELSE 0 END AS {c}") + else: + select_exprs.append(f"CASE WHEN {c} IS NULL THEN NULL ELSE (({c} - {repr(mean_db)}) / {repr(std_db)}) END AS {c}") + else: + select_exprs.append(c) + + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"standardize_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table CTAS + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename: original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with rollback token + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table};details={"|".join(msgs)}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # local apply (existing behavior) + operation_status = "success" + messages = [] + try: + scaler = StandardScaler() + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + summary_msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." + messages.append(summary_msg) + if inplace: + data["last_select"] = target_df + location_msg = "Updated data['last_select'] in-place." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") + else: + data["last_select_standardized"] = target_df + location_msg = "Stored in data['last_select_standardized']." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") + except Exception as e: + operation_status = "error" + err_msg = f"Error during standardization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # show + try: + self._send_html(kernel, target_df) + except Exception: + pass + + # metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return From 53c37b0e2a321277dd60d927a5006dc68e889cbb Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Wed, 29 Oct 2025 19:15:24 +0000 Subject: [PATCH 30/38] Logging added in ml_pipeline --- Untitled.ipynb | 3790 ++++++++++++----- .../ml_commands/data_cleaning/missing.py | 5 +- .../ml_commands/data_cleaning/outliers.py | 5 +- .../ml_commands/data_cleaning/stats.py | 5 +- .../data_preprocessing/splitdata.py | 5 +- .../ml_commands/ml_pipeline/ml_pipeline.py | 620 ++- .../ml_pipeline/select_features.py | 23 +- .../ml_commands/ml_pipeline/select_model.py | 390 +- .../ml_commands/model_training/savemodel.py | 5 +- 9 files changed, 3756 insertions(+), 1092 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 2f473dc..e60867e 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -162,7 +162,7 @@ { "data": { "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
2BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
3CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
2BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
3CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
12BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
13CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
22BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
23CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
32BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
33CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" ] }, "metadata": {}, @@ -5846,7 +5846,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", "metadata": {}, "outputs": [ @@ -5888,7 +5888,7 @@ " \n", " 1\n", " 30\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -5896,7 +5896,7 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", @@ -5904,19 +5904,19 @@ " 0\n", " 2\n", " 0\n", - " 0\n", + " -1\n", " \n", " \n", " 2\n", " 45\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", @@ -5930,7 +5930,7 @@ " \n", " 3\n", " 38\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -5951,7 +5951,7 @@ " \n", " 4\n", " 29\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -5972,7 +5972,7 @@ " \n", " 5\n", " 35\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -5986,14 +5986,14 @@ " 85.0\n", " 0\n", " 4\n", - " 1\n", + " -1\n", " 0\n", " 0\n", " \n", " \n", " 6\n", " 50\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -6014,7 +6014,7 @@ " \n", " 7\n", " 42\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -6035,7 +6035,7 @@ " \n", " 8\n", " 31\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -6056,7 +6056,7 @@ " \n", " 9\n", " 27\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -6077,7 +6077,7 @@ " \n", " 10\n", " 55\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -6098,7 +6098,7 @@ " \n", " 11\n", " 30\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -6106,7 +6106,7 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", @@ -6114,19 +6114,19 @@ " 0\n", " 2\n", " 0\n", - " 0\n", + " -1\n", " \n", " \n", " 12\n", " 45\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", @@ -6140,7 +6140,7 @@ " \n", " 13\n", " 38\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -6161,7 +6161,7 @@ " \n", " 14\n", " 29\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -6182,7 +6182,7 @@ " \n", " 15\n", " 35\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -6196,14 +6196,14 @@ " 85.0\n", " 0\n", " 4\n", - " 1\n", + " -1\n", " 0\n", " 0\n", " \n", " \n", " 16\n", " 50\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -6224,7 +6224,7 @@ " \n", " 17\n", " 42\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -6245,7 +6245,7 @@ " \n", " 18\n", " 31\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -6266,7 +6266,7 @@ " \n", " 19\n", " 27\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -6287,7 +6287,7 @@ " \n", " 20\n", " 55\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -6308,7 +6308,7 @@ " \n", " 21\n", " 30\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -6316,7 +6316,7 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", @@ -6324,19 +6324,19 @@ " 0\n", " 2\n", " 0\n", - " 0\n", + " -1\n", " \n", " \n", " 22\n", " 45\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", @@ -6350,7 +6350,7 @@ " \n", " 23\n", " 38\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -6371,7 +6371,7 @@ " \n", " 24\n", " 29\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -6392,7 +6392,7 @@ " \n", " 25\n", " 35\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -6406,14 +6406,14 @@ " 85.0\n", " 0\n", " 4\n", - " 1\n", + " -1\n", " 0\n", " 0\n", " \n", " \n", " 26\n", " 50\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -6434,7 +6434,7 @@ " \n", " 27\n", " 42\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -6455,7 +6455,7 @@ " \n", " 28\n", " 31\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -6476,7 +6476,7 @@ " \n", " 29\n", " 27\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -6497,7 +6497,7 @@ " \n", " 30\n", " 55\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -6518,7 +6518,7 @@ " \n", " 31\n", " 30\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -6526,7 +6526,7 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", @@ -6534,19 +6534,19 @@ " 0\n", " 2\n", " 0\n", - " 0\n", + " -1\n", " \n", " \n", " 32\n", " 45\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", @@ -6560,7 +6560,7 @@ " \n", " 33\n", " 38\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -6581,7 +6581,7 @@ " \n", " 34\n", " 29\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -6602,7 +6602,7 @@ " \n", " 35\n", " 35\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -6616,14 +6616,14 @@ " 85.0\n", " 0\n", " 4\n", - " 1\n", + " -1\n", " 0\n", " 0\n", " \n", " \n", " 36\n", " 50\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -6644,7 +6644,7 @@ " \n", " 37\n", " 42\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -6665,7 +6665,7 @@ " \n", " 38\n", " 31\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -6686,7 +6686,7 @@ " \n", " 39\n", " 27\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -6707,7 +6707,7 @@ " \n", " 40\n", " 55\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -6734,7 +6734,7 @@ } ], "source": [ - "%encode method=label drop_original=true" + "%encode method=label drop_original=true mode=apply" ] }, { @@ -9680,6 +9680,24 @@ "%standardize table=test.employees mode=apply confirm=true " ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7c023d4e-a273-4322-bcf2-6eb5a1e290dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_f15a883f004548a8 -> test.employees; previous test.employees renamed to test.employees_prerollback_f15a883f004548a8.\n" + ] + } + ], + "source": [ + "%standardize mode=rollback rollback_token=f15a883f004548a8" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -12953,7 +12971,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 13, "id": "6c5def76-a36c-45be-8712-d886a1e52e25", "metadata": {}, "outputs": [ @@ -13010,23 +13028,23 @@ " \n", " \n", " department_lbl\n", - " 0.634270\n", + " 0.631068\n", " \n", " \n", " years_experience\n", - " 0.623764\n", + " 0.477280\n", " \n", " \n", - " bonus\n", - " 0.480500\n", + " projects_completed\n", + " 0.441624\n", " \n", " \n", - " salary\n", - " 0.463771\n", + " bonus\n", + " 0.392049\n", " \n", " \n", - " projects_completed\n", - " 0.441624\n", + " salary\n", + " 0.235729\n", " \n", " \n", " name_lbl\n", @@ -13034,7 +13052,7 @@ " \n", " \n", " education_level_lbl\n", - " 0.146310\n", + " 0.074848\n", " \n", " \n", " emp_id\n", @@ -13051,12 +13069,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n" + "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n" ] } ], "source": [ - "%select_features target=attrition_flag method=correlation k=5 problem=classification output_name=top_features" + "%select_features target=attrition_flag method=correlation k=5 problem=classification " ] }, { @@ -13179,15 +13197,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "cc570adc-ee80-42b9-a5a5-7a678224a220", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No features provided and no selected_features found. Run %select_features first.\n" + ] + } + ], + "source": [ + "%select_model target=attrition_flag problem=classification" + ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "e535919d-788e-44c3-8a42-7c499044a265", "metadata": {}, "outputs": [ @@ -13195,7 +13223,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" + "PREVIEW: would drop 12 row(s) (from 40 to 28).\n" ] }, { @@ -13223,6 +13251,7 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " _would_be_dropped\n", " \n", " \n", " \n", @@ -13232,8 +13261,8 @@ " HR\n", " 30\n", " F\n", - " Bachelors\n", - " 5\n", + " NaN\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -13241,32 +13270,12 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", " 0\n", - " \n", - " \n", - " 2\n", - " Bob\n", - " Engineering\n", - " 45\n", - " M\n", - " Masters\n", - " 20\n", - " 30\n", - " 91.0\n", - " 3\n", - " 20\n", - " 10\n", - " 0.1\n", - " 120000.0\n", - " 15000.0\n", - " 9.0\n", - " 5\n", - " 89.0\n", - " 0\n", + " True\n", " \n", " \n", " 3\n", @@ -13275,7 +13284,7 @@ " 38\n", " M\n", " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -13288,36 +13297,16 @@ " 3\n", " 70.0\n", " 1\n", - " \n", - " \n", - " 4\n", - " Diana\n", - " Engineering\n", - " 29\n", - " F\n", - " PhD\n", - " 6\n", - " 22\n", - " 95.2\n", - " 2\n", - " 50\n", - " 2\n", - " 0.0\n", - " 97000.0\n", - " 10000.0\n", - " 9.6\n", - " 5\n", - " 95.0\n", - " 0\n", + " True\n", " \n", " \n", " 5\n", " Eve\n", - " Finance\n", + " NaN\n", " 35\n", " F\n", " Bachelors\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -13330,111 +13319,205 @@ " 4\n", " 85.0\n", " 0\n", + " True\n", " \n", " \n", - " 6\n", - " Frank\n", + " 11\n", + " Alice\n", " HR\n", - " 50\n", - " M\n", - " High School\n", - " 25\n", - " 8\n", - " 72.5\n", - " 0\n", - " 10\n", - " 15\n", - " 0.7\n", - " 60000.0\n", - " 4000.0\n", - " 6.5\n", - " 2\n", - " 60.0\n", + " 30\n", + " F\n", + " NaN\n", + " 5.0\n", + " 12\n", + " 87.5\n", " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " True\n", " \n", " \n", - " 7\n", - " Grace\n", + " 13\n", + " Charlie\n", " Sales\n", - " 42\n", - " F\n", + " 38\n", + " M\n", " Bachelors\n", + " NaN\n", " 18\n", + " 79.3\n", + " 0\n", + " 15\n", " 20\n", - " 81.4\n", - " 1\n", - " 25\n", - " 12\n", - " 0.4\n", - " 85000.0\n", + " 0.5\n", + " 80000.0\n", " 7000.0\n", - " 7.8\n", + " 7.2\n", " 3\n", - " 74.0\n", - " 0\n", + " 70.0\n", + " 1\n", + " True\n", " \n", " \n", - " 8\n", - " Henry\n", - " Engineering\n", - " 31\n", - " M\n", - " Masters\n", - " 7\n", - " 25\n", - " 93.1\n", - " 2\n", + " 15\n", + " Eve\n", + " NaN\n", " 35\n", + " F\n", + " Bachelors\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " True\n", + " \n", + " \n", + " 21\n", + " Alice\n", + " HR\n", + " 30\n", + " F\n", + " NaN\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", " 5\n", " 0.2\n", - " 95000.0\n", - " 9000.0\n", - " 9.1\n", - " 5\n", - " 90.0\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", " 0\n", + " True\n", " \n", " \n", - " 9\n", - " Ivy\n", - " Finance\n", - " 27\n", - " F\n", + " 23\n", + " Charlie\n", + " Sales\n", + " 38\n", + " M\n", " Bachelors\n", + " NaN\n", + " 18\n", + " 79.3\n", + " 0\n", + " 15\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", " 3\n", - " 10\n", + " 70.0\n", + " 1\n", + " True\n", + " \n", + " \n", + " 25\n", + " Eve\n", + " NaN\n", + " 35\n", + " F\n", + " Bachelors\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", " 85.0\n", " 0\n", - " 20\n", - " 8\n", - " 0.6\n", - " 70000.0\n", - " 5000.0\n", - " 8.2\n", + " True\n", + " \n", + " \n", + " 31\n", + " Alice\n", + " HR\n", + " 30\n", + " F\n", + " NaN\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", " 4\n", - " 82.0\n", + " 75.0\n", " 0\n", + " True\n", " \n", " \n", - " 10\n", - " Jack\n", + " 33\n", + " Charlie\n", " Sales\n", - " 55\n", + " 38\n", " M\n", - " High School\n", - " 30\n", - " 12\n", - " 68.9\n", + " Bachelors\n", + " NaN\n", + " 18\n", + " 79.3\n", " 0\n", - " 5\n", - " 25\n", - " 0.8\n", - " 65000.0\n", - " 2000.0\n", - " 5.5\n", + " 15\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", + " 3\n", + " 70.0\n", " 1\n", - " 50.0\n", + " True\n", + " \n", + " \n", + " 35\n", + " Eve\n", + " NaN\n", + " 35\n", + " F\n", + " Bachelors\n", + " 8.0\n", + " 15\n", + " 88.0\n", " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " True\n", " \n", " \n", "" @@ -13485,19 +13568,21 @@ " department_Finance\n", " department_HR\n", " department_Sales\n", + " department____MISSING___\n", " gender_F\n", " gender_M\n", " education_level_Bachelors\n", " education_level_High School\n", " education_level_Masters\n", " education_level_PhD\n", + " education_level____MISSING___\n", " \n", " \n", " \n", " \n", " 1\n", " 30\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -13505,7 +13590,7 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", @@ -13524,24 +13609,26 @@ " 0.0\n", " 1.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", + " 0.0\n", + " 1.0\n", " \n", " \n", " 2\n", " 45\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", @@ -13562,16 +13649,18 @@ " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 1.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 3\n", " 38\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -13599,16 +13688,18 @@ " 0.0\n", " 1.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 4\n", " 29\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -13635,17 +13726,19 @@ " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " 0.0\n", " 1.0\n", + " 0.0\n", " \n", " \n", " 5\n", " 35\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -13669,20 +13762,22 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 1.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", + " 1.0\n", " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 6\n", " 50\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -13710,16 +13805,18 @@ " 1.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 7\n", " 42\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -13746,17 +13843,19 @@ " 0.0\n", " 0.0\n", " 1.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 8\n", " 31\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -13784,16 +13883,18 @@ " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 1.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 9\n", " 27\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -13820,17 +13921,19 @@ " 1.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", " \n", " 10\n", " 55\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -13858,240 +13961,29 @@ " 0.0\n", " 1.0\n", " 0.0\n", + " 0.0\n", " 1.0\n", " 0.0\n", " 1.0\n", " 0.0\n", " 0.0\n", + " 0.0\n", " \n", - " \n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Feature Selection Results (method=correlation)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
potential_score0.828136
performance_rating0.817918
education_level_High School0.763763
remote_ratio0.744150
training_hours0.742307
age0.683720
gender_M0.654654
certifications0.654654
gender_F0.654654
years_experience0.623764
department_Sales0.523810
name_Charlie0.509175
name_Frank0.509175
name_Jack0.509175
bonus0.480500
salary0.463771
projects_completed0.441624
department_Engineering0.428571
education_level_Masters0.327327
department_Finance0.327327
name_Ivy0.218218
name_Henry0.218218
education_level_Bachelors0.218218
department_HR0.218218
name_Eve0.218218
name_Grace0.218218
name_Diana0.218218
name_Alice0.218218
name_Bob0.218218
education_level_PhD0.218218
emp_id0.189934
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n", - "Standardized 5 column(s) (mean=0, std=1). Updated data['last_select'] in-place.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14107,28 +13999,30 @@ " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14145,32 +14039,2096 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
1113055.0120.40280687.5140-0.83576650.255000.03000.00.4698750.312348-0.148823300.08.5475.001.00.00.01.00.01.00.01.00.00.00.00.00.01.0
212452020.0300.82873491.0320-0.115278100.1120000.01200000.015000.00.8894071.0932160.8929409.0589.000.01.00.00.00.00.01.00.00.01.00.00.0
3133810NaN18-0.59508279.30151.325698200.580000.07000.0-0.620907-0.468521-0.5208827.2370.010.00.01.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
14296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
15358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
165025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
174218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
18317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
19273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
205530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
21305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
224520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
2338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
24296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
25358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
265025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
274218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
28317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
29273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
305530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
31305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
324520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
3338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
34296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
35358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
365025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
374218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
38317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
39273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
405530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Feature Selection Results (method=correlation)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
potential_score0.828136
performance_rating0.817918
education_level_High School0.763763
remote_ratio0.744150
training_hours0.742307
age0.683720
gender_F0.654654
gender_M0.654654
certifications0.654654
department_Sales0.523810
name_Charlie0.509175
name_Jack0.509175
name_Frank0.509175
years_experience0.477280
projects_completed0.441624
department_Engineering0.428571
bonus0.392049
education_level_Masters0.327327
salary0.235729
department_HR0.218218
education_level____MISSING___0.218218
education_level_PhD0.218218
name_Diana0.218218
name_Alice0.218218
name_Bob0.218218
department_Finance0.218218
name_Eve0.218218
name_Henry0.218218
name_Ivy0.218218
department____MISSING___0.218218
name_Grace0.218218
education_level_Bachelors0.089087
emp_id0.047260
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n", + "PREVIEW (local):\n", + "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", + "Local: Column 'avg_project_score': mean=84.19000000000001, std=8.217353588595294\n", + "Local: Column 'satisfaction_score': mean=7.94, std=1.1918053532351665\n", + "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", + "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
overtime_hoursavg_project_scoresatisfaction_scorepotential_scoreperformance_ratingovertime_hours_std_previewavg_project_score_std_previewsatisfaction_score_std_previewpotential_score_std_previewperformance_rating_std_preview
587.58.575.04-0.8357660.4028060.469875-0.1488230.312348
1091.09.089.05-0.1152780.8287340.8894070.8929401.093216
2079.37.270.031.325698-0.595082-0.620907-0.520882-0.468521
295.29.695.05-1.2680591.3398471.3928451.3394101.093216
688.08.085.04-0.6916690.4636530.0503440.5952930.312348
1572.56.560.020.605210-1.422599-1.208251-1.264999-1.249390
1281.47.874.030.172917-0.339525-0.117469-0.223235-0.468521
593.19.190.05-0.8357661.0842910.9733130.9673521.093216
885.08.282.04-0.4034730.0985720.2181560.3720580.312348
2568.95.550.012.046186-1.860696-2.047314-2.009115-2.030259
587.58.575.04-0.8357660.4028060.469875-0.1488230.312348
1091.09.089.05-0.1152780.8287340.8894070.8929401.093216
2079.37.270.031.325698-0.595082-0.620907-0.520882-0.468521
295.29.695.05-1.2680591.3398471.3928451.3394101.093216
688.08.085.04-0.6916690.4636530.0503440.5952930.312348
1572.56.560.020.605210-1.422599-1.208251-1.264999-1.249390
1281.47.874.030.172917-0.339525-0.117469-0.223235-0.468521
593.19.190.05-0.8357661.0842910.9733130.9673521.093216
885.08.282.04-0.4034730.0985720.2181560.3720580.312348
2568.95.550.012.046186-1.860696-2.047314-2.009115-2.030259
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=40, train=32, test=8, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (32 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", " \n", " \n", " \n", @@ -14181,38 +16139,40 @@ " \n", " \n", " \n", - " \n", " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14220,63 +16180,66 @@ " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14293,35 +16256,37 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", @@ -14329,37 +16294,39 @@ " \n", " \n", " \n", - " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", @@ -14367,28 +16334,31 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14400,6 +16370,7 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -14411,20 +16382,20 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14441,11 +16412,13 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesdepartment____MISSING___gender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhDeducation_level____MISSING___
305530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
2338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
19273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
274218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
29273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
224520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
24296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
4296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
374218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
324520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
5358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
35358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
39273.01085.002080.670000.05000.08.24296221.339847250-1.2680590.097000.010000.01.3928451.0932161.33941082.000.00.00.01.00.00.00.00.00.01.00.00.01.00.00.01.00.01.00.00.00.00.01.0
5359273.01085.00208150.463653130-0.6916690.390000.08000.00.0503440.3123480.5952930.670000.05000.08.2482.000.00.00.00.01.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
6502525.08-1.42259972.50100.605210150.760000.04000.0-1.208251-1.249390-1.2649996.5260.010.00.01.00.00.00.01.00.01.00.00.00.0
7423338NaN1879.301520-0.3395251250.1729170.485000.00.580000.07000.0-0.117469-0.468521-0.22323507.2370.010.00.01.00.00.00.00.01.00.00.00.00.00.01.01.00.00.01.01.00.00.00.00.0
8317251.084291235-0.8357660.295000.09000.00.9733131.0932160.9673524520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.01.00.00.01.00.00.00.00.01.00.00.01.00.00.0
9273100.098572020-0.4034730.670000.05000.00.2181560.3123480.3720585530.01268.905250.865000.02000.05.5150.010.00.00.00.01.00.00.00.01.00.00.00.0
1040553030.012-1.86069668.9052.046186250.865000.02000.0-2.047314-2.030259-2.0091155.5150.010.00.00.01.00.00.01.00.01.00.00.00.0
" @@ -14454,17 +16427,10 @@ "metadata": {}, "output_type": "display_data" }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=10, train=8, test=2, val=0.\n" - ] - }, { "data": { "text/html": [ - "

Train (8 rows)

\n", + "

Test (8 rows)

\n", " \n", " \n", " \n", @@ -14496,73 +16462,33 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -14572,67 +16498,36 @@ " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", " \n", " \n", " \n", @@ -14642,8 +16537,6 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -14652,22 +16545,27 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14680,44 +16578,47 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", + " \n", " \n", " \n", " \n", @@ -14725,38 +16626,41 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14765,116 +16669,67 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", - "
emp_iddepartment_Financedepartment_HRdepartment_Salesdepartment____MISSING___gender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhDeducation_level____MISSING___
65028317.0258-1.4225990100.6052100.760000.04000.0-1.208251-1.249390-1.26499910.00.00.00.00.01.00.00.00.00.00.00.01.00.00.01.00.01.00.00.0
13093.12355120.402806140-0.8357660.255000.03000.00.4698750.312348-0.14882395000.09000.09.1590.001.00.00.00.00.00.00.00.00.01.00.01.00.01.00.00.00.0
10553012-1.860696052.0461860.865000.02000.0-2.047314-2.030259-2.00911510.00.00.00.00.00.00.00.00.01.00.00.00.01.00.01.00.01.00.00.0
2452011300.828734320-0.1152780.1120000.015000.00.8894071.0932160.8929405.01287.514050.255000.0300.08.5475.000.01.00.00.00.00.00.01.00.00.00.00.00.01.00.00.00.00.00.01.0
5253588.0150.46365388.0130-0.69166960.390000.08000.00.0503440.3123480.5952938.0485.000.00.00.00.00.01.00.00.00.01.01.00.01.00.00.00.00.0
21305.01287.514050.255000.0300.08.54296221.339847250-1.2680590.097000.010000.01.3928451.0932161.33941075.001.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.00.01.00.00.00.00.01.0
7421820-0.3395251250.1729170.485000.07000.0-0.117469-0.468521-0.223235105530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.01.00.00.00.01.00.00.00.01.00.00.01.00.01.00.0
365025.08317251.084291235-0.8357660.295000.09000.00.9733131.0932160.96735272.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.01.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (2 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", " \n", @@ -14884,26 +16739,28 @@ " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14912,18 +16769,20 @@ " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -14952,109 +16811,109 @@ " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -15072,7 +16931,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Best model 'gbm' (mean accuracy=1.0000) saved to data['last_model'].\n", + "Best model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].\n", "[MLPipeline] Automatically selected best model via SelectModel.\n" ] }, @@ -15085,23 +16944,23 @@ "
\n", "

Metrics

\n", "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesgender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhD
3381018-0.5950820151.3256980.580000.07000.0-0.620907-0.468521-0.520882174218.02081.4125120.485000.07000.07.8374.000.00.01.00.00.00.00.01.00.00.00.01.00.01.00.01.00.00.00.00.0
9273100.098572020-0.4034730.670000.05000.00.2181560.3123480.37205838317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.01.00.00.01.00.00.01.00.00.00.01.00.00.01.00.00.0
catboostlogistic1.00000.00001.00000.00001.00.00001.00.00000.40.48990.40.48990.40.4899
gbmrf1.00000.00001.00000.00001.00.00001.00.00000.40.48990.40.48990.40.4899
rf0.90.20000.20.40000.40.48990.40.4899ada1.00000.00001.00000.00001.00.00001.00.0000
logistic0.90.20000.20.40000.20.40000.20.4000gbm1.00000.00001.00000.00001.00.00001.00.0000
ada0.90.20000.20.40000.20.40000.20.4000catboost1.00000.00001.00000.00001.00.00001.00.0000
knn0.80.24490.0xgboost1.00000.00000.01.00000.00000.01.00.00001.00.0000
svm0.93810.07620.86670.16331.00.00000.80.24490.00.00000.00.00000.0
knn0.90950.07440.88000.09800.80.16331.00.0000
mlp0.80.40000.40.48990.40.90950.11700.60000.48990.40.60.4899
xgboost0.80.24490.00.00000.00.00000.00.00000.4000
lightgbm0.80.24490.00.68570.02330.00000.00000.00.0000
\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
Accuracy0.5000
Precision (w)0.2500
Recall (w)0.5000
F1 (w)0.3333
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", " \n", - "
\"confusion
\n", + "
\"confusion
\n", "

Classification report

\n", "
              precision    recall  f1-score   support\n",
        "\n",
-       "           0       0.50      1.00      0.67         1\n",
-       "           1       0.00      0.00      0.00         1\n",
+       "           0       1.00      1.00      1.00         6\n",
+       "           1       1.00      1.00      1.00         2\n",
        "\n",
-       "    accuracy                           0.50         2\n",
-       "   macro avg       0.25      0.50      0.33         2\n",
-       "weighted avg       0.25      0.50      0.33         2\n",
+       "    accuracy                           1.00         8\n",
+       "   macro avg       1.00      1.00      1.00         8\n",
+       "weighted avg       1.00      1.00      1.00         8\n",
        "
\n", "" ] @@ -15122,14 +16981,44 @@ " \n", " \n", " \n", + " 0\n", + " 0\n", + " 2.931267e-08\n", + " \n", + " \n", + " 0\n", + " 0\n", + " 4.718786e-05\n", + " \n", + " \n", + " 0\n", + " 0\n", + " 1.445187e-06\n", + " \n", + " \n", + " 0\n", + " 0\n", + " 4.718786e-05\n", + " \n", + " \n", + " 1\n", + " 1\n", + " 1.000000e+00\n", + " \n", + " \n", " 1\n", + " 1\n", + " 9.979979e-01\n", + " \n", + " \n", + " 0\n", " 0\n", - " 0.022933\n", + " 2.864653e-02\n", " \n", " \n", " 0\n", " 0\n", - " 0.000011\n", + " 2.931267e-08\n", " \n", " \n", "" @@ -15142,7 +17031,20 @@ "name": "stdout", "output_type": "stream", "text": [ - "Model from data['last_model'] saved to ./models/model.joblib\n", + "Saving model from data['last_model'] to ./models/model.joblib...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to save model: [Errno 2] No such file or directory: './models/model.joblib'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "[MLPipeline] Model saved to ./models/model.joblib.\n", "[MLPipeline] ML pipeline completed successfully.\n" ] @@ -16048,175 +17950,7 @@ "execution_count": null, "id": "de68a877-5b52-4727-9606-4439152c4506", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_name
1missingaction=percent2025-10-29 12:31:10ALL_COLUMNSsuccess%missing action=percent examined 1 column(s); total_rows=5.test
2missingaction=percent2025-10-29 12:31:22ALL_COLUMNSsuccess%missing action=percent examined 19 column(s); total_rows=40.test
3dropmissingcolumns=salary2025-10-29 12:31:34salarysuccessDropped 0 row(s) with missing values (in-place). Updated last_select.test
4statsinclude=all2025-10-29 12:31:38ALL_COLUMNSsuccessStats computed for 19 column(s); total_rows=40; percentiles=; include=all.test
5fillmissingcolumns=age,salary strategy=median2025-10-29 12:31:43age\n", - "salarysuccessColumn 'age': filled missing with median=36.5.\n", - "Column 'salary': filled missing with median=82500.0.test
6outliersmethod=zscore z_thresh=2.52025-10-29 12:31:57emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'years_experience': detected 0 outlier(s) using zscore.\n", - "Column 'projects_completed': detected 0 outlier(s) using zscore.\n", - "Column 'avg_project_score': detected 0 outlier(s) using zscore.\n", - "Column 'certifications': detected 0 outlier(s) using zscore.\n", - "Column 'training_hours': detected 0 outlier(s) using zscore.\n", - "Column 'overtime_hours': detected 0 outlier(s) using zscore.\n", - "Column 'remote_ratio': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n", - "Column 'bonus': detected 0 outlier(s) using zscore.\n", - "Column 'satisfaction_score': detected 0 outlier(s) using zscore.\n", - "Column 'performance_rating': detected 0 outlier(s) using zscore.\n", - "Column 'potential_score': detected 0 outlier(s) using zscore.\n", - "Column 'attrition_flag': detected 0 outlier(s) using zscore.test
7dropoutliersmethod=zscore z_thresh=2.52025-10-29 12:32:05emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'years_experience': detected 0 outlier(s) using zscore.\n", - "Column 'projects_completed': detected 0 outlier(s) using zscore.\n", - "Column 'avg_project_score': detected 0 outlier(s) using zscore.\n", - "Column 'certifications': detected 0 outlier(s) using zscore.\n", - "Column 'training_hours': detected 0 outlier(s) using zscore.\n", - "Column 'overtime_hours': detected 0 outlier(s) using zscore.\n", - "Column 'remote_ratio': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n", - "Column 'bonus': detected 0 outlier(s) using zscore.\n", - "Column 'satisfaction_score': detected 0 outlier(s) using zscore.\n", - "Column 'performance_rating': detected 0 outlier(s) using zscore.\n", - "Column 'potential_score': detected 0 outlier(s) using zscore.\n", - "Column 'attrition_flag': detected 0 outlier(s) using zscore.test
8clipoutliersmethod=zscore z_thresh=2.0 2025-10-29 12:32:08emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", - "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", - "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", - "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", - "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", - "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", - "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).test
9encodemethod=label columns=department drop_original=false2025-10-29 13:04:21departmentsuccessMethod: label\n", - "Created columns:\n", - "department_lbl\n", - "\n", - "Details:\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)test
10normalizecolumns=age,salary feature_range=5,10 inplace=False2025-10-29 13:04:27age\n", - "salarysuccessFeature range: (5.0, 10.0)\n", - "\n", - "Details:\n", - "Normalized 2 column(s) to range (5.0, 10.0).\n", - "Stored in data['last_select_normalized'].test
11standardizecolumns=age,salary inplace=False2025-10-29 13:04:31age\n", - "salarysuccessStandardized 2 column(s) (mean=0, std=1).\n", - "Stored in data['last_select_standardized'].test
12splitdatatest_size=0.2 val_size=0.1 random_state=422025-10-29 13:04:35ALL_COLUMNSsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=28, test_count=8, val_count=4\n", - "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42test
13encodemethod=label columns=department drop_original=true2025-10-29 13:22:48departmentsuccessMethod: label\n", - "Created columns:\n", - "department_lbl\n", - "\n", - "Details:\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)test
14encodemethod=label drop_original=true2025-10-29 13:23:03name\n", - "gender\n", - "education_levelsuccessMethod: label\n", - "Created columns:\n", - "name_lbl\n", - "gender_lbl\n", - "education_level_lbl\n", - "\n", - "Details:\n", - "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", - "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test
15encodemethod=label drop_original=true2025-10-29 13:24:50name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: label\n", - "Created columns:\n", - "name_lbl\n", - "department_lbl\n", - "gender_lbl\n", - "education_level_lbl\n", - "\n", - "Details:\n", - "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", - "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test
16encodemethod=label drop_original=true2025-10-29 13:30:36name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: label\n", - "Created columns:\n", - "name_lbl\n", - "department_lbl\n", - "gender_lbl\n", - "education_level_lbl\n", - "\n", - "Details:\n", - "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", - "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test
17encodemethod=label drop_original=true2025-10-29 13:41:01name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: label\n", - "Created columns:\n", - "name_lbl\n", - "department_lbl\n", - "gender_lbl\n", - "education_level_lbl\n", - "\n", - "Details:\n", - "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", - "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)test
18select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 13:41:24overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['top_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtest
19missing2025-10-29 14:23:11ALL_COLUMNSsuccess%missing action=show examined 19 column(s); total_rows=10.test
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "select * from magic_metadata;" ] diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py index cfdf393..383e1ae 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py @@ -201,7 +201,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py index 25f1498..d1f0e12 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py @@ -303,7 +303,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py index 700e7ef..ebad038 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py @@ -239,7 +239,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py index 381c1d1..09207dc 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py @@ -175,7 +175,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py index 35bd4b4..f233092 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py @@ -8,6 +8,9 @@ import pandas as pd import numpy as np import json +import logging +import os +import re # Import the other pipeline stages (paths kept as in your original snippet) from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing @@ -19,6 +22,7 @@ from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData # placeholder safety from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel @@ -26,6 +30,12 @@ from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + class MLPipeline(MariaMagic): """ @@ -90,16 +100,205 @@ def _send_html(self, kernel, df, title=None): def _send_message(self, kernel, channel, message): kernel._send_message(channel, f"[MLPipeline] {message}") + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + df = data.get("last_select") if df is None or df.empty: - self._send_message(kernel, "stderr", "No last_select found or DataFrame is empty.") + msg = "No last_select found or DataFrame is empty." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False try: args = self.parse_args(self.args) except Exception as e: - self._send_message(kernel, "stderr", f"Error parsing arguments: {e}. Use key=value syntax.") + msg = f"Error parsing arguments: {e}. Use key=value syntax." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Parse arguments @@ -111,16 +310,72 @@ def execute(self, kernel, data): # Validate required arguments if not target: - self._send_message(kernel, "stderr", "target argument is required (target=target_col).") + msg = "target argument is required (target=target_col)." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False if not problem: - self._send_message(kernel, "stderr", "problem argument is required (problem=classification|regression).") + msg = "problem argument is required (problem=classification|regression)." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False if problem not in ("classification", "regression"): - self._send_message(kernel, "stderr", "problem must be 'classification' or 'regression'.") + msg = "problem must be 'classification' or 'regression'." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False if target not in df.columns: - self._send_message(kernel, "stderr", f"Target column '{target}' not found in DataFrame.") + msg = f"Target column '{target}' not found in DataFrame." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Parse features or set to all columns except target if not provided @@ -130,18 +385,60 @@ def execute(self, kernel, data): elif isinstance(features_arg, (list, tuple)): features = list(features_arg) else: - self._send_message(kernel, "stderr", "features must be comma-separated string or list.") + msg = "features must be comma-separated string or list." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False else: features = [col for col in df.columns if col != target] if not features: - self._send_message(kernel, "stderr", "No features available after excluding target column.") + msg = "No features available after excluding target column." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Validate features missing = [c for c in features if c not in df.columns] if missing: - self._send_message(kernel, "stderr", f"Missing feature columns in DataFrame: {', '.join(missing)}") + msg = f"Missing feature columns in DataFrame: {', '.join(missing)}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Set defaults @@ -174,12 +471,40 @@ def execute(self, kernel, data): DropMissing(drop_args).execute(kernel, data) cur_df = data.get("last_select") if cur_df is None or cur_df.empty: - self._send_message(kernel, "stderr", "DataFrame is empty after dropping missing values.") + msg = "DataFrame is empty after dropping missing values." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=drop_args, + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Refresh working_df reference after cleaning working_df = cur_df except Exception as e: - self._send_message(kernel, "stderr", f"Error handling missing values: {e}") + msg = f"Error handling missing values: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Step 2: Encode categorical features @@ -187,7 +512,7 @@ def execute(self, kernel, data): # Recompute cat_columns on current working_df cat_columns = [c for c in features if c in working_df.columns and working_df[c].dtype in ["object", "category"]] if cat_columns: - encode_args = f"method={encode_method} columns={','.join(cat_columns)} inplace=True drop_original=True" + encode_args = f"method={encode_method} columns={','.join(cat_columns)} inplace=True drop_original=True mode=apply confirm=true" # reset any previous encoder data["last_select_encoder"] = None Encode(encode_args).execute(kernel, data) @@ -198,7 +523,21 @@ def execute(self, kernel, data): if encode_method == "onehot": encoder = data.get("last_select_encoder") if not encoder: - self._send_message(kernel, "stderr", "Encoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder'].") + msg = "Encoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder']." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns=",".join(cat_columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False try: # get_feature_names_out may require passing the original column names @@ -222,7 +561,21 @@ def execute(self, kernel, data): feature_names = [str(fn) for fn in feature_names] features = [c for c in features if c not in cat_columns] + feature_names except Exception as e: - self._send_message(kernel, "stderr", f"Failed to retrieve encoded feature names: {e}") + msg = f"Failed to retrieve encoded feature names: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns=",".join(cat_columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False elif encode_method == "label": @@ -243,12 +596,40 @@ def execute(self, kernel, data): related_columns = [] for c in cat_columns: related_columns += [col for col in working_df.columns if col.startswith(c + "_") or col.startswith(c + "_lbl") or col.startswith(c + "_ord")] - self._send_message(kernel, "stderr", f"Encoded features not found in DataFrame: {', '.join(missing_encoded)}") + msg = f"Encoded features not found in DataFrame: {', '.join(missing_encoded)}" + self._send_message(kernel, "stderr", msg) if related_columns: self._send_message(kernel, "stderr", f"Available related columns: {', '.join(related_columns)}") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns=",".join(missing_encoded), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False except Exception as e: - self._send_message(kernel, "stderr", f"Error during encoding: {e}") + msg = f"Error during encoding: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Step 3: Feature selection (if features not provided) @@ -258,16 +639,58 @@ def execute(self, kernel, data): SelectFeatures(select_features_args).execute(kernel, data) features = data.get("selected_features", []) if not features: - self._send_message(kernel, "stderr", "Feature selection failed to return features.") + msg = "Feature selection failed to return features." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=select_features_args, + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Verify selected features exist working_df = data.get("last_select", working_df) missing_features = [f for f in features if f not in working_df.columns] if missing_features: - self._send_message(kernel, "stderr", f"Selected features not found in DataFrame: {', '.join(missing_features)}") + msg = f"Selected features not found in DataFrame: {', '.join(missing_features)}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=select_features_args, + affected_columns=",".join(missing_features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False except Exception as e: - self._send_message(kernel, "stderr", f"Error during feature selection: {e}") + msg = f"Error during feature selection: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Step 4: Scale numeric features @@ -278,10 +701,23 @@ def execute(self, kernel, data): scale_args = f"columns={','.join(num_columns)} inplace=True" Standardize(scale_args).execute(kernel, data) except Exception as e: - self._send_message(kernel, "stderr", f"Error during scaling: {e}") + msg = f"Error during scaling: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=scale_args if 'scale_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns=",".join(num_columns) if 'num_columns' in locals() else (",".join(features) if 'features' in locals() else ""), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False - # Step 5: Split data # Step 5: Split data try: split_args = f"test_size={test_size} val_size={val_size} shuffle={shuffle} " \ @@ -298,11 +734,39 @@ def execute(self, kernel, data): test_df = data.get(test_name) if train_df is None or train_df.empty or test_df is None or test_df.empty: - self._send_message(kernel, "stderr", "Data splitting failed to produce non-empty train/test sets.") + msg = "Data splitting failed to produce non-empty train/test sets." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=split_args, + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False except Exception as e: - self._send_message(kernel, "stderr", f"Error during data splitting: {e}") + msg = f"Error during data splitting: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=split_args if 'split_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Step 6: Model selection or training @@ -327,14 +791,39 @@ def execute(self, kernel, data): # Validate model creation model_obj = data.get(model_store_name) if model_obj is None: - self._send_message( - kernel, "stderr", - f"No model object created. Ensure SelectModel or TrainModel supports problem='{problem}'." - ) + msg = f"No model object created. Ensure SelectModel or TrainModel supports problem='{problem}'." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False except Exception as e: - self._send_message(kernel, "stderr", f"Error during model training/selection: {e}") + msg = f"Error during model training/selection: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False # Step 7: Evaluate model @@ -342,25 +831,86 @@ def execute(self, kernel, data): eval_args = f"model_name={model_store_name} test_name={test_name} problem={problem}" EvaluateModel(eval_args).execute(kernel, data) except Exception as e: - self._send_message(kernel, "stderr", f"Error during model evaluation: {e}") + msg = f"Error during model evaluation: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=eval_args if 'eval_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False - # Step 8: Save model if requested # Step 8: Save model if requested if save_path: try: # Ensure correct key for SaveModel command - save_args = f"model_name={model_store_name} save_path={save_path}" + save_args = f"model_name_in_data={model_store_name} save_path={save_path}" SaveModel(save_args).execute(kernel, data) self._send_message(kernel, "stdout", f"Model saved to {save_path}.") except Exception as e: - self._send_message(kernel, "stderr", f"Error saving model: {e}") + msg = f"Error saving model: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=save_args if 'save_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns=model_store_name, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False else: - self._send_message(kernel, "stderr", "You must provide save_path=/path/to/file.joblib") + msg = "You must provide save_path=/path/to/file.joblib" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return False + # Summary and success metadata + success_msg = "ML pipeline completed successfully." + self._send_message(kernel, "stdout", success_msg) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = ",".join(features) if 'features' in locals() else "" + message_str = f"{success_msg} model={model_store_name} saved_to={save_path}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + self._send_message(kernel, "stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass - # Summary - self._send_message(kernel, "stdout", "ML pipeline completed successfully.") return True diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py index f2224bd..6aca48a 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py @@ -186,7 +186,10 @@ def _ensure_metadata_table(self, kernel, db_name): operation_status VARCHAR(50), message TEXT, db_name VARCHAR(255), - user_name VARCHAR(255) + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) ); """ try: @@ -402,7 +405,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -420,7 +423,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -443,7 +446,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -509,7 +512,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -526,7 +529,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -587,7 +590,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -605,7 +608,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -633,7 +636,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(selected_features) if 'selected_features' in locals() else "", + affected_columns="\n".join(selected_features) if 'selected_features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -655,7 +658,7 @@ def execute(self, kernel, data): # Insert metadata (best-effort) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ",".join(selected_features) + affected_columns_str = "\n".join(selected_features) message_str = success_msg self._insert_metadata( kernel=kernel, diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py index 23edc02..bd16bdd 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py @@ -12,6 +12,9 @@ from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.neural_network import MLPClassifier, MLPRegressor +import logging +import os +import re # Optional external libraries _XGBOOST_AVAILABLE = False @@ -35,6 +38,13 @@ except Exception: pass +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + class SelectModel(MariaMagic): """ %select_model target=target_col @@ -99,6 +109,156 @@ def _send_html(self, kernel, df, title=None): except Exception: pass + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + def _choose_model(self, name, problem, params=None): p = params or {} name = name.lower() @@ -149,16 +309,55 @@ def _choose_model(self, name, problem, params=None): raise ValueError(f"Unknown model name '{name}'") def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + # Load training DataFrame df = data.get("last_select") if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty.") + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return features_arg = args.get("features") @@ -171,14 +370,42 @@ def execute(self, kernel, data): model_params = args.get("model_params", {}) or {} if not target: - kernel._send_message("stderr", "target argument is required (target=target_col).") + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Use selected_features if features not provided if not features_arg: features = data.get("selected_features") if not features: - kernel._send_message("stderr", "No features provided and no selected_features found. Run %select_features first.") + msg = "No features provided and no selected_features found. Run %select_features first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: if isinstance(features_arg, str): @@ -186,19 +413,61 @@ def execute(self, kernel, data): elif isinstance(features_arg, (list, tuple)): features = list(features_arg) else: - kernel._send_message("stderr", "features must be comma-separated string or list.") + msg = "features must be comma-separated string or list." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return missing = [c for c in features + [target] if c not in df.columns] if missing: - kernel._send_message("stderr", f"Missing columns in DataFrame: {', '.join(missing)}") + msg = f"Missing columns in DataFrame: {', '.join(missing)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Determine problem type if problem_override: problem = problem_override.lower() if problem not in ("classification", "regression"): - kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: tgt_ser = df[target] @@ -235,7 +504,21 @@ def execute(self, kernel, data): if primary_metric is None: primary_metric = "accuracy" if problem == "classification" else "r2" if primary_metric not in metrics[problem]: - kernel._send_message("stderr", f"Invalid primary_metric '{primary_metric}' for {problem}. Choose from {', '.join(metrics[problem])}.") + msg = f"Invalid primary_metric '{primary_metric}' for {problem}. Choose from {', '.join(metrics[problem])}." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Prepare data @@ -245,7 +528,21 @@ def execute(self, kernel, data): # Handle missing values X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) if X.isna().any().any(): - kernel._send_message("stderr", "Features contain non-numeric data or unhandled missing values.") + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Evaluate models across all metrics @@ -290,11 +587,26 @@ def execute(self, kernel, data): best_model_name = model_name except Exception as e: + # Log the model-level failure but continue with other models kernel._send_message("stderr", f"Error evaluating model '{model_name}': {e}") continue if not results: - kernel._send_message("stderr", "No models were successfully evaluated.") + msg = "No models were successfully evaluated." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Create results DataFrame @@ -308,7 +620,21 @@ def execute(self, kernel, data): try: best_model.fit(X, y) except Exception as e: - kernel._send_message("stderr", f"Error fitting best model '{best_model_name}': {e}") + msg = f"Error fitting best model '{best_model_name}': {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Store the best model and metadata @@ -327,11 +653,47 @@ def execute(self, kernel, data): if hasattr(best_model, "classes_"): data[output_name + "_meta"]["classes"] = list(getattr(best_model, "classes_")) except Exception as e: - kernel._send_message("stderr", f"Error storing best model: {e}") + msg = f"Error storing best model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Display results self._send_html(kernel, result_df, title=f"Model Selection Results (primary_metric={primary_metric})") - kernel._send_message("stdout", f"Best model '{best_model_name}' (mean {primary_metric}={best_score:.4f}) saved to data['{output_name}'].") + success_msg = f"Best model '{best_model_name}' (mean {primary_metric}={best_score:.4f}) saved to data['{output_name}']." + kernel._send_message("stdout", success_msg) + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(features) + message_str = success_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass - return \ No newline at end of file + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py index d7dfd9c..5974f4f 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py @@ -5,7 +5,7 @@ from distutils import util import logging from mariadb_kernel.maria_magics.maria_magic import MariaMagic - +import os def _str_to_obj(s): try: @@ -68,6 +68,8 @@ def execute(self, kernel, data): save_path = args.get("save_path") overwrite = bool(args.get("overwrite", False)) + + if not save_path: kernel._send_message("stderr", "You must provide save_path=/path/to/file.joblib") return @@ -84,6 +86,7 @@ def execute(self, kernel, data): return try: + os.makedirs(os.path.dirname(save_path), exist_ok=True) joblib.dump(model_obj, save_path) kernel._send_message("stdout", f"Model from data['{model_key}'] saved to {save_path}") except Exception as e: From 7cf5d02396ed403207341ce000a5b1474dea5c55 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Thu, 30 Oct 2025 09:40:40 +0000 Subject: [PATCH 31/38] All loggings in ml_pipeline completed --- Untitled.ipynb | 597 +++++++++++++++++- .../ml_commands/ml_pipeline/ml_pipeline.py | 34 +- 2 files changed, 613 insertions(+), 18 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index e60867e..9bb72bf 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -17950,7 +17950,602 @@ "execution_count": null, "id": "de68a877-5b52-4727-9606-4439152c4506", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1dropmissingcolumns=department mode=preview2025-10-29 15:22:49departmentpreviewpreview_dropped=1testNULLNULLNULL
2dropmissingcolumns=department table=test.employees mode=apply confirm=true 2025-10-29 15:22:49departmentappliedapplied_backup=test.employees_backup_73b9e64a9b8c4045test73b9e64a9b8c4045test.employees_backup_73b9e64a9b8c4045test.employees
3dropmissingmode=rollback rollback_token=73b9e64a9b8c40452025-10-29 15:23:00ALL_COLUMNSrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_73b9e64a9b8c4045test73b9e64a9b8c4045test.employees_backup_73b9e64a9b8c4045test.employees
4fillmissingcolumns=years_experience strategy=median mode=preview2025-10-29 15:36:42years_experiencepreviewpreview_computed_fill_valuestestNULLNULLNULL
5fillmissingcolumns=years_experience strategy=median table=test.employees mode=apply confirm=true 2025-10-29 15:37:18years_experienceappliedapplied_backup=test.employees_backup_9407f2e1e7db47b2test9407f2e1e7db47b2test.employees_backup_9407f2e1e7db47b2test.employees
6fillmissingmode=rollback rollback_token=9407f2e1e7db47b22025-10-29 15:39:07emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_9407f2e1e7db47b2test9407f2e1e7db47b2test.employees_backup_9407f2e1e7db47b2test.employees
7outliers2025-10-29 15:54:23errorNo numeric target columns found to detect outliers.testNULLNULLNULL
8outliers2025-10-29 15:54:44emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using iqr.\n", + "Column 'age': detected 0 outlier(s) using iqr.\n", + "Column 'years_experience': detected 0 outlier(s) using iqr.\n", + "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", + "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", + "Column 'certifications': detected 0 outlier(s) using iqr.\n", + "Column 'training_hours': detected 0 outlier(s) using iqr.\n", + "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", + "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", + "Column 'salary': detected 1 outlier(s) using iqr.\n", + "Column 'bonus': detected 0 outlier(s) using iqr.\n", + "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", + "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", + "Column 'potential_score': detected 0 outlier(s) using iqr.\n", + "Column 'attrition_flag': detected 0 outlier(s) using iqr.testNULLNULLNULL
9dropoutliersmode=preview2025-10-29 16:00:10emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_completedtestNULLNULLNULL
10dropoutlierstable=test.employees mode=apply confirm=true 2025-10-29 16:00:58emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagappliedapplied_db_versionedtest53731015c85a478atest.employees_backup_53731015c85a478atest.employees
11dropoutliersmode=rollback rollback_token=53731015c85a478a2025-10-29 16:02:06emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_53731015c85a478atest53731015c85a478atest.employees_backup_53731015c85a478atest.employees
12clipoutliersmode=preview2025-10-29 17:01:54emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_completedtestNULLNULLNULL
13clipoutlierstable=test.employees mode=apply confirm=true 2025-10-29 17:02:22emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagappliedapplied_backup=test.employees_backup_656479791b1d48fctest656479791b1d48fctest.employees_backup_656479791b1d48fctest.employees
14clipoutliersmode=rollback rollback_token=656479791b1d48fc2025-10-29 17:04:03emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_656479791b1d48fctest656479791b1d48fctest.employees_backup_656479791b1d48fctest.employees
15encodemethod=onehot columns=department drop_original=false mode=preview2025-10-29 17:21:19departmenterrorColumn(s) not found: departmenttestNULLNULLNULL
16encodemethod=onehot columns=department drop_original=false mode=preview2025-10-29 17:21:42departmentpreviewpreview_completedtestNULLNULLNULL
17encodemethod=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true 2025-10-29 17:24:34departmentappliedapplied_backup=test.employees_backup_8d3413bf829d4cd8;created_columns=5test8d3413bf829d4cd8test.employees_backup_8d3413bf829d4cd8test.employees
18encodemode=rollback rollback_token=8d3413bf829d4cd82025-10-29 17:26:10name\n", + "department\n", + "gender\n", + "education_levelrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_8d3413bf829d4cd8test8d3413bf829d4cd8test.employees_backup_8d3413bf829d4cd8test.employees
19encodemethod=onehot columns=department drop_original=false 2025-10-29 17:32:31departmentpreviewpreview_completedtestNULLNULLNULL
20normalizecolumns=age,salary feature_range=5,10 mode=preview2025-10-29 17:44:41age\n", + "salarypreviewpreview_completedtestNULLNULLNULL
21normalize feature_range=5,10 mode=preview2025-10-29 17:45:01emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_completedtestNULLNULLNULL
22normalizefeature_range=5,10 table=test.employees mode=apply confirm=true 2025-10-29 17:47:11emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagappliedapplied_backup=test.employees_backup_daf864252a6c46f1;details=emp_id: min=1.0, max=10.0|age: min=27.0, max=55.0|years_experience: min=3.0, max=30.0|projects_completed: min=8.0, max=30.0|avg_project_score: min=68.9, max=95.2|certifications: min=0.0, max=3.0|training_hours: min=5.0, max=50.0|overtime_hours: min=2.0, max=25.0|remote_ratio: min=0.0, max=0.8|salary: min=55000.0, max=1200000.0|bonus: min=300.0, max=15000.0|satisfaction_score: min=5.5, max=9.6|performance_rating: min=1.0, max=5.0|potential_score: min=50.0, max=95.0|attrition_flag: min=0.0, max=1.0testdaf864252a6c46f1test.employees_backup_daf864252a6c46f1test.employees
23normalizemode=rollback rollback_token=daf864252a6c46f12025-10-29 17:48:30emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_daf864252a6c46f1testdaf864252a6c46f1test.employees_backup_daf864252a6c46f1test.employees
24standardizemode=preview2025-10-29 17:56:44emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_completedtestNULLNULLNULL
25standardizetable=test.employees mode=apply confirm=true 2025-10-29 17:57:27emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagappliedapplied_backup=test.employees_backup_f15a883f004548a8;details=emp_id: mean=5.5, std=2.8723|age: mean=38.2, std=9.0642|years_experience: mean=13.5556, std=9.2989|projects_completed: mean=17.2, std=6.7201|avg_project_score: mean=84.19, std=8.217354|certifications: mean=1.0, std=1.0|training_hours: mean=25.0, std=13.2288|overtime_hours: mean=10.8, std=6.9397|remote_ratio: mean=0.38, std=0.25219|salary: mean=189700.0, std=337053.126376|bonus: mean=6730.0, std=4002.011994|satisfaction_score: mean=7.94, std=1.191805|performance_rating: mean=3.6, std=1.2806|potential_score: mean=77.0, std=13.43875|attrition_flag: mean=0.3, std=0.4583testf15a883f004548a8test.employees_backup_f15a883f004548a8test.employees
26standardizemode=rollback rollback_token=f15a883f004548a82025-10-29 17:58:34emp_id\n", + "age\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_f15a883f004548a8testf15a883f004548a8test.employees_backup_f15a883f004548a8test.employees
27encodemethod=label drop_original=true2025-10-29 18:21:31name\n", + "department\n", + "gender\n", + "education_levelpreviewpreview_completedtestNULLNULLNULL
28select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 18:22:01emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_scoreerrorError during feature selection: could not convert string to float: 'Alice'testNULLNULLNULL
29encodemethod=label drop_original=true2025-10-29 18:22:26name\n", + "department\n", + "gender\n", + "education_levelpreviewpreview_completedtestNULLNULLNULL
30encodemethod=label drop_original=true mode=apply2025-10-29 18:22:48name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: label\n", + "Created columns:\n", + "name_lbl\n", + "department_lbl\n", + "gender_lbl\n", + "education_level_lbl\n", + "\n", + "Details:\n", + "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", + "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", + "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)testNULLNULLNULL
31select_featurestarget=attrition_flag method=correlation k=5 problem=classification 2025-10-29 18:23:26overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
32select_modeltarget=attrition_flag problem=classification2025-10-29 18:24:09overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['best_model'].testNULLNULLNULL
33select_modeltarget=attrition_flag problem=classification2025-10-29 18:36:07errorNo features provided and no selected_features found. Run %select_features first.testNULLNULLNULL
34dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:36:38emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
35encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:36:38name\n", + "department\n", + "gender\n", + "education_levelpreviewpreview_completedtestNULLNULLNULL
36mlpipelinemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:36:39name,department,gender,education_levelerrorEncoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder'].testNULLNULLNULL
37dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:48:09emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
38encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:48:09name\n", + "department\n", + "gender\n", + "education_levelpreviewpreview_completedtestNULLNULLNULL
39mlpipelinemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:48:09name_Alice,name_Bob,name_Charlie,name_Diana,name_Eve,name_Frank,name_Grace,name_Henry,name_Ivy,name_Jack,department_HR,department_Engineering,department_Sales,department_nan,department_Finance,gender_F,gender_M,education_level_nan,education_level_Masters,education_level_Bachelors,education_level_PhD,education_level_High SchoolerrorEncoded features not found in DataFrame: name_Alice, name_Bob, name_Charlie, name_Diana, name_Eve, name_Frank, name_Grace, name_Henry, name_Ivy, name_Jack, department_HR, department_Engineering, department_Sales, department_nan, department_Finance, gender_F, gender_M, education_level_nan, education_level_Masters, education_level_Bachelors, education_level_PhD, education_level_High SchooltestNULLNULLNULL
40dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:57:36emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
41encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 18:57:36name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: onehot\n", + "Created columns:\n", + "name_Alice\n", + "name_Bob\n", + "name_Charlie\n", + "name_Diana\n", + "name_Eve\n", + "name_Frank\n", + "name_Grace\n", + "name_Henry\n", + "name_Ivy\n", + "name_Jack\n", + "department_Engineering\n", + "department_Finance\n", + "department_HR\n", + "department_Sales\n", + "department____MISSING___\n", + "gender_F\n", + "gender_M\n", + "education_level_Bachelors\n", + "education_level_High School\n", + "education_level_Masters\n", + "education_level_PhD\n", + "education_level____MISSING___\n", + "\n", + "Details:\n", + "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
42select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 18:57:36overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
43standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 18:57:37overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingpreviewpreview_completedtestNULLNULLNULL
44splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 18:57:37attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=32, test_count=8, val_count=0\n", + "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
45select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 18:57:52overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
46mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 18:57:52overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
47mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:04:49errorTarget column 'attrition_flag' not found in DataFrame.testNULLNULLNULL
48dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:05:13emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
49encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:05:13name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: onehot\n", + "Created columns:\n", + "name_Alice\n", + "name_Bob\n", + "name_Charlie\n", + "name_Diana\n", + "name_Eve\n", + "name_Frank\n", + "name_Grace\n", + "name_Henry\n", + "name_Ivy\n", + "name_Jack\n", + "department_Engineering\n", + "department_Finance\n", + "department_HR\n", + "department_Sales\n", + "department____MISSING___\n", + "gender_F\n", + "gender_M\n", + "education_level_Bachelors\n", + "education_level_High School\n", + "education_level_Masters\n", + "education_level_PhD\n", + "education_level____MISSING___\n", + "\n", + "Details:\n", + "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
50select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:05:14overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
51standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:05:14overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingpreviewpreview_completedtestNULLNULLNULL
52splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:05:14attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=32, test_count=8, val_count=0\n", + "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
53select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:05:27overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
54mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:05:27overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
55dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:08:25emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
56encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:08:25name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: onehot\n", + "Created columns:\n", + "name_Alice\n", + "name_Bob\n", + "name_Charlie\n", + "name_Diana\n", + "name_Eve\n", + "name_Frank\n", + "name_Grace\n", + "name_Henry\n", + "name_Ivy\n", + "name_Jack\n", + "department_Engineering\n", + "department_Finance\n", + "department_HR\n", + "department_Sales\n", + "department____MISSING___\n", + "gender_F\n", + "gender_M\n", + "education_level_Bachelors\n", + "education_level_High School\n", + "education_level_Masters\n", + "education_level_PhD\n", + "education_level____MISSING___\n", + "\n", + "Details:\n", + "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
57select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:08:25overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
58standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:08:25overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingpreviewpreview_completedtestNULLNULLNULL
59splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:08:26attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=32, test_count=8, val_count=0\n", + "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
60select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:08:38overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
61mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:08:38overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
62mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:11:04errorTarget column 'attrition_flag' not found in DataFrame.testNULLNULLNULL
63dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:11:31emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
64encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:11:32name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: onehot\n", + "Created columns:\n", + "name_Alice\n", + "name_Bob\n", + "name_Charlie\n", + "name_Diana\n", + "name_Eve\n", + "name_Frank\n", + "name_Grace\n", + "name_Henry\n", + "name_Ivy\n", + "name_Jack\n", + "department_Engineering\n", + "department_Finance\n", + "department_HR\n", + "department_Sales\n", + "department____MISSING___\n", + "gender_F\n", + "gender_M\n", + "education_level_Bachelors\n", + "education_level_High School\n", + "education_level_Masters\n", + "education_level_PhD\n", + "education_level____MISSING___\n", + "\n", + "Details:\n", + "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
65select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:11:32overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
66standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:11:32overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingpreviewpreview_completedtestNULLNULLNULL
67splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:11:32attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=32, test_count=8, val_count=0\n", + "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
68select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:11:45overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
69mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:11:45overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
70dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:14:14emp_id\n", + "name\n", + "department\n", + "age\n", + "gender\n", + "education_level\n", + "years_experience\n", + "projects_completed\n", + "avg_project_score\n", + "certifications\n", + "training_hours\n", + "overtime_hours\n", + "remote_ratio\n", + "salary\n", + "bonus\n", + "satisfaction_score\n", + "performance_rating\n", + "potential_score\n", + "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
71encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:14:14name\n", + "department\n", + "gender\n", + "education_levelsuccessMethod: onehot\n", + "Created columns:\n", + "name_Alice\n", + "name_Bob\n", + "name_Charlie\n", + "name_Diana\n", + "name_Eve\n", + "name_Frank\n", + "name_Grace\n", + "name_Henry\n", + "name_Ivy\n", + "name_Jack\n", + "department_Engineering\n", + "department_Finance\n", + "department_HR\n", + "department_Sales\n", + "department____MISSING___\n", + "gender_F\n", + "gender_M\n", + "education_level_Bachelors\n", + "education_level_High School\n", + "education_level_Masters\n", + "education_level_PhD\n", + "education_level____MISSING___\n", + "\n", + "Details:\n", + "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
72select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:14:14overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
73standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:14:15overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingpreviewpreview_completedtestNULLNULLNULL
74splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:14:15attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=32, test_count=8, val_count=0\n", + "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
75select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:14:29overtime_hours\n", + "avg_project_score\n", + "satisfaction_score\n", + "potential_score\n", + "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
76mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:14:29overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "select * from magic_metadata;" ] diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py index f233092..81913ea 100644 --- a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py @@ -431,7 +431,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -478,7 +478,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=drop_args, - affected_columns=",".join(features), + affected_columns="\n".join(features), operation_status="error", message=msg, db_name=db_name, @@ -497,7 +497,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -530,7 +530,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=encode_args, - affected_columns=",".join(cat_columns), + affected_columns="\n".join(cat_columns), operation_status="error", message=msg, db_name=db_name, @@ -568,7 +568,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=encode_args, - affected_columns=",".join(cat_columns), + affected_columns="\n".join(cat_columns), operation_status="error", message=msg, db_name=db_name, @@ -605,7 +605,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=encode_args, - affected_columns=",".join(missing_encoded), + affected_columns="\n".join(missing_encoded), operation_status="error", message=msg, db_name=db_name, @@ -622,7 +622,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -666,7 +666,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=select_features_args, - affected_columns=",".join(missing_features), + affected_columns="\n".join(missing_features), operation_status="error", message=msg, db_name=db_name, @@ -683,7 +683,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -708,7 +708,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=scale_args if 'scale_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), - affected_columns=",".join(num_columns) if 'num_columns' in locals() else (",".join(features) if 'features' in locals() else ""), + affected_columns="\n".join(num_columns) if 'num_columns' in locals() else (",".join(features) if 'features' in locals() else ""), operation_status="error", message=msg, db_name=db_name, @@ -741,7 +741,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=split_args, - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -759,7 +759,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=split_args if 'split_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -798,7 +798,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -816,7 +816,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -838,7 +838,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=eval_args if 'eval_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -880,7 +880,7 @@ def execute(self, kernel, data): kernel=kernel, command_name=self.name(), arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=",".join(features) if 'features' in locals() else "", + affected_columns="\n".join(features) if 'features' in locals() else "", operation_status="error", message=msg, db_name=db_name, @@ -895,7 +895,7 @@ def execute(self, kernel, data): self._send_message(kernel, "stdout", success_msg) try: args_for_db = self.args if isinstance(self.args, str) else str(self.args) - affected_columns_str = ",".join(features) if 'features' in locals() else "" + affected_columns_str = "\n".join(features) if 'features' in locals() else "" message_str = f"{success_msg} model={model_store_name} saved_to={save_path}" self._insert_metadata( kernel=kernel, From 468a3424784a0e44e87e2da5c94ba5f0962f90a4 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Thu, 30 Oct 2025 11:58:20 +0000 Subject: [PATCH 32/38] Logging added to ml_pipeline --- Untitled.ipynb | 2087 ++++++----------- .../model_training/evaluate_model.py | 377 ++- .../ml_commands/model_training/loadmodel.py | 304 ++- .../ml_commands/model_training/predict.py | 421 +++- .../ml_commands/model_training/savemodel.py | 312 ++- .../ml_commands/model_training/train_model.py | 390 ++- 6 files changed, 2484 insertions(+), 1407 deletions(-) diff --git a/Untitled.ipynb b/Untitled.ipynb index 9bb72bf..022dc8b 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -5846,7 +5846,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", "metadata": {}, "outputs": [ @@ -10750,7 +10750,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "992bf2a2-15e2-4c67-a8fc-f0ac3c3e0630", "metadata": {}, "outputs": [ @@ -10768,11 +10768,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -10786,18 +10782,17 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", " 18\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -10810,16 +10805,15 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 25\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -10832,16 +10826,15 @@ " 4\n", " 85.0\n", " 0\n", - " 1\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", " \n", " \n", " 29\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -10854,16 +10847,15 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 23\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -10876,16 +10868,15 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 6\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -10898,16 +10889,15 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 40\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -10920,16 +10910,15 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 14\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -10942,82 +10931,78 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 22\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 32\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 12\n", - " Bob\n", - " Engineering\n", " 45\n", - " M\n", - " Masters\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", " 89.0\n", " 0\n", + " 1\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 34\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -11030,16 +11015,15 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 33\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -11052,16 +11036,15 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 7\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -11074,16 +11057,15 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 26\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -11096,16 +11078,15 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 19\n", - " Ivy\n", - " Finance\n", " 27\n", - " F\n", - " Bachelors\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -11118,16 +11099,15 @@ " 4\n", " 82.0\n", " 0\n", + " 8\n", " 1\n", + " 0\n", + " 0\n", " \n", " \n", " 10\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -11140,16 +11120,15 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 36\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -11162,16 +11141,15 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 21\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -11179,21 +11157,20 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " -1\n", " \n", " \n", " 24\n", - " Diana\n", - " Engineering\n", " 29\n", - " F\n", - " PhD\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -11206,16 +11183,15 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", " 0\n", + " 3\n", " \n", " \n", " 31\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -11223,12 +11199,15 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " -1\n", " \n", " \n", "" @@ -11244,11 +11223,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -11262,18 +11237,17 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", " 15\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -11286,16 +11260,15 @@ " 4\n", " 85.0\n", " 0\n", - " 1\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", " \n", " \n", " 30\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -11308,16 +11281,15 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 11\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", - " 5\n", + " 5.0\n", " 12\n", " 87.5\n", " 1\n", @@ -11325,21 +11297,20 @@ " 5\n", " 0.2\n", " 55000.0\n", - " 3000.0\n", + " 300.0\n", " 8.5\n", " 4\n", " 75.0\n", " 0\n", + " 0\n", " 2\n", + " 0\n", + " -1\n", " \n", " \n", " 3\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -11352,7 +11323,10 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", "" @@ -11368,11 +11342,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -11386,18 +11356,17 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", " 20\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -11410,16 +11379,15 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", " 3\n", + " 1\n", + " 1\n", " \n", " \n", " 17\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -11432,16 +11400,15 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 16\n", - " Frank\n", - " HR\n", " 50\n", - " M\n", - " High School\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -11454,16 +11421,15 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", " 2\n", + " 1\n", + " 1\n", " \n", " \n", " 27\n", - " Grace\n", - " Sales\n", " 42\n", - " F\n", - " Bachelors\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -11476,16 +11442,15 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", " 3\n", + " 0\n", + " 0\n", " \n", " \n", " 5\n", - " Eve\n", - " Finance\n", " 35\n", - " F\n", - " Bachelors\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -11498,16 +11463,15 @@ " 4\n", " 85.0\n", " 0\n", - " 1\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", " \n", " \n", " 13\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -11520,16 +11484,15 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", " 3\n", + " 1\n", + " 0\n", " \n", " \n", " 38\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -11542,16 +11505,15 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", " 28\n", - " Henry\n", - " Engineering\n", " 31\n", - " M\n", - " Masters\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -11564,7 +11526,10 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", "" @@ -11598,11 +11563,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -11616,122 +11577,122 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", + " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", - " 31\n", - " Alice\n", - " HR\n", - " 30\n", - " F\n", - " Bachelors\n", - " 5\n", - " 12\n", - " 87.5\n", + " 36\n", + " 50\n", + " 25.0\n", + " 8\n", + " 72.5\n", + " 0\n", + " 10\n", + " 15\n", + " 0.7\n", + " 60000.0\n", + " 4000.0\n", + " 6.5\n", + " 2\n", + " 60.0\n", " 1\n", - " 40\n", " 5\n", - " 0.2\n", - " 55000.0\n", - " 3000.0\n", - " 8.5\n", - " 4\n", - " 75.0\n", - " 0\n", + " 2\n", + " 1\n", + " 1\n", " \n", " \n", + " 33\n", + " 38\n", + " NaN\n", + " 18\n", + " 79.3\n", + " 0\n", + " 15\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", + " 3\n", + " 70.0\n", " 1\n", - " Alice\n", - " HR\n", - " 30\n", - " F\n", - " Bachelors\n", - " 5\n", - " 12\n", - " 87.5\n", + " 2\n", + " 3\n", " 1\n", - " 40\n", - " 5\n", - " 0.2\n", - " 55000.0\n", - " 3000.0\n", - " 8.5\n", - " 4\n", - " 75.0\n", " 0\n", " \n", " \n", - " 19\n", - " Ivy\n", - " Finance\n", - " 27\n", - " F\n", - " Bachelors\n", + " 34\n", + " 29\n", + " 6.0\n", + " 22\n", + " 95.2\n", + " 2\n", + " 50\n", + " 2\n", + " 0.0\n", + " 97000.0\n", + " 10000.0\n", + " 9.6\n", + " 5\n", + " 95.0\n", + " 0\n", " 3\n", - " 10\n", - " 85.0\n", " 0\n", - " 20\n", - " 8\n", - " 0.6\n", - " 70000.0\n", - " 5000.0\n", - " 8.2\n", - " 4\n", - " 82.0\n", " 0\n", + " 3\n", " \n", " \n", - " 21\n", - " Alice\n", - " HR\n", " 30\n", - " F\n", - " Bachelors\n", - " 5\n", + " 55\n", + " 30.0\n", " 12\n", - " 87.5\n", - " 1\n", - " 40\n", - " 5\n", - " 0.2\n", - " 55000.0\n", - " 3000.0\n", - " 8.5\n", - " 4\n", - " 75.0\n", + " 68.9\n", " 0\n", + " 5\n", + " 25\n", + " 0.8\n", + " 65000.0\n", + " 2000.0\n", + " 5.5\n", + " 1\n", + " 50.0\n", + " 1\n", + " 9\n", + " 3\n", + " 1\n", + " 1\n", " \n", " \n", + " 3\n", + " 38\n", + " NaN\n", + " 18\n", + " 79.3\n", + " 0\n", " 15\n", - " Eve\n", - " Finance\n", - " 35\n", - " F\n", - " Bachelors\n", - " 8\n", - " 15\n", - " 88.0\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", + " 3\n", + " 70.0\n", + " 1\n", + " 2\n", + " 3\n", " 1\n", - " 30\n", - " 6\n", - " 0.3\n", - " 90000.0\n", - " 8000.0\n", - " 8.0\n", - " 4\n", - " 85.0\n", " 0\n", " \n", " \n", - " 14\n", - " Diana\n", - " Engineering\n", + " 4\n", " 29\n", - " F\n", - " PhD\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -11744,120 +11705,36 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", + " 0\n", + " 3\n", " \n", " \n", - " 24\n", - " Diana\n", - " Engineering\n", - " 29\n", - " F\n", - " PhD\n", - " 6\n", - " 22\n", - " 95.2\n", - " 2\n", - " 50\n", - " 2\n", - " 0.0\n", - " 97000.0\n", - " 10000.0\n", - " 9.6\n", - " 5\n", - " 95.0\n", + " 40\n", + " 55\n", + " 30.0\n", + " 12\n", + " 68.9\n", " 0\n", - " \n", - " \n", - " 18\n", - " Henry\n", - " Engineering\n", - " 31\n", - " M\n", - " Masters\n", - " 7\n", - " 25\n", - " 93.1\n", - " 2\n", - " 35\n", - " 5\n", - " 0.2\n", - " 95000.0\n", - " 9000.0\n", - " 9.1\n", " 5\n", - " 90.0\n", - " 0\n", - " \n", - " \n", - " 26\n", - " Frank\n", - " HR\n", - " 50\n", - " M\n", - " High School\n", " 25\n", - " 8\n", - " 72.5\n", - " 0\n", - " 10\n", - " 15\n", - " 0.7\n", - " 60000.0\n", - " 4000.0\n", - " 6.5\n", - " 2\n", - " 60.0\n", + " 0.8\n", + " 65000.0\n", + " 2000.0\n", + " 5.5\n", " 1\n", - " \n", - " \n", - " 23\n", - " Charlie\n", - " Sales\n", - " 38\n", - " M\n", - " Bachelors\n", - " 10\n", - " 18\n", - " 79.3\n", - " 0\n", - " 15\n", - " 20\n", - " 0.5\n", - " 80000.0\n", - " 7000.0\n", - " 7.2\n", + " 50.0\n", + " 1\n", + " 9\n", " 3\n", - " 70.0\n", + " 1\n", " 1\n", " \n", " \n", - " 29\n", - " Ivy\n", - " Finance\n", - " 27\n", - " F\n", - " Bachelors\n", - " 3\n", " 10\n", - " 85.0\n", - " 0\n", - " 20\n", - " 8\n", - " 0.6\n", - " 70000.0\n", - " 5000.0\n", - " 8.2\n", - " 4\n", - " 82.0\n", - " 0\n", - " \n", - " \n", - " 20\n", - " Jack\n", - " Sales\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -11870,36 +11747,36 @@ " 1\n", " 50.0\n", " 1\n", + " 9\n", + " 3\n", + " 1\n", + " 1\n", " \n", " \n", - " 17\n", - " Grace\n", - " Sales\n", - " 42\n", - " F\n", - " Bachelors\n", - " 18\n", - " 20\n", - " 81.4\n", - " 1\n", - " 25\n", + " 31\n", + " 30\n", + " 5.0\n", " 12\n", - " 0.4\n", - " 85000.0\n", - " 7000.0\n", - " 7.8\n", - " 3\n", - " 74.0\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " 0\n", + " 2\n", " 0\n", + " -1\n", " \n", " \n", - " 5\n", - " Eve\n", - " Finance\n", + " 15\n", " 35\n", - " F\n", - " Bachelors\n", - " 8\n", + " 8.0\n", " 15\n", " 88.0\n", " 1\n", @@ -11912,15 +11789,15 @@ " 4\n", " 85.0\n", " 0\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", " \n", " \n", - " 3\n", - " Charlie\n", - " Sales\n", + " 23\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -11933,15 +11810,57 @@ " 3\n", " 70.0\n", " 1\n", + " 2\n", + " 3\n", + " 1\n", + " 0\n", + " \n", + " \n", + " 5\n", + " 35\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", " \n", " \n", + " 1\n", + " 30\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", " 40\n", - " Jack\n", - " Sales\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " 0\n", + " 2\n", + " 0\n", + " -1\n", + " \n", + " \n", + " 20\n", " 55\n", - " M\n", - " High School\n", - " 30\n", + " 30.0\n", " 12\n", " 68.9\n", " 0\n", @@ -11954,36 +11873,36 @@ " 1\n", " 50.0\n", " 1\n", - " \n", - " \n", - " 39\n", - " Ivy\n", - " Finance\n", - " 27\n", - " F\n", - " Bachelors\n", + " 9\n", " 3\n", - " 10\n", - " 85.0\n", - " 0\n", - " 20\n", - " 8\n", - " 0.6\n", - " 70000.0\n", - " 5000.0\n", - " 8.2\n", - " 4\n", - " 82.0\n", - " 0\n", + " 1\n", + " 1\n", " \n", " \n", - " 28\n", - " Henry\n", - " Engineering\n", - " 31\n", - " M\n", - " Masters\n", - " 7\n", + " 25\n", + " 35\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 8\n", + " 31\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -11996,48 +11915,94 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", + " 0\n", + " 1\n", + " 2\n", + " \n", + " \n", + " 27\n", + " 42\n", + " 18.0\n", + " 20\n", + " 81.4\n", + " 1\n", + " 25\n", + " 12\n", + " 0.4\n", + " 85000.0\n", + " 7000.0\n", + " 7.8\n", + " 3\n", + " 74.0\n", + " 0\n", + " 6\n", + " 3\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 37\n", + " 42\n", + " 18.0\n", + " 20\n", + " 81.4\n", + " 1\n", + " 25\n", + " 12\n", + " 0.4\n", + " 85000.0\n", + " 7000.0\n", + " 7.8\n", + " 3\n", + " 74.0\n", + " 0\n", + " 6\n", + " 3\n", + " 0\n", + " 0\n", " \n", " \n", + " 16\n", + " 50\n", + " 25.0\n", + " 8\n", + " 72.5\n", + " 0\n", + " 10\n", + " 15\n", + " 0.7\n", + " 60000.0\n", + " 4000.0\n", + " 6.5\n", " 2\n", - " Bob\n", - " Engineering\n", + " 60.0\n", + " 1\n", + " 5\n", + " 2\n", + " 1\n", + " 1\n", + " \n", + " \n", + " 22\n", " 45\n", - " M\n", - " Masters\n", - " 20\n", + " 20.0\n", " 30\n", " 91.0\n", " 3\n", " 20\n", " 10\n", " 0.1\n", - " 120000.0\n", + " 1200000.0\n", " 15000.0\n", " 9.0\n", " 5\n", " 89.0\n", " 0\n", - " \n", - " \n", - " 30\n", - " Jack\n", - " Sales\n", - " 55\n", - " M\n", - " High School\n", - " 30\n", - " 12\n", - " 68.9\n", - " 0\n", - " 5\n", - " 25\n", - " 0.8\n", - " 65000.0\n", - " 2000.0\n", - " 5.5\n", " 1\n", - " 50.0\n", + " 0\n", " 1\n", + " 2\n", " \n", " \n", "" @@ -12053,11 +12018,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -12071,80 +12032,59 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", + " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", - " 33\n", - " Charlie\n", - " Sales\n", - " 38\n", - " M\n", - " Bachelors\n", - " 10\n", " 18\n", - " 79.3\n", - " 0\n", - " 15\n", - " 20\n", - " 0.5\n", - " 80000.0\n", - " 7000.0\n", - " 7.2\n", - " 3\n", - " 70.0\n", - " 1\n", - " \n", - " \n", + " 31\n", + " 7.0\n", " 25\n", - " Eve\n", - " Finance\n", + " 93.1\n", + " 2\n", " 35\n", - " F\n", - " Bachelors\n", - " 8\n", - " 15\n", - " 88.0\n", - " 1\n", - " 30\n", - " 6\n", - " 0.3\n", - " 90000.0\n", - " 8000.0\n", - " 8.0\n", - " 4\n", - " 85.0\n", + " 5\n", + " 0.2\n", + " 95000.0\n", + " 9000.0\n", + " 9.1\n", + " 5\n", + " 90.0\n", + " 0\n", + " 7\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", - " 11\n", - " Alice\n", - " HR\n", + " 32\n", + " 45\n", + " 20.0\n", " 30\n", - " F\n", - " Bachelors\n", + " 91.0\n", + " 3\n", + " 20\n", + " 10\n", + " 0.1\n", + " 1200000.0\n", + " 15000.0\n", + " 9.0\n", " 5\n", - " 12\n", - " 87.5\n", + " 89.0\n", + " 0\n", " 1\n", - " 40\n", - " 5\n", - " 0.2\n", - " 55000.0\n", - " 3000.0\n", - " 8.5\n", - " 4\n", - " 75.0\n", " 0\n", + " 1\n", + " 2\n", " \n", " \n", - " 37\n", - " Grace\n", - " Sales\n", + " 17\n", " 42\n", - " F\n", - " Bachelors\n", - " 18\n", + " 18.0\n", " 20\n", " 81.4\n", " 1\n", @@ -12157,31 +12097,56 @@ " 3\n", " 74.0\n", " 0\n", + " 6\n", + " 3\n", + " 0\n", + " 0\n", " \n", " \n", - " 34\n", - " Diana\n", - " Engineering\n", - " 29\n", - " F\n", - " PhD\n", - " 6\n", - " 22\n", - " 95.2\n", - " 2\n", + " 26\n", " 50\n", + " 25.0\n", + " 8\n", + " 72.5\n", + " 0\n", + " 10\n", + " 15\n", + " 0.7\n", + " 60000.0\n", + " 4000.0\n", + " 6.5\n", " 2\n", - " 0.0\n", - " 97000.0\n", - " 10000.0\n", - " 9.6\n", + " 60.0\n", + " 1\n", " 5\n", - " 95.0\n", - " 0\n", + " 2\n", + " 1\n", + " 1\n", " \n", - " \n", - "" - ] + " \n", + " 35\n", + " 35\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " 4\n", + " -1\n", + " 0\n", + " 0\n", + " \n", + " \n", + "" + ] }, "metadata": {}, "output_type": "display_data" @@ -12193,11 +12158,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -12211,17 +12172,59 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " name_lbl\n", + " department_lbl\n", + " gender_lbl\n", + " education_level_lbl\n", " \n", " \n", " \n", " \n", + " 7\n", + " 42\n", + " 18.0\n", + " 20\n", + " 81.4\n", + " 1\n", + " 25\n", + " 12\n", + " 0.4\n", + " 85000.0\n", + " 7000.0\n", + " 7.8\n", + " 3\n", + " 74.0\n", + " 0\n", + " 6\n", + " 3\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 11\n", + " 30\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " 0\n", + " 2\n", + " 0\n", + " -1\n", + " \n", + " \n", " 13\n", - " Charlie\n", - " Sales\n", " 38\n", - " M\n", - " Bachelors\n", - " 10\n", + " NaN\n", " 18\n", " 79.3\n", " 0\n", @@ -12234,57 +12237,15 @@ " 3\n", " 70.0\n", " 1\n", - " \n", - " \n", - " 6\n", - " Frank\n", - " HR\n", - " 50\n", - " M\n", - " High School\n", - " 25\n", - " 8\n", - " 72.5\n", - " 0\n", - " 10\n", - " 15\n", - " 0.7\n", - " 60000.0\n", - " 4000.0\n", - " 6.5\n", " 2\n", - " 60.0\n", + " 3\n", " 1\n", - " \n", - " \n", - " 38\n", - " Henry\n", - " Engineering\n", - " 31\n", - " M\n", - " Masters\n", - " 7\n", - " 25\n", - " 93.1\n", - " 2\n", - " 35\n", - " 5\n", - " 0.2\n", - " 95000.0\n", - " 9000.0\n", - " 9.1\n", - " 5\n", - " 90.0\n", " 0\n", " \n", " \n", - " 9\n", - " Ivy\n", - " Finance\n", + " 39\n", " 27\n", - " F\n", - " Bachelors\n", - " 3\n", + " 3.0\n", " 10\n", " 85.0\n", " 0\n", @@ -12297,99 +12258,15 @@ " 4\n", " 82.0\n", " 0\n", - " \n", - " \n", - " 22\n", - " Bob\n", - " Engineering\n", - " 45\n", - " M\n", - " Masters\n", - " 20\n", - " 30\n", - " 91.0\n", - " 3\n", - " 20\n", - " 10\n", - " 0.1\n", - " 120000.0\n", - " 15000.0\n", - " 9.0\n", - " 5\n", - " 89.0\n", - " 0\n", - " \n", - " \n", - " 7\n", - " Grace\n", - " Sales\n", - " 42\n", - " F\n", - " Bachelors\n", - " 18\n", - " 20\n", - " 81.4\n", - " 1\n", - " 25\n", - " 12\n", - " 0.4\n", - " 85000.0\n", - " 7000.0\n", - " 7.8\n", - " 3\n", - " 74.0\n", - " 0\n", - " \n", - " \n", - " 35\n", - " Eve\n", - " Finance\n", - " 35\n", - " F\n", - " Bachelors\n", " 8\n", - " 15\n", - " 88.0\n", " 1\n", - " 30\n", - " 6\n", - " 0.3\n", - " 90000.0\n", - " 8000.0\n", - " 8.0\n", - " 4\n", - " 85.0\n", " 0\n", - " \n", - " \n", - " 10\n", - " Jack\n", - " Sales\n", - " 55\n", - " M\n", - " High School\n", - " 30\n", - " 12\n", - " 68.9\n", " 0\n", - " 5\n", - " 25\n", - " 0.8\n", - " 65000.0\n", - " 2000.0\n", - " 5.5\n", - " 1\n", - " 50.0\n", - " 1\n", " \n", " \n", - " 16\n", - " Frank\n", - " HR\n", + " 6\n", " 50\n", - " M\n", - " High School\n", - " 25\n", + " 25.0\n", " 8\n", " 72.5\n", " 0\n", @@ -12402,15 +12279,36 @@ " 2\n", " 60.0\n", " 1\n", + " 5\n", + " 2\n", + " 1\n", + " 1\n", " \n", " \n", - " 4\n", - " Diana\n", - " Engineering\n", + " 12\n", + " 45\n", + " 20.0\n", + " 30\n", + " 91.0\n", + " 3\n", + " 20\n", + " 10\n", + " 0.1\n", + " 1200000.0\n", + " 15000.0\n", + " 9.0\n", + " 5\n", + " 89.0\n", + " 0\n", + " 1\n", + " 0\n", + " 1\n", + " 2\n", + " \n", + " \n", + " 24\n", " 29\n", - " F\n", - " PhD\n", - " 6\n", + " 6.0\n", " 22\n", " 95.2\n", " 2\n", @@ -12423,36 +12321,99 @@ " 5\n", " 95.0\n", " 0\n", + " 3\n", + " 0\n", + " 0\n", + " 3\n", " \n", " \n", - " 27\n", - " Grace\n", - " Sales\n", - " 42\n", - " F\n", - " Bachelors\n", - " 18\n", + " 2\n", + " 45\n", + " 20.0\n", + " 30\n", + " 91.0\n", + " 3\n", " 20\n", - " 81.4\n", + " 10\n", + " 0.1\n", + " 1200000.0\n", + " 15000.0\n", + " 9.0\n", + " 5\n", + " 89.0\n", + " 0\n", " 1\n", - " 25\n", - " 12\n", - " 0.4\n", - " 85000.0\n", - " 7000.0\n", - " 7.8\n", + " 0\n", + " 1\n", + " 2\n", + " \n", + " \n", + " 14\n", + " 29\n", + " 6.0\n", + " 22\n", + " 95.2\n", + " 2\n", + " 50\n", + " 2\n", + " 0.0\n", + " 97000.0\n", + " 10000.0\n", + " 9.6\n", + " 5\n", + " 95.0\n", + " 0\n", " 3\n", - " 74.0\n", " 0\n", + " 0\n", + " 3\n", + " \n", + " \n", + " 21\n", + " 30\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " 0\n", + " 2\n", + " 0\n", + " -1\n", " \n", " \n", + " 9\n", + " 27\n", + " 3.0\n", + " 10\n", + " 85.0\n", + " 0\n", + " 20\n", " 8\n", - " Henry\n", - " Engineering\n", + " 0.6\n", + " 70000.0\n", + " 5000.0\n", + " 8.2\n", + " 4\n", + " 82.0\n", + " 0\n", + " 8\n", + " 1\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 38\n", " 31\n", - " M\n", - " Masters\n", - " 7\n", + " 7.0\n", " 25\n", " 93.1\n", " 2\n", @@ -12465,6 +12426,10 @@ " 5\n", " 90.0\n", " 0\n", + " 7\n", + " 0\n", + " 1\n", + " 2\n", " \n", " \n", "" @@ -12475,7 +12440,7 @@ } ], "source": [ - "%splitdata stratify=department test_size=0.3 val_size=0.1 random_state=123" + "%splitdata test_size=0.3 val_size=0.1 random_state=123" ] }, { @@ -12701,7 +12666,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "2543a896-7047-45a7-a118-3adcfb822023", "metadata": {}, "outputs": [ @@ -12709,12 +12674,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Model 'random_forest' trained and saved to data['last_model']. problem=classification. train_rows=28\n" + "Model 'random_forest' trained and saved to data['last_model']. problem=regression. train_rows=28\n" ] } ], "source": [ - "%train_model target=department features=age,salary model=random_forest n_estimators=50 max_depth=4" + "%train_model target=department_lbl features=age,salary model=random_forest n_estimators=50 max_depth=4" ] }, { @@ -12773,7 +12738,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 8, "id": "4f7d7e57-e6e6-47cd-84f4-ac653b11e9b1", "metadata": {}, "outputs": [ @@ -12809,7 +12774,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "1d90ce87-aafd-4958-8318-09f66793b98e", "metadata": {}, "outputs": [ @@ -12818,30 +12783,15 @@ "text/html": [ "
\n", "\n", - "
\n", - "
\n", - "

Metrics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", - "
\n", - "
\"confusion
\n", - "

Classification report

\n", - "
              precision    recall  f1-score   support\n",
-       "\n",
-       " Engineering       1.00      1.00      1.00         2\n",
-       "     Finance       1.00      1.00      1.00         1\n",
-       "          HR       1.00      1.00      1.00         1\n",
-       "       Sales       1.00      1.00      1.00         4\n",
-       "\n",
-       "    accuracy                           1.00         8\n",
-       "   macro avg       1.00      1.00      1.00         8\n",
-       "weighted avg       1.00      1.00      1.00         8\n",
-       "
\n", + "
\n", + "

Regression metrics

\n", + " \n", + " \n", + " \n", + " \n", + "
RMSE0.1362
MAE0.1108
0.9858
\n", + "
\n", + " \n", "
" ] }, @@ -12854,51 +12804,58 @@ "

Predictions preview (actual vs predicted)

\n", " \n", " \n", - " \n", + " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
departmentdepartment_lbl_predicted_pred_proba
SalesSales[0.01, 0.0, 0.11, 0.88]32.96
SalesSales[0.0, 0.02, 0.01, 0.97]21.84
HRHR[0.03, 0.0, 0.96, 0.01]33.00
SalesSales[0.0, 0.02, 0.01, 0.97]11.23
FinanceFinance[0.0, 0.92, 0.0, 0.08]22.10
SalesSales[0.0, 0.09, 0.0, 0.91]00.16
EngineeringEngineering[0.98, 0.0, 0.02, 0.0]0-0.03
EngineeringEngineering[0.98, 0.0, 0.02, 0.0]00.16
0-0.03
21.84
11.23
0-0.03
" @@ -12914,7 +12871,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 7, "id": "3e1a5300-a034-469a-abed-b50108a7f3a9", "metadata": {}, "outputs": [ @@ -12922,7 +12879,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "✅ Model from data['last_model'] saved to ./models/test_model.joblib\n" + "Model from data['last_model'] saved to ./models/test_model.joblib\n" ] } ], @@ -12932,7 +12889,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 8, "id": "5d5a292a-f4af-47bc-9eaa-44ddd63078ec", "metadata": {}, "outputs": [ @@ -12940,7 +12897,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "✅ Loaded model from ./models/test_model.joblib → data['restored_model']\n" + "Loaded model from ./models/test_model.joblib → data['restored_model']\n" ] } ], @@ -12950,23 +12907,21 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "788d9f75-9fe5-4bd2-b39d-1732eeee5bcd", "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "✅ Using inline feature values for prediction: {'age': 38, 'salary': 80000.0}\n", - "prediction\n", - " Sales\n", - "✅ Predictions stored in data['last_preds'] with shape=(1, 1)\n" + "Model meta missing 'features'. Using numeric columns only if applicable.\n", + "Cannot use inline values: model has no stored feature names.\n" ] } ], "source": [ - "%predict model_name=last_model data_name=[38,\"80000.0\"] output_name=last_preds" + "%predict model_name=restored_model data_name=[38,\"80000.0\"] output_name=last_preds" ] }, { @@ -17954,231 +17909,7 @@ { "data": { "text/html": [ - "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1dropmissingcolumns=department mode=preview2025-10-29 15:22:49departmentpreviewpreview_dropped=1testNULLNULLNULL
2dropmissingcolumns=department table=test.employees mode=apply confirm=true 2025-10-29 15:22:49departmentappliedapplied_backup=test.employees_backup_73b9e64a9b8c4045test73b9e64a9b8c4045test.employees_backup_73b9e64a9b8c4045test.employees
3dropmissingmode=rollback rollback_token=73b9e64a9b8c40452025-10-29 15:23:00ALL_COLUMNSrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_73b9e64a9b8c4045test73b9e64a9b8c4045test.employees_backup_73b9e64a9b8c4045test.employees
4fillmissingcolumns=years_experience strategy=median mode=preview2025-10-29 15:36:42years_experiencepreviewpreview_computed_fill_valuestestNULLNULLNULL
5fillmissingcolumns=years_experience strategy=median table=test.employees mode=apply confirm=true 2025-10-29 15:37:18years_experienceappliedapplied_backup=test.employees_backup_9407f2e1e7db47b2test9407f2e1e7db47b2test.employees_backup_9407f2e1e7db47b2test.employees
6fillmissingmode=rollback rollback_token=9407f2e1e7db47b22025-10-29 15:39:07emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_9407f2e1e7db47b2test9407f2e1e7db47b2test.employees_backup_9407f2e1e7db47b2test.employees
7outliers2025-10-29 15:54:23errorNo numeric target columns found to detect outliers.testNULLNULLNULL
8outliers2025-10-29 15:54:44emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'years_experience': detected 0 outlier(s) using iqr.\n", - "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", - "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", - "Column 'certifications': detected 0 outlier(s) using iqr.\n", - "Column 'training_hours': detected 0 outlier(s) using iqr.\n", - "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", - "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Column 'bonus': detected 0 outlier(s) using iqr.\n", - "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", - "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", - "Column 'potential_score': detected 0 outlier(s) using iqr.\n", - "Column 'attrition_flag': detected 0 outlier(s) using iqr.testNULLNULLNULL
9dropoutliersmode=preview2025-10-29 16:00:10emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_completedtestNULLNULLNULL
10dropoutlierstable=test.employees mode=apply confirm=true 2025-10-29 16:00:58emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagappliedapplied_db_versionedtest53731015c85a478atest.employees_backup_53731015c85a478atest.employees
11dropoutliersmode=rollback rollback_token=53731015c85a478a2025-10-29 16:02:06emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_53731015c85a478atest53731015c85a478atest.employees_backup_53731015c85a478atest.employees
12clipoutliersmode=preview2025-10-29 17:01:54emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_completedtestNULLNULLNULL
13clipoutlierstable=test.employees mode=apply confirm=true 2025-10-29 17:02:22emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagappliedapplied_backup=test.employees_backup_656479791b1d48fctest656479791b1d48fctest.employees_backup_656479791b1d48fctest.employees
14clipoutliersmode=rollback rollback_token=656479791b1d48fc2025-10-29 17:04:03emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_656479791b1d48fctest656479791b1d48fctest.employees_backup_656479791b1d48fctest.employees
15encodemethod=onehot columns=department drop_original=false mode=preview2025-10-29 17:21:19departmenterrorColumn(s) not found: departmenttestNULLNULLNULL
16encodemethod=onehot columns=department drop_original=false mode=preview2025-10-29 17:21:42departmentpreviewpreview_completedtestNULLNULLNULL
17encodemethod=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true 2025-10-29 17:24:34departmentappliedapplied_backup=test.employees_backup_8d3413bf829d4cd8;created_columns=5test8d3413bf829d4cd8test.employees_backup_8d3413bf829d4cd8test.employees
18encodemode=rollback rollback_token=8d3413bf829d4cd82025-10-29 17:26:10name\n", - "department\n", - "gender\n", - "education_levelrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_8d3413bf829d4cd8test8d3413bf829d4cd8test.employees_backup_8d3413bf829d4cd8test.employees
19encodemethod=onehot columns=department drop_original=false 2025-10-29 17:32:31departmentpreviewpreview_completedtestNULLNULLNULL
20normalizecolumns=age,salary feature_range=5,10 mode=preview2025-10-29 17:44:41age\n", - "salarypreviewpreview_completedtestNULLNULLNULL
21normalize feature_range=5,10 mode=preview2025-10-29 17:45:01emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_completedtestNULLNULLNULL
22normalizefeature_range=5,10 table=test.employees mode=apply confirm=true 2025-10-29 17:47:11emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagappliedapplied_backup=test.employees_backup_daf864252a6c46f1;details=emp_id: min=1.0, max=10.0|age: min=27.0, max=55.0|years_experience: min=3.0, max=30.0|projects_completed: min=8.0, max=30.0|avg_project_score: min=68.9, max=95.2|certifications: min=0.0, max=3.0|training_hours: min=5.0, max=50.0|overtime_hours: min=2.0, max=25.0|remote_ratio: min=0.0, max=0.8|salary: min=55000.0, max=1200000.0|bonus: min=300.0, max=15000.0|satisfaction_score: min=5.5, max=9.6|performance_rating: min=1.0, max=5.0|potential_score: min=50.0, max=95.0|attrition_flag: min=0.0, max=1.0testdaf864252a6c46f1test.employees_backup_daf864252a6c46f1test.employees
23normalizemode=rollback rollback_token=daf864252a6c46f12025-10-29 17:48:30emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_daf864252a6c46f1testdaf864252a6c46f1test.employees_backup_daf864252a6c46f1test.employees
24standardizemode=preview2025-10-29 17:56:44emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_completedtestNULLNULLNULL
25standardizetable=test.employees mode=apply confirm=true 2025-10-29 17:57:27emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagappliedapplied_backup=test.employees_backup_f15a883f004548a8;details=emp_id: mean=5.5, std=2.8723|age: mean=38.2, std=9.0642|years_experience: mean=13.5556, std=9.2989|projects_completed: mean=17.2, std=6.7201|avg_project_score: mean=84.19, std=8.217354|certifications: mean=1.0, std=1.0|training_hours: mean=25.0, std=13.2288|overtime_hours: mean=10.8, std=6.9397|remote_ratio: mean=0.38, std=0.25219|salary: mean=189700.0, std=337053.126376|bonus: mean=6730.0, std=4002.011994|satisfaction_score: mean=7.94, std=1.191805|performance_rating: mean=3.6, std=1.2806|potential_score: mean=77.0, std=13.43875|attrition_flag: mean=0.3, std=0.4583testf15a883f004548a8test.employees_backup_f15a883f004548a8test.employees
26standardizemode=rollback rollback_token=f15a883f004548a82025-10-29 17:58:34emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagrollbackrestored_to=test.employees;previous_saved_as=test.employees_prerollback_f15a883f004548a8testf15a883f004548a8test.employees_backup_f15a883f004548a8test.employees
27encodemethod=label drop_original=true2025-10-29 18:21:31name\n", - "department\n", - "gender\n", - "education_levelpreviewpreview_completedtestNULLNULLNULL
28select_featurestarget=attrition_flag method=correlation k=5 problem=classification output_name=top_features2025-10-29 18:22:01emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_scoreerrorError during feature selection: could not convert string to float: 'Alice'testNULLNULLNULL
29encodemethod=label drop_original=true2025-10-29 18:22:26name\n", - "department\n", - "gender\n", - "education_levelpreviewpreview_completedtestNULLNULLNULL
30encodemethod=label drop_original=true mode=apply2025-10-29 18:22:48name\n", + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1encodemethod=label drop_original=true mode=apply2025-10-30 11:36:29name\n", "department\n", "gender\n", "education_levelsuccessMethod: label\n", @@ -18192,354 +17923,14 @@ "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)testNULLNULLNULL
31select_featurestarget=attrition_flag method=correlation k=5 problem=classification 2025-10-29 18:23:26overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
32select_modeltarget=attrition_flag problem=classification2025-10-29 18:24:09overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['best_model'].testNULLNULLNULL
33select_modeltarget=attrition_flag problem=classification2025-10-29 18:36:07errorNo features provided and no selected_features found. Run %select_features first.testNULLNULLNULL
34dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:36:38emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
35encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:36:38name\n", - "department\n", - "gender\n", - "education_levelpreviewpreview_completedtestNULLNULLNULL
36mlpipelinemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:36:39name,department,gender,education_levelerrorEncoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder'].testNULLNULLNULL
37dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:48:09emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
38encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:48:09name\n", - "department\n", - "gender\n", - "education_levelpreviewpreview_completedtestNULLNULLNULL
39mlpipelinemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True2025-10-29 18:48:09name_Alice,name_Bob,name_Charlie,name_Diana,name_Eve,name_Frank,name_Grace,name_Henry,name_Ivy,name_Jack,department_HR,department_Engineering,department_Sales,department_nan,department_Finance,gender_F,gender_M,education_level_nan,education_level_Masters,education_level_Bachelors,education_level_PhD,education_level_High SchoolerrorEncoded features not found in DataFrame: name_Alice, name_Bob, name_Charlie, name_Diana, name_Eve, name_Frank, name_Grace, name_Henry, name_Ivy, name_Jack, department_HR, department_Engineering, department_Sales, department_nan, department_Finance, gender_F, gender_M, education_level_nan, education_level_Masters, education_level_Bachelors, education_level_PhD, education_level_High SchooltestNULLNULLNULL
40dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 18:57:36emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
41encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 18:57:36name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: onehot\n", - "Created columns:\n", - "name_Alice\n", - "name_Bob\n", - "name_Charlie\n", - "name_Diana\n", - "name_Eve\n", - "name_Frank\n", - "name_Grace\n", - "name_Henry\n", - "name_Ivy\n", - "name_Jack\n", - "department_Engineering\n", - "department_Finance\n", - "department_HR\n", - "department_Sales\n", - "department____MISSING___\n", - "gender_F\n", - "gender_M\n", - "education_level_Bachelors\n", - "education_level_High School\n", - "education_level_Masters\n", - "education_level_PhD\n", - "education_level____MISSING___\n", - "\n", - "Details:\n", - "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
42select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 18:57:36overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
43standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 18:57:37overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingpreviewpreview_completedtestNULLNULLNULL
44splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 18:57:37attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=32, test_count=8, val_count=0\n", - "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
45select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 18:57:52overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
46mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 18:57:52overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
47mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:04:49errorTarget column 'attrition_flag' not found in DataFrame.testNULLNULLNULL
48dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:05:13emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
49encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:05:13name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: onehot\n", - "Created columns:\n", - "name_Alice\n", - "name_Bob\n", - "name_Charlie\n", - "name_Diana\n", - "name_Eve\n", - "name_Frank\n", - "name_Grace\n", - "name_Henry\n", - "name_Ivy\n", - "name_Jack\n", - "department_Engineering\n", - "department_Finance\n", - "department_HR\n", - "department_Sales\n", - "department____MISSING___\n", - "gender_F\n", - "gender_M\n", - "education_level_Bachelors\n", - "education_level_High School\n", - "education_level_Masters\n", - "education_level_PhD\n", - "education_level____MISSING___\n", - "\n", - "Details:\n", - "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
50select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:05:14overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
51standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:05:14overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingpreviewpreview_completedtestNULLNULLNULL
52splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:05:14attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=32, test_count=8, val_count=0\n", - "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
53select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:05:27overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
54mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:05:27overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
55dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:08:25emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
56encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:08:25name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: onehot\n", - "Created columns:\n", - "name_Alice\n", - "name_Bob\n", - "name_Charlie\n", - "name_Diana\n", - "name_Eve\n", - "name_Frank\n", - "name_Grace\n", - "name_Henry\n", - "name_Ivy\n", - "name_Jack\n", - "department_Engineering\n", - "department_Finance\n", - "department_HR\n", - "department_Sales\n", - "department____MISSING___\n", - "gender_F\n", - "gender_M\n", - "education_level_Bachelors\n", - "education_level_High School\n", - "education_level_Masters\n", - "education_level_PhD\n", - "education_level____MISSING___\n", - "\n", - "Details:\n", - "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
57select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:08:25overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
58standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:08:25overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingpreviewpreview_completedtestNULLNULLNULL
59splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:08:26attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=32, test_count=8, val_count=0\n", - "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
60select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:08:38overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
61mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:08:38overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
62mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:11:04errorTarget column 'attrition_flag' not found in DataFrame.testNULLNULLNULL
63dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:11:31emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
64encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:11:32name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: onehot\n", - "Created columns:\n", - "name_Alice\n", - "name_Bob\n", - "name_Charlie\n", - "name_Diana\n", - "name_Eve\n", - "name_Frank\n", - "name_Grace\n", - "name_Henry\n", - "name_Ivy\n", - "name_Jack\n", - "department_Engineering\n", - "department_Finance\n", - "department_HR\n", - "department_Sales\n", - "department____MISSING___\n", - "gender_F\n", - "gender_M\n", - "education_level_Bachelors\n", - "education_level_High School\n", - "education_level_Masters\n", - "education_level_PhD\n", - "education_level____MISSING___\n", - "\n", - "Details:\n", - "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
65select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:11:32overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
66standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:11:32overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingpreviewpreview_completedtestNULLNULLNULL
67splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:11:32attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=32, test_count=8, val_count=0\n", - "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
68select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:11:45overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
69mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:11:45overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
70dropmissingcolumns=emp_id,name,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag2025-10-29 19:14:14emp_id\n", - "name\n", - "department\n", - "age\n", - "gender\n", - "education_level\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagpreviewpreview_dropped=12testNULLNULLNULL
71encodemethod=onehot columns=name,department,gender,education_level inplace=True drop_original=True mode=apply confirm=true2025-10-29 19:14:14name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: onehot\n", - "Created columns:\n", - "name_Alice\n", - "name_Bob\n", - "name_Charlie\n", - "name_Diana\n", - "name_Eve\n", - "name_Frank\n", - "name_Grace\n", - "name_Henry\n", - "name_Ivy\n", - "name_Jack\n", - "department_Engineering\n", - "department_Finance\n", - "department_HR\n", - "department_Sales\n", - "department____MISSING___\n", - "gender_F\n", - "gender_M\n", - "education_level_Bachelors\n", - "education_level_High School\n", - "education_level_Masters\n", - "education_level_PhD\n", - "education_level____MISSING___\n", - "\n", - "Details:\n", - "Columns ['name', 'department', 'gender', 'education_level'] one-hot encoded -> created 22 columns.testNULLNULLNULL
72select_featurestarget=attrition_flag method=correlation k=5 problem=classification inplace=True2025-10-29 19:14:14overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessSelected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_ratingtestNULLNULLNULL
73standardizecolumns=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating inplace=True2025-10-29 19:14:15overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingpreviewpreview_completedtestNULLNULLNULL
74splitdatatest_size=0.2 val_size=0.0 shuffle=True train_name=last_select_train test_name=last_select_test val_name=last_select_val inplace=True stratify=attrition_flag2025-10-29 19:14:15attrition_flagsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=32, test_count=8, val_count=0\n", - "test_frac=0.2, val_frac=0.0, shuffle=True, random_state=NonetestNULLNULLNULL
75select_modelfeatures=overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_rating target=attrition_flag cv=5 primary_metric=accuracy problem=classification output_name=last_model inplace=True2025-10-29 19:14:29overtime_hours\n", - "avg_project_score\n", - "satisfaction_score\n", - "potential_score\n", - "performance_ratingsuccessBest model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].testNULLNULLNULL
76mlpipelinetarget=attrition_flag problem=classification save_path=./models/model.joblib2025-10-29 19:14:29overtime_hours,avg_project_score,satisfaction_score,potential_score,performance_ratingsuccessML pipeline completed successfully. model=last_model saved_to=./models/model.joblibtestNULLNULLNULL
" + "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)
testNULLNULLNULL
2splitdatastratify=department test_size=0.3 val_size=0.1 random_state=1232025-10-30 11:36:44errorStratify column 'department' not found in DataFrame.testNULLNULLNULL
3splitdatatest_size=0.3 val_size=0.1 random_state=1232025-10-30 11:36:50ALL_COLUMNSsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=23, test_count=12, val_count=5\n", + "test_frac=0.3, val_frac=0.1, shuffle=True, random_state=123testNULLNULLNULL
4train_modelmodel=catboost target=department_lbl features=age,salary model_params='{\"iterations\":50}'2025-10-30 11:37:25age\n", + "salarysuccessModel 'catboost' trained and saved to data['last_model']. problem=regression. train_rows=23testNULLNULLNULL
5evaluate_model2025-10-30 11:37:32age\n", + "salaryerrorError computing classification metrics: Classification metrics can't handle a mix of multiclass and continuous targetstestNULLNULLNULL
6train_modeltarget=department_lbl features=age,salary model=random_forest n_estimators=50 max_depth=42025-10-30 11:37:58age\n", + "salarysuccessModel 'random_forest' trained and saved to data['last_model']. problem=regression. train_rows=23testNULLNULLNULL
7evaluate_model2025-10-30 11:38:02age\n", + "salarysuccessEvaluation success. Model='last_model', test='last_select_test', preds_saved='last_preds'. rmse=0.0993, mae=0.0792, r2=0.9924testNULLNULLNULL
8save_modelmodel_name_in_data=last_model save_path=./models/test_model.joblib overwrite=True2025-10-30 11:38:23last_modelsuccessSaved model to ./models/test_model.joblibtestNULLNULLNULL
9load_modelload_path=./models/test_model.joblib target_key=restored_model2025-10-30 11:38:25restored_modelsuccessLoaded model from ./models/test_model.joblib → data['restored_model']testNULLNULLNULL
10predict_modelmodel_name=restored_model data_name=[38,\"80000.0\"] output_name=last_preds2025-10-30 11:38:43errorCannot use inline values: model has no stored feature names.testNULLNULLNULL
11predict_modelmodel_name=last_model data_name=[38,\"80000.0\"] output_name=last_preds2025-10-30 11:39:07age\n", + "salarysuccessPrediction success. model=last_model, data_arg=[38, 80000.0], output=last_preds, shape=(1, 1) inline_values=[38, 80000.0]testNULLNULLNULL
" ] }, "metadata": {}, diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py index a6020b8..b038f24 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py @@ -8,6 +8,9 @@ import numpy as np import joblib import json +import logging +import os +import re from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, @@ -23,6 +26,13 @@ import io import base64 +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + class EvaluateModel(MariaMagic): """ %evaluate_model [model_name=last_model] [test_name=last_select_test] [pred_name=last_preds] @@ -30,6 +40,10 @@ class EvaluateModel(MariaMagic): Nice, visual evaluation of a trained model: metrics card, confusion-matrix plot, classification report and a preview table of actual vs predicted. + + This version adds metadata logging to magic_metadata table: + - creates magic_metadata if needed + - logs error rows on failures and a success row on successful evaluation """ def __init__(self, args=""): self.args = args @@ -122,11 +136,185 @@ def _plot_confusion_matrix_to_datauri(self, cm, labels): data = base64.b64encode(buf.read()).decode("ascii") return f"data:image/png;base64,{data}" + # -------------------- metadata helpers (copied/adapted) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return model_store_name = args.get("model_name", args.get("model", "last_model")) @@ -137,13 +325,41 @@ def execute(self, kernel, data): # fetch model model = data.get(model_store_name) if model is None: - kernel._send_message("stderr", f"No model found in data['{model_store_name}']. Train and save a model first.") + msg = f"No model found in data['{model_store_name}']. Train and save a model first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # fetch test set test_df = data.get(test_name) if test_df is None or not isinstance(test_df, pd.DataFrame) or test_df.empty: - kernel._send_message("stderr", f"No test DataFrame found in data['{test_name}'] or it is empty.") + msg = f"No test DataFrame found in data['{test_name}'] or it is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # infer problem if not provided @@ -166,18 +382,74 @@ def execute(self, kernel, data): target_col = possible_targets[0] if not target_col: - kernel._send_message("stderr", "Target column not found in model meta and could not be inferred from test DataFrame. " - "Set data[model_name + '_meta']['target']='' when training, or pass target info in meta.") + msg = ("Target column not found in model meta and could not be inferred from test DataFrame. " + "Set data[model_name + '_meta']['target']='' when training, or pass target info in meta.") + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if target_col not in test_df.columns: - kernel._send_message("stderr", f"Target column '{target_col}' not present in test DataFrame '{test_name}'.") + msg = f"Target column '{target_col}' not present in test DataFrame '{test_name}'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if not features: - kernel._send_message("stderr", "Model metadata does not contain 'features' list. Cannot build X_test.") + msg = "Model metadata does not contain 'features' list. Cannot build X_test." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return missing_features = [c for c in features if c not in test_df.columns] if missing_features: - kernel._send_message("stderr", f"Test DataFrame missing feature columns: {', '.join(missing_features)}") + msg = f"Test DataFrame missing feature columns: {', '.join(missing_features)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return X_test = test_df[features].copy() @@ -187,7 +459,21 @@ def execute(self, kernel, data): try: preds_raw = model.predict(X_test) except Exception as e: - kernel._send_message("stderr", f"Error during prediction: {e}") + msg = f"Error during prediction: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # predict_proba if available @@ -222,6 +508,10 @@ def execute(self, kernel, data): out_lines = [] metrics_html = "" cm_image_uri = None + + # We'll collect a compact metrics summary to store in metadata message on success + metrics_summary = {} + if problem == "classification": y_true_vals = np.asarray(y_true_orig) preds_vals = np.asarray(preds_display) @@ -248,9 +538,26 @@ def is_mixed(a, b): f1 = f1_score(y_metric, p_metric, average="weighted", zero_division=0) cm = confusion_matrix(y_metric, p_metric) except Exception as e: - kernel._send_message("stderr", f"Error computing classification metrics: {e}") + msg = f"Error computing classification metrics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + metrics_summary.update({"accuracy": float(acc), "precision": float(prec), + "recall": float(rec), "f1": float(f1)}) + # ROC AUC if possible roc_text = "N/A" if pred_proba is not None and model_classes is not None: @@ -264,9 +571,11 @@ def is_mixed(a, b): if proba_arr.ndim == 1: roc_auc = roc_auc_score(y_idx.astype(int), proba_arr.astype(float)) roc_text = f"{roc_auc:.4f}" + metrics_summary["roc_auc"] = float(roc_auc) else: roc_auc = roc_auc_score(y_idx.astype(int), proba_arr, multi_class="ovr", average="weighted") roc_text = f"{roc_auc:.4f}" + metrics_summary["roc_auc"] = float(roc_auc) except Exception: roc_text = "Computation failed." @@ -286,7 +595,6 @@ def is_mixed(a, b): """ # Render confusion matrix as image and embed - # determine label names for axes try: if model_classes is not None: label_names = [str(c) for c in model_classes] @@ -298,7 +606,6 @@ def is_mixed(a, b): cm_image_uri = self._plot_confusion_matrix_to_datauri(cm_arr, label_names) metrics_html += f'
confusion matrix
' except Exception: - # fallback: textual representation included below metrics_html += '
Confusion matrix image failed to render.
' metrics_html += "" # close flex container @@ -319,9 +626,25 @@ def is_mixed(a, b): mae = float(mean_absolute_error(y_true_num, preds_num)) r2 = float(r2_score(y_true_num, preds_num)) except Exception as e: - kernel._send_message("stderr", f"Error computing regression metrics: {e}") + msg = f"Error computing regression metrics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + metrics_summary.update({"rmse": float(rmse), "mae": float(mae), "r2": float(r2)}) + metrics_html = f"""

Regression metrics

@@ -343,7 +666,7 @@ def is_mixed(a, b): html_parts.append("

Classification report

") html_parts.append(f"
{report}
") # also add textual confusion matrix below if image not present - if cm_image_uri is None: + if cm_image_uri is None and 'cm' in locals(): html_parts.append("

Confusion matrix

")
                 html_parts.append(str(cm.tolist()))
                 html_parts.append("
") @@ -363,4 +686,30 @@ def is_mixed(a, b): except Exception: pass + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = "\n".join(features) + # craft a concise message describing main metrics + if problem == "classification": + main_metrics = ", ".join(f"{k}={v:.4f}" for k, v in metrics_summary.items()) + else: + main_metrics = ", ".join(f"{k}={v:.4f}" for k, v in metrics_summary.items()) + message_str = f"Evaluation success. Model='{model_store_name}', test='{test_name}', preds_saved='{pred_name}'. {main_metrics}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py index 0442b27..1fffe14 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py @@ -4,6 +4,15 @@ from distutils import util import logging from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import os +import re +import pandas as pd + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None def _str_to_obj(s): @@ -33,6 +42,14 @@ class LoadModel(MariaMagic): %load_model load_path=/tmp/model.joblib [target_key=last_model] Loads a locally saved .joblib model into the `data` dictionary. + + This version: + - detects save formats: + * raw model object (backwards compatible) + * dict {"model": , "meta": } (round-trip with SaveModel) + - restores meta into data[target_key + '_meta'] when available + - attempts minimal inference of features from model if present (feature_names_in_) + - logs metadata to magic_metadata table (creates it if necessary) """ def __init__(self, args=""): @@ -56,23 +73,298 @@ def parse_args(self, input_str): pairs[k] = _str_to_obj(v) return pairs + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments.") + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return load_path = args.get("load_path") target_key = args.get("target_key", "last_model") if not load_path: - kernel._send_message("stderr", "You must provide load_path=/path/to/file.joblib") + msg = "You must provide load_path=/path/to/file.joblib" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=target_key or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Attempt to load try: - model_obj = joblib.load(load_path) - data[target_key] = model_obj - kernel._send_message("stdout", f"Loaded model from {load_path} → data['{target_key}']") + loaded = joblib.load(load_path) + + # Detect saved structure: either raw model, or {"model": model, "meta": meta} + model_obj = None + restored_meta = None + + if isinstance(loaded, dict) and "model" in loaded: + model_obj = loaded.get("model") + restored_meta = loaded.get("meta") if isinstance(loaded.get("meta"), dict) else None + else: + # backwards-compatible: raw model object + model_obj = loaded + restored_meta = None + + # Minimal inference: if no meta restored, but model exposes feature_names_in_, capture it + try: + if not isinstance(restored_meta, dict): + inferred_meta = {} + if hasattr(model_obj, "feature_names_in_"): + try: + inferred_meta["features"] = list(getattr(model_obj, "feature_names_in_")) + except Exception: + pass + # If we inferred something, assign restored_meta to keep behavior consistent + if inferred_meta: + restored_meta = inferred_meta + # store model and meta into data + data[target_key] = model_obj + if isinstance(restored_meta, dict) and restored_meta: + data[target_key + "_meta"] = restored_meta + except Exception: + # if for any reason storing meta fails, just store the model + data[target_key] = model_obj + + # Prepare success message + meta_info = "" + try: + if isinstance(restored_meta, dict) and restored_meta: + feat = restored_meta.get("features") + tgt = restored_meta.get("target") or restored_meta.get("target_col") + parts = [] + if feat: + parts.append(f"features[{len(feat)}]") + if tgt: + parts.append(f"target={tgt}") + if parts: + meta_info = " (" + ", ".join(parts) + ")" + except Exception: + meta_info = "" + + success_msg = f"Loaded model from {load_path} → data['{target_key}']{meta_info}" + kernel._send_message("stdout", success_msg) + + # write success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = target_key + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=success_msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + except Exception as e: - kernel._send_message("stderr", f"Failed to load model: {e}") + msg = f"Failed to load model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=target_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py index 629ef5d..38ea857 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py @@ -7,6 +7,15 @@ import shlex import json from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None class Predict(MariaMagic): @@ -16,10 +25,14 @@ class Predict(MariaMagic): You can also provide inline values: %predict_model model_name=last_model data_name=[38, 80000.0] output_name=last_preds + + This version records metadata into magic_metadata table (creates it if needed), + logging errors and a final success entry on completion. """ def __init__(self, args=""): self.args = args + self.log = logging.getLogger(__name__) def type(self): return "Line" @@ -61,11 +74,196 @@ def parse_args(self, input_str): pairs[k] = self._str_to_obj(v) return pairs + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- metadata helpers (copied/adapted) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return model_name = args.get("model_name", "last_model") @@ -77,7 +275,21 @@ def execute(self, kernel, data): # --- 1. Retrieve model --- model = data.get(model_name) if model is None: - kernel._send_message("stderr", f"No model found in data['{model_name}']. Train one first.") + msg = f"No model found in data['{model_name}']. Train one first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_name or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # --- 2. Load metadata --- @@ -87,37 +299,149 @@ def execute(self, kernel, data): if not features: kernel._send_message("stderr", "Model meta missing 'features'. Using numeric columns only if applicable.") + # leave features as empty list so we can attempt to infer columns from df later features = [] # --- 3. Determine input mode --- df = None + inline_used = False + inline_vals = None if isinstance(data_arg, list): # Inline list of feature values + inline_used = True + inline_vals = data_arg if not features: - kernel._send_message("stderr", "Cannot use inline values: model has no stored feature names.") + msg = "Cannot use inline values: model has no stored feature names." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if len(data_arg) != len(features): - kernel._send_message("stderr", f"Number of values ({len(data_arg)}) doesn't match expected features ({len(features)}): {features}") + msg = f"Number of values ({len(data_arg)}) doesn't match expected features ({len(features)}): {features}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return df = pd.DataFrame([data_arg], columns=features) kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, data_arg))}") + elif isinstance(data_arg, str) and data_arg.startswith("[") and data_arg.endswith("]"): # If user passed JSON array as string, parse it try: vals = json.loads(data_arg) + inline_used = True + inline_vals = vals + if not features: + msg = "Cannot use inline values: model has no stored feature names." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return if len(vals) != len(features): - kernel._send_message("stderr", f"Number of values ({len(vals)}) doesn't match expected features ({len(features)}): {features}") + msg = f"Number of values ({len(vals)}) doesn't match expected features ({len(features)}): {features}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return df = pd.DataFrame([vals], columns=features) kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, vals))}") except Exception as e: - kernel._send_message("stderr", f"Error parsing inline data list: {e}") + msg = f"Error parsing inline data list: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: # DataFrame-based mode df = data.get(data_arg) - if df is None or df.empty: - kernel._send_message("stderr", f"No DataFrame found in data['{data_arg}'] or it's empty.") + if df is None or (isinstance(df, pd.DataFrame) and df.empty): + msg = f"No DataFrame found in data['{data_arg}'] or it's empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else data_arg, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if not isinstance(df, pd.DataFrame): + msg = f"data['{data_arg}'] is not a pandas DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=data_arg, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # --- 4. Align columns to features --- @@ -126,10 +450,36 @@ def execute(self, kernel, data): extra = [c for c in df_cols if c not in features] if missing: + # we fill missing with zeros (behavior from original) kernel._send_message("stderr", f"Missing columns not in input: {missing}. Filling with zeros.") if extra: kernel._send_message("stderr", f"Ignoring extra columns not seen during training: {extra}.") + # If no features are known, attempt to use numeric columns from df + if not features: + # prefer numeric columns + numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() + if numeric_cols: + features = numeric_cols + kernel._send_message("stdout", f"Inferred features from numeric columns: {features}") + else: + msg = "No features available and could not infer numeric columns for prediction." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + X = pd.DataFrame({col: df[col] if col in df.columns else 0 for col in features}) # --- 5. Run predictions --- @@ -145,18 +495,59 @@ def execute(self, kernel, data): y_pred = model.predict(X) pred_df = pd.DataFrame(y_pred, columns=["prediction"]) except Exception as e: - kernel._send_message("stderr", f"Error during prediction: {e}") + msg = f"Error during prediction: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # --- 6. Save & display --- data[output_name] = pred_df try: - kernel._send_html(pred_df.head(show_cols), title=f"Predictions ({output_name})") + # prefer html display if kernel supports it + try: + self._send_html(kernel, pred_df.head(show_cols), title=f"Predictions ({output_name})") + except Exception: + kernel._send_message("stdout", pred_df.head(show_cols).to_string(index=False)) except Exception: - kernel._send_message("stdout", pred_df.head(show_cols).to_string(index=False)) + pass + + success_msg = f"Predictions stored in data['{output_name}'] with shape={pred_df.shape}" + kernel._send_message("stdout", success_msg) + + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = "\n".join(features) if features else "" + # include a short summary mentioning whether inline values were used + inline_part = f" inline_values={inline_vals}" if inline_used else "" + message_str = f"Prediction success. model={model_name}, data_arg={data_arg}, output={output_name}, shape={pred_df.shape}{inline_part}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass - kernel._send_message( - "stdout", - f"Predictions stored in data['{output_name}'] with shape={pred_df.shape}" - ) + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py index 5974f4f..460edfe 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py @@ -6,6 +6,15 @@ import logging from mariadb_kernel.maria_magics.maria_magic import MariaMagic import os +import re +import pandas as pd + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + def _str_to_obj(s): try: @@ -34,6 +43,11 @@ class SaveModel(MariaMagic): %save_model model_name_in_data=last_model save_path=/tmp/model.joblib [overwrite=True|False] Saves a trained model (from the `data` dict) to a local file using joblib. + + This version: + - If data contains model_key + "_meta", saves a dict {"model": model, "meta": meta} + - Otherwise saves the raw model object (backwards compatible) + - Writes metadata rows to magic_metadata (creates table if needed) """ def __init__(self, args=""): @@ -57,37 +71,315 @@ def parse_args(self, input_str): pairs[k] = _str_to_obj(v) return pairs + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments.") + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return model_key = args.get("model_name_in_data", "last_model") save_path = args.get("save_path") overwrite = bool(args.get("overwrite", False)) - - if not save_path: - kernel._send_message("stderr", "You must provide save_path=/path/to/file.joblib") + msg = "You must provide save_path=/path/to/file.joblib" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return model_obj = data.get(model_key) if model_obj is None: - kernel._send_message("stderr", f"No model found in data['{model_key}'].") + msg = f"No model found in data['{model_key}']." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # If file exists and overwrite=False - import os if os.path.exists(save_path) and not overwrite: - kernel._send_message("stderr", f"File {save_path} already exists. Use overwrite=True to replace it.") + msg = f"File {save_path} already exists. Use overwrite=True to replace it." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return + # Try to include model metadata (if present) into the saved file so LoadModel can restore it + meta = data.get(model_key + "_meta") + try: os.makedirs(os.path.dirname(save_path), exist_ok=True) - joblib.dump(model_obj, save_path) - kernel._send_message("stdout", f"Model from data['{model_key}'] saved to {save_path}") + if isinstance(meta, dict) and meta: + # Save both model and meta in a single object for round-trip + save_obj = {"model": model_obj, "meta": meta} + joblib.dump(save_obj, save_path) + else: + # No meta available, save raw model (backwards compatible) + joblib.dump(model_obj, save_path) + + success_msg = f"Model from data['{model_key}'] saved to {save_path}" + kernel._send_message("stdout", success_msg) + + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = model_key + # If meta exists, include a short description of key meta fields (features/target) in the message + meta_info = "" + try: + if isinstance(meta, dict): + feat = meta.get("features") + tgt = meta.get("target") or meta.get("target_col") + parts = [] + if feat: + # limit length to avoid overly long field in DB + parts.append(f"features[{len(feat)}]") + if tgt: + parts.append(f"target={tgt}") + if parts: + meta_info = " (" + ", ".join(parts) + ")" + except Exception: + meta_info = "" + message_str = f"Saved model to {save_path}{meta_info}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + except Exception as e: - kernel._send_message("stderr", f"Failed to save model: {e}") + msg = f"Failed to save model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py index b48bfb7..905ad8a 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py @@ -8,6 +8,9 @@ import numpy as np import joblib import json +import logging +import os +import re from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso @@ -38,6 +41,12 @@ except Exception: pass +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + class TrainModel(MariaMagic): """ @@ -48,6 +57,10 @@ class TrainModel(MariaMagic): Train a model on data["last_select"] (TRAINING set). This magic DOES NOT perform splitting or scaling — run your preprocessing and %splitdata beforehand. + + This version adds metadata logging to magic_metadata table similar to SelectModel: + - Ensures magic_metadata exists in current database + - Inserts error/success rows for operations """ def __init__(self, args=""): self.args = args @@ -103,6 +116,155 @@ def _send_html(self, kernel, df, title=None): except Exception: pass + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + def _choose_model(self, name, problem, params=None): p = params or {} name = name.lower() @@ -155,16 +317,55 @@ def _choose_model(self, name, problem, params=None): raise ValueError(f"Unknown model name '{name}'") def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + # Load training DataFrame df = data.get("last_select") if df is None or df.empty: - kernel._send_message("stderr", "No last_select found or DataFrame is empty (training set required).") + msg = "No last_select found or DataFrame is empty (training set required)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return try: args = self.parse_args(self.args) except Exception: - kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return features_arg = args.get("features") @@ -174,15 +375,42 @@ def execute(self, kernel, data): problem_override = args.get("problem", None) test_name = args.get("test_name", "last_select_test") model_store_name = args.get("model_name", "last_model") - # pred_name and save_path intentionally ignored/removed inplace = bool(args.get("inplace", True)) model_params = args.get("model_params", {}) or {} if not features_arg: - kernel._send_message("stderr", "features argument is required (features=col1,col2...).") + msg = "features argument is required (features=col1,col2...)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return if not target: - kernel._send_message("stderr", "target argument is required (target=target_col).") + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # parse features @@ -191,19 +419,61 @@ def execute(self, kernel, data): elif isinstance(features_arg, (list, tuple)): features = list(features_arg) else: - kernel._send_message("stderr", "features must be comma-separated string or list.") + msg = "features must be comma-separated string or list." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return missing = [c for c in features + [target] if c not in df.columns] if missing: - kernel._send_message("stderr", f"Missing columns in training DataFrame: {', '.join(missing)}") + msg = f"Missing columns in training DataFrame: {', '.join(missing)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Determine problem type if problem_override: problem = problem_override.lower() if problem not in ("classification", "regression"): - kernel._send_message("stderr", "problem must be 'classification' or 'regression'.") + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return else: # improved heuristic for problem detection @@ -236,14 +506,42 @@ def execute(self, kernel, data): if isinstance(test_df, pd.DataFrame) and not test_df.empty: missing_test = [c for c in features + [target] if c not in test_df.columns] if missing_test: - kernel._send_message("stderr", f"Test DataFrame '{test_name}' missing columns: {', '.join(missing_test)}") + msg = f"Test DataFrame '{test_name}' missing columns: {', '.join(missing_test)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Instantiate model try: model = self._choose_model(model_name_arg, problem, params=model_params) except Exception as e: - kernel._send_message("stderr", f"Error creating model: {e}") + msg = f"Error creating model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Cross-validation on training set if requested (kept) @@ -253,14 +551,42 @@ def execute(self, kernel, data): scoring = "accuracy" if problem == "classification" else "r2" cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring) except Exception as e: - kernel._send_message("stderr", f"Error during cross-validation: {e}") + msg = f"Error during cross-validation: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Fit try: model.fit(X_train, y_train) except Exception as e: - kernel._send_message("stderr", f"Error fitting model: {e}") + msg = f"Error fitting model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Store only the trained model and minimal meta (no preds, no metrics, no joblib saving) @@ -281,13 +607,49 @@ def execute(self, kernel, data): pass except Exception as e: - kernel._send_message("stderr", f"Error storing model: {e}") + msg = f"Error storing model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass return # Output concise summary out_lines = [f"Model '{model_name_arg}' trained and saved to data['{model_store_name}']. problem={problem}. train_rows={len(X_train)}"] if cv_results is not None: out_lines.append(f"cross-val (cv={cv}) scores: mean={float(np.mean(cv_results)):.4f}, std={float(np.std(cv_results)):.4f}") - kernel._send_message("stdout", "\n".join(out_lines)) + summary_msg = "\n".join(out_lines) + kernel._send_message("stdout", summary_msg) + + # Insert success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(features) + message_str = summary_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass return From 201eec47f064f576cea6fae1fae311bddfd1bfad Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Thu, 30 Oct 2025 12:59:28 +0000 Subject: [PATCH 33/38] Logging completed --- Untitled.ipynb | 122 +++---- .../maria_ingest.py | 322 +++++++++++++++++- .../maria_rag_query.py | 297 +++++++++++++++- .../maria_search.py | 300 +++++++++++++++- .../maria_magics/supported_magics.py | 6 +- 5 files changed, 950 insertions(+), 97 deletions(-) rename mariadb_kernel/maria_magics/{ml_commands/model_training => rag_commands}/maria_ingest.py (64%) rename mariadb_kernel/maria_magics/{ml_commands/model_training => rag_commands}/maria_rag_query.py (63%) rename mariadb_kernel/maria_magics/{ml_commands/model_training => rag_commands}/maria_search.py (62%) diff --git a/Untitled.ipynb b/Untitled.ipynb index 022dc8b..4739c9a 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -12786,9 +12786,9 @@ "
\n", "

Regression metrics

\n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", "
RMSE0.1362
MAE0.1108
0.9858
RMSE0.1237
MAE0.0737
0.9938
\n", "
\n", " \n", @@ -12811,51 +12811,35 @@ " \n", " \n", " 3\n", - " 2.96\n", - " \n", - " \n", - " 2\n", - " 1.84\n", + " 2.88\n", " \n", " \n", " 3\n", - " 3.00\n", - " \n", - " \n", - " 1\n", - " 1.23\n", + " 2.96\n", " \n", " \n", " 2\n", - " 2.10\n", + " 2.03\n", " \n", " \n", - " 0\n", - " 0.16\n", + " 3\n", + " 2.96\n", " \n", " \n", - " 0\n", - " -0.03\n", + " -1\n", + " -0.68\n", " \n", " \n", - " 0\n", - " 0.16\n", + " 3\n", + " 2.96\n", " \n", " \n", " 0\n", - " -0.03\n", - " \n", - " \n", - " 2\n", - " 1.84\n", - " \n", - " \n", - " 1\n", - " 1.23\n", + " 0.00\n", " \n", " \n", " 0\n", - " -0.03\n", + " 0.00\n", " \n", " \n", "" @@ -12897,7 +12881,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Loaded model from ./models/test_model.joblib → data['restored_model']\n" + "Loaded model from ./models/test_model.joblib → data['restored_model'] (features[2], target=department_lbl)\n" ] } ], @@ -12912,11 +12896,37 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "Model meta missing 'features'. Using numeric columns only if applicable.\n", - "Cannot use inline values: model has no stored feature names.\n" + "Using inline feature values for prediction: {'age': 38, 'salary': 80000.0}\n" + ] + }, + { + "data": { + "text/html": [ + "

Predictions (last_preds)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prediction
2.96
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predictions stored in data['last_preds'] with shape=(1, 1)\n" ] } ], @@ -17625,7 +17635,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "400d0901-9fc5-4bb4-bfa9-1912917b4450", "metadata": {}, "outputs": [ @@ -17641,9 +17651,21 @@ " documents=1\n", " chunks_total=40\n", " embeddings_written=40\n", + " native_attempts=40 native_successes=0 native_failures=40\n", + " fallback_successes=40 fallback_failures=0\n", " Server version: 11.8.3-MariaDB-ubu2404\n", "\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Warnings/notes:\n", + "\n", + " - python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", + "\n" + ] } ], "source": [ @@ -17909,28 +17931,14 @@ { "data": { "text/html": [ - "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1encodemethod=label drop_original=true mode=apply2025-10-30 11:36:29name\n", - "department\n", - "gender\n", - "education_levelsuccessMethod: label\n", - "Created columns:\n", - "name_lbl\n", - "department_lbl\n", - "gender_lbl\n", - "education_level_lbl\n", - "\n", - "Details:\n", - "Column 'name': label-encoded -> name_lbl (unique_values=10)\n", - "Column 'department': label-encoded -> department_lbl (unique_values=4)\n", - "Column 'gender': label-encoded -> gender_lbl (unique_values=2)\n", - "Column 'education_level': label-encoded -> education_level_lbl (unique_values=4)testNULLNULLNULL
2splitdatastratify=department test_size=0.3 val_size=0.1 random_state=1232025-10-30 11:36:44errorStratify column 'department' not found in DataFrame.testNULLNULLNULL
3splitdatatest_size=0.3 val_size=0.1 random_state=1232025-10-30 11:36:50ALL_COLUMNSsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", - "train_count=23, test_count=12, val_count=5\n", - "test_frac=0.3, val_frac=0.1, shuffle=True, random_state=123testNULLNULLNULL
4train_modelmodel=catboost target=department_lbl features=age,salary model_params='{\"iterations\":50}'2025-10-30 11:37:25age\n", - "salarysuccessModel 'catboost' trained and saved to data['last_model']. problem=regression. train_rows=23testNULLNULLNULL
5evaluate_model2025-10-30 11:37:32age\n", - "salaryerrorError computing classification metrics: Classification metrics can't handle a mix of multiclass and continuous targetstestNULLNULLNULL
6train_modeltarget=department_lbl features=age,salary model=random_forest n_estimators=50 max_depth=42025-10-30 11:37:58age\n", - "salarysuccessModel 'random_forest' trained and saved to data['last_model']. problem=regression. train_rows=23testNULLNULLNULL
7evaluate_model2025-10-30 11:38:02age\n", - "salarysuccessEvaluation success. Model='last_model', test='last_select_test', preds_saved='last_preds'. rmse=0.0993, mae=0.0792, r2=0.9924testNULLNULLNULL
8save_modelmodel_name_in_data=last_model save_path=./models/test_model.joblib overwrite=True2025-10-30 11:38:23last_modelsuccessSaved model to ./models/test_model.joblibtestNULLNULLNULL
9load_modelload_path=./models/test_model.joblib target_key=restored_model2025-10-30 11:38:25restored_modelsuccessLoaded model from ./models/test_model.joblib → data['restored_model']testNULLNULLNULL
10predict_modelmodel_name=restored_model data_name=[38,\"80000.0\"] output_name=last_preds2025-10-30 11:38:43errorCannot use inline values: model has no stored feature names.testNULLNULLNULL
11predict_modelmodel_name=last_model data_name=[38,\"80000.0\"] output_name=last_preds2025-10-30 11:39:07age\n", - "salarysuccessPrediction success. model=last_model, data_arg=[38, 80000.0], output=last_preds, shape=(1, 1) inline_values=[38, 80000.0]testNULLNULLNULL
" + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1maria_ingestdoc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"2025-10-30 12:51:35documents,chunks,embeddingssuccessIngest complete.\n", + " documents=1\n", + " chunks_total=40\n", + " embeddings_written=40\n", + " native_attempts=40 native_successes=0 native_failures=40\n", + " fallback_successes=40 fallback_failures=0\n", + " Server version: 11.8.3-MariaDB-ubu2404\n", + "testNULLNULLNULL
" ] }, "metadata": {}, diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py b/mariadb_kernel/maria_magics/rag_commands/maria_ingest.py similarity index 64% rename from mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py rename to mariadb_kernel/maria_magics/rag_commands/maria_ingest.py index a3ba58c..f94c11f 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_ingest.py +++ b/mariadb_kernel/maria_magics/rag_commands/maria_ingest.py @@ -39,12 +39,22 @@ except Exception: get_ipython = None +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + class MariaIngest(MariaMagic): """ Ingest text documents into MariaDB, chunk them, and store embeddings. This variant reduces noisy logging and prints only important status/warnings. + + Added: metadata logging into magic_metadata table similar to TrainModel: + - Ensures magic_metadata exists in current database + - Inserts error/success rows for operations """ def __init__(self, args=""): self.args = args @@ -106,6 +116,7 @@ def parse_args(self, input_obj): pairs[k] = self._str_to_obj(v) return pairs + # NOTE: keep existing _sql_escape for SQL literals used in queries (returns unquoted for non-strings) def _sql_escape(self, s): if s is None: return "NULL" @@ -113,6 +124,157 @@ def _sql_escape(self, s): return str(s) return "'" + s.replace("'", "''") + "'" + # Metadata helpers (copied/adapted from TrainModel to provide consistent metadata logging) + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ---- end metadata helpers ---- + def _simple_chunk(self, text: str, chunk_size: int, overlap: int): if not text: return [] @@ -308,6 +470,15 @@ def execute(self, kernel, data): args = self.parse_args(self.args) except Exception as e: kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # attempt to write metadata about parse error if possible + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata(kernel, self.name(), self.args, "", "error", + f"Error parsing arguments: {e}", db_name, user_name) + except Exception: + pass return # text arg or file arg preference @@ -384,13 +555,25 @@ def execute(self, kernel, data): docs_to_ingest = [d for d in docs_to_ingest if (d.get("content") or "").strip()] if not docs_to_ingest: - kernel._send_message("stderr", "No non-empty documents to ingest; aborting.\n") + msg = "No non-empty documents to ingest; aborting." + kernel._send_message("stderr", msg + "\n") + # write metadata error (best-effort) + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, db_name, user_name) + except Exception: + pass return # get mariadb client mariadb_client = getattr(kernel, "mariadb_client", None) if mariadb_client is None: - kernel._send_message("stderr", "No mariadb_client available on kernel (can't run ingestion).\n") + msg = "No mariadb_client available on kernel (can't run ingestion)." + kernel._send_message("stderr", msg + "\n") + # cannot write metadata without mariadb_client, so just return return # determine db @@ -399,13 +582,41 @@ def execute(self, kernel, data): dbname = self._parse_single_result(db_name_html) or "" kernel._send_message("stdout", f"Using database: {dbname}\n") except Exception as e: - kernel._send_message("stderr", f"Failed to query current database: {e}\n") + msg = f"Failed to query current database: {e}" + kernel._send_message("stderr", msg + "\n") + # try to write metadata + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass return if not dbname: - kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return + # Ensure metadata table exists for this database + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + # non-fatal, continue; metadata will be best-effort later + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + # create tables: documents, chunks; embeddings handled carefully try: mariadb_client.run_statement( @@ -452,7 +663,13 @@ def execute(self, kernel, data): # tolerate create failure and continue (we'll detect existing table schema) pass except Exception as e: - kernel._send_message("stderr", f"DDL failed: {e}\n") + msg = f"DDL failed: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return # detect existing VECTOR dimension (if any) @@ -508,6 +725,12 @@ def execute(self, kernel, data): ) except Exception as e: user_warnings.append(f"Failed to insert document {d_doc_id}: {e}") + # Log per-document failure into metadata (best-effort) + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id}", "error", f"Failed to insert document {d_doc_id}: {e}", dbname, user_name) + except Exception: + pass continue # chunk @@ -546,6 +769,13 @@ def execute(self, kernel, data): inserted_chunk_ids.append((idx, None)) except Exception as e: user_warnings.append(f"Failed to insert chunk {idx} for {d_doc_id}: {e}") + # log per-chunk failure + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id},chunk_index={idx}", "error", + f"Failed to insert chunk {idx} for {d_doc_id}: {e}", dbname, user_name) + except Exception: + pass inserted_chunk_ids.append((idx, None)) continue @@ -559,6 +789,13 @@ def execute(self, kernel, data): for (i, chunk_db_id), vec in zip(inserted_chunk_ids, embs_norm): if chunk_db_id is None: user_warnings.append(f"No chunk id for doc {d_doc_id} chunk {i}; embedding skipped.") + # log skipped embedding + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id},chunk_index={i}", "error", + "No chunk id for embedding; skipped", dbname, user_name) + except Exception: + pass continue vec_list = [float(v) for v in vec.tolist()] @@ -586,11 +823,27 @@ def execute(self, kernel, data): total_emb_rows += 1 else: fallback_failures += 1 + # log failure + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON embedding write returned zero rows", dbname, user_name) except Exception: fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON embedding verification failed", dbname, user_name) + except Exception: + pass except Exception as e_json: user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback embedding storage failed: {e_json}", dbname, user_name) + except Exception: + pass continue # Attempt native VECTOR insert @@ -630,6 +883,19 @@ def execute(self, kernel, data): except Exception as e_json: user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback storage after native failure also failed: {e_json}", dbname, user_name) + except Exception: + pass + # log native failure + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Native vector insert failed: {e_native}", dbname, user_name) + except Exception: + pass continue # Verify native insert succeeded by COUNT(*) @@ -663,13 +929,28 @@ def execute(self, kernel, data): total_emb_rows += 1 else: fallback_failures += 1 + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON write after native verify returned zero rows", dbname, user_name) except Exception: fallback_failures += 1 except Exception as e_json: user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback JSON storage after native verify failed: {e_json}", dbname, user_name) + except Exception: + pass except Exception as e_verify: user_warnings.append(f"Verify select for embeddings failed: {e_verify}") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Verify select for embeddings failed: {e_verify}", dbname, user_name) + except Exception: + pass # Final diagnostics: counts & version try: @@ -689,17 +970,36 @@ def execute(self, kernel, data): version_val = "" # concise output - kernel._send_message("stdout", ( + summary_msg = ( "Ingest complete.\n" f" documents={len(docs_to_ingest)}\n" f" chunks_total={total_chunks}\n" f" embeddings_written={total_emb_rows}\n" f" Server version: {version_val}\n" - )) + ) + kernel._send_message("stdout", summary_msg) - # if user_warnings: - # kernel._send_message("stderr", "Warnings/notes:\n") - # for w in user_warnings: - # kernel._send_message("stderr", f" - {w}\n") + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "documents,chunks,embeddings" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", summary_msg, dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + + # optionally show warnings + if user_warnings: + try: + for w in user_warnings: + try: + pass + except Exception: + pass + except Exception: + pass return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py b/mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py similarity index 63% rename from mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py rename to mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py index 83197e9..1d19c20 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_rag_query.py +++ b/mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py @@ -48,6 +48,13 @@ except Exception: _GENAI_AVAILABLE = False +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + class MariaRAGQuery(MariaMagic): def __init__(self, args=""): self.args = args @@ -125,6 +132,155 @@ def _sql_escape(self, s): return str(s) return "'" + s.replace("'", "''") + "'" + # ---------------- Metadata helpers (added) ---------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # ---------------- Embedding utilities ---------------- def _embed_texts(self, texts, dim=None): """Return normalized numpy embeddings for texts. If sentence-transformers available use it.""" @@ -356,7 +512,8 @@ def _call_gemini(self, system_prompt, user_prompt, model_name=None, max_output_t self.log.debug("google.genai not available in environment.") return None, None - api_key = "AIzaSyBW1n6kIu0o-W3l0-pMBOMc4nzjYfsbETg" + # NOTE: in the snippet provided earlier an API key was hardcoded; here we'll check env vars. + api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GENAI_API_KEY") or "" if not api_key: self.log.debug("No GENAI API key found in GOOGLE_API_KEY or GENAI_API_KEY.") return None, None @@ -453,6 +610,15 @@ def execute(self, kernel, data): args = self.parse_args(self.args) except Exception as e: kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # best-effort metadata log + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", f"Error parsing arguments: {e}", dbname, user_name) + except Exception: + pass args = {} query = None @@ -462,7 +628,16 @@ def execute(self, kernel, data): if isinstance(data, str) and data.strip(): query = data.strip() if not query: - kernel._send_message("stderr", "No query supplied. Usage: %maria_rag_query query=\"...\"\n") + msg = "No query supplied. Usage: %maria_rag_query query=\"...\"" + kernel._send_message("stderr", msg + "\n") + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return explain = False @@ -471,9 +646,20 @@ def execute(self, kernel, data): explain = True kernel._send_message("stdout", f"[debug] RAG query received (len={len(query)}): {query}\n") + mariadb_client = getattr(kernel, "mariadb_client", None) if mariadb_client is None: - kernel._send_message("stderr", "No mariadb_client available on kernel (can't run retrieval).\n") + msg = "No mariadb_client available on kernel (can't run retrieval)." + kernel._send_message("stderr", msg + "\n") + # metadata best-effort: cannot insert without client, but attempt helper which will no-op + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return # determine DB @@ -489,33 +675,95 @@ def execute(self, kernel, data): if m: dbname = m.group(1).strip() except Exception as e: - kernel._send_message("stderr", f"Failed to detect current DB: {e}\n") + msg = f"Failed to detect current DB: {e}" + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass return if not dbname: - kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return + # Ensure metadata table exists for this database (best-effort) + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + # RETRIEVAL: BM25 prefilter (hybrid) candidates = [] - if self.RETRIEVER == "hybrid": - candidates = self._bm25_prefilter(kernel, dbname, query) + try: + if self.RETRIEVER == "hybrid": + candidates = self._bm25_prefilter(kernel, dbname, query) + except Exception: + candidates = [] if not candidates: candidates = self._sample_candidates(kernel, dbname) if not candidates: - kernel._send_message("stderr", "No candidate chunks found (chunks table empty?).\n") + msg = "No candidate chunks found (chunks table empty?)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass return candidate_ids = [c["chunk_id"] for c in candidates if c.get("chunk_id") is not None] emb_map = self._fetch_embeddings_for_candidates(kernel, dbname, candidate_ids) if not emb_map: - kernel._send_message("stderr", "No embeddings found for any candidate chunks.\n") + msg = "No embeddings found for any candidate chunks." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass return # compute query embedding consistent with vector dim - first_vec = next(iter(emb_map.values()))["vec"] - vec_dim = first_vec.shape[0] - q_emb = self._embed_texts([query], dim=vec_dim)[0] + try: + first_vec = next(iter(emb_map.values()))["vec"] + vec_dim = first_vec.shape[0] + except Exception: + msg = "Failed to determine embedding dimensionality." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + try: + q_emb = self._embed_texts([query], dim=vec_dim)[0] + except Exception as e: + msg = f"Failed to compute query embedding: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return # combine bm25 + vector scored = [] @@ -541,7 +789,13 @@ def execute(self, kernel, data): }) if not scored: - kernel._send_message("stderr", "No scored candidates after combining BM25/vector.\n") + msg = "No scored candidates after combining BM25/vector." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "search", "error", msg, dbname, user_name) + except Exception: + pass return # top-K @@ -579,11 +833,13 @@ def execute(self, kernel, data): llm_raw_resp = gemini_raw chain_debug = None + used_llm = False if not llm_answer: ans, evidence, debug = self._fusion_chain_local(query, context_blocks) chain_debug = debug - # **DO NOT** append sources to the answer (per request) llm_answer = ans + else: + used_llm = True # Output answer only (no sources printed) kernel._send_message("stdout", "\n=== ANSWER ===\n") @@ -602,4 +858,17 @@ def execute(self, kernel, data): kernel._send_message("stdout", "\n=== GEMINI RAW RESP (truncated) ===\n") kernel._send_message("stdout", str(llm_raw_resp)[:2000] + "\n") + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "chunks,embeddings,llm" if used_llm else "chunks,embeddings,local_chain" + msg = f"Returned {len(topk)} results for query. used_llm={used_llm}" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", msg, dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py b/mariadb_kernel/maria_magics/rag_commands/maria_search.py similarity index 62% rename from mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py rename to mariadb_kernel/maria_magics/rag_commands/maria_search.py index 8b6fc11..6d995b5 100644 --- a/mariadb_kernel/maria_magics/ml_commands/model_training/maria_search.py +++ b/mariadb_kernel/maria_magics/rag_commands/maria_search.py @@ -48,6 +48,12 @@ def type(self): return "Line" def name(self): return "maria_search" def help(self): return "Search (hybrid)." +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + class MariaSearch(MariaMagic): def __init__(self, args=""): @@ -123,6 +129,157 @@ def _sql_escape(self, s): return str(s) return "'" + s.replace("'", "''") + "'" + # ----- metadata helpers (same approach as TrainModel / MariaIngest) ----- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ----------------- end metadata helpers ----------------- + def _parse_html_table(self, html): """Return a list-of-dicts or pandas.DataFrame. Best-effort fallback if pandas missing.""" if html is None: @@ -234,6 +391,15 @@ def execute(self, kernel, data): args = self.parse_args(self.args) except Exception as e: kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # best-effort metadata logging + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", f"Error parsing arguments: {e}", dbname, user_name) + except Exception: + pass args = {} query = None @@ -246,14 +412,33 @@ def execute(self, kernel, data): query = "testquery" query = str(query).strip() if not query: - kernel._send_message("stderr", "Empty query; nothing to search.\n") + msg = "Empty query; nothing to search." + kernel._send_message("stderr", msg + "\n") + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return kernel._send_message("stdout", f"[debug] running hybrid search for query (len={len(query)}): {query}\n") mariadb_client = getattr(kernel, "mariadb_client", None) if mariadb_client is None: - kernel._send_message("stderr", "No mariadb_client available on kernel (can't run search).\n") + msg = "No mariadb_client available on kernel (can't run search)." + kernel._send_message("stderr", msg + "\n") + # Can't insert metadata without mariadb_client, but attempt (internal check will no-op) + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return # determine DB @@ -279,13 +464,39 @@ def execute(self, kernel, data): else: dbname = "" except Exception as e: - kernel._send_message("stderr", f"Failed to query current database: {e}\n") + msg = f"Failed to query current database: {e}" + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass return if not dbname: - kernel._send_message("stderr", "No current database selected (use `USE ` before running the magic).\n") + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass return + # Ensure metadata table exists for this database + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + # --- BM25 prefilter if requested --- candidates = [] try: @@ -328,6 +539,12 @@ def execute(self, kernel, data): }) except Exception as e: kernel._send_message("stderr", f"BM25 prefilter failed: {e}\n") + # Log BM25 prefilter warning as metadata (non-fatal) + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "warning", f"BM25 prefilter failed: {e}", dbname, user_name) + except Exception: + pass # if no candidates from BM25, fallback to sample if not candidates: @@ -364,14 +581,31 @@ def execute(self, kernel, data): }) except Exception as e: kernel._send_message("stderr", f"Candidate sampling failed: {e}\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "warning", f"Candidate sampling failed: {e}", dbname, user_name) + except Exception: + pass if not candidates: - kernel._send_message("stderr", "No candidate chunks found (empty chunks table?).\n") + msg = "No candidate chunks found (empty chunks table?)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass return candidate_ids = [int(c["chunk_id"]) for c in candidates if c.get("chunk_id") is not None] if not candidate_ids: - kernel._send_message("stderr", "No valid candidate chunk ids.\n") + msg = "No valid candidate chunk ids." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass return # --- fetch embeddings: try native embeddings table first --- @@ -493,18 +727,35 @@ def execute(self, kernel, data): "doc_id": r.get("doc_id") or "", "chunk_meta": r.get("chunk_meta") or "" } - except Exception: - pass + except Exception as e: + kernel._send_message("stderr", f"Embeddings JSON fallback failed: {e}\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "warning", f"Embeddings JSON fallback failed: {e}", dbname, user_name) + except Exception: + pass if not emb_map: - kernel._send_message("stderr", "No embeddings found for candidate chunks (neither native nor JSON fallback).\n") + msg = "No embeddings found for candidate chunks (neither native nor JSON fallback)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass return # compute query embedding (dim inferred from first vector) try: vec_dim = next(iter(emb_map.values()))["vec"].shape[0] except Exception: - kernel._send_message("stderr", "Failed to determine embedding dimensionality.\n") + msg = "Failed to determine embedding dimensionality." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass return try: @@ -513,7 +764,13 @@ def execute(self, kernel, data): if q_norm == 0: q_norm = 1.0 q_emb = q_emb.astype(np.float32) / q_norm except Exception as e: - kernel._send_message("stderr", f"Failed to compute query embedding: {e}\n") + msg = f"Failed to compute query embedding: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass return # combine scores and rank @@ -540,7 +797,13 @@ def execute(self, kernel, data): }) if not results: - kernel._send_message("stderr", "No scored results to return after filtering.\n") + msg = "No scored results to return after filtering." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "search", "error", msg, dbname, user_name) + except Exception: + pass return results.sort(key=lambda r: r["score"], reverse=True) @@ -562,4 +825,17 @@ def execute(self, kernel, data): out = "\n".join(lines) + "\n" kernel._send_message("stdout", out) + + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "chunks,embeddings" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", f"Returned {len(topk)} results for query.", dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index 5aab37b..e2ceb18 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -30,9 +30,9 @@ from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline -from mariadb_kernel.maria_magics.ml_commands.model_training.maria_ingest import MariaIngest -from mariadb_kernel.maria_magics.ml_commands.model_training.maria_search import MariaSearch -from mariadb_kernel.maria_magics.ml_commands.model_training.maria_rag_query import MariaRAGQuery +from mariadb_kernel.maria_magics.rag_commands.maria_ingest import MariaIngest +from mariadb_kernel.maria_magics.rag_commands.maria_search import MariaSearch +from mariadb_kernel.maria_magics.rag_commands.maria_rag_query import MariaRAGQuery def get(): return { From 17d5f465fadc40cdaa539c94144aeaaa5ce072e2 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Fri, 31 Oct 2025 14:54:12 +0000 Subject: [PATCH 34/38] clipoutliers & fillmissing completed --- Iris.csv | 151 + RawMLPipeline.ipynb | 2739 +++++++++++++ Untitled.ipynb | 3615 +++++++++++------ .../ml_commands/data_cleaning/clipoutliers.py | 29 +- .../ml_commands/data_cleaning/fillmissing.py | 23 + 5 files changed, 5352 insertions(+), 1205 deletions(-) create mode 100644 Iris.csv create mode 100644 RawMLPipeline.ipynb diff --git a/Iris.csv b/Iris.csv new file mode 100644 index 0000000..15e8a8f --- /dev/null +++ b/Iris.csv @@ -0,0 +1,151 @@ +Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species +1,5.1,3.5,1.4,0.2,Iris-setosa +2,4.9,3,1.4,0.2,Iris-setosa +3,4.7,3.2,1.3,0.2,Iris-setosa +4,4.6,3.1,1.5,0.2,Iris-setosa +5,5,3.6,1.4,0.2,Iris-setosa +6,5.4,3.9,1.7,0.4,Iris-setosa +7,4.6,NULL,1.4,0.3,Iris-setosa +8,5,3.4,1.5,0.2,Iris-setosa +9,4.4,2.9,1.4,0.2,Iris-setosa +10,4.9,3.1,1.5,0.1,Iris-setosa +11,5.4,3.7,1.5,0.2,Iris-setosa +12,4.8,3.4,1.6,0.2,Iris-setosa +13,4.8,3,1.4,0.1,Iris-setosa +14,4.3,3,1.1,0.1, +15,5.8,4,1.2,0.1,Iris-setosa +16,5.7,4.4,1.5,0.1,Iris-setosa +17,5.4,NULL,1.3,0.1,Iris-setosa +18,5.1,3.5,1.4,0.1,Iris-setosa +19,5.7,3.8,1.7,0.1,Iris-setosa +20,5.1,3.8,1.5,0.1,Iris-setosa +21,5.4,3.4,1.7,0.1,Iris-setosa +22,5.1,3.7,1.5,0.1,Iris-setosa +23,4.6,3.6,1,0.1,Iris-setosa +24,5.1,3.3,1.7,0.1,Iris-setosa +25,4.8,3.4,1.9,0.1,Iris-setosa +26,5,3,1.6,0.1,Iris-setosa +27,5,3.4,1.6,0.1,Iris-setosa +28,NULL,3.5,1.5,0.1,Iris-setosa +29,5.2,3.4,1.4,0.1,Iris-setosa +30,4.7,3.2,1.6,0.1,Iris-setosa +31,4.8,3.1,NULL,0.1,Iris-setosa +32,5.4,3.4,1.5,0.1,Iris-setosa +33,5.2,4.1,1.5,0.1,Iris-setosa +34,5.5,4.2,1.4,0.1,Iris-setosa +35,4.9,3.1,1.5,0.1,Iris-setosa +36,5,3.2,1.2,0.1,Iris-setosa +37,5.5,NULL,1.3,0.1,Iris-setosa +38,4.9,3.1,1.5,0.1,Iris-setosa +39,4.4,3,1.3,0.1,Iris-setosa +40,5.1,3.4,1.5,0.1,Iris-setosa +41,5,3.5,1.3,0.1,Iris-setosa +42,4.5,2.3,1.3,0.1,Iris-setosa +43,4.4,3.2,1.3,0.1,Iris-setosa +44,5,3.5,1.6,0.1,Iris-setosa +45,5.1,3.8,1.9,0.1,Iris-setosa +46,4.8,3,1.4,0.1,Iris-setosa +47,5.1,3.8,1.6,0.1,Iris-setosa +48,4.6,3.2,1.4,0.1,Iris-setosa +49,5.3,3.7,1.5,0.1,Iris-setosa +50,5,3.3,1.4,0.1,Iris-setosa +51,7,3.2,4.7,0.1,Iris-versicolor +52,6.4,3.2,4.5,0.1,Iris-versicolor +53,6.9,3.1,4.9,0.1,Iris-versicolor +54,5.5,2.3,4,0.1,Iris-versicolor +55,6.5,2.8,4.6,0.1,Iris-versicolor +56,5.7,2.8,4.5,0.1,Iris-versicolor +57,6.3,3.3,4.7,0.1,Iris-versicolor +58,4.9,2.4,3.3,0.1,Iris-versicolor +59,6.6,2.9,4.6,0.1,Iris-versicolor +60,5.2,2.7,3.9,0.1,Iris-versicolor +61,5,2,3.5,0.1,Iris-versicolor +62,5.9,3,4.2,0.1,Iris-versicolor +63,6,2.2,4,0.1,Iris-versicolor +64,6.1,2.9,4.7,0.1,Iris-versicolor +65,5.6,2.9,3.6,0.1,Iris-versicolor +66,6.7,3.1,4.4,0.1,Iris-versicolor +67,5.6,3,4.5,0.1,Iris-versicolor +68,5.8,2.7,4.1,0.1,Iris-versicolor +69,6.2,2.2,4.5,0.1,Iris-versicolor +70,5.6,2.5,3.9,0.1,Iris-versicolor +71,5.9,3.2,4.8,0.1,Iris-versicolor +72,6.1,2.8,4,0.1,Iris-versicolor +73,6.3,2.5,4.9,0.1,Iris-versicolor +74,6.1,2.8,4.7,0.1,Iris-versicolor +75,6.4,2.9,4.3,0.1,Iris-versicolor +76,6.6,3,4.4,0.1,Iris-versicolor +77,6.8,2.8,4.8,0.1,Iris-versicolor +78,6.7,3,5,0.1,Iris-versicolor +79,6,2.9,4.5,0.1,Iris-versicolor +80,5.7,2.6,3.5,0.1,Iris-versicolor +81,5.5,2.4,3.8,0.1,Iris-versicolor +82,5.5,2.4,3.7,0.1,Iris-versicolor +83,5.8,2.7,3.9,0.1,Iris-versicolor +84,6,2.7,5.1,0.1,Iris-versicolor +85,5.4,3,4.5,0.1,Iris-versicolor +86,6,3.4,4.5,0.1,Iris-versicolor +87,6.7,3.1,4.7,0.1,Iris-versicolor +88,6.3,2.3,4.4,0.1,Iris-versicolor +89,5.6,3,4.1,0.1,Iris-versicolor +90,5.5,2.5,4,0.1,Iris-versicolor +91,5.5,2.6,4.4,0.1,Iris-versicolor +92,6.1,3,4.6,0.1,Iris-versicolor +93,5.8,2.6,4,0.1,Iris-versicolor +94,5,2.3,3.3,0.1,Iris-versicolor +95,5.6,2.7,4.2,0.1,Iris-versicolor +96,5.7,3,4.2,0.1,Iris-versicolor +97,5.7,2.9,4.2,0.1,Iris-versicolor +98,6.2,2.9,4.3,0.1,Iris-versicolor +99,5.1,2.5,3,0.1,Iris-versicolor +100,5.7,2.8,4.1,0.1,Iris-versicolor +101,6.3,3.3,6,0.1,Iris-virginica +102,5.8,2.7,5.1,0.1,Iris-virginica +103,7.1,3,5.9,0.1,Iris-virginica +104,6.3,2.9,5.6,0.1,Iris-virginica +105,6.5,3,5.8,0.1,Iris-virginica +106,7.6,3,6.6,0.1,Iris-virginica +107,4.9,2.5,4.5,0.1,Iris-virginica +108,7.3,2.9,6.3,0.1,Iris-virginica +109,6.7,2.5,5.8,0.1,Iris-virginica +110,7.2,3.6,6.1,0.1,Iris-virginica +111,6.5,3.2,5.1,0.1,Iris-virginica +112,6.4,2.7,5.3,0.1,Iris-virginica +113,6.8,3,5.5,0.1,Iris-virginica +114,5.7,2.5,5,0.1,Iris-virginica +115,5.8,2.8,5.1,0.1,Iris-virginica +116,6.4,3.2,5.3,0.1,Iris-virginica +117,6.5,3,5.5,0.1,Iris-virginica +118,7.7,3.8,6.7,0.1,Iris-virginica +119,7.7,2.6,6.9,0.1,Iris-virginica +120,6,2.2,5,0.1,Iris-virginica +121,6.9,3.2,5.7,0.1,Iris-virginica +122,5.6,2.8,4.9,0.1,Iris-virginica +123,7.7,2.8,6.7,0.1,Iris-virginica +124,6.3,2.7,4.9,0.1,Iris-virginica +125,6.7,3.3,5.7,0.1,Iris-virginica +126,7.2,3.2,6,0.1,Iris-virginica +127,6.2,2.8,4.8,0.1,Iris-virginica +128,6.1,3,4.9,0.1,Iris-virginica +129,6.4,2.8,5.6,0.1,Iris-virginica +130,7.2,3,5.8,0.1,Iris-virginica +131,7.4,2.8,6.1,0.1,Iris-virginica +132,7.9,3.8,6.4,0.1,Iris-virginica +133,6.4,2.8,5.6,0.1,Iris-virginica +134,6.3,2.8,5.1,0.1,Iris-virginica +135,6.1,2.6,5.6,0.1,Iris-virginica +136,7.7,3,6.1,0.1,Iris-virginica +137,6.3,3.4,5.6,0.1,Iris-virginica +138,6.4,3.1,5.5,0.1,Iris-virginica +139,6,3,4.8,0.1,Iris-virginica +140,6.9,3.1,5.4,0.1,Iris-virginica +141,6.7,3.1,5.6,0.1,Iris-virginica +142,6.9,3.1,5.1,0.1,Iris-virginica +143,5.8,2.7,5.1,0.1,Iris-virginica +144,6.8,3.2,5.9,0.1,Iris-virginica +145,6.7,3.3,5.7,0.1,Iris-virginica +146,6.7,3,5.2,0.1,Iris-virginica +147,6.3,2.5,5,0.1,Iris-virginica +148,6.5,3,5.2,0.1,Iris-virginica +149,6.2,3.4,5.4,0.1, +150,NULL,3,5.1,0.1,Iris-virginica diff --git a/RawMLPipeline.ipynb b/RawMLPipeline.ipynb new file mode 100644 index 0000000..076f7ba --- /dev/null +++ b/RawMLPipeline.ipynb @@ -0,0 +1,2739 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e5af9aaa-128d-46cc-a84f-9000580b203b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Database
information_schema
mysql
performance_schema
sys
test
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SHOW DATABASES;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0308f6d-a504-45b4-8f5b-a67181615fff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE DATABASE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e276e32-8767-452f-bbbb-0ddbbafc7fb0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "148a24eb-b93b-43dd-9d73-c9414f5d4807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa\n", + "
24.931.40.2Iris-setosa\n", + "
34.73.21.30.2Iris-setosa\n", + "
44.63.11.50.2Iris-setosa\n", + "
553.61.40.2Iris-setosa\n", + "
65.43.91.70.4Iris-setosa\n", + "
74.6NULL1.40.3Iris-setosa\n", + "
853.41.50.2Iris-setosa\n", + "
94.42.91.40.2Iris-setosa\n", + "
104.93.11.50.1Iris-setosa\n", + "
115.43.71.50.2Iris-setosa\n", + "
124.83.41.60.2Iris-setosa\n", + "
134.831.40.1Iris-setosa\n", + "
144.331.10.1\n", + "
155.841.20.1Iris-setosa\n", + "
165.74.41.50.1Iris-setosa\n", + "
175.4NULL1.30.1Iris-setosa\n", + "
185.13.51.40.1Iris-setosa\n", + "
195.73.81.70.1Iris-setosa\n", + "
205.13.81.50.1Iris-setosa\n", + "
215.43.41.70.1Iris-setosa\n", + "
225.13.71.50.1Iris-setosa\n", + "
234.63.610.1Iris-setosa\n", + "
245.13.31.70.1Iris-setosa\n", + "
254.83.41.90.1Iris-setosa\n", + "
26531.60.1Iris-setosa\n", + "
2753.41.60.1Iris-setosa\n", + "
28NULL3.51.50.1Iris-setosa\n", + "
295.23.41.40.1Iris-setosa\n", + "
304.73.21.60.1Iris-setosa\n", + "
314.83.1NULL0.1Iris-setosa\n", + "
325.43.41.50.1Iris-setosa\n", + "
335.24.11.50.1Iris-setosa\n", + "
345.54.21.40.1Iris-setosa\n", + "
354.93.11.50.1Iris-setosa\n", + "
3653.21.20.1Iris-setosa\n", + "
375.5NULL1.30.1Iris-setosa\n", + "
384.93.11.50.1Iris-setosa\n", + "
394.431.30.1Iris-setosa\n", + "
405.13.41.50.1Iris-setosa\n", + "
4153.51.30.1Iris-setosa\n", + "
424.52.31.30.1Iris-setosa\n", + "
434.43.21.30.1Iris-setosa\n", + "
4453.51.60.1Iris-setosa\n", + "
455.13.81.90.1Iris-setosa\n", + "
464.831.40.1Iris-setosa\n", + "
475.13.81.60.1Iris-setosa\n", + "
484.63.21.40.1Iris-setosa\n", + "
495.33.71.50.1Iris-setosa\n", + "
5053.31.40.1Iris-setosa\n", + "
5173.24.70.1Iris-versicolor\n", + "
526.43.24.50.1Iris-versicolor\n", + "
536.93.14.90.1Iris-versicolor\n", + "
545.52.340.1Iris-versicolor\n", + "
556.52.84.60.1Iris-versicolor\n", + "
565.72.84.50.1Iris-versicolor\n", + "
576.33.34.70.1Iris-versicolor\n", + "
584.92.43.30.1Iris-versicolor\n", + "
596.62.94.60.1Iris-versicolor\n", + "
605.22.73.90.1Iris-versicolor\n", + "
61523.50.1Iris-versicolor\n", + "
625.934.20.1Iris-versicolor\n", + "
6362.240.1Iris-versicolor\n", + "
646.12.94.70.1Iris-versicolor\n", + "
655.62.93.60.1Iris-versicolor\n", + "
666.73.14.40.1Iris-versicolor\n", + "
675.634.50.1Iris-versicolor\n", + "
685.82.74.10.1Iris-versicolor\n", + "
696.22.24.50.1Iris-versicolor\n", + "
705.62.53.90.1Iris-versicolor\n", + "
715.93.24.80.1Iris-versicolor\n", + "
726.12.840.1Iris-versicolor\n", + "
736.32.54.90.1Iris-versicolor\n", + "
746.12.84.70.1Iris-versicolor\n", + "
756.42.94.30.1Iris-versicolor\n", + "
766.634.40.1Iris-versicolor\n", + "
776.82.84.80.1Iris-versicolor\n", + "
786.7350.1Iris-versicolor\n", + "
7962.94.50.1Iris-versicolor\n", + "
805.72.63.50.1Iris-versicolor\n", + "
815.52.43.80.1Iris-versicolor\n", + "
825.52.43.70.1Iris-versicolor\n", + "
835.82.73.90.1Iris-versicolor\n", + "
8462.75.10.1Iris-versicolor\n", + "
855.434.50.1Iris-versicolor\n", + "
8663.44.50.1Iris-versicolor\n", + "
876.73.14.70.1Iris-versicolor\n", + "
886.32.34.40.1Iris-versicolor\n", + "
895.634.10.1Iris-versicolor\n", + "
905.52.540.1Iris-versicolor\n", + "
915.52.64.40.1Iris-versicolor\n", + "
926.134.60.1Iris-versicolor\n", + "
935.82.640.1Iris-versicolor\n", + "
9452.33.30.1Iris-versicolor\n", + "
955.62.74.20.1Iris-versicolor\n", + "
965.734.20.1Iris-versicolor\n", + "
975.72.94.20.1Iris-versicolor\n", + "
986.22.94.30.1Iris-versicolor\n", + "
995.12.530.1Iris-versicolor\n", + "
1005.72.84.10.1Iris-versicolor\n", + "
1016.33.360.1Iris-virginica\n", + "
1025.82.75.10.1Iris-virginica\n", + "
1037.135.90.1Iris-virginica\n", + "
1046.32.95.60.1Iris-virginica\n", + "
1056.535.80.1Iris-virginica\n", + "
1067.636.60.1Iris-virginica\n", + "
1074.92.54.50.1Iris-virginica\n", + "
1087.32.96.30.1Iris-virginica\n", + "
1096.72.55.80.1Iris-virginica\n", + "
1107.23.66.10.1Iris-virginica\n", + "
1116.53.25.10.1Iris-virginica\n", + "
1126.42.75.30.1Iris-virginica\n", + "
1136.835.50.1Iris-virginica\n", + "
1145.72.550.1Iris-virginica\n", + "
1155.82.85.10.1Iris-virginica\n", + "
1166.43.25.30.1Iris-virginica\n", + "
1176.535.50.1Iris-virginica\n", + "
1187.73.86.70.1Iris-virginica\n", + "
1197.72.66.90.1Iris-virginica\n", + "
12062.250.1Iris-virginica\n", + "
1216.93.25.70.1Iris-virginica\n", + "
1225.62.84.90.1Iris-virginica\n", + "
1237.72.86.70.1Iris-virginica\n", + "
1246.32.74.90.1Iris-virginica\n", + "
1256.73.35.70.1Iris-virginica\n", + "
1267.23.260.1Iris-virginica\n", + "
1276.22.84.80.1Iris-virginica\n", + "
1286.134.90.1Iris-virginica\n", + "
1296.42.85.60.1Iris-virginica\n", + "
1307.235.80.1Iris-virginica\n", + "
1317.42.86.10.1Iris-virginica\n", + "
1327.93.86.40.1Iris-virginica\n", + "
1336.42.85.60.1Iris-virginica\n", + "
1346.32.85.10.1Iris-virginica\n", + "
1356.12.65.60.1Iris-virginica\n", + "
1367.736.10.1Iris-virginica\n", + "
1376.33.45.60.1Iris-virginica\n", + "
1386.43.15.50.1Iris-virginica\n", + "
139634.80.1Iris-virginica\n", + "
1406.93.15.40.1Iris-virginica\n", + "
1416.73.15.60.1Iris-virginica\n", + "
1426.93.15.10.1Iris-virginica\n", + "
1435.82.75.10.1Iris-virginica\n", + "
1446.83.25.90.1Iris-virginica\n", + "
1456.73.35.70.1Iris-virginica\n", + "
1466.735.20.1Iris-virginica\n", + "
1476.32.550.1Iris-virginica\n", + "
1486.535.20.1Iris-virginica\n", + "
1496.23.45.40.1\n", + "
150NULL35.10.1Iris-virginica\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT * FROM IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "f7b8f34e-e73b-45c4-a1c9-9c504ff12c4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
missingpercent
Id00.00
SepalLengthCm21.33
SepalWidthCm32.00
PetalLengthCm10.67
PetalWidthCm00.00
Species21.33
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%missing" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cd7df893-e1e7-4372-b484-a22dc3702484", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW: would drop 8 row(s) (from 150 to 142).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_would_be_dropped
74.6NaN1.40.3Iris-setosaTrue
144.33.01.10.1NaNTrue
175.4NaN1.30.1Iris-setosaTrue
28NaN3.51.50.1Iris-setosaTrue
314.83.1NaN0.1Iris-setosaTrue
375.5NaN1.30.1Iris-setosaTrue
1496.23.45.40.1NaNTrue
150NaN3.05.10.1Iris-virginicaTrue
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropmissing mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "bdaff76a-42cb-4d07-8c13-b8365ea38560", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW: missing counts per column:\n", + "Id: missing=0\n", + "SepalLengthCm: missing=2\n", + "SepalWidthCm: missing=3\n", + "PetalLengthCm: missing=1\n", + "PetalWidthCm: missing=0\n", + "Species: missing=2\n", + "PREVIEW: computed fill-values (best-effort):\n", + "Id: would fill with -> 75.5 (mean via local preview)\n", + "SepalLengthCm: would fill with -> 5.847297297297298 (mean via local preview)\n", + "SepalWidthCm: would fill with -> 3.042857142857143 (mean via local preview)\n", + "PetalLengthCm: would fill with -> 3.773154362416107 (mean via local preview)\n", + "PetalWidthCm: would fill with -> 0.1093333333333333 (mean via local preview)\n", + "Species: could NOT determine fill value (not numeric; cannot compute mean locally); would skip\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_null_columns
74.6NaN1.40.3Iris-setosaSepalWidthCm
144.33.01.10.1NaNSpecies
175.4NaN1.30.1Iris-setosaSepalWidthCm
28NaN3.51.50.1Iris-setosaSepalLengthCm
314.83.1NaN0.1Iris-setosaPetalLengthCm
375.5NaN1.30.1Iris-setosaSepalWidthCm
1496.23.45.40.1NaNSpecies
150NaN3.05.10.1Iris-virginicaSepalLengthCm
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "78f0e9c1-c61a-4d79-bb3e-31efd05b4bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 4 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 11 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesId_is_outlierSepalLengthCm_is_outlierSepalWidthCm_is_outlierPetalLengthCm_is_outlierPetalWidthCm_is_outlier
15.13.51.40.2Iris-setosaFalseFalseFalseFalseTrue
24.93.01.40.2Iris-setosaFalseFalseFalseFalseTrue
34.73.21.30.2Iris-setosaFalseFalseFalseFalseTrue
44.63.11.50.2Iris-setosaFalseFalseFalseFalseTrue
55.03.61.40.2Iris-setosaFalseFalseFalseFalseTrue
65.43.91.70.4Iris-setosaFalseFalseFalseFalseTrue
74.6NaN1.40.3Iris-setosaFalseFalseFalseFalseTrue
85.03.41.50.2Iris-setosaFalseFalseFalseFalseTrue
94.42.91.40.2Iris-setosaFalseFalseFalseFalseTrue
104.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
115.43.71.50.2Iris-setosaFalseFalseFalseFalseTrue
124.83.41.60.2Iris-setosaFalseFalseFalseFalseTrue
134.83.01.40.1Iris-setosaFalseFalseFalseFalseFalse
144.33.01.10.1NaNFalseFalseFalseFalseFalse
155.84.01.20.1Iris-setosaFalseFalseFalseFalseFalse
165.74.41.50.1Iris-setosaFalseFalseTrueFalseFalse
175.4NaN1.30.1Iris-setosaFalseFalseFalseFalseFalse
185.13.51.40.1Iris-setosaFalseFalseFalseFalseFalse
195.73.81.70.1Iris-setosaFalseFalseFalseFalseFalse
205.13.81.50.1Iris-setosaFalseFalseFalseFalseFalse
215.43.41.70.1Iris-setosaFalseFalseFalseFalseFalse
225.13.71.50.1Iris-setosaFalseFalseFalseFalseFalse
234.63.61.00.1Iris-setosaFalseFalseFalseFalseFalse
245.13.31.70.1Iris-setosaFalseFalseFalseFalseFalse
254.83.41.90.1Iris-setosaFalseFalseFalseFalseFalse
265.03.01.60.1Iris-setosaFalseFalseFalseFalseFalse
275.03.41.60.1Iris-setosaFalseFalseFalseFalseFalse
28NaN3.51.50.1Iris-setosaFalseFalseFalseFalseFalse
295.23.41.40.1Iris-setosaFalseFalseFalseFalseFalse
304.73.21.60.1Iris-setosaFalseFalseFalseFalseFalse
314.83.1NaN0.1Iris-setosaFalseFalseFalseFalseFalse
325.43.41.50.1Iris-setosaFalseFalseFalseFalseFalse
335.24.11.50.1Iris-setosaFalseFalseTrueFalseFalse
345.54.21.40.1Iris-setosaFalseFalseTrueFalseFalse
354.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
365.03.21.20.1Iris-setosaFalseFalseFalseFalseFalse
375.5NaN1.30.1Iris-setosaFalseFalseFalseFalseFalse
384.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
394.43.01.30.1Iris-setosaFalseFalseFalseFalseFalse
405.13.41.50.1Iris-setosaFalseFalseFalseFalseFalse
415.03.51.30.1Iris-setosaFalseFalseFalseFalseFalse
424.52.31.30.1Iris-setosaFalseFalseFalseFalseFalse
434.43.21.30.1Iris-setosaFalseFalseFalseFalseFalse
445.03.51.60.1Iris-setosaFalseFalseFalseFalseFalse
455.13.81.90.1Iris-setosaFalseFalseFalseFalseFalse
464.83.01.40.1Iris-setosaFalseFalseFalseFalseFalse
475.13.81.60.1Iris-setosaFalseFalseFalseFalseFalse
484.63.21.40.1Iris-setosaFalseFalseFalseFalseFalse
495.33.71.50.1Iris-setosaFalseFalseFalseFalseFalse
505.03.31.40.1Iris-setosaFalseFalseFalseFalseFalse
517.03.24.70.1Iris-versicolorFalseFalseFalseFalseFalse
526.43.24.50.1Iris-versicolorFalseFalseFalseFalseFalse
536.93.14.90.1Iris-versicolorFalseFalseFalseFalseFalse
545.52.34.00.1Iris-versicolorFalseFalseFalseFalseFalse
556.52.84.60.1Iris-versicolorFalseFalseFalseFalseFalse
565.72.84.50.1Iris-versicolorFalseFalseFalseFalseFalse
576.33.34.70.1Iris-versicolorFalseFalseFalseFalseFalse
584.92.43.30.1Iris-versicolorFalseFalseFalseFalseFalse
596.62.94.60.1Iris-versicolorFalseFalseFalseFalseFalse
605.22.73.90.1Iris-versicolorFalseFalseFalseFalseFalse
615.02.03.50.1Iris-versicolorFalseFalseTrueFalseFalse
625.93.04.20.1Iris-versicolorFalseFalseFalseFalseFalse
636.02.24.00.1Iris-versicolorFalseFalseFalseFalseFalse
646.12.94.70.1Iris-versicolorFalseFalseFalseFalseFalse
655.62.93.60.1Iris-versicolorFalseFalseFalseFalseFalse
666.73.14.40.1Iris-versicolorFalseFalseFalseFalseFalse
675.63.04.50.1Iris-versicolorFalseFalseFalseFalseFalse
685.82.74.10.1Iris-versicolorFalseFalseFalseFalseFalse
696.22.24.50.1Iris-versicolorFalseFalseFalseFalseFalse
705.62.53.90.1Iris-versicolorFalseFalseFalseFalseFalse
715.93.24.80.1Iris-versicolorFalseFalseFalseFalseFalse
726.12.84.00.1Iris-versicolorFalseFalseFalseFalseFalse
736.32.54.90.1Iris-versicolorFalseFalseFalseFalseFalse
746.12.84.70.1Iris-versicolorFalseFalseFalseFalseFalse
756.42.94.30.1Iris-versicolorFalseFalseFalseFalseFalse
766.63.04.40.1Iris-versicolorFalseFalseFalseFalseFalse
776.82.84.80.1Iris-versicolorFalseFalseFalseFalseFalse
786.73.05.00.1Iris-versicolorFalseFalseFalseFalseFalse
796.02.94.50.1Iris-versicolorFalseFalseFalseFalseFalse
805.72.63.50.1Iris-versicolorFalseFalseFalseFalseFalse
815.52.43.80.1Iris-versicolorFalseFalseFalseFalseFalse
825.52.43.70.1Iris-versicolorFalseFalseFalseFalseFalse
835.82.73.90.1Iris-versicolorFalseFalseFalseFalseFalse
846.02.75.10.1Iris-versicolorFalseFalseFalseFalseFalse
855.43.04.50.1Iris-versicolorFalseFalseFalseFalseFalse
866.03.44.50.1Iris-versicolorFalseFalseFalseFalseFalse
876.73.14.70.1Iris-versicolorFalseFalseFalseFalseFalse
886.32.34.40.1Iris-versicolorFalseFalseFalseFalseFalse
895.63.04.10.1Iris-versicolorFalseFalseFalseFalseFalse
905.52.54.00.1Iris-versicolorFalseFalseFalseFalseFalse
915.52.64.40.1Iris-versicolorFalseFalseFalseFalseFalse
926.13.04.60.1Iris-versicolorFalseFalseFalseFalseFalse
935.82.64.00.1Iris-versicolorFalseFalseFalseFalseFalse
945.02.33.30.1Iris-versicolorFalseFalseFalseFalseFalse
955.62.74.20.1Iris-versicolorFalseFalseFalseFalseFalse
965.73.04.20.1Iris-versicolorFalseFalseFalseFalseFalse
975.72.94.20.1Iris-versicolorFalseFalseFalseFalseFalse
986.22.94.30.1Iris-versicolorFalseFalseFalseFalseFalse
995.12.53.00.1Iris-versicolorFalseFalseFalseFalseFalse
1005.72.84.10.1Iris-versicolorFalseFalseFalseFalseFalse
1016.33.36.00.1Iris-virginicaFalseFalseFalseFalseFalse
1025.82.75.10.1Iris-virginicaFalseFalseFalseFalseFalse
1037.13.05.90.1Iris-virginicaFalseFalseFalseFalseFalse
1046.32.95.60.1Iris-virginicaFalseFalseFalseFalseFalse
1056.53.05.80.1Iris-virginicaFalseFalseFalseFalseFalse
1067.63.06.60.1Iris-virginicaFalseFalseFalseFalseFalse
1074.92.54.50.1Iris-virginicaFalseFalseFalseFalseFalse
1087.32.96.30.1Iris-virginicaFalseFalseFalseFalseFalse
1096.72.55.80.1Iris-virginicaFalseFalseFalseFalseFalse
1107.23.66.10.1Iris-virginicaFalseFalseFalseFalseFalse
1116.53.25.10.1Iris-virginicaFalseFalseFalseFalseFalse
1126.42.75.30.1Iris-virginicaFalseFalseFalseFalseFalse
1136.83.05.50.1Iris-virginicaFalseFalseFalseFalseFalse
1145.72.55.00.1Iris-virginicaFalseFalseFalseFalseFalse
1155.82.85.10.1Iris-virginicaFalseFalseFalseFalseFalse
1166.43.25.30.1Iris-virginicaFalseFalseFalseFalseFalse
1176.53.05.50.1Iris-virginicaFalseFalseFalseFalseFalse
1187.73.86.70.1Iris-virginicaFalseFalseFalseFalseFalse
1197.72.66.90.1Iris-virginicaFalseFalseFalseFalseFalse
1206.02.25.00.1Iris-virginicaFalseFalseFalseFalseFalse
1216.93.25.70.1Iris-virginicaFalseFalseFalseFalseFalse
1225.62.84.90.1Iris-virginicaFalseFalseFalseFalseFalse
1237.72.86.70.1Iris-virginicaFalseFalseFalseFalseFalse
1246.32.74.90.1Iris-virginicaFalseFalseFalseFalseFalse
1256.73.35.70.1Iris-virginicaFalseFalseFalseFalseFalse
1267.23.26.00.1Iris-virginicaFalseFalseFalseFalseFalse
1276.22.84.80.1Iris-virginicaFalseFalseFalseFalseFalse
1286.13.04.90.1Iris-virginicaFalseFalseFalseFalseFalse
1296.42.85.60.1Iris-virginicaFalseFalseFalseFalseFalse
1307.23.05.80.1Iris-virginicaFalseFalseFalseFalseFalse
1317.42.86.10.1Iris-virginicaFalseFalseFalseFalseFalse
1327.93.86.40.1Iris-virginicaFalseFalseFalseFalseFalse
1336.42.85.60.1Iris-virginicaFalseFalseFalseFalseFalse
1346.32.85.10.1Iris-virginicaFalseFalseFalseFalseFalse
1356.12.65.60.1Iris-virginicaFalseFalseFalseFalseFalse
1367.73.06.10.1Iris-virginicaFalseFalseFalseFalseFalse
1376.33.45.60.1Iris-virginicaFalseFalseFalseFalseFalse
1386.43.15.50.1Iris-virginicaFalseFalseFalseFalseFalse
1396.03.04.80.1Iris-virginicaFalseFalseFalseFalseFalse
1406.93.15.40.1Iris-virginicaFalseFalseFalseFalseFalse
1416.73.15.60.1Iris-virginicaFalseFalseFalseFalseFalse
1426.93.15.10.1Iris-virginicaFalseFalseFalseFalseFalse
1435.82.75.10.1Iris-virginicaFalseFalseFalseFalseFalse
1446.83.25.90.1Iris-virginicaFalseFalseFalseFalseFalse
1456.73.35.70.1Iris-virginicaFalseFalseFalseFalseFalse
1466.73.05.20.1Iris-virginicaFalseFalseFalseFalseFalse
1476.32.55.00.1Iris-virginicaFalseFalseFalseFalseFalse
1486.53.05.20.1Iris-virginicaFalseFalseFalseFalseFalse
1496.23.45.40.1NaNFalseFalseFalseFalseFalse
150NaN3.05.10.1Iris-virginicaFalseFalseFalseFalseFalse
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "740e0779-369c-4f26-ba1d-be52a342fd90", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local): would modify 15 value(s) across 5 column(s).\n", + "Column 'Id': would clip 0 value(s) locally (bounds: -73.5, 224.5).\n", + "Column 'SepalLengthCm': would clip 0 value(s) locally (bounds: 3.1499999999999986, 8.350000000000001).\n", + "Column 'SepalWidthCm': would clip 4 value(s) locally (bounds: 2.05, 4.05).\n", + "Column 'PetalLengthCm': would clip 0 value(s) locally (bounds: -3.649999999999999, 10.349999999999998).\n", + "Column 'PetalWidthCm': would clip 11 value(s) locally (bounds: 0.1, 0.1).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_oob_columns
15.13.51.40.2Iris-setosaPetalWidthCm
24.93.01.40.2Iris-setosaPetalWidthCm
34.73.21.30.2Iris-setosaPetalWidthCm
44.63.11.50.2Iris-setosaPetalWidthCm
55.03.61.40.2Iris-setosaPetalWidthCm
65.43.91.70.4Iris-setosaPetalWidthCm
74.6NaN1.40.3Iris-setosaPetalWidthCm
85.03.41.50.2Iris-setosaPetalWidthCm
94.42.91.40.2Iris-setosaPetalWidthCm
115.43.71.50.2Iris-setosaPetalWidthCm
124.83.41.60.2Iris-setosaPetalWidthCm
165.74.41.50.1Iris-setosaSepalWidthCm
335.24.11.50.1Iris-setosaSepalWidthCm
345.54.21.40.1Iris-setosaSepalWidthCm
615.02.03.50.1Iris-versicolorSepalWidthCm
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e73470c4-20e2-4a6d-9b5c-ad21d41659e8", + "metadata": {}, + "outputs": [], + "source": [ + "%encode " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Untitled.ipynb b/Untitled.ipynb index 4739c9a..4821ba6 100644 --- a/Untitled.ipynb +++ b/Untitled.ipynb @@ -6739,7 +6739,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "66a8378a-6f87-4b38-a729-5aab1a288cb0", "metadata": {}, "outputs": [ @@ -6969,74 +6969,9 @@ " 50.0\n", " 1\n", " \n", - " \n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "2ebe63dd-3817-423b-bfb1-8e4b57dfb0a9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", - "PREVIEW (local) estimated created columns: 4\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7056,13 +6991,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7082,13 +7012,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7108,13 +7033,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7134,13 +7054,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7160,13 +7075,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7186,13 +7096,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7212,13 +7117,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7238,13 +7138,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7263,72 +7158,9 @@ " \n", " \n", " \n", - " \n", - "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
111Alice10000HR30F0
212Bob01000Engineering45M0
313Charlie00100Sales38M1
414Diana01000Engineering29F0
616Frank10000HR50M1
717Grace00100Sales42F0
818Henry01000Engineering31M0
919Ivy00001Finance27F0
1020Jack00100Sales55M50.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4760a0c0-869a-4e77-bce1-8985bca8006f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_8d3413bf829d4cd8.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7348,13 +7180,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7374,13 +7201,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7400,13 +7222,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7426,39 +7243,8 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7478,13 +7264,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7504,13 +7285,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7530,13 +7306,8 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", " \n", @@ -7556,13 +7327,197 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -7590,38 +7545,22 @@ } ], "source": [ - "%encode method=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "65d25956-0729-4e5f-9f41-d91ab3361655", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_8d3413bf829d4cd8 -> test.employees; previous test.employees renamed to test.employees_prerollback_8d3413bf829d4cd8.\n" - ] - } - ], - "source": [ - "%encode mode=rollback rollback_token=8d3413bf829d4cd8" + "%encode method=onehot columns=department drop_original=false mode=preview" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", + "execution_count": 8, + "id": "2ebe63dd-3817-423b-bfb1-8e4b57dfb0a9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Encoded columns in-place and updated last_select.\n" + "PREVIEW (local):\n", + "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", + "PREVIEW (local) estimated created columns: 4\n" ] }, { @@ -7630,284 +7569,648 @@ "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
121Alice10000HR30F0
222Bob01000Engineering45M0
323Charlie00100Sales38M1
424Diana01000Engineering29F0
5Eve00010NaN35FBachelors8.01588.013060.390000.08000.08.0485.00
626Frank10000HR50M1
727Grace00100Sales42F0
828Henry01000Engineering31M0
929Ivy00001Finance27F0
1030JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
31AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
32BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
33CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
34DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
36FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
37GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
38HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
39IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
40JackSales55M
\n", " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + "
idemp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknownbonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1Alice10000HR30.05000.00.01.00.00.030FNaN5.01287.514050.255000.0300.08.5475.00
2BobUnknown40.065000.00.00.00.01.001000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3CharlieEngineering35.0700000.01.00.00.00.000100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4DavidHR25.048000.00.01.00.0Diana01000Engineering29FPhD6.02295.225020.0
97000.010000.09.65EveUnknown35.065000.00.00.00.01.095.00
6FrankEngineering28.072000.01.00.00.00.010000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7UnknownGrace00100Sales50.065000.00.00.01.00.042FBachelors18.02081.4125120.485000.07000.07.8374.00
8GraceSales45.065000.00.00.01.00.0Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9AliceHR30.0Ivy00001Finance27FBachelors3.01085.002080.670000.05000.00.01.00.00.08.2482.00
10BobUnknown40.0Jack00100Sales55MHigh School30.01268.905250.865000.00.00.00.01.0
11CharlieEngineering35.0700000.01.00.00.00.0
12DavidHR25.048000.00.01.00.00.0
13EveUnknown35.065000.00.00.00.01.0
14FrankEngineering28.072000.01.00.00.00.0
15UnknownSales2000.05.5150.065000.00.00.01.00.01
16GraceSales45.065000.00.00.01.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4760a0c0-869a-4e77-bce1-8985bca8006f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_8d3413bf829d4cd8.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
171Alice10000HR30.05000.00.01.00.00.030FNaN5.01287.514050.255000.0300.08.5475.00
182BobUnknown40.065000.00.00.00.01.001000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
193CharlieEngineering35.0700000.01.00.00.00.000100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
20DavidHR25.048000.00.01.00.04Diana01000Engineering29FPhD6.02295.225020.097000.010000.09.6595.00
215EveUnknown35.065000.00.00.00.01.000010NaN35FBachelors8.01588.013060.390000.08000.08.0485.00
226FrankEngineering28.072000.01.00.00.00.010000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
23Unknown7Grace00100Sales50.065000.00.00.01.00.042FBachelors18.02081.4125120.485000.07000.07.8374.00
24Grace8Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9Ivy00001Finance27FBachelors3.01085.002080.670000.05000.08.2482.00
10Jack00100Sales45.055MHigh School30.01268.905250.865000.00.00.01.00.02000.05.5150.01
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "65d25956-0729-4e5f-9f41-d91ab3361655", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_8d3413bf829d4cd8 -> test.employees; previous test.employees renamed to test.employees_prerollback_8d3413bf829d4cd8.\n" + ] + } + ], + "source": [ + "%encode mode=rollback rollback_token=8d3413bf829d4cd8" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7918,7 +8221,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7929,7 +8232,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7940,7 +8243,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7951,7 +8254,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7962,7 +8265,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7973,7 +8276,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7984,7 +8287,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -7995,7 +8298,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8006,7 +8309,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8017,7 +8320,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8028,7 +8331,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8039,7 +8342,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8050,7 +8353,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8061,7 +8364,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8072,7 +8375,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -8082,648 +8385,513 @@ " \n", " \n", " \n", - " \n", - "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown
251AliceHR30.00.0
262BobUnknown40.01.0
273CharlieEngineering35.00.0
284DavidHR25.00.0
295EveUnknown35.01.0
306FrankEngineering28.00.0
317UnknownSales50.00.0
328GraceSales45.00.0
339AliceHR30.00.0
3410BobUnknown40.01.0
3511CharlieEngineering35.00.0
3612DavidHR25.00.0
3713EveUnknown35.01.0
3814FrankEngineering28.00.0
3915UnknownSales50.00.0
4016GraceSales45.01.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Encoded columns in-place and updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalarydepartment_lbldepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknowndepartment_ord
117AliceHR30.05000.010.01.00.00.01.0
218BobUnknown40.0NaN365000.00.00.00.01.03.0
319CharlieEngineeringNaN35.0700000.001.00.00.00.00.0
420DavidHR25.048000.010.01.00.00.01.0
521EveUnknown35.0NaN365000.00.00.00.01.03.0
622FrankEngineering28.072000.001.00.00.00.00.0
723UnknownSales50.0NaN265000.00.00.01.00.02.0
824GraceSales45.065000.020.00.01.00.02.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=ordinal columns=department drop_original=false" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "ab1491d5-4a2b-46e8-a079-dab57ae95afe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - "
idnamedepartmentagesalary
0.00000025AliceHR0.200.00000030.05000.00.01.00.00.0
0.14285726BobNaN0.60NaNUnknown40.065000.00.00.00.01.0
0.28571427CharlieEngineeringNaN1.00000035.0700000.01.00.00.00.0
0.42857128DavidHR0.000.06187125.048000.00.01.00.00.0
0.57142929EveNaN0.40NaNUnknown35.065000.00.00.00.01.0
0.71428630FrankEngineering0.120.09640328.072000.01.00.00.00.0
0.857143NaN31UnknownSales1.00NaN50.065000.00.00.01.00.0
1.00000032GraceSales0.800.08633145.065000.00.00.01.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'emp_id' min=1.0, max=10.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'age' min=27.0, max=55.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'years_experience' min=3.0, max=30.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'projects_completed' min=8.0, max=30.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'avg_project_score' min=68.9, max=95.2 -> range will map to (5.0, 10.0)\n", - "Local: Column 'certifications' min=0.0, max=3.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'training_hours' min=5.0, max=50.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'overtime_hours' min=2.0, max=25.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'remote_ratio' min=0.0, max=0.8 -> range will map to (5.0, 10.0)\n", - "Local: Column 'salary' min=55000.0, max=1200000.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'bonus' min=300.0, max=15000.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'satisfaction_score' min=5.5, max=9.6 -> range will map to (5.0, 10.0)\n", - "Local: Column 'performance_rating' min=1.0, max=5.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'potential_score' min=50.0, max=95.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'attrition_flag' min=0.0, max=1.0 -> range will map to (5.0, 10.0)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
33AliceHR30.05000.00.01.00.00.0
34BobUnknown40.065000.00.00.00.01.0
35CharlieEngineering35.0700000.01.00.00.00.0
36DavidHR25.048000.00.01.00.00.0
37EveUnknown35.065000.00.00.00.01.0
38FrankEngineering28.072000.01.00.00.00.0
39UnknownSales50.065000.00.00.01.00.0
40GraceSales45.065000.00.00.01.00.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_ididnamedepartmentageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_norm_previewage_norm_previewyears_experience_norm_previewprojects_completed_norm_previewavg_project_score_norm_previewcertifications_norm_previewtraining_hours_norm_previewovertime_hours_norm_previewremote_ratio_norm_previewsalary_norm_previewbonus_norm_previewsatisfaction_score_norm_previewperformance_rating_norm_previewpotential_score_norm_previewattrition_flag_norm_previewdepartment_lbldepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknowndepartment_ord
1305.01287.5AliceHR30.05000.014050.255000.0300.08.5475.005.0000005.5357145.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000005.0000008.6585378.757.7777785.00.01.00.00.01.0
24520.03091.0BobUnknown40.0NaN320100.11200000.015000.09.0589.005.5555568.2142868.14814810.0000009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.00.00.00.01.03.0
338CharlieEngineeringNaN1879.3700000.0015200.580000.07000.07.2370.01.00.00.00.00.0
4DavidHR25.048000.016.1111116.964286NaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
4296.02295.225020.097000.010000.09.6595.006.6666675.3571435.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.01.00.00.01.0
5358.01588.013060.390000.08000.08.0485.007.2222226.4285715.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0EveUnknown35.0NaN30.00.00.01.03.0
65025.0872.5FrankEngineering28.072000.0010150.760000.04000.06.5260.017.7777789.1071439.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.01.00.00.00.00.0
74218.02081.4125120.485000.07000.07.8374.008.3333337.6785717.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0UnknownSales50.0NaN20.00.01.00.02.0
8317.02593.1GraceSales45.065000.023550.295000.09000.09.1590.008.8888895.7142865.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.00.00.01.00.02.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=ordinal columns=department drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ab1491d5-4a2b-46e8-a079-dab57ae95afe", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
idnamedepartmentagesalary
9273.01085.002080.670000.05000.08.2482.009.4444445.0000005.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.00.000000AliceHR0.200.000000
105530.01268.905250.865000.02000.05.5150.0110.00000010.00000010.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.00.142857BobNaN0.60NaN
0.285714CharlieEngineeringNaN1.000000
0.428571DavidHR0.000.061871
0.571429EveNaN0.40NaN
0.714286FrankEngineering0.120.096403
0.857143NaNSales1.00NaN
1.000000GraceSales0.800.086331
" @@ -8734,20 +8902,35 @@ } ], "source": [ - "%normalize feature_range=5,10 mode=preview" + "%normalize" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "6685559f-8986-4504-97eb-e62e20275bd3", + "execution_count": 5, + "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Apply completed: original preserved as test.employees_backup_daf864252a6c46f1.\n" + "PREVIEW (local):\n", + "Local: Column 'emp_id' min=1.0, max=40.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'age' min=27.0, max=55.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'years_experience' min=3.0, max=30.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'projects_completed' min=8.0, max=30.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'avg_project_score' min=68.9, max=95.2 -> range will map to (5.0, 10.0)\n", + "Local: Column 'certifications' min=0.0, max=3.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'training_hours' min=5.0, max=50.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'overtime_hours' min=2.0, max=25.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'remote_ratio' min=0.0, max=0.8 -> range will map to (5.0, 10.0)\n", + "Local: Column 'salary' min=55000.0, max=1200000.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'bonus' min=300.0, max=15000.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'satisfaction_score' min=5.5, max=9.6 -> range will map to (5.0, 10.0)\n", + "Local: Column 'performance_rating' min=1.0, max=5.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'potential_score' min=50.0, max=95.0 -> range will map to (5.0, 10.0)\n", + "Local: Column 'attrition_flag' min=0.0, max=1.0 -> range will map to (5.0, 10.0)\n" ] }, { @@ -8757,11 +8940,7 @@ " \n", " \n", " emp_id\n", - " name\n", - " department\n", " age\n", - " gender\n", - " education_level\n", " years_experience\n", " projects_completed\n", " avg_project_score\n", @@ -8775,16 +8954,42 @@ " performance_rating\n", " potential_score\n", " attrition_flag\n", + " emp_id_norm_preview\n", + " age_norm_preview\n", + " years_experience_norm_preview\n", + " projects_completed_norm_preview\n", + " avg_project_score_norm_preview\n", + " certifications_norm_preview\n", + " training_hours_norm_preview\n", + " overtime_hours_norm_preview\n", + " remote_ratio_norm_preview\n", + " salary_norm_preview\n", + " bonus_norm_preview\n", + " satisfaction_score_norm_preview\n", + " performance_rating_norm_preview\n", + " potential_score_norm_preview\n", + " attrition_flag_norm_preview\n", " \n", " \n", " \n", " \n", + " 1\n", + " 30\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", " 5.000000\n", - " Alice\n", - " HR\n", " 5.535714\n", - " F\n", - " NaN\n", " 5.370370\n", " 5.909091\n", " 8.536122\n", @@ -8800,12 +9005,23 @@ " 5.0\n", " \n", " \n", - " 5.555556\n", - " Bob\n", - " Engineering\n", + " 2\n", + " 45\n", + " 20.0\n", + " 30\n", + " 91.0\n", + " 3\n", + " 20\n", + " 10\n", + " 0.1\n", + " 1200000.0\n", + " 15000.0\n", + " 9.0\n", + " 5\n", + " 89.0\n", + " 0\n", + " 5.128205\n", " 8.214286\n", - " M\n", - " Masters\n", " 8.148148\n", " 10.000000\n", " 9.201521\n", @@ -8821,12 +9037,23 @@ " 5.0\n", " \n", " \n", - " 6.111111\n", - " Charlie\n", - " Sales\n", + " 3\n", + " 38\n", + " NaN\n", + " 18\n", + " 79.3\n", + " 0\n", + " 15\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", + " 3\n", + " 70.0\n", + " 1\n", + " 5.256410\n", " 6.964286\n", - " M\n", - " Bachelors\n", " NaN\n", " 7.272727\n", " 6.977186\n", @@ -8842,12 +9069,23 @@ " 10.0\n", " \n", " \n", - " 6.666667\n", - " Diana\n", - " Engineering\n", + " 4\n", + " 29\n", + " 6.0\n", + " 22\n", + " 95.2\n", + " 2\n", + " 50\n", + " 2\n", + " 0.0\n", + " 97000.0\n", + " 10000.0\n", + " 9.6\n", + " 5\n", + " 95.0\n", + " 0\n", + " 5.384615\n", " 5.357143\n", - " F\n", - " PhD\n", " 5.555556\n", " 8.181818\n", " 10.000000\n", @@ -8863,12 +9101,23 @@ " 5.0\n", " \n", " \n", - " 7.222222\n", - " Eve\n", - " NaN\n", + " 5\n", + " 35\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " 5.512821\n", " 6.428571\n", - " F\n", - " Bachelors\n", " 5.925926\n", " 6.590909\n", " 8.631179\n", @@ -8884,12 +9133,23 @@ " 5.0\n", " \n", " \n", - " 7.777778\n", - " Frank\n", - " HR\n", + " 6\n", + " 50\n", + " 25.0\n", + " 8\n", + " 72.5\n", + " 0\n", + " 10\n", + " 15\n", + " 0.7\n", + " 60000.0\n", + " 4000.0\n", + " 6.5\n", + " 2\n", + " 60.0\n", + " 1\n", + " 5.641026\n", " 9.107143\n", - " M\n", - " High School\n", " 9.074074\n", " 5.000000\n", " 5.684411\n", @@ -8905,12 +9165,23 @@ " 10.0\n", " \n", " \n", - " 8.333333\n", - " Grace\n", - " Sales\n", + " 7\n", + " 42\n", + " 18.0\n", + " 20\n", + " 81.4\n", + " 1\n", + " 25\n", + " 12\n", + " 0.4\n", + " 85000.0\n", + " 7000.0\n", + " 7.8\n", + " 3\n", + " 74.0\n", + " 0\n", + " 5.769231\n", " 7.678571\n", - " F\n", - " Bachelors\n", " 7.777778\n", " 7.727273\n", " 7.376426\n", @@ -8926,167 +9197,1103 @@ " 5.0\n", " \n", " \n", - " 8.888889\n", - " Henry\n", - " Engineering\n", - " 5.714286\n", - " M\n", - " Masters\n", - " 5.740741\n", - " 8.863636\n", - " 9.600760\n", - " 8.333333\n", - " 8.333333\n", - " 5.652174\n", - " 6.250\n", - " 5.174672\n", - " 7.959184\n", - " 9.390244\n", - " 10.00\n", - " 9.444444\n", - " 5.0\n", + " 8\n", + " 31\n", + " 7.0\n", + " 25\n", + " 93.1\n", + " 2\n", + " 35\n", + " 5\n", + " 0.2\n", + " 95000.0\n", + " 9000.0\n", + " 9.1\n", + " 5\n", + " 90.0\n", + " 0\n", + " 5.897436\n", + " 5.714286\n", + " 5.740741\n", + " 8.863636\n", + " 9.600760\n", + " 8.333333\n", + " 8.333333\n", + " 5.652174\n", + " 6.250\n", + " 5.174672\n", + " 7.959184\n", + " 9.390244\n", + " 10.00\n", + " 9.444444\n", + " 5.0\n", + " \n", + " \n", + " 9\n", + " 27\n", + " 3.0\n", + " 10\n", + " 85.0\n", + " 0\n", + " 20\n", + " 8\n", + " 0.6\n", + " 70000.0\n", + " 5000.0\n", + " 8.2\n", + " 4\n", + " 82.0\n", + " 0\n", + " 6.025641\n", + " 5.000000\n", + " 5.000000\n", + " 5.454545\n", + " 8.060837\n", + " 5.000000\n", + " 6.666667\n", + " 6.304348\n", + " 8.750\n", + " 5.065502\n", + " 6.598639\n", + " 8.292683\n", + " 8.75\n", + " 8.555556\n", + " 5.0\n", + " \n", + " \n", + " 10\n", + " 55\n", + " 30.0\n", + " 12\n", + " 68.9\n", + " 0\n", + " 5\n", + " 25\n", + " 0.8\n", + " 65000.0\n", + " 2000.0\n", + " 5.5\n", + " 1\n", + " 50.0\n", + " 1\n", + " 6.153846\n", + " 10.000000\n", + " 10.000000\n", + " 5.909091\n", + " 5.000000\n", + " 5.000000\n", + " 5.000000\n", + " 10.000000\n", + " 10.000\n", + " 5.043668\n", + " 5.578231\n", + " 5.000000\n", + " 5.00\n", + " 5.000000\n", + " 10.0\n", + " \n", + " \n", + " 11\n", + " 30\n", + " 5.0\n", + " 12\n", + " 87.5\n", + " 1\n", + " 40\n", + " 5\n", + " 0.2\n", + " 55000.0\n", + " 300.0\n", + " 8.5\n", + " 4\n", + " 75.0\n", + " 0\n", + " 6.282051\n", + " 5.535714\n", + " 5.370370\n", + " 5.909091\n", + " 8.536122\n", + " 6.666667\n", + " 8.888889\n", + " 5.652174\n", + " 6.250\n", + " 5.000000\n", + " 5.000000\n", + " 8.658537\n", + " 8.75\n", + " 7.777778\n", + " 5.0\n", + " \n", + " \n", + " 12\n", + " 45\n", + " 20.0\n", + " 30\n", + " 91.0\n", + " 3\n", + " 20\n", + " 10\n", + " 0.1\n", + " 1200000.0\n", + " 15000.0\n", + " 9.0\n", + " 5\n", + " 89.0\n", + " 0\n", + " 6.410256\n", + " 8.214286\n", + " 8.148148\n", + " 10.000000\n", + " 9.201521\n", + " 10.000000\n", + " 6.666667\n", + " 6.739130\n", + " 5.625\n", + " 10.000000\n", + " 10.000000\n", + " 9.268293\n", + " 10.00\n", + " 9.333333\n", + " 5.0\n", + " \n", + " \n", + " 13\n", + " 38\n", + " NaN\n", + " 18\n", + " 79.3\n", + " 0\n", + " 15\n", + " 20\n", + " 0.5\n", + " 80000.0\n", + " 7000.0\n", + " 7.2\n", + " 3\n", + " 70.0\n", + " 1\n", + " 6.538462\n", + " 6.964286\n", + " NaN\n", + " 7.272727\n", + " 6.977186\n", + " 5.000000\n", + " 6.111111\n", + " 8.913043\n", + " 8.125\n", + " 5.109170\n", + " 7.278912\n", + " 7.073171\n", + " 7.50\n", + " 7.222222\n", + " 10.0\n", + " \n", + " \n", + " 14\n", + " 29\n", + " 6.0\n", + " 22\n", + " 95.2\n", + " 2\n", + " 50\n", + " 2\n", + " 0.0\n", + " 97000.0\n", + " 10000.0\n", + " 9.6\n", + " 5\n", + " 95.0\n", + " 0\n", + " 6.666667\n", + " 5.357143\n", + " 5.555556\n", + " 8.181818\n", + " 10.000000\n", + " 8.333333\n", + " 10.000000\n", + " 5.000000\n", + " 5.000\n", + " 5.183406\n", + " 8.299320\n", + " 10.000000\n", + " 10.00\n", + " 10.000000\n", + " 5.0\n", + " \n", + " \n", + " 15\n", + " 35\n", + " 8.0\n", + " 15\n", + " 88.0\n", + " 1\n", + " 30\n", + " 6\n", + " 0.3\n", + " 90000.0\n", + " 8000.0\n", + " 8.0\n", + " 4\n", + " 85.0\n", + " 0\n", + " 6.794872\n", + " 6.428571\n", + " 5.925926\n", + " 6.590909\n", + " 8.631179\n", + " 6.666667\n", + " 7.777778\n", + " 5.869565\n", + " 6.875\n", + " 5.152838\n", + " 7.619048\n", + " 8.048780\n", + " 8.75\n", + " 8.888889\n", + " 5.0\n", + " \n", + " \n", + " 16\n", + " 50\n", + " 25.0\n", + " 8\n", + " 72.5\n", + " 0\n", + " 10\n", + " 15\n", + " 0.7\n", + " 60000.0\n", + " 4000.0\n", + " 6.5\n", + " 2\n", + " 60.0\n", + " 1\n", + " 6.923077\n", + " 9.107143\n", + " 9.074074\n", + " 5.000000\n", + " 5.684411\n", + " 5.000000\n", + " 5.555556\n", + " 7.826087\n", + " 9.375\n", + " 5.021834\n", + " 6.258503\n", + " 6.219512\n", + " 6.25\n", + " 6.111111\n", + " 10.0\n", + " \n", + " \n", + " 17\n", + " 42\n", + " 18.0\n", + " 20\n", + " 81.4\n", + " 1\n", + " 25\n", + " 12\n", + " 0.4\n", + " 85000.0\n", + " 7000.0\n", + " 7.8\n", + " 3\n", + " 74.0\n", + " 0\n", + " 7.051282\n", + " 7.678571\n", + " 7.777778\n", + " 7.727273\n", + " 7.376426\n", + " 6.666667\n", + " 7.222222\n", + " 7.173913\n", + " 7.500\n", + " 5.131004\n", + " 7.278912\n", + " 7.804878\n", + " 7.50\n", + " 7.666667\n", + " 5.0\n", + " \n", + " \n", + " 18\n", + " 31\n", + " 7.0\n", + " 25\n", + " 93.1\n", + " 2\n", + " 35\n", + " 5\n", + " 0.2\n", + " 95000.0\n", + " 9000.0\n", + " 9.1\n", + " 5\n", + " 90.0\n", + " 0\n", + " 7.179487\n", + " 5.714286\n", + " 5.740741\n", + " 8.863636\n", + " 9.600760\n", + " 8.333333\n", + " 8.333333\n", + " 5.652174\n", + " 6.250\n", + " 5.174672\n", + " 7.959184\n", + " 9.390244\n", + " 10.00\n", + " 9.444444\n", + " 5.0\n", + " \n", + " \n", + " 19\n", + " 27\n", + " 3.0\n", + " 10\n", + " 85.0\n", + " 0\n", + " 20\n", + " 8\n", + " 0.6\n", + " 70000.0\n", + " 5000.0\n", + " 8.2\n", + " 4\n", + " 82.0\n", + " 0\n", + " 7.307692\n", + " 5.000000\n", + " 5.000000\n", + " 5.454545\n", + " 8.060837\n", + " 5.000000\n", + " 6.666667\n", + " 6.304348\n", + " 8.750\n", + " 5.065502\n", + " 6.598639\n", + " 8.292683\n", + " 8.75\n", + " 8.555556\n", + " 5.0\n", + " \n", + " \n", + " 20\n", + " 55\n", + " 30.0\n", + " 12\n", + " 68.9\n", + " 0\n", + " 5\n", + " 25\n", + " 0.8\n", + " 65000.0\n", + " 2000.0\n", + " 5.5\n", + " 1\n", + " 50.0\n", + " 1\n", + " 7.435897\n", + " 10.000000\n", + " 10.000000\n", + " 5.909091\n", + " 5.000000\n", + " 5.000000\n", + " 5.000000\n", + " 10.000000\n", + " 10.000\n", + " 5.043668\n", + " 5.578231\n", + " 5.000000\n", + " 5.00\n", + " 5.000000\n", + " 10.0\n", + " \n", + " \n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize feature_range=5,10 mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6685559f-8986-4504-97eb-e62e20275bd3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as test.employees_backup_daf864252a6c46f1.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
5.000000AliceHR5.535714FNaN5.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000005.0000008.6585378.757.7777785.0
5.555556BobEngineering8.214286MMasters8.14814810.0000009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.0
6.111111CharlieSales6.964286MBachelorsNaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
6.666667DianaEngineering5.357143FPhD5.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.0
7.222222EveNaN6.428571FBachelors5.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0
7.777778FrankHR9.107143MHigh School9.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.0
8.333333GraceSales7.678571FBachelors7.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0
8.888889HenryEngineering5.714286MMasters5.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.0
9.444444IvyFinance5.000000FBachelors5.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.0
10.000000JackSales10.000000MHigh School10.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize feature_range=5,10 table=test.employees mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e5bb1249-ba1e-4a9d-86f0-58b528c0e465", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rollback: restored test.employees_backup_daf864252a6c46f1 -> test.employees; previous test.employees renamed to test.employees_prerollback_daf864252a6c46f1.\n" + ] + } + ], + "source": [ + "%normalize mode=rollback rollback_token=daf864252a6c46f1" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'emp_id': mean=20.5, std=11.543396380615196\n", + "Local: Column 'age': mean=38.2, std=9.064215354899728\n", + "Local: Column 'years_experience': mean=13.555555555555555, std=9.2988782012923\n", + "Local: Column 'projects_completed': mean=17.2, std=6.7201190465645775\n", + "Local: Column 'avg_project_score': mean=84.19000000000001, std=8.217353588595294\n", + "Local: Column 'certifications': mean=1.0, std=1.0\n", + "Local: Column 'training_hours': mean=25.0, std=13.228756555322953\n", + "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", + "Local: Column 'remote_ratio': mean=0.38, std=0.2521904042583698\n", + "Local: Column 'salary': mean=189700.0, std=337053.12637624354\n", + "Local: Column 'bonus': mean=6730.0, std=4002.0119939850256\n", + "Local: Column 'satisfaction_score': mean=7.94, std=1.1918053532351665\n", + "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n", + "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", + "Local: Column 'attrition_flag': mean=0.3, std=0.45825756949558394\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_std_previewage_std_previewyears_experience_std_previewprojects_completed_std_previewavg_project_score_std_previewcertifications_std_previewtraining_hours_std_previewovertime_hours_std_previewremote_ratio_std_previewsalary_std_previewbonus_std_previewsatisfaction_score_std_previewperformance_rating_std_previewpotential_score_std_previewattrition_flag_std_preview
1305.01287.514050.255000.0300.08.5475.00-1.689278-0.904656-0.920063-0.7737960.4028060.01.133893-0.835766-0.713746-0.399640-1.6066920.4698750.312348-0.148823-0.654654
24520.03091.0320100.11200000.015000.09.0589.00-1.6026480.7502030.6930351.9047280.8287342.0-0.377964-0.115278-1.1102722.9974502.0664610.8894071.0932160.892940-0.654654
338NaN1879.3015200.580000.07000.07.2370.01-1.516018-0.022065NaN0.119046-0.595082-1.0-0.7559291.3256980.475831-0.3254680.067466-0.620907-0.468521-0.5208821.527525
4296.02295.225020.097000.010000.09.6595.00-1.429389-1.014980-0.8125230.7142731.3398471.01.889822-1.268059-1.506798-0.2750310.8170891.3928451.0932161.339410-0.654654
5358.01588.013060.390000.08000.08.0485.00-1.342759-0.353037-0.597444-0.3273750.4636530.00.377964-0.691669-0.317221-0.2957990.3173400.0503440.3123480.595293-0.654654
65025.0872.5010150.760000.04000.06.5260.01-1.2561291.3018231.230734-1.369023-1.422599-1.0-1.1338930.6052101.268883-0.384806-0.682157-1.208251-1.249390-1.2649991.527525
74218.02081.4125120.485000.07000.07.8374.00-1.1695000.4192310.4779550.416659-0.3395250.00.0000000.1729170.079305-0.3106340.067466-0.117469-0.468521-0.223235-0.654654
8317.02593.123550.295000.09000.09.1590.00-1.082870-0.794332-0.7049831.1606941.0842911.00.755929-0.835766-0.713746-0.2809650.5672150.9733131.0932160.967352-0.654654
9.444444IvyFinance5.000000FBachelors5.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.09273.01085.002080.670000.05000.08.2482.00-0.996241-1.235628-1.135143-1.0714100.098572-1.0-0.377964-0.4034730.872357-0.355137-0.4322830.2181560.3123480.372058-0.654654
10.000000JackSales10.000000MHigh School10.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize feature_range=5,10 table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e5bb1249-ba1e-4a9d-86f0-58b528c0e465", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_daf864252a6c46f1 -> test.employees; previous test.employees renamed to test.employees_prerollback_daf864252a6c46f1.\n" - ] - } - ], - "source": [ - "%normalize mode=rollback rollback_token=daf864252a6c46f1" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'emp_id': mean=5.5, std=2.8722813232690143\n", - "Local: Column 'age': mean=38.2, std=9.064215354899728\n", - "Local: Column 'years_experience': mean=13.555555555555555, std=9.2988782012923\n", - "Local: Column 'projects_completed': mean=17.2, std=6.7201190465645775\n", - "Local: Column 'avg_project_score': mean=84.19, std=8.217353588595294\n", - "Local: Column 'certifications': mean=1.0, std=1.0\n", - "Local: Column 'training_hours': mean=25.0, std=13.228756555322953\n", - "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", - "Local: Column 'remote_ratio': mean=0.38000000000000006, std=0.2521904042583698\n", - "Local: Column 'salary': mean=189700.0, std=337053.12637624354\n", - "Local: Column 'bonus': mean=6730.0, std=4002.0119939850256\n", - "Local: Column 'satisfaction_score': mean=7.9399999999999995, std=1.1918053532351665\n", - "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n", - "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", - "Local: Column 'attrition_flag': mean=0.3, std=0.45825756949558405\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9101,7 +10308,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9118,7 +10325,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9133,7 +10340,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9150,7 +10357,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9165,7 +10372,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9182,7 +10389,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9197,7 +10404,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9214,7 +10421,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9229,7 +10436,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9246,7 +10453,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9261,7 +10468,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9278,7 +10485,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9293,7 +10500,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9310,7 +10517,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9325,7 +10532,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9342,7 +10549,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9357,7 +10564,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9374,7 +10581,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -9389,7 +10596,7 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py index a14c22e..11423a0 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -573,7 +573,34 @@ def oob_cols(r): cols = [c for c, lower, upper, _ in combined_info if lower is not None and upper is not None and (pd.notna(r.get(c)) and (r.get(c) < lower or r.get(c) > upper))] return ",".join(cols) sample_rows["_oob_columns"] = sample_rows.apply(oob_cols, axis=1) + + # ADD: compute and show clipped preview columns for visibility + for c, lower, upper, _ in combined_info: + clipped_col_name = f"{c}_clipped_preview" + try: + if lower is None and upper is None: + # no computed bounds; copy original values + sample_rows[clipped_col_name] = sample_rows[c] + else: + # use pandas clip to compute what the value would be after clipping + sample_rows[clipped_col_name] = sample_rows[c].clip(lower=lower, upper=upper) + except Exception: + # fallback: try elementwise clipping to avoid exceptions on mixed types + def _clip_val(v): + try: + if pd.isna(v): + return v + if lower is not None and v < lower: + return lower + if upper is not None and v > upper: + return upper + return v + except Exception: + return v + sample_rows[clipped_col_name] = sample_rows[c].apply(_clip_val) + try: + # prefer HTML display; this will include the *_clipped_preview columns self._send_html(kernel, sample_rows) except Exception: kernel._send_message("stdout", str(sample_rows.head())) @@ -627,7 +654,7 @@ def oob_cols(r): except Exception as e: kernel._send_message("stderr", f"Error during preview: {e}") return - + # --- ROLLBACK MODE --- if mode == "rollback": if mariadb_client is None: diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py index 928e0ee..bc4ee2a 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -536,6 +536,29 @@ def nulls_in_row(r): return ",".join([c for c in target_columns if pd.isnull(r.get(c))]) sample_preview = sample_rows.copy() sample_preview["_null_columns"] = sample_preview.apply(nulls_in_row, axis=1) + + # --- NEW: compute filled-preview columns so the user can see what values would be used --- + for c in target_columns: + filled_col = f"{c}_filled_preview" + ok, fill_val, _ = computed.get(c, (False, None, "")) + try: + if ok and fill_val is not None: + # use pandas fillna on the preview sample to show the to-be-filled value + sample_preview[filled_col] = sample_preview[c].fillna(fill_val) + else: + # no computed fill value: show original values so preview is still informative + sample_preview[filled_col] = sample_preview[c] + except Exception: + # fallback elementwise: preserve original when something goes wrong + def _fill_elem(v): + try: + if pd.isna(v) and ok and fill_val is not None: + return fill_val + return v + except Exception: + return v + sample_preview[filled_col] = sample_preview[c].apply(_fill_elem) + try: self._send_html(kernel, sample_preview) except Exception: From 528b7ca649fa08bcd060dcb1c3395bd78c9af79e Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Sat, 1 Nov 2025 11:10:07 +0000 Subject: [PATCH 35/38] Notebooks added --- AutomatedMLPipeline.ipynb | 2446 ++++++++++++ RAG.ipynb | 1292 +++++++ RawMLPipeline.ipynb | 7501 +++++++++++++++++++++++++++++++------ 3 files changed, 10129 insertions(+), 1110 deletions(-) create mode 100644 AutomatedMLPipeline.ipynb create mode 100644 RAG.ipynb diff --git a/AutomatedMLPipeline.ipynb b/AutomatedMLPipeline.ipynb new file mode 100644 index 0000000..a5075cc --- /dev/null +++ b/AutomatedMLPipeline.ipynb @@ -0,0 +1,2446 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1b08e2b8-7680-420d-acd7-17a1f522276f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "847e1c93-f379-4175-ad00-d6be5f876d00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_std_previewage_std_previewyears_experience_std_previewprojects_completed_std_previewavg_project_score_std_previewcertifications_std_previewtraining_hours_std_previewovertime_hours_std_previewremote_ratio_std_previewsalary_std_previewbonus_std_previewsatisfaction_score_std_previewperformance_rating_std_previewpotential_score_std_previewattrition_flag_std_preview105530.01268.905250.865000.02000.05.5150.01-0.9096111.8534421.768433-0.773796-1.860696-1.0-1.5118582.0461861.665408-0.369971-1.181906-2.047314-2.030259-2.0091151.527525
111305.012475.00-1.566699-0.822981-0.904656-0.920063-0.773796-0.654654
2124520.030589.00-1.218544-0.7363520.7502030.6930351.904728-0.654654
31338NaN18370.01-0.870388-0.649722-0.022065NaN0.1190461.527525
414296.022595.00-0.522233-0.563093-1.014980-0.8125230.714273-0.654654
515358.015485.00-0.174078-0.476463-0.353037-0.597444-0.327375-0.654654
6165025.08260.010.174078-0.3898331.3018231.230734-1.3690231.527525
7174218.020374.000.522233-0.3032040.4192310.4779550.416659-0.654654
818317.025590.000.870388-0.216574-0.794332-0.7049831.160694-0.654654
919273.010482.001.218544-0.129944-1.235628-1.135143-1.071410-0.654654
10205530.012150.011.566699-0.0433151.8534421.768433-0.773796
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa\n", + "
24.931.40.2Iris-setosa\n", + "
34.73.21.30.2Iris-setosa\n", + "
44.63.11.50.2Iris-setosa\n", + "
553.61.40.2Iris-setosa\n", + "
65.43.91.70.4Iris-setosa\n", + "
74.6NULL1.40.3Iris-setosa\n", + "
853.41.50.2Iris-setosa\n", + "
94.42.91.40.2Iris-setosa\n", + "
104.93.11.50.1Iris-setosa\n", + "
115.43.71.50.2Iris-setosa\n", + "
124.83.41.60.2Iris-setosa\n", + "
134.831.40.1Iris-setosa\n", + "
144.331.10.1Iris-setosa\n", + "
155.841.20.1Iris-setosa\n", + "
165.74.41.50.1Iris-setosa\n", + "
175.4NULL1.30.1Iris-setosa\n", + "
185.13.51.40.1Iris-setosa\n", + "
195.73.81.70.1Iris-setosa\n", + "
205.13.81.50.1Iris-setosa\n", + "
215.43.41.70.1Iris-setosa\n", + "
225.13.71.50.1Iris-setosa\n", + "
234.63.610.1Iris-setosa\n", + "
245.13.31.70.1Iris-setosa\n", + "
254.83.41.90.1Iris-setosa\n", + "
26531.60.1Iris-setosa\n", + "
2753.41.60.1Iris-setosa\n", + "
28NULL3.51.50.1Iris-setosa\n", + "
295.23.41.40.1Iris-setosa\n", + "
304.73.21.60.1Iris-setosa\n", + "
314.83.1NULL0.1Iris-setosa\n", + "
325.43.41.50.1Iris-setosa\n", + "
335.24.11.50.1Iris-setosa\n", + "
345.54.21.40.1Iris-setosa\n", + "
354.93.11.50.1Iris-setosa\n", + "
3653.21.20.1Iris-setosa\n", + "
375.5NULL1.30.1Iris-setosa\n", + "
384.93.11.50.1Iris-setosa\n", + "
394.431.30.1Iris-setosa\n", + "
405.13.41.50.1Iris-setosa\n", + "
4153.51.30.1Iris-setosa\n", + "
424.52.31.30.1Iris-setosa\n", + "
434.43.21.30.1Iris-setosa\n", + "
4453.51.60.1Iris-setosa\n", + "
455.13.81.90.1Iris-setosa\n", + "
464.831.40.1Iris-setosa\n", + "
475.13.81.60.1Iris-setosa\n", + "
484.63.21.40.1Iris-setosa\n", + "
495.33.71.50.1Iris-setosa\n", + "
5053.31.40.1Iris-setosa\n", + "
5173.24.70.1Iris-versicolor\n", + "
526.43.24.50.1Iris-versicolor\n", + "
536.93.14.90.1Iris-versicolor\n", + "
545.52.340.1Iris-versicolor\n", + "
556.52.84.60.1Iris-versicolor\n", + "
565.72.84.50.1Iris-versicolor\n", + "
576.33.34.70.1Iris-versicolor\n", + "
584.92.43.30.1Iris-versicolor\n", + "
596.62.94.60.1Iris-versicolor\n", + "
605.22.73.90.1Iris-versicolor\n", + "
61523.50.1Iris-versicolor\n", + "
625.934.20.1Iris-versicolor\n", + "
6362.240.1Iris-versicolor\n", + "
646.12.94.70.1Iris-versicolor\n", + "
655.62.93.60.1Iris-versicolor\n", + "
666.73.14.40.1Iris-versicolor\n", + "
675.634.50.1Iris-versicolor\n", + "
685.82.74.10.1Iris-versicolor\n", + "
696.22.24.50.1Iris-versicolor\n", + "
705.62.53.90.1Iris-versicolor\n", + "
715.93.24.80.1Iris-versicolor\n", + "
726.12.840.1Iris-versicolor\n", + "
736.32.54.90.1Iris-versicolor\n", + "
746.12.84.70.1Iris-versicolor\n", + "
756.42.94.30.1Iris-versicolor\n", + "
766.634.40.1Iris-versicolor\n", + "
776.82.84.80.1Iris-versicolor\n", + "
786.7350.1Iris-versicolor\n", + "
7962.94.50.1Iris-versicolor\n", + "
805.72.63.50.1Iris-versicolor\n", + "
815.52.43.80.1Iris-versicolor\n", + "
825.52.43.70.1Iris-versicolor\n", + "
835.82.73.90.1Iris-versicolor\n", + "
8462.75.10.1Iris-versicolor\n", + "
855.434.50.1Iris-versicolor\n", + "
8663.44.50.1Iris-versicolor\n", + "
876.73.14.70.1Iris-versicolor\n", + "
886.32.34.40.1Iris-versicolor\n", + "
895.634.10.1Iris-versicolor\n", + "
905.52.540.1Iris-versicolor\n", + "
915.52.64.40.1Iris-versicolor\n", + "
926.134.60.1Iris-versicolor\n", + "
935.82.640.1Iris-versicolor\n", + "
9452.33.30.1Iris-versicolor\n", + "
955.62.74.20.1Iris-versicolor\n", + "
965.734.20.1Iris-versicolor\n", + "
975.72.94.20.1Iris-versicolor\n", + "
986.22.94.30.1Iris-versicolor\n", + "
995.12.530.1Iris-versicolor\n", + "
1005.72.84.10.1Iris-versicolor\n", + "
1016.33.360.1Iris-virginica\n", + "
1025.82.75.10.1Iris-virginica\n", + "
1037.135.90.1Iris-virginica\n", + "
1046.32.95.60.1Iris-virginica\n", + "
1056.535.80.1Iris-virginica\n", + "
1067.636.60.1Iris-virginica\n", + "
1074.92.54.50.1Iris-virginica\n", + "
1087.32.96.30.1Iris-virginica\n", + "
1096.72.55.80.1Iris-virginica\n", + "
1107.23.66.10.1Iris-virginica\n", + "
1116.53.25.10.1Iris-virginica\n", + "
1126.42.75.30.1Iris-virginica\n", + "
1136.835.50.1Iris-virginica\n", + "
1145.72.550.1Iris-virginica\n", + "
1155.82.85.10.1Iris-virginica\n", + "
1166.43.25.30.1Iris-virginica\n", + "
1176.535.50.1Iris-virginica\n", + "
1187.73.86.70.1Iris-virginica\n", + "
1197.72.66.90.1Iris-virginica\n", + "
12062.250.1Iris-virginica\n", + "
1216.93.25.70.1Iris-virginica\n", + "
1225.62.84.90.1Iris-virginica\n", + "
1237.72.86.70.1Iris-virginica\n", + "
1246.32.74.90.1Iris-virginica\n", + "
1256.73.35.70.1Iris-virginica\n", + "
1267.23.260.1Iris-virginica\n", + "
1276.22.84.80.1Iris-virginica\n", + "
1286.134.90.1Iris-virginica\n", + "
1296.42.85.60.1Iris-virginica\n", + "
1307.235.80.1Iris-virginica\n", + "
1317.42.86.10.1Iris-virginica\n", + "
1327.93.86.40.1Iris-virginica\n", + "
1336.42.85.60.1Iris-virginica\n", + "
1346.32.85.10.1Iris-virginica\n", + "
1356.12.65.60.1Iris-virginica\n", + "
1367.736.10.1Iris-virginica\n", + "
1376.33.45.60.1Iris-virginica\n", + "
1386.43.15.50.1Iris-virginica\n", + "
139634.80.1Iris-virginica\n", + "
1406.93.15.40.1Iris-virginica\n", + "
1416.73.15.60.1Iris-virginica\n", + "
1426.93.15.10.1Iris-virginica\n", + "
1435.82.75.10.1Iris-virginica\n", + "
1446.83.25.90.1Iris-virginica\n", + "
1456.73.35.70.1Iris-virginica\n", + "
1466.735.20.1Iris-virginica\n", + "
1476.32.550.1Iris-virginica\n", + "
1486.535.20.1Iris-virginica\n", + "
1496.23.45.40.1Iris-virginica\n", + "
150NULL35.10.1Iris-virginica\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT * FROM IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6f73b503-62fb-429a-97a8-bbb95ffcb163", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped 6 row(s) with missing values (in-place local).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
34.73.21.30.2Iris-setosa
44.63.11.50.2Iris-setosa
55.03.61.40.2Iris-setosa
65.43.91.70.4Iris-setosa
85.03.41.50.2Iris-setosa
94.42.91.40.2Iris-setosa
104.93.11.50.1Iris-setosa
115.43.71.50.2Iris-setosa
124.83.41.60.2Iris-setosa
134.83.01.40.1Iris-setosa
144.33.01.10.1Iris-setosa
155.84.01.20.1Iris-setosa
165.74.41.50.1Iris-setosa
185.13.51.40.1Iris-setosa
195.73.81.70.1Iris-setosa
205.13.81.50.1Iris-setosa
215.43.41.70.1Iris-setosa
225.13.71.50.1Iris-setosa
234.63.61.00.1Iris-setosa
245.13.31.70.1Iris-setosa
254.83.41.90.1Iris-setosa
265.03.01.60.1Iris-setosa
275.03.41.60.1Iris-setosa
295.23.41.40.1Iris-setosa
304.73.21.60.1Iris-setosa
325.43.41.50.1Iris-setosa
335.24.11.50.1Iris-setosa
345.54.21.40.1Iris-setosa
354.93.11.50.1Iris-setosa
365.03.21.20.1Iris-setosa
384.93.11.50.1Iris-setosa
394.43.01.30.1Iris-setosa
405.13.41.50.1Iris-setosa
415.03.51.30.1Iris-setosa
424.52.31.30.1Iris-setosa
434.43.21.30.1Iris-setosa
445.03.51.60.1Iris-setosa
455.13.81.90.1Iris-setosa
464.83.01.40.1Iris-setosa
475.13.81.60.1Iris-setosa
484.63.21.40.1Iris-setosa
495.33.71.50.1Iris-setosa
505.03.31.40.1Iris-setosa
517.03.24.70.1Iris-versicolor
526.43.24.50.1Iris-versicolor
536.93.14.90.1Iris-versicolor
545.52.34.00.1Iris-versicolor
556.52.84.60.1Iris-versicolor
565.72.84.50.1Iris-versicolor
576.33.34.70.1Iris-versicolor
584.92.43.30.1Iris-versicolor
596.62.94.60.1Iris-versicolor
605.22.73.90.1Iris-versicolor
615.02.03.50.1Iris-versicolor
625.93.04.20.1Iris-versicolor
636.02.24.00.1Iris-versicolor
646.12.94.70.1Iris-versicolor
655.62.93.60.1Iris-versicolor
666.73.14.40.1Iris-versicolor
675.63.04.50.1Iris-versicolor
685.82.74.10.1Iris-versicolor
696.22.24.50.1Iris-versicolor
705.62.53.90.1Iris-versicolor
715.93.24.80.1Iris-versicolor
726.12.84.00.1Iris-versicolor
736.32.54.90.1Iris-versicolor
746.12.84.70.1Iris-versicolor
756.42.94.30.1Iris-versicolor
766.63.04.40.1Iris-versicolor
776.82.84.80.1Iris-versicolor
786.73.05.00.1Iris-versicolor
796.02.94.50.1Iris-versicolor
805.72.63.50.1Iris-versicolor
815.52.43.80.1Iris-versicolor
825.52.43.70.1Iris-versicolor
835.82.73.90.1Iris-versicolor
846.02.75.10.1Iris-versicolor
855.43.04.50.1Iris-versicolor
866.03.44.50.1Iris-versicolor
876.73.14.70.1Iris-versicolor
886.32.34.40.1Iris-versicolor
895.63.04.10.1Iris-versicolor
905.52.54.00.1Iris-versicolor
915.52.64.40.1Iris-versicolor
926.13.04.60.1Iris-versicolor
935.82.64.00.1Iris-versicolor
945.02.33.30.1Iris-versicolor
955.62.74.20.1Iris-versicolor
965.73.04.20.1Iris-versicolor
975.72.94.20.1Iris-versicolor
986.22.94.30.1Iris-versicolor
995.12.53.00.1Iris-versicolor
1005.72.84.10.1Iris-versicolor
1016.33.36.00.1Iris-virginica
1025.82.75.10.1Iris-virginica
1037.13.05.90.1Iris-virginica
1046.32.95.60.1Iris-virginica
1056.53.05.80.1Iris-virginica
1067.63.06.60.1Iris-virginica
1074.92.54.50.1Iris-virginica
1087.32.96.30.1Iris-virginica
1096.72.55.80.1Iris-virginica
1107.23.66.10.1Iris-virginica
1116.53.25.10.1Iris-virginica
1126.42.75.30.1Iris-virginica
1136.83.05.50.1Iris-virginica
1145.72.55.00.1Iris-virginica
1155.82.85.10.1Iris-virginica
1166.43.25.30.1Iris-virginica
1176.53.05.50.1Iris-virginica
1187.73.86.70.1Iris-virginica
1197.72.66.90.1Iris-virginica
1206.02.25.00.1Iris-virginica
1216.93.25.70.1Iris-virginica
1225.62.84.90.1Iris-virginica
1237.72.86.70.1Iris-virginica
1246.32.74.90.1Iris-virginica
1256.73.35.70.1Iris-virginica
1267.23.26.00.1Iris-virginica
1276.22.84.80.1Iris-virginica
1286.13.04.90.1Iris-virginica
1296.42.85.60.1Iris-virginica
1307.23.05.80.1Iris-virginica
1317.42.86.10.1Iris-virginica
1327.93.86.40.1Iris-virginica
1336.42.85.60.1Iris-virginica
1346.32.85.10.1Iris-virginica
1356.12.65.60.1Iris-virginica
1367.73.06.10.1Iris-virginica
1376.33.45.60.1Iris-virginica
1386.43.15.50.1Iris-virginica
1396.03.04.80.1Iris-virginica
1406.93.15.40.1Iris-virginica
1416.73.15.60.1Iris-virginica
1426.93.15.10.1Iris-virginica
1435.82.75.10.1Iris-virginica
1446.83.25.90.1Iris-virginica
1456.73.35.70.1Iris-virginica
1466.73.05.20.1Iris-virginica
1476.32.55.00.1Iris-virginica
1486.53.05.20.1Iris-virginica
1496.23.45.40.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Feature Selection Results (method=anova)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
PetalLengthCm1065.303728
Id543.481503
SepalLengthCm113.483483
SepalWidthCm42.504709
PetalWidthCm10.479730
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, SepalWidthCm, PetalWidthCm\n", + "PREVIEW (local):\n", + "Local: Column 'PetalLengthCm': mean=3.8305555555555557, std=1.7358666494553543\n", + "Local: Column 'Id': mean=76.77083333333333, std=42.624119207256875\n", + "Local: Column 'SepalLengthCm': mean=5.86875, std=0.8279382651099859\n", + "Local: Column 'SepalWidthCm': mean=3.0395833333333333, std=0.43095803321076276\n", + "Local: Column 'PetalWidthCm': mean=0.10833333333333332, std=0.03435921354681384\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PetalLengthCmIdSepalLengthCmSepalWidthCmPetalWidthCmPetalLengthCm_std_previewId_std_previewSepalLengthCm_std_previewSepalWidthCm_std_previewPetalWidthCm_std_preview
1.415.13.50.2-1.400197-1.777652-0.9285111.0683562.667892
1.424.93.00.2-1.400197-1.754191-1.170075-0.0918502.667892
1.334.73.20.2-1.457805-1.730730-1.4116390.3722332.667892
1.544.63.10.2-1.342589-1.707269-1.5324210.1401922.667892
1.455.03.60.2-1.400197-1.683808-1.0492931.3003972.667892
1.765.43.90.4-1.227373-1.660347-0.5661651.9965218.488747
1.585.03.40.2-1.342589-1.613425-1.0492930.8363152.667892
1.494.42.90.2-1.400197-1.589964-1.773985-0.3238912.667892
1.5104.93.10.1-1.342589-1.566504-1.1700750.140192-0.242536
1.5115.43.70.2-1.342589-1.543043-0.5661651.5324382.667892
1.6124.83.40.2-1.284981-1.519582-1.2908570.8363152.667892
1.4134.83.00.1-1.400197-1.496121-1.290857-0.091850-0.242536
1.1144.33.00.1-1.573021-1.472660-1.894767-0.091850-0.242536
1.2155.84.00.1-1.515413-1.449199-0.0830382.228562-0.242536
1.5165.74.40.1-1.342589-1.425738-0.2038203.156727-0.242536
1.4185.13.50.1-1.400197-1.378816-0.9285111.068356-0.242536
1.7195.73.80.1-1.227373-1.355355-0.2038201.764480-0.242536
1.5205.13.80.1-1.342589-1.331895-0.9285111.764480-0.242536
1.7215.43.40.1-1.227373-1.308434-0.5661650.836315-0.242536
1.5225.13.70.1-1.342589-1.284973-0.9285111.532438-0.242536
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=144, train=115, test=29, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (115 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
1396.03.04.80.1Iris-virginica
965.73.04.20.1Iris-versicolor
1386.43.15.50.1Iris-virginica
1074.92.54.50.1Iris-virginica
756.42.94.30.1Iris-versicolor
1267.23.26.00.1Iris-virginica
325.43.41.50.1Iris-setosa
445.03.51.60.1Iris-setosa
886.32.34.40.1Iris-versicolor
625.93.04.20.1Iris-versicolor
1197.72.66.90.1Iris-virginica
55.03.61.40.2Iris-setosa
605.22.73.90.1Iris-versicolor
124.83.41.60.2Iris-setosa
726.12.84.00.1Iris-versicolor
1225.62.84.90.1Iris-virginica
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
646.12.94.70.1Iris-versicolor
846.02.75.10.1Iris-versicolor
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (29 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
265.03.01.60.1Iris-setosa
926.13.04.60.1Iris-versicolor
405.13.41.50.1Iris-setosa
495.33.71.50.1Iris-setosa
545.52.34.00.1Iris-versicolor
1037.13.05.90.1Iris-virginica
1367.73.06.10.1Iris-virginica
275.03.41.60.1Iris-setosa
895.63.04.10.1Iris-versicolor
484.63.21.40.1Iris-setosa
805.72.63.50.1Iris-versicolor
1237.72.86.70.1Iris-virginica
1206.02.25.00.1Iris-virginica
115.43.71.50.2Iris-setosa
675.63.04.50.1Iris-versicolor
1435.82.75.10.1Iris-virginica
1346.32.85.10.1Iris-virginica
1426.93.15.10.1Iris-virginica
705.62.53.90.1Iris-versicolor
505.03.31.40.1Iris-setosa
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Model Selection Results (primary_metric=accuracy)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
rf0.99130.01740.98150.03700.98520.01810.99050.0190
logistic0.98260.03480.98150.03700.98670.02670.98100.0381
svm0.98260.02130.98260.02140.98520.01810.98210.0220
knn0.98260.03480.98150.03700.98670.02670.98100.0381
gbm0.98260.03480.98150.03700.98670.02670.98100.0381
ada0.98260.03480.98150.03700.98670.02670.98100.0381
catboost0.98260.03480.98150.03700.98670.02670.98100.0381
lightgbm0.98260.03480.98150.03700.98670.02670.98100.0381
xgboost0.97390.03480.97260.03690.97830.02770.97260.0376
mlp0.77390.28230.56470.36210.60880.34380.71190.2479
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model 'rf' (mean accuracy=0.9913) saved to data['last_model'].\n", + "[MLPipeline] Automatically selected best model via SelectModel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
                 precision    recall  f1-score   support\n",
+       "\n",
+       "    Iris-setosa       1.00      1.00      1.00         9\n",
+       "Iris-versicolor       1.00      1.00      1.00        10\n",
+       " Iris-virginica       1.00      1.00      1.00        10\n",
+       "\n",
+       "       accuracy                           1.00        29\n",
+       "      macro avg       1.00      1.00      1.00        29\n",
+       "   weighted avg       1.00      1.00      1.00        29\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Predictions preview (actual vs predicted)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Species_predicted_pred_proba
Iris-setosaIris-setosa[0.99, 0.01, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-setosaIris-setosa[0.91, 0.09, 0.0]
Iris-versicolorIris-versicolor[0.01, 0.99, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-setosaIris-setosa[0.98, 0.01, 0.01]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.08, 0.92]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.02, 0.98]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.02, 0.98]
Iris-versicolorIris-versicolor[0.0, 0.99, 0.01]
Iris-setosaIris-setosa[0.73, 0.27, 0.0]
Iris-setosaIris-setosa[0.99, 0.01, 0.0]
Iris-versicolorIris-versicolor[0.0, 0.92, 0.08]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 0.99, 0.01]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.01, 0.99]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model from data['last_model'] saved to ./auto_models/model.joblib\n", + "[MLPipeline] Model saved to ./auto_models/model.joblib.\n", + "[MLPipeline] ML pipeline completed successfully.\n" + ] + } + ], + "source": [ + "%ml_pipeline target=Species problem=classification save_path='./auto_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75dc8ab9-f1ad-477f-a414-9380607df20a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/RAG.ipynb b/RAG.ipynb new file mode 100644 index 0000000..1aa70fd --- /dev/null +++ b/RAG.ipynb @@ -0,0 +1,1292 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3603dec6-c3a3-404a-b965-293237c73e1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "87ac5ab4-54ae-4e34-82b4-973134b7a60f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using file content from ./test.docx (len=28508)\n", + "\n", + "Using database: BUGBREW\n", + "\n", + "Ingest complete.\n", + " documents=1\n", + " chunks_total=40\n", + " embeddings_written=40\n", + " Server version: 11.8.3-MariaDB-ubu2404\n", + "\n" + ] + } + ], + "source": [ + "%maria_ingest doc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0730da59-cc1c-4fe6-af9d-07a6db9835bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idtitlecontentmetadatacreated_at
1search_test_docHybrid Search TestOur store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-31 20:17:53
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from documents;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34038236-6e67-4f6a-a779-d5d635a7c781", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1search_test_doc0Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:53
2search_test_doc1LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:53
3search_test_doc2nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:54
4search_test_doc3carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:54
5search_test_doc4nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:54
6search_test_doc5LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:54
7search_test_doc6nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:54
8search_test_doc7carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:54
9search_test_doc8nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:54
10search_test_doc9LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:54
11search_test_doc10nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
12search_test_doc11carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:55
13search_test_doc12nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:55
14search_test_doc13LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:55
15search_test_doc14nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
16search_test_doc15carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:55
17search_test_doc16nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:55
18search_test_doc17LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:55
19search_test_doc18nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
20search_test_doc19carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
21search_test_doc20nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:56
22search_test_doc21LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:56
23search_test_doc22nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:56
24search_test_doc23carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
25search_test_doc24nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:56
26search_test_doc25LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:56
27search_test_doc26nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:56
28search_test_doc27carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
29search_test_doc28nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:57
30search_test_doc29LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:57
31search_test_doc30nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:57
32search_test_doc31carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:57
33search_test_doc32nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:57
34search_test_doc33LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:57
35search_test_doc34nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:57
36search_test_doc35carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:57
37search_test_doc36nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:58
38search_test_doc37LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:58
39search_test_doc38nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:58
40search_test_doc39carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-31 20:17:58
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from chunks;" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "3aec3bdd-c6f3-4ad4-90bb-bab2b445bdaf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] running hybrid search for query (len=19): how to get a refund\n", + "\n", + "chunk_id\tchunk_text...\tscore\tvec_sim\tbm25\tdoc_id\n", + "1\tOur store strives to deliver exceptional value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving th...\t0.819408\t0.484022\t0.209054\tsearch_test_doc\n", + "5\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "9\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "13\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "17\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "21\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "25\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "29\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "\n" + ] + } + ], + "source": [ + "%maria_search query=\"how to get a refund\"" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "77e04582-6d81-4717-bb59-ea170f65d5aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] RAG query received (len=26): How do I request a refund?\n", + "\n", + "\n", + "=== ANSWER ===\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "%maria_rag_query query=\"How do I request a refund?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cae763f-935f-43ab-8e99-554f382bb5b6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/RawMLPipeline.ipynb b/RawMLPipeline.ipynb index 076f7ba..8942cb2 100644 --- a/RawMLPipeline.ipynb +++ b/RawMLPipeline.ipynb @@ -5,11 +5,21 @@ "execution_count": null, "id": "e5af9aaa-128d-46cc-a84f-9000580b203b", "metadata": {}, + "outputs": [], + "source": [ + "SHOW DATABASES;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0308f6d-a504-45b4-8f5b-a67181615fff", + "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Database
information_schema
mysql
performance_schema
sys
test
" + "Query OK" ] }, "metadata": {}, @@ -17,13 +27,13 @@ } ], "source": [ - "SHOW DATABASES;" + "CREATE DATABASE BUGBREW;" ] }, { "cell_type": "code", "execution_count": null, - "id": "a0308f6d-a504-45b4-8f5b-a67181615fff", + "id": "4e276e32-8767-452f-bbbb-0ddbbafc7fb0", "metadata": {}, "outputs": [ { @@ -37,13 +47,13 @@ } ], "source": [ - "CREATE DATABASE BUGBREW;" + "USE BUGBREW;" ] }, { "cell_type": "code", "execution_count": null, - "id": "4e276e32-8767-452f-bbbb-0ddbbafc7fb0", + "id": "4b8c203d-36f1-4ae2-a614-8c5e62ab29ef", "metadata": {}, "outputs": [ { @@ -57,7 +67,60 @@ } ], "source": [ - "USE BUGBREW;" + "DROP table IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40898b54-436b-4556-96fa-f3dbffa3fa17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE TABLE IRIS(\n", + " Id INT PRIMARY KEY,\n", + " SepalLengthCm DOUBLE,\n", + " SepalWidthCm DOUBLE,\n", + " PetalLengthCm DOUBLE,\n", + " PetalWidthCm DOUBLE,\n", + " Species VARCHAR(50)\n", + ");\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42e1bb47-ea97-4b57-9c4a-9d20a6ee5032", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "LOAD DATA LOCAL INFILE '/home/iddhartha/mariadb_kernel/Iris.csv'\n", + "INTO TABLE IRIS\n", + "FIELDS TERMINATED BY ','\n", + "OPTIONALLY ENCLOSED BY '\"'\n", + "LINES TERMINATED BY '\\n'\n", + "IGNORE 1 LINES\n", + "(Id, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species);" ] }, { @@ -82,7 +145,7 @@ "115.43.71.50.2Iris-setosa\n", "124.83.41.60.2Iris-setosa\n", "134.831.40.1Iris-setosa\n", - "144.331.10.1\n", + "144.331.10.1Iris-setosa\n", "155.841.20.1Iris-setosa\n", "165.74.41.50.1Iris-setosa\n", "175.4NULL1.30.1Iris-setosa\n", @@ -217,7 +280,7 @@ "1466.735.20.1Iris-virginica\n", "1476.32.550.1Iris-virginica\n", "1486.535.20.1Iris-virginica\n", - "1496.23.45.40.1\n", + "1496.23.45.40.1Iris-virginica\n", "150NULL35.10.1Iris-virginica\n", "" ] @@ -232,7 +295,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "id": "f7b8f34e-e73b-45c4-a1c9-9c504ff12c4d", "metadata": {}, "outputs": [ @@ -275,8 +338,8 @@ " \n", " \n", " Species\n", - " 2\n", - " 1.33\n", + " 0\n", + " 0.00\n", " \n", " \n", "" @@ -292,120 +355,7 @@ }, { "cell_type": "code", - "execution_count": 32, - "id": "cd7df893-e1e7-4372-b484-a22dc3702484", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW: would drop 8 row(s) (from 150 to 142).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_would_be_dropped
74.6NaN1.40.3Iris-setosaTrue
144.33.01.10.1NaNTrue
175.4NaN1.30.1Iris-setosaTrue
28NaN3.51.50.1Iris-setosaTrue
314.83.1NaN0.1Iris-setosaTrue
375.5NaN1.30.1Iris-setosaTrue
1496.23.45.40.1NaNTrue
150NaN3.05.10.1Iris-virginicaTrue
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 34, + "execution_count": 12, "id": "bdaff76a-42cb-4d07-8c13-b8365ea38560", "metadata": {}, "outputs": [ @@ -419,14 +369,14 @@ "SepalWidthCm: missing=3\n", "PetalLengthCm: missing=1\n", "PetalWidthCm: missing=0\n", - "Species: missing=2\n", + "Species: missing=0\n", "PREVIEW: computed fill-values (best-effort):\n", - "Id: would fill with -> 75.5 (mean via local preview)\n", - "SepalLengthCm: would fill with -> 5.847297297297298 (mean via local preview)\n", - "SepalWidthCm: would fill with -> 3.042857142857143 (mean via local preview)\n", - "PetalLengthCm: would fill with -> 3.773154362416107 (mean via local preview)\n", - "PetalWidthCm: would fill with -> 0.1093333333333333 (mean via local preview)\n", - "Species: could NOT determine fill value (not numeric; cannot compute mean locally); would skip\n" + "Id: would fill with -> 75.5 (median via local preview)\n", + "SepalLengthCm: would fill with -> 5.8 (median via local preview)\n", + "SepalWidthCm: would fill with -> 3.0 (median via local preview)\n", + "PetalLengthCm: would fill with -> 4.4 (median via local preview)\n", + "PetalWidthCm: would fill with -> 0.1 (median via local preview)\n", + "Species: could NOT determine fill value (not numeric; cannot compute median locally); would skip\n" ] }, { @@ -442,6 +392,12 @@ " PetalWidthCm\n", " Species\n", " _null_columns\n", + " Id_filled_preview\n", + " SepalLengthCm_filled_preview\n", + " SepalWidthCm_filled_preview\n", + " PetalLengthCm_filled_preview\n", + " PetalWidthCm_filled_preview\n", + " Species_filled_preview\n", " \n", " \n", " \n", @@ -453,15 +409,12 @@ " 0.3\n", " Iris-setosa\n", " SepalWidthCm\n", - " \n", - " \n", - " 14\n", - " 4.3\n", + " 7\n", + " 4.6\n", " 3.0\n", - " 1.1\n", - " 0.1\n", - " NaN\n", - " Species\n", + " 1.4\n", + " 0.3\n", + " Iris-setosa\n", " \n", " \n", " 17\n", @@ -471,6 +424,12 @@ " 0.1\n", " Iris-setosa\n", " SepalWidthCm\n", + " 17\n", + " 5.4\n", + " 3.0\n", + " 1.3\n", + " 0.1\n", + " Iris-setosa\n", " \n", " \n", " 28\n", @@ -480,6 +439,12 @@ " 0.1\n", " Iris-setosa\n", " SepalLengthCm\n", + " 28\n", + " 5.8\n", + " 3.5\n", + " 1.5\n", + " 0.1\n", + " Iris-setosa\n", " \n", " \n", " 31\n", @@ -489,6 +454,12 @@ " 0.1\n", " Iris-setosa\n", " PetalLengthCm\n", + " 31\n", + " 4.8\n", + " 3.1\n", + " 4.4\n", + " 0.1\n", + " Iris-setosa\n", " \n", " \n", " 37\n", @@ -498,15 +469,12 @@ " 0.1\n", " Iris-setosa\n", " SepalWidthCm\n", - " \n", - " \n", - " 149\n", - " 6.2\n", - " 3.4\n", - " 5.4\n", + " 37\n", + " 5.5\n", + " 3.0\n", + " 1.3\n", " 0.1\n", - " NaN\n", - " Species\n", + " Iris-setosa\n", " \n", " \n", " 150\n", @@ -516,6 +484,12 @@ " 0.1\n", " Iris-virginica\n", " SepalLengthCm\n", + " 150\n", + " 5.8\n", + " 3.0\n", + " 5.1\n", + " 0.1\n", + " Iris-virginica\n", " \n", " \n", "" @@ -526,26 +500,20 @@ } ], "source": [ - "%fillmissing mode=preview" + "%fillmissing strategy=median mode=preview" ] }, { "cell_type": "code", - "execution_count": 35, - "id": "78f0e9c1-c61a-4d79-bb3e-31efd05b4bd0", + "execution_count": 13, + "id": "95bd44b3-5cb4-472e-b4ee-a3321e595b4d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'Id': detected 0 outlier(s) using iqr.\n", - "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", - "Column 'SepalWidthCm': detected 4 outlier(s) using iqr.\n", - "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", - "Column 'PetalWidthCm': detected 11 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + "Apply completed: original preserved as IRIS_backup_29adf350d1ab4121.\n" ] }, { @@ -560,11 +528,6 @@ " PetalLengthCm\n", " PetalWidthCm\n", " Species\n", - " Id_is_outlier\n", - " SepalLengthCm_is_outlier\n", - " SepalWidthCm_is_outlier\n", - " PetalLengthCm_is_outlier\n", - " PetalWidthCm_is_outlier\n", " \n", " \n", " \n", @@ -575,11 +538,6 @@ " 1.4\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 2\n", @@ -588,11 +546,6 @@ " 1.4\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 3\n", @@ -601,11 +554,6 @@ " 1.3\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 4\n", @@ -614,11 +562,6 @@ " 1.5\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 5\n", @@ -627,11 +570,6 @@ " 1.4\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 6\n", @@ -640,24 +578,6 @@ " 1.7\n", " 0.4\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", - " \n", - " \n", - " 7\n", - " 4.6\n", - " NaN\n", - " 1.4\n", - " 0.3\n", - " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 8\n", @@ -666,11 +586,6 @@ " 1.5\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 9\n", @@ -679,11 +594,6 @@ " 1.4\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 10\n", @@ -692,11 +602,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 11\n", @@ -705,11 +610,6 @@ " 1.5\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 12\n", @@ -718,11 +618,6 @@ " 1.6\n", " 0.2\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " True\n", " \n", " \n", " 13\n", @@ -731,11 +626,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 14\n", @@ -743,12 +633,7 @@ " 3.0\n", " 1.1\n", " 0.1\n", - " NaN\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", + " Iris-setosa\n", " \n", " \n", " 15\n", @@ -757,11 +642,6 @@ " 1.2\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 16\n", @@ -770,24 +650,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " True\n", - " False\n", - " False\n", - " \n", - " \n", - " 17\n", - " 5.4\n", - " NaN\n", - " 1.3\n", - " 0.1\n", - " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 18\n", @@ -796,11 +658,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 19\n", @@ -809,11 +666,6 @@ " 1.7\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 20\n", @@ -822,11 +674,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 21\n", @@ -835,11 +682,6 @@ " 1.7\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 22\n", @@ -848,11 +690,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 23\n", @@ -861,11 +698,6 @@ " 1.0\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 24\n", @@ -874,11 +706,6 @@ " 1.7\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 25\n", @@ -887,11 +714,6 @@ " 1.9\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 26\n", @@ -900,11 +722,6 @@ " 1.6\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 27\n", @@ -913,24 +730,6 @@ " 1.6\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - " 28\n", - " NaN\n", - " 3.5\n", - " 1.5\n", - " 0.1\n", - " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 29\n", @@ -939,11 +738,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 30\n", @@ -952,24 +746,6 @@ " 1.6\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - " 31\n", - " 4.8\n", - " 3.1\n", - " NaN\n", - " 0.1\n", - " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 32\n", @@ -978,11 +754,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 33\n", @@ -991,11 +762,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " True\n", - " False\n", - " False\n", " \n", " \n", " 34\n", @@ -1004,11 +770,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " True\n", - " False\n", - " False\n", " \n", " \n", " 35\n", @@ -1017,11 +778,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 36\n", @@ -1030,24 +786,6 @@ " 1.2\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", - " \n", - " \n", - " 37\n", - " 5.5\n", - " NaN\n", - " 1.3\n", - " 0.1\n", - " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 38\n", @@ -1056,11 +794,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 39\n", @@ -1069,11 +802,6 @@ " 1.3\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 40\n", @@ -1082,11 +810,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 41\n", @@ -1095,11 +818,6 @@ " 1.3\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 42\n", @@ -1108,11 +826,6 @@ " 1.3\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 43\n", @@ -1121,11 +834,6 @@ " 1.3\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 44\n", @@ -1134,11 +842,6 @@ " 1.6\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 45\n", @@ -1147,11 +850,6 @@ " 1.9\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 46\n", @@ -1160,11 +858,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 47\n", @@ -1173,11 +866,6 @@ " 1.6\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 48\n", @@ -1186,11 +874,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 49\n", @@ -1199,11 +882,6 @@ " 1.5\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 50\n", @@ -1212,11 +890,6 @@ " 1.4\n", " 0.1\n", " Iris-setosa\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 51\n", @@ -1225,11 +898,6 @@ " 4.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 52\n", @@ -1238,11 +906,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 53\n", @@ -1251,11 +914,6 @@ " 4.9\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 54\n", @@ -1264,11 +922,6 @@ " 4.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 55\n", @@ -1277,11 +930,6 @@ " 4.6\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 56\n", @@ -1290,11 +938,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 57\n", @@ -1303,11 +946,6 @@ " 4.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 58\n", @@ -1316,11 +954,6 @@ " 3.3\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 59\n", @@ -1329,11 +962,6 @@ " 4.6\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 60\n", @@ -1342,11 +970,6 @@ " 3.9\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 61\n", @@ -1355,11 +978,6 @@ " 3.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " True\n", - " False\n", - " False\n", " \n", " \n", " 62\n", @@ -1368,11 +986,6 @@ " 4.2\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 63\n", @@ -1381,11 +994,6 @@ " 4.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 64\n", @@ -1394,11 +1002,6 @@ " 4.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 65\n", @@ -1407,11 +1010,6 @@ " 3.6\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 66\n", @@ -1420,11 +1018,6 @@ " 4.4\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 67\n", @@ -1433,11 +1026,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 68\n", @@ -1446,11 +1034,6 @@ " 4.1\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 69\n", @@ -1459,11 +1042,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 70\n", @@ -1472,11 +1050,6 @@ " 3.9\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 71\n", @@ -1485,11 +1058,6 @@ " 4.8\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 72\n", @@ -1498,11 +1066,6 @@ " 4.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 73\n", @@ -1511,11 +1074,6 @@ " 4.9\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 74\n", @@ -1524,11 +1082,6 @@ " 4.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 75\n", @@ -1537,11 +1090,6 @@ " 4.3\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 76\n", @@ -1550,11 +1098,6 @@ " 4.4\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 77\n", @@ -1563,11 +1106,6 @@ " 4.8\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 78\n", @@ -1576,11 +1114,6 @@ " 5.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 79\n", @@ -1589,11 +1122,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 80\n", @@ -1602,11 +1130,6 @@ " 3.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 81\n", @@ -1615,11 +1138,6 @@ " 3.8\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 82\n", @@ -1628,11 +1146,6 @@ " 3.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 83\n", @@ -1641,11 +1154,6 @@ " 3.9\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 84\n", @@ -1654,11 +1162,6 @@ " 5.1\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 85\n", @@ -1667,11 +1170,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 86\n", @@ -1680,11 +1178,6 @@ " 4.5\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 87\n", @@ -1693,11 +1186,6 @@ " 4.7\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 88\n", @@ -1706,11 +1194,6 @@ " 4.4\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 89\n", @@ -1719,11 +1202,6 @@ " 4.1\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 90\n", @@ -1732,11 +1210,6 @@ " 4.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 91\n", @@ -1745,11 +1218,6 @@ " 4.4\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 92\n", @@ -1758,11 +1226,6 @@ " 4.6\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 93\n", @@ -1771,11 +1234,6 @@ " 4.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 94\n", @@ -1784,11 +1242,6 @@ " 3.3\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 95\n", @@ -1797,11 +1250,6 @@ " 4.2\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 96\n", @@ -1810,11 +1258,6 @@ " 4.2\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 97\n", @@ -1823,11 +1266,6 @@ " 4.2\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 98\n", @@ -1836,11 +1274,6 @@ " 4.3\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 99\n", @@ -1849,11 +1282,6 @@ " 3.0\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 100\n", @@ -1862,11 +1290,6 @@ " 4.1\n", " 0.1\n", " Iris-versicolor\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 101\n", @@ -1875,11 +1298,6 @@ " 6.0\n", " 0.1\n", " Iris-virginica\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 102\n", @@ -1888,11 +1306,6 @@ " 5.1\n", " 0.1\n", " Iris-virginica\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 103\n", @@ -1901,11 +1314,6 @@ " 5.9\n", " 0.1\n", " Iris-virginica\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 104\n", @@ -1914,11 +1322,6 @@ " 5.6\n", " 0.1\n", " Iris-virginica\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", " \n", " 105\n", @@ -1927,123 +1330,276 @@ " 5.8\n", " 0.1\n", " Iris-virginica\n", - " False\n", - " False\n", - " False\n", - " False\n", - " False\n", " \n", + " \n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropmissing columns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf048049-058b-4c46-85e6-6966fb1ff7ae", + "metadata": {}, + "outputs": [], + "source": [ + "%dropmissing mode=rollback rollback_token=" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "23d4d6fa-590c-4f01-9a18-8e62a0e499d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
1067.63.06.60.1Iris-virginicaFalsecount100.000000100.000000100.000000100.000000100.000000
mean54.4500005.5360003.0690003.0750000.112000
std30.3968280.6685610.4666011.5339510.040899
min1.0000004.3000002.0000001.0000000.100000
25%28.5000005.0000002.8000001.5000000.100000
50%55.5000005.5000003.0000003.5500000.100000
75%80.2500006.0000003.4000004.5000000.100000
max105.0000007.1000004.4000006.0000000.400000
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%stats" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "78f0e9c1-c61a-4d79-bb3e-31efd05b4bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2051,38 +1607,38 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2090,12 +1646,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2103,12 +1659,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2116,25 +1672,25 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", + " \n", " \n", " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2142,12 +1698,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2155,12 +1711,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2168,12 +1724,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2181,12 +1737,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2194,12 +1750,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2207,12 +1763,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2220,12 +1776,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2233,12 +1789,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2246,12 +1802,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2259,12 +1815,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2272,12 +1828,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2285,12 +1841,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2298,12 +1854,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2311,12 +1867,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2324,12 +1880,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2337,12 +1893,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2350,12 +1906,12 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2363,12 +1919,12 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2376,12 +1932,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2389,12 +1945,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2402,12 +1958,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2415,12 +1971,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2428,12 +1984,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2441,12 +1997,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2454,12 +2010,12 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2467,12 +2023,103 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -2480,12 +2127,77 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2493,12 +2205,12 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -2506,198 +2218,5669 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesId_is_outlierSepalLengthCm_is_outlierSepalWidthCm_is_outlierPetalLengthCm_is_outlierPetalWidthCm_is_outlier
15.13.51.40.2Iris-setosaFalseFalseFalseFalseTrue
10724.92.54.50.1Iris-virginicaFalse3.01.40.2Iris-setosaFalseFalseFalseFalseTrue
1087.32.96.30.1Iris-virginicaFalse34.73.21.30.2Iris-setosaFalseFalseFalseFalseTrue
1096.72.55.80.1Iris-virginicaFalse44.63.11.50.2Iris-setosaFalseFalseFalseFalseTrue
1107.255.03.66.10.1Iris-virginicaFalse1.40.2Iris-setosaFalseFalseFalseFalseTrue
1116.53.25.10.1Iris-virginicaFalse65.43.91.70.4Iris-setosaFalseFalseFalseFalseTrue
1126.42.75.30.1Iris-virginicaFalse85.03.41.50.2Iris-setosaFalseFalseFalseFalseTrue
1136.83.05.50.1Iris-virginicaFalse94.42.91.40.2Iris-setosaFalseFalseFalseFalseTrue
1145.72.55.0104.93.11.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1155.82.85.10.1Iris-virginicaFalse115.43.71.50.2Iris-setosaFalseFalseFalseFalseTrue
1166.43.25.30.1Iris-virginicaFalse124.83.41.60.2Iris-setosaFalseFalseFalseFalseTrue
1176.5134.83.05.51.40.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1187.73.86.7144.33.01.10.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1197.72.66.9155.84.01.20.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1206.02.25.0165.74.41.50.1Iris-virginicaFalseIris-setosaFalseFalseTrueFalseFalse
1216.93.25.7185.13.51.40.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1225.62.84.9195.73.81.70.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1237.72.86.7205.13.81.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1246.32.74.9215.43.41.70.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1256.73.35.7225.13.71.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1267.23.26.0234.63.61.00.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1276.22.84.8245.13.31.70.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1286.13.04.9254.83.41.90.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1296.42.85.6265.03.01.60.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1307.23.05.8275.03.41.60.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1317.42.86.1295.23.41.40.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1327.93.86.4304.73.21.60.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1336.42.85.6325.43.41.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1346.32.85.1335.24.11.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1356.12.65.6345.54.21.40.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1367.73.06.1354.93.11.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1376.33.45.6365.03.21.20.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1386.4384.93.15.51.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1396.0394.43.04.81.30.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1406.93.15.4405.13.41.50.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1416.73.15.6415.03.51.30.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1426.93.15.1424.52.31.30.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1435.82.75.1434.43.21.30.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1446.83.25.9445.03.51.60.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1456.73.35.7455.13.81.90.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1466.7464.83.05.21.40.1Iris-virginicaIris-setosaFalseFalseFalseFalse
1476.32.5475.13.81.60.1Iris-setosaFalseFalseFalseFalseFalse
484.63.21.40.1Iris-setosaFalseFalseFalseFalseFalse
495.33.71.50.1Iris-setosaFalseFalseFalseFalseFalse
505.03.31.40.1Iris-virginicaIris-setosaFalseFalseFalseFalseFalse
517.03.24.70.1Iris-versicolorFalseFalseFalseFalseFalse
526.43.24.50.1Iris-versicolorFalseFalseFalseFalseFalse
536.93.14.90.1Iris-versicolorFalseFalseFalseFalseFalse
545.52.34.00.1Iris-versicolorFalseFalseFalseFalse
148556.53.02.84.60.1Iris-versicolorFalseFalseFalseFalseFalse
565.72.84.50.1Iris-versicolorFalseFalseFalseFalseFalse
576.33.34.70.1Iris-versicolorFalseFalseFalseFalseFalse
584.92.43.30.1Iris-versicolorFalseFalseFalseFalseFalse
596.62.94.60.1Iris-versicolorFalseFalseFalseFalseFalse
605.22.73.90.1Iris-virginicaIris-versicolorFalseFalseFalseFalse
1496.23.45.4615.02.03.50.1NaNIris-versicolorFalseFalseFalseFalse
150NaN625.93.05.14.20.1Iris-virginicaIris-versicolorFalseFalseFalseFalseFalse
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "740e0779-369c-4f26-ba1d-be52a342fd90", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local): would modify 15 value(s) across 5 column(s).\n", - "Column 'Id': would clip 0 value(s) locally (bounds: -73.5, 224.5).\n", - "Column 'SepalLengthCm': would clip 0 value(s) locally (bounds: 3.1499999999999986, 8.350000000000001).\n", - "Column 'SepalWidthCm': would clip 4 value(s) locally (bounds: 2.05, 4.05).\n", - "Column 'PetalLengthCm': would clip 0 value(s) locally (bounds: -3.649999999999999, 10.349999999999998).\n", - "Column 'PetalWidthCm': would clip 11 value(s) locally (bounds: 0.1, 0.1).\n" - ] + " \n", + " 63\n", + " 6.0\n", + " 2.2\n", + " 4.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 64\n", + " 6.1\n", + " 2.9\n", + " 4.7\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 65\n", + " 5.6\n", + " 2.9\n", + " 3.6\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 66\n", + " 6.7\n", + " 3.1\n", + " 4.4\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 67\n", + " 5.6\n", + " 3.0\n", + " 4.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 68\n", + " 5.8\n", + " 2.7\n", + " 4.1\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 69\n", + " 6.2\n", + " 2.2\n", + " 4.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 70\n", + " 5.6\n", + " 2.5\n", + " 3.9\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 71\n", + " 5.9\n", + " 3.2\n", + " 4.8\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 72\n", + " 6.1\n", + " 2.8\n", + " 4.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 73\n", + " 6.3\n", + " 2.5\n", + " 4.9\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 74\n", + " 6.1\n", + " 2.8\n", + " 4.7\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 75\n", + " 6.4\n", + " 2.9\n", + " 4.3\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 76\n", + " 6.6\n", + " 3.0\n", + " 4.4\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 77\n", + " 6.8\n", + " 2.8\n", + " 4.8\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 78\n", + " 6.7\n", + " 3.0\n", + " 5.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 79\n", + " 6.0\n", + " 2.9\n", + " 4.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 80\n", + " 5.7\n", + " 2.6\n", + " 3.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 81\n", + " 5.5\n", + " 2.4\n", + " 3.8\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 82\n", + " 5.5\n", + " 2.4\n", + " 3.7\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 83\n", + " 5.8\n", + " 2.7\n", + " 3.9\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 84\n", + " 6.0\n", + " 2.7\n", + " 5.1\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 85\n", + " 5.4\n", + " 3.0\n", + " 4.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 86\n", + " 6.0\n", + " 3.4\n", + " 4.5\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 87\n", + " 6.7\n", + " 3.1\n", + " 4.7\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 88\n", + " 6.3\n", + " 2.3\n", + " 4.4\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 89\n", + " 5.6\n", + " 3.0\n", + " 4.1\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 90\n", + " 5.5\n", + " 2.5\n", + " 4.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 91\n", + " 5.5\n", + " 2.6\n", + " 4.4\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 92\n", + " 6.1\n", + " 3.0\n", + " 4.6\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 93\n", + " 5.8\n", + " 2.6\n", + " 4.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 94\n", + " 5.0\n", + " 2.3\n", + " 3.3\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 95\n", + " 5.6\n", + " 2.7\n", + " 4.2\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 96\n", + " 5.7\n", + " 3.0\n", + " 4.2\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 97\n", + " 5.7\n", + " 2.9\n", + " 4.2\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 98\n", + " 6.2\n", + " 2.9\n", + " 4.3\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 99\n", + " 5.1\n", + " 2.5\n", + " 3.0\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 100\n", + " 5.7\n", + " 2.8\n", + " 4.1\n", + " 0.1\n", + " Iris-versicolor\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 101\n", + " 6.3\n", + " 3.3\n", + " 6.0\n", + " 0.1\n", + " Iris-virginica\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 102\n", + " 5.8\n", + " 2.7\n", + " 5.1\n", + " 0.1\n", + " Iris-virginica\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 103\n", + " 7.1\n", + " 3.0\n", + " 5.9\n", + " 0.1\n", + " Iris-virginica\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 104\n", + " 6.3\n", + " 2.9\n", + " 5.6\n", + " 0.1\n", + " Iris-virginica\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + " 105\n", + " 6.5\n", + " 3.0\n", + " 5.8\n", + " 0.1\n", + " Iris-virginica\n", + " False\n", + " False\n", + " False\n", + " False\n", + " False\n", + " \n", + " \n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "39e17585-f0af-4b7c-b8c1-8a4a28c76ee8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local): would drop 11 row(s) (from 100 to 89).\n", + "Column 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_outlier_cols
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
34.73.21.30.2Iris-setosa
44.63.11.50.2Iris-setosa
55.03.61.40.2Iris-setosa
65.43.91.70.4Iris-setosa
85.03.41.50.2Iris-setosa
94.42.91.40.2Iris-setosa
115.43.71.50.2Iris-setosa
124.83.41.60.2Iris-setosa
165.74.41.50.1Iris-setosa
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "06543b64-2e3e-4be4-aa0b-fd90d4d1125f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: non-numeric columns skipped: Species\n", + "Apply completed: original preserved as IRIS_backup_89cc522a9f0c4aad.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.1Iris-setosa
24.93.01.40.1Iris-setosa
34.73.21.30.1Iris-setosa
44.63.11.50.1Iris-setosa
55.03.61.40.1Iris-setosa
65.43.91.70.1Iris-setosa
85.03.41.50.1Iris-setosa
94.42.91.40.1Iris-setosa
104.93.11.50.1Iris-setosa
115.43.71.50.1Iris-setosa
124.83.41.60.1Iris-setosa
134.83.01.40.1Iris-setosa
144.33.01.10.1Iris-setosa
155.84.01.20.1Iris-setosa
165.74.31.50.1Iris-setosa
185.13.51.40.1Iris-setosa
195.73.81.70.1Iris-setosa
205.13.81.50.1Iris-setosa
215.43.41.70.1Iris-setosa
225.13.71.50.1Iris-setosa
234.63.61.00.1Iris-setosa
245.13.31.70.1Iris-setosa
254.83.41.90.1Iris-setosa
265.03.01.60.1Iris-setosa
275.03.41.60.1Iris-setosa
295.23.41.40.1Iris-setosa
304.73.21.60.1Iris-setosa
325.43.41.50.1Iris-setosa
335.24.11.50.1Iris-setosa
345.54.21.40.1Iris-setosa
354.93.11.50.1Iris-setosa
365.03.21.20.1Iris-setosa
384.93.11.50.1Iris-setosa
394.43.01.30.1Iris-setosa
405.13.41.50.1Iris-setosa
415.03.51.30.1Iris-setosa
424.52.31.30.1Iris-setosa
434.43.21.30.1Iris-setosa
445.03.51.60.1Iris-setosa
455.13.81.90.1Iris-setosa
464.83.01.40.1Iris-setosa
475.13.81.60.1Iris-setosa
484.63.21.40.1Iris-setosa
495.33.71.50.1Iris-setosa
505.03.31.40.1Iris-setosa
517.03.24.70.1Iris-versicolor
526.43.24.50.1Iris-versicolor
536.93.14.90.1Iris-versicolor
545.52.34.00.1Iris-versicolor
556.52.84.60.1Iris-versicolor
565.72.84.50.1Iris-versicolor
576.33.34.70.1Iris-versicolor
584.92.43.30.1Iris-versicolor
596.62.94.60.1Iris-versicolor
605.22.73.90.1Iris-versicolor
615.02.03.50.1Iris-versicolor
625.93.04.20.1Iris-versicolor
636.02.24.00.1Iris-versicolor
646.12.94.70.1Iris-versicolor
655.62.93.60.1Iris-versicolor
666.73.14.40.1Iris-versicolor
675.63.04.50.1Iris-versicolor
685.82.74.10.1Iris-versicolor
696.22.24.50.1Iris-versicolor
705.62.53.90.1Iris-versicolor
715.93.24.80.1Iris-versicolor
726.12.84.00.1Iris-versicolor
736.32.54.90.1Iris-versicolor
746.12.84.70.1Iris-versicolor
756.42.94.30.1Iris-versicolor
766.63.04.40.1Iris-versicolor
776.82.84.80.1Iris-versicolor
786.73.05.00.1Iris-versicolor
796.02.94.50.1Iris-versicolor
805.72.63.50.1Iris-versicolor
815.52.43.80.1Iris-versicolor
825.52.43.70.1Iris-versicolor
835.82.73.90.1Iris-versicolor
846.02.75.10.1Iris-versicolor
855.43.04.50.1Iris-versicolor
866.03.44.50.1Iris-versicolor
876.73.14.70.1Iris-versicolor
886.32.34.40.1Iris-versicolor
895.63.04.10.1Iris-versicolor
905.52.54.00.1Iris-versicolor
915.52.64.40.1Iris-versicolor
926.13.04.60.1Iris-versicolor
935.82.64.00.1Iris-versicolor
945.02.33.30.1Iris-versicolor
955.62.74.20.1Iris-versicolor
965.73.04.20.1Iris-versicolor
975.72.94.20.1Iris-versicolor
986.22.94.30.1Iris-versicolor
995.12.53.00.1Iris-versicolor
1005.72.84.10.1Iris-versicolor
1016.33.36.00.1Iris-virginica
1025.82.75.10.1Iris-virginica
1037.13.05.90.1Iris-virginica
1046.32.95.60.1Iris-virginica
1056.53.05.80.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers columns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1bb7b324-237b-4817-9877-8f6bea4d455b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'Species' unique non-null values: 3 (showing up to 10): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']\n", + "PREVIEW (local) estimated created columns: 3\n", + "Preview sample with encoded columns (showing 100 rows):\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesSpecies_Iris-setosaSpecies_Iris-versicolorSpecies_Iris-virginica
15.13.51.40.1Iris-setosa1.00.00.0
24.93.01.40.1Iris-setosa1.00.00.0
34.73.21.30.1Iris-setosa1.00.00.0
44.63.11.50.1Iris-setosa1.00.00.0
55.03.61.40.1Iris-setosa1.00.00.0
65.43.91.70.1Iris-setosa1.00.00.0
85.03.41.50.1Iris-setosa1.00.00.0
94.42.91.40.1Iris-setosa1.00.00.0
104.93.11.50.1Iris-setosa1.00.00.0
115.43.71.50.1Iris-setosa1.00.00.0
124.83.41.60.1Iris-setosa1.00.00.0
134.83.01.40.1Iris-setosa1.00.00.0
144.33.01.10.1Iris-setosa1.00.00.0
155.84.01.20.1Iris-setosa1.00.00.0
165.74.31.50.1Iris-setosa1.00.00.0
185.13.51.40.1Iris-setosa1.00.00.0
195.73.81.70.1Iris-setosa1.00.00.0
205.13.81.50.1Iris-setosa1.00.00.0
215.43.41.70.1Iris-setosa1.00.00.0
225.13.71.50.1Iris-setosa1.00.00.0
234.63.61.00.1Iris-setosa1.00.00.0
245.13.31.70.1Iris-setosa1.00.00.0
254.83.41.90.1Iris-setosa1.00.00.0
265.03.01.60.1Iris-setosa1.00.00.0
275.03.41.60.1Iris-setosa1.00.00.0
295.23.41.40.1Iris-setosa1.00.00.0
304.73.21.60.1Iris-setosa1.00.00.0
325.43.41.50.1Iris-setosa1.00.00.0
335.24.11.50.1Iris-setosa1.00.00.0
345.54.21.40.1Iris-setosa1.00.00.0
354.93.11.50.1Iris-setosa1.00.00.0
365.03.21.20.1Iris-setosa1.00.00.0
384.93.11.50.1Iris-setosa1.00.00.0
394.43.01.30.1Iris-setosa1.00.00.0
405.13.41.50.1Iris-setosa1.00.00.0
415.03.51.30.1Iris-setosa1.00.00.0
424.52.31.30.1Iris-setosa1.00.00.0
434.43.21.30.1Iris-setosa1.00.00.0
445.03.51.60.1Iris-setosa1.00.00.0
455.13.81.90.1Iris-setosa1.00.00.0
464.83.01.40.1Iris-setosa1.00.00.0
475.13.81.60.1Iris-setosa1.00.00.0
484.63.21.40.1Iris-setosa1.00.00.0
495.33.71.50.1Iris-setosa1.00.00.0
505.03.31.40.1Iris-setosa1.00.00.0
517.03.24.70.1Iris-versicolor0.01.00.0
526.43.24.50.1Iris-versicolor0.01.00.0
536.93.14.90.1Iris-versicolor0.01.00.0
545.52.34.00.1Iris-versicolor0.01.00.0
556.52.84.60.1Iris-versicolor0.01.00.0
565.72.84.50.1Iris-versicolor0.01.00.0
576.33.34.70.1Iris-versicolor0.01.00.0
584.92.43.30.1Iris-versicolor0.01.00.0
596.62.94.60.1Iris-versicolor0.01.00.0
605.22.73.90.1Iris-versicolor0.01.00.0
615.02.03.50.1Iris-versicolor0.01.00.0
625.93.04.20.1Iris-versicolor0.01.00.0
636.02.24.00.1Iris-versicolor0.01.00.0
646.12.94.70.1Iris-versicolor0.01.00.0
655.62.93.60.1Iris-versicolor0.01.00.0
666.73.14.40.1Iris-versicolor0.01.00.0
675.63.04.50.1Iris-versicolor0.01.00.0
685.82.74.10.1Iris-versicolor0.01.00.0
696.22.24.50.1Iris-versicolor0.01.00.0
705.62.53.90.1Iris-versicolor0.01.00.0
715.93.24.80.1Iris-versicolor0.01.00.0
726.12.84.00.1Iris-versicolor0.01.00.0
736.32.54.90.1Iris-versicolor0.01.00.0
746.12.84.70.1Iris-versicolor0.01.00.0
756.42.94.30.1Iris-versicolor0.01.00.0
766.63.04.40.1Iris-versicolor0.01.00.0
776.82.84.80.1Iris-versicolor0.01.00.0
786.73.05.00.1Iris-versicolor0.01.00.0
796.02.94.50.1Iris-versicolor0.01.00.0
805.72.63.50.1Iris-versicolor0.01.00.0
815.52.43.80.1Iris-versicolor0.01.00.0
825.52.43.70.1Iris-versicolor0.01.00.0
835.82.73.90.1Iris-versicolor0.01.00.0
846.02.75.10.1Iris-versicolor0.01.00.0
855.43.04.50.1Iris-versicolor0.01.00.0
866.03.44.50.1Iris-versicolor0.01.00.0
876.73.14.70.1Iris-versicolor0.01.00.0
886.32.34.40.1Iris-versicolor0.01.00.0
895.63.04.10.1Iris-versicolor0.01.00.0
905.52.54.00.1Iris-versicolor0.01.00.0
915.52.64.40.1Iris-versicolor0.01.00.0
926.13.04.60.1Iris-versicolor0.01.00.0
935.82.64.00.1Iris-versicolor0.01.00.0
945.02.33.30.1Iris-versicolor0.01.00.0
955.62.74.20.1Iris-versicolor0.01.00.0
965.73.04.20.1Iris-versicolor0.01.00.0
975.72.94.20.1Iris-versicolor0.01.00.0
986.22.94.30.1Iris-versicolor0.01.00.0
995.12.53.00.1Iris-versicolor0.01.00.0
1005.72.84.10.1Iris-versicolor0.01.00.0
1016.33.36.00.1Iris-virginica0.00.01.0
1025.82.75.10.1Iris-virginica0.00.01.0
1037.13.05.90.1Iris-virginica0.00.01.0
1046.32.95.60.1Iris-virginica0.00.01.0
1056.53.05.80.1Iris-virginica0.00.01.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=Species drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b12612ad-d4b2-4864-a3a1-0874dfbe414d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
15.13.51.40.10
24.93.01.40.10
34.73.21.30.10
44.63.11.50.10
55.03.61.40.10
65.43.91.70.10
85.03.41.50.10
94.42.91.40.10
104.93.11.50.10
115.43.71.50.10
124.83.41.60.10
134.83.01.40.10
144.33.01.10.10
155.84.01.20.10
165.74.31.50.10
185.13.51.40.10
195.73.81.70.10
205.13.81.50.10
215.43.41.70.10
225.13.71.50.10
234.63.61.00.10
245.13.31.70.10
254.83.41.90.10
265.03.01.60.10
275.03.41.60.10
295.23.41.40.10
304.73.21.60.10
325.43.41.50.10
335.24.11.50.10
345.54.21.40.10
354.93.11.50.10
365.03.21.20.10
384.93.11.50.10
394.43.01.30.10
405.13.41.50.10
415.03.51.30.10
424.52.31.30.10
434.43.21.30.10
445.03.51.60.10
455.13.81.90.10
464.83.01.40.10
475.13.81.60.10
484.63.21.40.10
495.33.71.50.10
505.03.31.40.10
517.03.24.70.11
526.43.24.50.11
536.93.14.90.11
545.52.34.00.11
556.52.84.60.11
565.72.84.50.11
576.33.34.70.11
584.92.43.30.11
596.62.94.60.11
605.22.73.90.11
615.02.03.50.11
625.93.04.20.11
636.02.24.00.11
646.12.94.70.11
655.62.93.60.11
666.73.14.40.11
675.63.04.50.11
685.82.74.10.11
696.22.24.50.11
705.62.53.90.11
715.93.24.80.11
726.12.84.00.11
736.32.54.90.11
746.12.84.70.11
756.42.94.30.11
766.63.04.40.11
776.82.84.80.11
786.73.05.00.11
796.02.94.50.11
805.72.63.50.11
815.52.43.80.11
825.52.43.70.11
835.82.73.90.11
846.02.75.10.11
855.43.04.50.11
866.03.44.50.11
876.73.14.70.11
886.32.34.40.11
895.63.04.10.11
905.52.54.00.11
915.52.64.40.11
926.13.04.60.11
935.82.64.00.11
945.02.33.30.11
955.62.74.20.11
965.73.04.20.11
975.72.94.20.11
986.22.94.30.11
995.12.53.00.11
1005.72.84.10.11
1016.33.36.00.12
1025.82.75.10.12
1037.13.05.90.12
1046.32.95.60.12
1056.53.05.80.12
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=label columns=Species drop_original=true mode=apply confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8ba776f8-84ff-477b-a5e3-c02e678c839c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'Id': mean=54.45, std=30.244462303039874\n", + "Local: Column 'SepalLengthCm': mean=5.535999999999999, std=0.6652097413598211\n", + "Local: Column 'SepalWidthCm': mean=3.067999999999999, std=0.46149322855270586\n", + "Local: Column 'PetalLengthCm': mean=3.0750000000000006, std=1.526261773091366\n", + "Local: Column 'PetalWidthCm': mean=0.09999999999999998, std=2.7755575615628914e-17\n", + "Local: Column 'Species_lbl': mean=0.6, std=0.5830951894845301\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lblId_std_previewSepalLengthCm_std_previewSepalWidthCm_std_previewPetalLengthCm_std_previewPetalWidthCm_std_previewSpecies_lbl_std_preview
15.13.51.40.10-1.767266-0.6554320.936092-1.0974531.0-1.028992
24.93.01.40.10-1.734202-0.956089-0.147348-1.0974531.0-1.028992
34.73.21.30.10-1.701138-1.2567460.286028-1.1629721.0-1.028992
44.63.11.50.10-1.668074-1.4070750.069340-1.0319331.0-1.028992
55.03.61.40.10-1.635010-0.8057611.152780-1.0974531.0-1.028992
65.43.91.70.10-1.601946-0.2044471.802843-0.9008941.0-1.028992
85.03.41.50.10-1.535818-0.8057610.719404-1.0319331.0-1.028992
94.42.91.40.10-1.502754-1.707732-0.364036-1.0974531.0-1.028992
104.93.11.50.10-1.469691-0.9560890.069340-1.0319331.0-1.028992
115.43.71.50.10-1.436627-0.2044471.369468-1.0319331.0-1.028992
124.83.41.60.10-1.403563-1.1064180.719404-0.9664141.0-1.028992
134.83.01.40.10-1.370499-1.106418-0.147348-1.0974531.0-1.028992
144.33.01.10.10-1.337435-1.858061-0.147348-1.2940111.0-1.028992
155.84.01.20.10-1.3043710.3968672.019531-1.2284921.0-1.028992
165.74.31.50.10-1.2713070.2465392.669595-1.0319331.0-1.028992
185.13.51.40.10-1.205179-0.6554320.936092-1.0974531.0-1.028992
195.73.81.70.10-1.1721150.2465391.586155-0.9008941.0-1.028992
205.13.81.50.10-1.139051-0.6554321.586155-1.0319331.0-1.028992
215.43.41.70.10-1.105988-0.2044470.719404-0.9008941.0-1.028992
225.13.71.50.10-1.072924-0.6554321.369468-1.0319331.0-1.028992
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%standardize mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "dbca0744-2672-4c67-bdab-b94ec62b8cff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
10.2857140.6521740.080.10
20.2142860.4347830.080.10
30.1428570.5217390.060.10
40.1071430.4782610.100.10
50.2500000.6956520.080.10
60.3928570.8260870.140.10
80.2500000.6086960.100.10
90.0357140.3913040.080.10
100.2142860.4782610.100.10
110.3928570.7391300.100.10
120.1785710.6086960.120.10
130.1785710.4347830.080.10
140.0000000.4347830.020.10
150.5357140.8695650.040.10
160.5000001.0000000.100.10
180.2857140.6521740.080.10
190.5000000.7826090.140.10
200.2857140.7826090.100.10
210.3928570.6086960.140.10
220.2857140.7391300.100.10
230.1071430.6956520.000.10
240.2857140.5652170.140.10
250.1785710.6086960.180.10
260.2500000.4347830.120.10
270.2500000.6086960.120.10
290.3214290.6086960.080.10
300.1428570.5217390.120.10
320.3928570.6086960.100.10
330.3214290.9130430.100.10
340.4285710.9565220.080.10
350.2142860.4782610.100.10
360.2500000.5217390.040.10
380.2142860.4782610.100.10
390.0357140.4347830.060.10
400.2857140.6086960.100.10
410.2500000.6521740.060.10
420.0714290.1304350.060.10
430.0357140.5217390.060.10
440.2500000.6521740.120.10
450.2857140.7826090.180.10
460.1785710.4347830.080.10
470.2857140.7826090.120.10
480.1071430.5217390.080.10
490.3571430.7391300.100.10
500.2500000.5652170.080.10
510.9642860.5217390.740.11
520.7500000.5217390.700.11
530.9285710.4782610.780.11
540.4285710.1304350.600.11
550.7857140.3478260.720.11
560.5000000.3478260.700.11
570.7142860.5652170.740.11
580.2142860.1739130.460.11
590.8214290.3913040.720.11
600.3214290.3043480.580.11
610.2500000.0000000.500.11
620.5714290.4347830.640.11
630.6071430.0869570.600.11
640.6428570.3913040.740.11
650.4642860.3913040.520.11
660.8571430.4782610.680.11
670.4642860.4347830.700.11
680.5357140.3043480.620.11
690.6785710.0869570.700.11
700.4642860.2173910.580.11
710.5714290.5217390.760.11
720.6428570.3478260.600.11
730.7142860.2173910.780.11
740.6428570.3478260.740.11
750.7500000.3913040.660.11
760.8214290.4347830.680.11
770.8928570.3478260.760.11
780.8571430.4347830.800.11
790.6071430.3913040.700.11
800.5000000.2608700.500.11
810.4285710.1739130.560.11
820.4285710.1739130.540.11
830.5357140.3043480.580.11
840.6071430.3043480.820.11
850.3928570.4347830.700.11
860.6071430.6086960.700.11
870.8571430.4782610.740.11
880.7142860.1304350.680.11
890.4642860.4347830.620.11
900.4285710.2173910.600.11
910.4285710.2608700.680.11
920.6428570.4347830.720.11
930.5357140.2608700.600.11
940.2500000.1304350.460.11
950.4642860.3043480.640.11
960.5000000.4347830.640.11
970.5000000.3913040.640.11
980.6785710.3913040.660.11
990.2857140.2173910.400.11
1000.5000000.3478260.620.11
1010.7142860.5652171.000.12
1020.5357140.3043480.820.12
1031.0000000.4347830.980.12
1040.7142860.3913040.920.12
1050.7857140.4347830.960.12
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize columns=SepalLengthCm,SepalWidthCm,PetalLengthCm mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "76f81e4f-4f72-4f6b-b31e-339e660de710", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=100, train=70, test=20, val=10.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (70 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
770.8928570.3478260.760.11
600.3214290.3043480.580.11
400.2857140.6086960.100.10
140.0000000.4347830.020.10
740.6428570.3478260.740.11
10.2857140.6521740.080.10
270.2500000.6086960.120.10
850.3928570.4347830.700.11
680.5357140.3043480.620.11
20.2142860.4347830.080.10
450.2857140.7826090.180.10
800.5000000.2608700.500.11
670.4642860.4347830.700.11
1020.5357140.3043480.820.12
950.4642860.3043480.640.11
840.6071430.3043480.820.11
40.1071430.4782610.100.10
980.6785710.3913040.660.11
150.5357140.8695650.040.10
300.1428570.5217390.120.10
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Validation (10 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
810.4285710.1739130.560.11
260.2500000.4347830.120.10
380.2142860.4782610.100.10
940.2500000.1304350.460.11
110.3928570.7391300.100.10
80.2500000.6086960.100.10
560.5000000.3478260.700.11
720.6428570.3478260.600.11
190.5000000.7826090.140.10
910.4285710.2608700.680.11
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (20 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
700.4642860.2173910.580.11
590.8214290.3913040.720.11
570.7142860.5652170.740.11
750.7500000.3913040.660.11
90.0357140.3913040.080.10
330.3214290.9130430.100.10
230.1071430.6956520.000.10
540.4285710.1304350.600.11
210.3928570.6086960.140.10
250.1785710.6086960.180.10
160.5000001.0000000.100.10
1031.0000000.4347830.980.12
500.2500000.5652170.080.10
580.2142860.1739130.460.11
120.1785710.6086960.120.10
520.7500000.5217390.700.11
640.6428570.3913040.740.11
440.2500000.6521740.120.10
760.8214290.4347830.680.11
780.8571430.4347830.800.11
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata test_size=0.2 val_size=0.1 stratify=Species_lbl random_state=42" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "512becef-92a8-4fa9-a856-39f60c661caa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Feature Selection Results (method=anova)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
PetalLengthCm624.413887
Id122.246919
SepalLengthCm49.475597
PetalWidthCm33.500000
SepalWidthCm24.338860
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 4 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, PetalWidthCm\n" + ] + } + ], + "source": [ + "%select_features target=Species_lbl problem=classification k=4" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "28490e5c-dad8-44c5-9710-75802aa037e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Model Selection Results (primary_metric=accuracy)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
gbm1.00000.00000.92890.14221.00000.00000.91430.1292
ada1.00000.00000.92890.14220.92500.15001.00000.0000
lightgbm0.98570.02860.92890.14220.92500.15000.93330.1333
rf0.98570.02860.92890.14220.92500.15000.93330.1333
catboost0.98570.02860.92890.14220.92500.15000.93330.1333
logistic0.95710.05710.95200.05970.95000.06670.97140.0381
xgboost0.95710.05710.84760.18740.83930.19760.85710.1756
knn0.95710.05710.95200.05970.95000.06670.97140.0381
svm0.90000.09690.68440.16890.67280.17190.70480.1576
mlp0.82860.21480.47020.28700.72200.31040.56670.2494
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model 'gbm' (mean accuracy=1.0000) saved to data['best_model'].\n" + ] + } + ], + "source": [ + "%select_model target=Species_lbl problem=classification " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a48fcd76-8b4e-44b7-8bad-bc91d76c8f5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'logistic' trained and saved to data['last_model']. problem=classification. train_rows=70\n" + ] + } + ], + "source": [ + "%train_model features=PetalLengthCm,SepalLengthCm,PetalWidthCm,SepalWidthCm target=Species_lbl problem=classification model=logistic" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9afbac09-ac9b-47ec-b3cb-29c650abc225", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy0.9500
Precision (w)0.9045
Recall (w)0.9500
F1 (w)0.9262
ROC AUCRequires numeric y for ROC AUC.
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
              precision    recall  f1-score   support\n",
+       "\n",
+       "           0       1.00      1.00      1.00         9\n",
+       "           1       0.91      1.00      0.95        10\n",
+       "           2       0.00      0.00      0.00         1\n",
+       "\n",
+       "    accuracy                           0.95        20\n",
+       "   macro avg       0.64      0.67      0.65        20\n",
+       "weighted avg       0.90      0.95      0.93        20\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" }, { "data": { "text/html": [ - "\n", + "

Predictions preview (actual vs predicted)

\n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_oob_columnsSpecies_lbl_predicted_pred_proba
15.13.51.40.2Iris-setosaPetalWidthCm1[0.1568254770832685, 0.7713064473235676, 0.07186807559316397]
24.93.01.40.2Iris-setosaPetalWidthCm11[0.06718779974737324, 0.8435489946780091, 0.0892632055746176]
34.73.21.30.2Iris-setosaPetalWidthCm11[0.11193644578263154, 0.7807944605541814, 0.10726909366318703]
44.63.11.50.2Iris-setosaPetalWidthCm11[0.0969971439337914, 0.8149505429793529, 0.0880523130868557]
55.03.61.40.2Iris-setosaPetalWidthCm00[0.8433008108096057, 0.13829791543550074, 0.01840127375489365]
65.43.91.70.4Iris-setosaPetalWidthCm00[0.895686730493734, 0.08380524435902056, 0.020508025147245475]
74.6NaN1.40.3Iris-setosaPetalWidthCm00[0.9268617694515947, 0.061193135907732926, 0.011945094640672366]
85.03.41.50.2Iris-setosaPetalWidthCm11[0.13151754326324983, 0.8012709935000745, 0.06721146323667562]
94.42.91.40.2Iris-setosaPetalWidthCm00[0.7596738139774813, 0.2071521192267119, 0.033174066795806915]
115.43.71.50.2Iris-setosaPetalWidthCm00[0.8118943280777537, 0.1607394103543395, 0.027366261567906593]
124.83.41.60.2Iris-setosaPetalWidthCm00[0.8754735560817294, 0.09876077960574704, 0.025765664312523556]
165.74.41.50.1Iris-setosaSepalWidthCm21[0.018507285282619034, 0.8871550456187827, 0.09433766909859825]
335.24.11.50.1Iris-setosaSepalWidthCm00[0.8331662419100963, 0.14412605014620486, 0.02270770794369895]
345.54.21.40.1Iris-setosaSepalWidthCm11[0.32161813792276145, 0.6197736419781975, 0.05860822009904103]
615.02.03.50.1Iris-versicolorSepalWidthCm00[0.8460107506358353, 0.13157965022216966, 0.022409599141995116]
11[0.11013288426842004, 0.7886084264808748, 0.10125868925070508]
11[0.08986421569943458, 0.8189349996674764, 0.09120078463308896]
00[0.8381775823858857, 0.13751583333001305, 0.02430658428410121]
11[0.0855996123051164, 0.8222134845068737, 0.09218690318800986]
11[0.050674387944242026, 0.8545552152483651, 0.09477039680739305]
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%evaluate_model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7544f465-842c-4d37-8c68-5837bb235411", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model from data['last_model'] saved to ./saved_models/model.joblib\n" + ] + } + ], + "source": [ + "%savemodel model_name_in_data=last_model save_path='./saved_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "63b78371-7c0f-40f0-8bcf-d6ecf5628feb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded model from ./saved_models/model.joblib → data['last_model'] (features[4], target=Species_lbl)\n" + ] + } + ], + "source": [ + "%loadmodel load_path='./saved_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f50c0979-343d-43fb-a132-efd77299370b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using inline feature values for prediction: {'PetalLengthCm': 6.4, 'SepalLengthCm': 4.2, 'PetalWidthCm': 2.8, 'SepalWidthCm': 1.2}\n" + ] + }, + { + "data": { + "text/html": [ + "

Predictions (last_preds)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
prediction
1
" @@ -2705,21 +7888,119 @@ }, "metadata": {}, "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predictions stored in data['last_preds'] with shape=(1, 1)\n" + ] } ], "source": [ - "%clipoutliers mode=preview" + "%predict model_name=last_model data_name=[6.4,4.2,2.8,1.2] output_name=last_preds" ] }, { "cell_type": "code", "execution_count": null, - "id": "e73470c4-20e2-4a6d-9b5c-ad21d41659e8", + "id": "248bd31c-42bd-4747-b1ca-94b31c97d70d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1missing2025-11-01 10:26:00ALL_COLUMNSsuccess%missing action=show examined 6 column(s); total_rows=150.BUGBREWNULLNULLNULL
2fillmissingstrategy=median mode=preview2025-11-01 10:26:08Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciespreviewpreview_computed_fill_valuesBUGBREWNULLNULLNULL
3dropmissingcolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:26:15SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciesappliedapplied_backup=IRIS_backup_c8cac3251a844d7bBUGBREWc8cac3251a844d7bIRIS_backup_c8cac3251a844d7bIRIS
4missing2025-11-01 10:29:32ALL_COLUMNSsuccess%missing action=show examined 6 column(s); total_rows=150.BUGBREWNULLNULLNULL
5fillmissingstrategy=median mode=preview2025-11-01 10:29:38Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciespreviewpreview_computed_fill_valuesBUGBREWNULLNULLNULL
6dropmissingcolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:29:46SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciesappliedapplied_backup=IRIS_backup_29adf350d1ab4121BUGBREW29adf350d1ab4121IRIS_backup_29adf350d1ab4121IRIS
7stats2025-11-01 10:30:13ALL_COLUMNSsuccessStats computed for 5 column(s); total_rows=100; percentiles=; include=numeric.BUGBREWNULLNULLNULL
8outliers2025-11-01 10:30:18Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmsuccessColumn 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.BUGBREWNULLNULLNULL
9dropoutliersmode=preview2025-11-01 10:30:25Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmpreviewpreview_completedBUGBREWNULLNULLNULL
10clipoutlierscolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:30:29SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmappliedapplied_backup=IRIS_backup_89cc522a9f0c4aadBUGBREW89cc522a9f0c4aadIRIS_backup_89cc522a9f0c4aadIRIS
11encodemethod=onehot columns=Species drop_original=false2025-11-01 10:30:33Speciespreviewpreview_completedBUGBREWNULLNULLNULL
12encodemethod=label columns=Species drop_original=true mode=apply confirm=true2025-11-01 10:30:39SpeciessuccessMethod: label\n", + "Created columns:\n", + "Species_lbl\n", + "\n", + "Details:\n", + "Column 'Species': label-encoded -> Species_lbl (unique_values=3)BUGBREWNULLNULLNULL
13standardizemode=preview2025-11-01 10:30:44Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Species_lblpreviewpreview_completedBUGBREWNULLNULLNULL
14normalizecolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm mode=apply confirm=true 2025-11-01 10:30:50SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCmsuccessFeature range: (0.0, 1.0)\n", + "\n", + "Details:\n", + "Normalized 3 column(s) to range (0.0, 1.0).BUGBREWNULLNULLNULL
15splitdatatest_size=0.2 val_size=0.1 stratify=Species_lbl random_state=422025-11-01 10:30:55Species_lblsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=70, test_count=20, val_count=10\n", + "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42BUGBREWNULLNULLNULL
16select_featurestarget=Species_lbl problem=classification k=42025-11-01 10:31:07PetalLengthCm\n", + "Id\n", + "SepalLengthCm\n", + "PetalWidthCmsuccessSelected 4 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, PetalWidthCmBUGBREWNULLNULLNULL
17select_modeltarget=Species_lbl problem=classification 2025-11-01 10:31:35PetalLengthCm\n", + "Id\n", + "SepalLengthCm\n", + "PetalWidthCmsuccessBest model 'gbm' (mean accuracy=1.0000) saved to data['best_model'].BUGBREWNULLNULLNULL
18train_modelfeatures=PetalLengthCm,SepalLengthCm,PetalWidthCm,SepalWidthCm target=Species_lbl problem=classification model=logistic2025-11-01 10:31:43PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessModel 'logistic' trained and saved to data['last_model']. problem=classification. train_rows=70BUGBREWNULLNULLNULL
19evaluate_model2025-11-01 10:31:46PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessEvaluation success. Model='last_model', test='last_select_test', preds_saved='last_preds'. accuracy=0.9500, precision=0.9045, recall=0.9500, f1=0.9262BUGBREWNULLNULLNULL
20save_modelmodel_name_in_data=last_model save_path='./saved_models/model.joblib'2025-11-01 10:31:54last_modelsuccessSaved model to ./saved_models/model.joblib (features[4], target=Species_lbl)BUGBREWNULLNULLNULL
21load_modelload_path='./saved_models/model.joblib'2025-11-01 10:31:56last_modelsuccessLoaded model from ./saved_models/model.joblib → data['last_model'] (features[4], target=Species_lbl)BUGBREWNULLNULLNULL
22predict_modelmodel_name=last_model data_name=[6.4,4.2,2.8,1.2] output_name=last_preds2025-11-01 10:31:58PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessPrediction success. model=last_model, data_arg=[6.4, 4.2, 2.8, 1.2], output=last_preds, shape=(1, 1) inline_values=[6.4, 4.2, 2.8, 1.2]BUGBREWNULLNULLNULL
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "%encode " + "SELECT * FROM magic_metadata;" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0706b5d-44a3-4bc3-a5c9-aff61a301cb4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "411fb892-8b7c-4495-ae2e-d24554d872a8", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 88fedd41788f3f4ca43335afbb4c70ffa6cb22e1 Mon Sep 17 00:00:00 2001 From: Sneha Jain <126079866+JainSneha6@users.noreply.github.com> Date: Sat, 1 Nov 2025 16:44:49 +0530 Subject: [PATCH 36/38] Add initial content to abc.ipynb --- DemoNotebooks/abc.ipynb | 1 + 1 file changed, 1 insertion(+) create mode 100644 DemoNotebooks/abc.ipynb diff --git a/DemoNotebooks/abc.ipynb b/DemoNotebooks/abc.ipynb new file mode 100644 index 0000000..5eae659 --- /dev/null +++ b/DemoNotebooks/abc.ipynb @@ -0,0 +1 @@ +sdhdh From c3871bca92c8cfc64b2e8ce7d7cafc4e389a0439 Mon Sep 17 00:00:00 2001 From: JainSneha6 Date: Sat, 1 Nov 2025 11:20:14 +0000 Subject: [PATCH 37/38] Created a DemoNotebooks folder --- .../AutomatedMLPipeline.ipynb | 0 Iris.csv => DemoNotebooks/Iris.csv | 0 RAG.ipynb => DemoNotebooks/RAG.ipynb | 0 .../RawMLPipeline.ipynb | 0 DemoNotebooks/abc.ipynb | 1 - test.docx => DemoNotebooks/test.docx | 0 test.txt => DemoNotebooks/test.txt | 0 Untitled.ipynb | 19220 ---------------- last_query.csv | 11 - .../ml_commands/data_cleaning/missing.py | 798 +- .../maria_magics/supported_magics.py | 136 +- 11 files changed, 467 insertions(+), 19699 deletions(-) rename AutomatedMLPipeline.ipynb => DemoNotebooks/AutomatedMLPipeline.ipynb (100%) rename Iris.csv => DemoNotebooks/Iris.csv (100%) rename RAG.ipynb => DemoNotebooks/RAG.ipynb (100%) rename RawMLPipeline.ipynb => DemoNotebooks/RawMLPipeline.ipynb (100%) delete mode 100644 DemoNotebooks/abc.ipynb rename test.docx => DemoNotebooks/test.docx (100%) rename test.txt => DemoNotebooks/test.txt (100%) delete mode 100644 Untitled.ipynb delete mode 100644 last_query.csv diff --git a/AutomatedMLPipeline.ipynb b/DemoNotebooks/AutomatedMLPipeline.ipynb similarity index 100% rename from AutomatedMLPipeline.ipynb rename to DemoNotebooks/AutomatedMLPipeline.ipynb diff --git a/Iris.csv b/DemoNotebooks/Iris.csv similarity index 100% rename from Iris.csv rename to DemoNotebooks/Iris.csv diff --git a/RAG.ipynb b/DemoNotebooks/RAG.ipynb similarity index 100% rename from RAG.ipynb rename to DemoNotebooks/RAG.ipynb diff --git a/RawMLPipeline.ipynb b/DemoNotebooks/RawMLPipeline.ipynb similarity index 100% rename from RawMLPipeline.ipynb rename to DemoNotebooks/RawMLPipeline.ipynb diff --git a/DemoNotebooks/abc.ipynb b/DemoNotebooks/abc.ipynb deleted file mode 100644 index 5eae659..0000000 --- a/DemoNotebooks/abc.ipynb +++ /dev/null @@ -1 +0,0 @@ -sdhdh diff --git a/test.docx b/DemoNotebooks/test.docx similarity index 100% rename from test.docx rename to DemoNotebooks/test.docx diff --git a/test.txt b/DemoNotebooks/test.txt similarity index 100% rename from test.txt rename to DemoNotebooks/test.txt diff --git a/Untitled.ipynb b/Untitled.ipynb deleted file mode 100644 index 4821ba6..0000000 --- a/Untitled.ipynb +++ /dev/null @@ -1,19220 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "370f7ad6-27b0-4b77-855f-5b42056d8f7d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Database
information_schema
mysql
performance_schema
sys
test
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "show databases;" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "4f793644-f458-4091-9bec-6680a0c2b849", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Your SQL code doesn't end with delimiter `;`\n" - ] - } - ], - "source": [ - "create database test;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee0b2971-9407-4de7-a800-79dd56eac54d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "use test;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8f15544d-ecac-4f56-988e-b577b8fb9ff7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "CREATE TABLE employees (\n", - " emp_id INT PRIMARY KEY AUTO_INCREMENT,\n", - " name VARCHAR(50),\n", - " department VARCHAR(50),\n", - " age INT,\n", - " gender VARCHAR(10),\n", - " education_level VARCHAR(30),\n", - " years_experience INT,\n", - " projects_completed INT,\n", - " avg_project_score DECIMAL(5,2),\n", - " certifications INT,\n", - " training_hours INT,\n", - " overtime_hours INT,\n", - " remote_ratio DECIMAL(3,2),\n", - " salary DECIMAL(10,2),\n", - " bonus DECIMAL(10,2),\n", - " satisfaction_score DECIMAL(3,2),\n", - " performance_rating INT, -- target variable for classification\n", - " potential_score DECIMAL(5,2), -- target variable for regression\n", - " attrition_flag INT -- 1 = left company, 0 = stayed\n", - ");\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f6d3778-2064-4b53-98d0-cc0cac8160d0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop tables employees;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d12775b4-23a8-4ba2-9de9-d0d3f727f53c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "INSERT INTO employees\n", - "(name, department, age, gender, education_level, years_experience,\n", - " projects_completed, avg_project_score, certifications, training_hours,\n", - " overtime_hours, remote_ratio, salary, bonus, satisfaction_score,\n", - " performance_rating, potential_score, attrition_flag)\n", - "VALUES\n", - "('Alice', 'HR', 30, 'F', NULL, 5, 12, 87.5, 1, 40, 5, 0.2, 55000, 300, 8.5, 4, 75.0, 0),\n", - "('Bob', 'Engineering', 45, 'M', 'Masters', 20, 30, 91.0, 3, 20, 10, 0.1, 1200000, 15000, 9.0, 5, 89.0, 0),\n", - "('Charlie', 'Sales', 38, 'M', 'Bachelors', NULL, 18, 79.3, 0, 15, 20, 0.5, 80000, 7000, 7.2, 3, 70.0, 1),\n", - "('Diana', 'Engineering', 29, 'F', 'PhD', 6, 22, 95.2, 2, 50, 2, 0.0, 97000, 10000, 9.6, 5, 95.0, 0),\n", - "('Eve', NULL, 35, 'F', 'Bachelors', 8, 15, 88.0, 1, 30, 6, 0.3, 90000, 8000, 8.0, 4, 85.0, 0),\n", - "('Frank', 'HR', 50, 'M', 'High School', 25, 8, 72.5, 0, 10, 15, 0.7, 60000, 4000, 6.5, 2, 60.0, 1),\n", - "('Grace', 'Sales', 42, 'F', 'Bachelors', 18, 20, 81.4, 1, 25, 12, 0.4, 85000, 7000, 7.8, 3, 74.0, 0),\n", - "('Henry', 'Engineering', 31, 'M', 'Masters', 7, 25, 93.1, 2, 35, 5, 0.2, 95000, 9000, 9.1, 5, 90.0, 0),\n", - "('Ivy', 'Finance', 27, 'F', 'Bachelors', 3, 10, 85.0, 0, 20, 8, 0.6, 70000, 5000, 8.2, 4, 82.0, 0),\n", - "('Jack', 'Sales', 55, 'M', 'High School', 30, 12, 68.9, 0, 5, 25, 0.8, 65000, 2000, 5.5, 1, 50.0, 1);" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af099a2a-a6e7-4297-b243-b64fd7be07c4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
2BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
3CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
4DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
5EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
6FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
7GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
8HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
9IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
10JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
11AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
12BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
13CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
14DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
15EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
16FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
17GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
18HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
19IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
20JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
21AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
22BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
23CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
24DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
25EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
26FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
27GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
28HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
29IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
30JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
31AliceHR30FNULL51287.5014050.2055000.00300.008.50475.000
32BobEngineering45MMasters203091.00320100.101200000.0015000.009.00589.000
33CharlieSales38MBachelorsNULL1879.30015200.5080000.007000.007.20370.001
34DianaEngineering29FPhD62295.2025020.0097000.0010000.009.60595.000
35EveNULL35FBachelors81588.0013060.3090000.008000.008.00485.000
36FrankHR50MHigh School25872.50010150.7060000.004000.006.50260.001
37GraceSales42FBachelors182081.40125120.4085000.007000.007.80374.000
38HenryEngineering31MMasters72593.1023550.2095000.009000.009.10590.000
39IvyFinance27FBachelors31085.0002080.6070000.005000.008.20482.000
40JackSales55MHigh School301268.9005250.8065000.002000.005.50150.001
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "SELECT * FROM employees;" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "184962da-5d3c-4f96-83de-2578df4706fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
missingpercent
emp_id00.0
name00.0
department110.0
age00.0
gender00.0
education_level110.0
years_experience110.0
projects_completed00.0
avg_project_score00.0
certifications00.0
training_hours00.0
overtime_hours00.0
remote_ratio00.0
salary00.0
bonus00.0
satisfaction_score00.0
performance_rating00.0
potential_score00.0
attrition_flag00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%missing" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "d9679f73-93a4-4e00-8561-e01207ab9015", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
percent
id0.0
name12.5
department25.0
age12.5
salary37.5
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%missing action=percent" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "771a152f-02ce-42e5-a006-6444bf2626db", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dtypemissingpercent
idint6400.0
nameobject412.5
departmentobject825.0
agefloat64412.5
salaryfloat641237.5
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%missing action=summary" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "28756366-6f31-4045-b2ea-7d47fdf08ff8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Line
namehelp
%line
The %line magic command follows a syntax very similar to that of
DataFrame.plot.line from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.line.html

Example:
> %line x=column1 y=column2

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Line class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%bar
The %bar magic command follows a syntax very similar to that of
DataFrame.plot.bar from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.bar.html

Example:
> %bar x=column1 y=column2 stacked=True

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Bar class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%pie
The %pie magic command follows a syntax very similar to that of
DataFrame.plot.pie from Pandas.
Please refer to this link for an exhaustive list of options:
pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.pie.html

Example:
> %pie y=column_name

The whole purpose of this magic command is to allow the user to display
the result of the last query (e.g. SELECT, SHOW,...) in a nice and simple
matplotlib plot.

Internally, the Pie class receives the data of the last query from the kernel
as a Pandas DataFrame, it generates a plot PNG image, wraps the image into
a nice display_data Jupyter message and then sends it further.
%df
The %df magic command has the following syntax:
> %df [filename]

It writes the result of the last query executed in the notebook
into an external CSV formatted file.
The purpose of this magic command is to allow users to export query
data from their MariaDB databases and then quickly import it
into a Python Notebook where more complex analytics can be performed.

If no arguments are specified, the kernel writes the data into a
CSV file named 'last_query.csv'.
%lsmagic
The %lsmagic magic command prints the magics currently
supported by the kernel. It also prints the help text for each command
%load
The %load magic command has the following syntax:
> %load csv_file_path table_name [skip_row_num] [seperator] [character set] [enclosing character]
The %load magic command can load CSV file for updating specific table data.

This command does not create a table if the one specified as argument doesn't exist,
the user needs to create the destination table with the proper schema to match the data in the CSV file.

CSV file first line may be header, can set [skip row num] to 1 for skipping header.

The separator between columns defaults to ',', but can be reconfigured.

The character set used by the CSV file defaults to utf8.

The character used to enclose entries to escape the field delimiter defaults to `"`.

Any argument can be enclosed by ' ' or " ", handling cases that argument contains spaces.
missing%missing [action=show|percent|summary] [columns=col1,col2]
Display missing-value information from the last query result.
dropmissing%dropmissing [columns=col1,col2,...]
Drops rows with missing values from data['last_select'] (always IN-PLACE).
stats%stats [columns=col1,col2] [include=all|numeric|object] [percentiles=25,50,75] [transpose=true|false]
Show statistical summary (uses pandas.DataFrame.describe under the hood).
fillmissing%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]
Fills missing values in data['last_select'] (always IN-PLACE).
outliers%outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False]
Detects outliers in data['last_select'] (non in-place). Results placed in data['last_select_outliers'].
dropoutliers%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]
Removes rows containing outliers from data['last_select'] (in-place).
clipoutliers%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [inplace=True|False]
Clamps extreme numeric values to computed boundaries (in-place by default).
encode%encode method=<label|onehot|ordinal> [columns=col1,col2] [inplace=true] [drop_original=true]
Encode categorical columns using label, one-hot, or ordinal encoding (automatic).
normalize%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]
Normalize numeric columns using MinMaxScaler (in-place by default).
standardize%standardize [columns=col1,col2,...] [inplace=True|False]
Standardizes numeric columns using sklearn's StandardScaler (in-place by default).
splitdata%splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False]
[random_state=42] [inplace=True|False] [train_name=name] [test_name=name] [val_name=name]
Split last_select into train/test/(val).
train_modelTrain a model on data['last_select'] (no split or scaling).
evaluate_modelEvaluate a trained model on a test DataFrame and show metrics + predictions.
Cell
namehelp
%%delimiter
The %%delimiter magic command is a cell magic. This means
that it operates over the entire cell within it is used.

Its purpose is to run an SQL statement using a different
delimiter than the default ";". The main usecase should be
Stored Procedures and Stored Functions.

Example:
--------cell
%%delimiter //
CREATE PROCEDURE proc ()
BEGIN
select 1;
END;
//
--------end-of-cell

Please note that the SQL statement needs to end with the
delimiter specified by the magic command.
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%lsmagic" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e01f05ec-ebc9-4344-a888-fceb40da7a8f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
1AliceHR30.050000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "63c5d2a7-c1ca-4fcd-a711-cf340c630d5b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW: would drop 1 row(s) (from 10 to 9).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_would_be_dropped
5EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00True
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing columns=department mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "2ef8cbe8-6fc7-46a6-9517-4d329bad53c8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "EXPLAIN (estimate):\n", - "
EXPLAIN
{\n", - " "query_block": {\n", - " "select_id": 1,\n", - " "table": {\n", - " "delete": 1,\n", - " "table_name": "employees",\n", - " "access_type": "ALL",\n", - " "rows": 10,\n", - " "attached_condition": "employees.department is null"\n", - " }\n", - " }\n", - "}
\n" - ] - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "0536ae17-266f-4b77-910b-05b69d4f817c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_73b9e64a9b8c4045.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN5.01287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters20.03091.0320100.1120000.015000.09.0589.00
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
10JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing columns=department table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "2fbe1e7a-501e-4af2-9d9e-1177adf4d64c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_73b9e64a9b8c4045 -> test.employees; previous test.employees renamed to test.employees_prerollback_73b9e64a9b8c4045.\n" - ] - } - ], - "source": [ - "%dropmissing mode=rollback rollback_token=73b9e64a9b8c4045" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "70e58c09-5a55-4093-b135-0fed2202d2d3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
1AliceHR30.050000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
9AliceHR30.050000.0
12DavidHR25.048000.0
14FrankEngineering28.072000.0
16GraceSales45.065000.0
17AliceHR30.050000.0
20DavidHR25.048000.0
22FrankEngineering28.072000.0
24GraceSales45.065000.0
25AliceHR30.050000.0
28DavidHR25.048000.0
30FrankEngineering28.072000.0
32GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing columns=salary" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "c674309d-4c1c-4d5a-a879-3b3f4816e4c5", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dropped rows with missing values (in-place). Updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
1AliceHR30.050000.0
3CharlieEngineeringNaN70000.0
4DavidHR25.048000.0
6FrankEngineering28.072000.0
8GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropmissing columns=salary" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "364a4ddb-de96-4f20-b979-292184dadd0e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idagesalary
count8.000007.0000005.00000
mean4.5000036.14285761000.00000
std2.449499.26334311269.42767
min1.0000025.00000048000.00000
25%2.7500029.00000050000.00000
50%4.5000035.00000065000.00000
75%6.2500042.50000070000.00000
max8.0000050.00000072000.00000
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%stats" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "0031cec8-4a24-491b-a910-f3effa939afd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
count8.00000767.0000005.00000
uniqueNaN73NaNNaN
topNaNAliceHRNaNNaN
freqNaN12NaNNaN
mean4.50000NaNNaN36.14285761000.00000
std2.44949NaNNaN9.26334311269.42767
min1.00000NaNNaN25.00000048000.00000
25%2.75000NaNNaN29.00000050000.00000
50%4.50000NaNNaN35.00000065000.00000
75%6.25000NaNNaN42.50000070000.00000
max8.00000NaNNaN50.00000072000.00000
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%stats include=all" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "829f6342-96b1-4d81-8519-2a53c091dfb1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'id': filled missing with mean=21.321428571428573.\n", - "Column 'name' is not numeric; cannot use mean. Skipped.\n", - "Column 'department' is not numeric; cannot use mean. Skipped.\n", - "Column 'age': filled missing with mean=37.07692307692308.\n", - "Column 'salary': filled missing with mean=127500.0.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
18BobNaN40.000000127500.0
25AliceHR30.0000005000.0
29EveNaN35.000000127500.0
23NaNSales50.000000127500.0
6FrankEngineering28.00000072000.0
40GraceSales45.00000065000.0
14FrankEngineering28.00000072000.0
22FrankEngineering28.00000072000.0
32GraceSales45.00000065000.0
12DavidHR25.00000048000.0
34BobNaN40.000000127500.0
33AliceHR30.0000005000.0
7NaNSales50.000000127500.0
26BobNaN40.000000127500.0
19CharlieEngineering37.076923700000.0
10BobNaN40.000000127500.0
36DavidHR25.00000048000.0
21EveNaN35.000000127500.0
24GraceSales45.00000065000.0
31NaNSales50.000000127500.0
39NaNSales50.000000127500.0
4DavidHR25.00000048000.0
9AliceHR30.0000005000.0
1AliceHR30.0000005000.0
2BobNaN40.000000127500.0
8GraceSales45.00000065000.0
37EveNaN35.000000127500.0
35CharlieEngineering37.076923700000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "52d78b76-69c5-41d4-874f-8d8cb8b0cae9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW: missing counts per column:\n", - "years_experience: missing=1\n", - "PREVIEW: computed fill-values (best-effort):\n", - "years_experience: would fill with -> 8.0 (median via local preview)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_null_columns
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01years_experience
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing columns=years_experience strategy=median mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "caf67100-c435-4da1-b993-b46f58981277", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_9407f2e1e7db47b2.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN5.01287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters20.03091.0320100.1120000.015000.09.0589.00
3CharlieSales38MBachelors8.01879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
5EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
10JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing columns=years_experience strategy=median table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "50047388-14df-4cfc-a670-5c1021b6e471", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_9407f2e1e7db47b2 -> test.employees; previous test.employees renamed to test.employees_prerollback_9407f2e1e7db47b2.\n" - ] - } - ], - "source": [ - "%fillmissing mode=rollback rollback_token=9407f2e1e7db47b2" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "beac2393-829b-472e-b9cb-d12166e16088", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'id': filled missing with mode=1.\n", - "Column 'name': filled missing with mode=Alice.\n", - "Column 'department': filled missing with mode=Engineering.\n", - "Column 'age': filled missing with mode=25.0.\n", - "Column 'salary': filled missing with mode=48000.0.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
1AliceHR30.050000.0
2BobEngineering40.048000.0
3CharlieEngineering25.070000.0
4DavidHR25.048000.0
5EveEngineering35.048000.0
6FrankEngineering28.072000.0
7AliceSales50.048000.0
8GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing strategy=mode" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "948523f7-1b1c-4f37-a3ae-161e8ce40d0b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fill missing completed (in-place). Summary:\n", - "Column 'name': filled missing with constant value=Unknown.\n", - "Column 'department': filled missing with constant value=Unknown.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
1AliceHR30.05000.0
2BobUnknown40.065000.0
3CharlieEngineering35.0700000.0
4DavidHR25.048000.0
5EveUnknown35.065000.0
6FrankEngineering28.072000.0
7UnknownSales50.065000.0
8GraceSales45.065000.0
9AliceHR30.05000.0
10BobUnknown40.065000.0
11CharlieEngineering35.0700000.0
12DavidHR25.048000.0
13EveUnknown35.065000.0
14FrankEngineering28.072000.0
15UnknownSales50.065000.0
16GraceSales45.065000.0
17AliceHR30.05000.0
18BobUnknown40.065000.0
19CharlieEngineering35.0700000.0
20DavidHR25.048000.0
21EveUnknown35.065000.0
22FrankEngineering28.072000.0
23UnknownSales50.065000.0
24GraceSales45.065000.0
25AliceHR30.05000.0
26BobUnknown40.065000.0
27CharlieEngineering35.0700000.0
28DavidHR25.048000.0
29EveUnknown35.065000.0
30FrankEngineering28.072000.0
31UnknownSales50.065000.0
32GraceSales45.065000.0
33AliceHR30.05000.0
34BobUnknown40.065000.0
35CharlieEngineering35.0700000.0
36DavidHR25.048000.0
37EveUnknown35.065000.0
38FrankEngineering28.072000.0
39UnknownSales50.065000.0
40GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%fillmissing columns=name,department strategy=constant value=\"Unknown\"" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "81a88f9a-bdf8-4f87-a5d0-a0a88fcc5ace", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Strategy 'constant' requires a 'value=...' argument.\n" - ] - } - ], - "source": [ - "%fillmissing strategy=constant" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "d76ed0d7-4332-40b0-a5c6-588784807a23", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The result set was successfully written into last_query.csv\n" - ] - } - ], - "source": [ - "%df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6cf47bfb-ff40-4522-8723-a8bf17398210", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'emp_id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'years_experience': detected 0 outlier(s) using iqr.\n", - "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", - "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", - "Column 'certifications': detected 0 outlier(s) using iqr.\n", - "Column 'training_hours': detected 0 outlier(s) using iqr.\n", - "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", - "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Column 'bonus': detected 0 outlier(s) using iqr.\n", - "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", - "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", - "Column 'potential_score': detected 0 outlier(s) using iqr.\n", - "Column 'attrition_flag': detected 0 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_is_outlierage_is_outlieryears_experience_is_outlierprojects_completed_is_outlieravg_project_score_is_outliercertifications_is_outliertraining_hours_is_outlierovertime_hours_is_outlierremote_ratio_is_outliersalary_is_outlierbonus_is_outliersatisfaction_score_is_outlierperformance_rating_is_outlierpotential_score_is_outlierattrition_flag_is_outlier
1AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00FalseFalseFalseFalseFalseFalseFalseFalseFalseTrueFalseFalseFalseFalseFalse
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
4DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
5EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
8HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
9IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
10JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01FalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalseFalse
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "ea459692-59ac-49be-8ba2-0c4b0b01908d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.0000005000.0FalseFalseFalse
2BobUnknown40.000000178000.0FalseFalseFalse
3CharlieEngineering36.142857700000.0FalseFalseFalse
4DavidHR25.00000048000.0FalseFalseFalse
5EveUnknown35.000000178000.0FalseFalseFalse
6FrankEngineering28.00000072000.0FalseFalseFalse
7UnknownSales50.000000178000.0FalseFalseFalse
8GraceSales45.00000065000.0FalseFalseFalse
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers method=zscore z_thresh=2.5" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "6a53e5f2-cdb4-4d72-afb1-5a7719be8835", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Outlier detection completed (non in-place). Summary:\n", - "Column 'id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" - ] - }, - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalaryid_is_outlierage_is_outliersalary_is_outlier
1AliceHR30.0000005000.0FalseFalseFalse
2BobUnknown40.000000178000.0FalseFalseFalse
3CharlieEngineering36.142857700000.0FalseFalseTrue
4DavidHR25.00000048000.0FalseFalseFalse
5EveUnknown35.000000178000.0FalseFalseFalse
6FrankEngineering28.00000072000.0FalseFalseFalse
7UnknownSales50.000000178000.0FalseFalseFalse
8GraceSales45.00000065000.0FalseFalseFalse
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers plot=True" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "636a1612-e7b4-44fd-9cdb-bf7c935719b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Marked outliers in-place. Summary:\n", - "Column 'salary': detected 2 outlier(s) using iqr.\n" - ] - }, - { - "data": { - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalarysalary_is_outlier
1AliceHR30.05000.0True
2BobNaN40.0NaNFalse
3CharlieEngineeringNaN700000.0True
4DavidHR25.048000.0False
5EveNaN35.0NaNFalse
6FrankEngineering28.072000.0False
7NaNSales50.0NaNFalse
8GraceSales45.065000.0False
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%outliers columns=salary plot=True" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3dfbc841-56d4-40f5-9be0-927acf0e6c63", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local): would drop 1 row(s) (from 10 to 9).\n", - "Column 'emp_id': detected 0 outlier(s) using iqr.\n", - "Column 'age': detected 0 outlier(s) using iqr.\n", - "Column 'years_experience': detected 0 outlier(s) using iqr.\n", - "Column 'projects_completed': detected 0 outlier(s) using iqr.\n", - "Column 'avg_project_score': detected 0 outlier(s) using iqr.\n", - "Column 'certifications': detected 0 outlier(s) using iqr.\n", - "Column 'training_hours': detected 0 outlier(s) using iqr.\n", - "Column 'overtime_hours': detected 0 outlier(s) using iqr.\n", - "Column 'remote_ratio': detected 0 outlier(s) using iqr.\n", - "Column 'salary': detected 1 outlier(s) using iqr.\n", - "Column 'bonus': detected 0 outlier(s) using iqr.\n", - "Column 'satisfaction_score': detected 0 outlier(s) using iqr.\n", - "Column 'performance_rating': detected 0 outlier(s) using iqr.\n", - "Column 'potential_score': detected 0 outlier(s) using iqr.\n", - "Column 'attrition_flag': detected 0 outlier(s) using iqr.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_outlier_cols
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropoutliers mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "444a843d-c62d-4dde-8be4-298e62f2f92b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_53731015c85a478a.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN51287.514050.255000.0300.08.5475.00
4DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveNaN35FBachelors81588.013060.390000.08000.08.0485.00
6FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
10JackSales55MHigh School301268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropoutliers table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "da62d244-57bb-46db-a98c-0f19fcf7073a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_53731015c85a478a -> test.employees; previous test.employees renamed to test.employees_prerollback_53731015c85a478a.\n" - ] - } - ], - "source": [ - "%dropoutliers mode=rollback rollback_token=53731015c85a478a" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "fdc71a0a-2246-4754-8f52-642be4ea209f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No outliers detected. No rows removed.\n", - "Column 'id': detected 0 outlier(s) using zscore.\n", - "Column 'age': detected 0 outlier(s) using zscore.\n", - "Column 'salary': detected 0 outlier(s) using zscore.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
2BobNaN40.0NaN
4DavidHR25.048000.0
5EveNaN35.0NaN
6FrankEngineering28.072000.0
7NaNSales50.0NaN
8GraceSales45.065000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%dropoutliers method=zscore z_thresh=2.5" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b4c02d58-7020-4514-b84a-9a6b5e18fb40", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clip outliers completed using iqr.\n", - "Column 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", - "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", - "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", - "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", - "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", - "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", - "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).\n", - "Total values clipped: 0. Modified in-place: data['last_select'] updated.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
2BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
3CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
5EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
6FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
10JackSales55MHigh School301268.905250.865000.02000.05.5150.01
11AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
12BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
13CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
14DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
15EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
16FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
17GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
18HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
19IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
20JackSales55MHigh School301268.905250.865000.02000.05.5150.01
21AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
22BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
23CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
24DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
25EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
26FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
27GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
28HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
29IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
30JackSales55MHigh School301268.905250.865000.02000.05.5150.01
31AliceHR30FBachelors51287.514050.255000.03000.08.5475.00
32BobEngineering45MMasters203091.0320100.1120000.015000.09.0589.00
33CharlieSales38MBachelors101879.3015200.580000.07000.07.2370.01
34DianaEngineering29FPhD62295.225020.097000.010000.09.6595.00
35EveFinance35FBachelors81588.013060.390000.08000.08.0485.00
36FrankHR50MHigh School25872.5010150.760000.04000.06.5260.01
37GraceSales42FBachelors182081.4125120.485000.07000.07.8374.00
38HenryEngineering31MMasters72593.123550.295000.09000.09.1590.00
39IvyFinance27FBachelors31085.002080.670000.05000.08.2482.00
40JackSales55MHigh School301268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%clipoutliers" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "60e97acc-b37f-4197-97c1-2a6dcb768b0c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local): would modify 1 value(s) across 15 column(s).\n", - "Column 'emp_id': would clip 0 value(s) locally (bounds: -3.5, 14.5).\n", - "Column 'age': would clip 0 value(s) locally (bounds: 9.25, 65.25).\n", - "Column 'years_experience': would clip 0 value(s) locally (bounds: -15.0, 41.0).\n", - "Column 'projects_completed': would clip 0 value(s) locally (bounds: -2.25, 35.75).\n", - "Column 'avg_project_score': would clip 0 value(s) locally (bounds: 64.1875, 105.88749999999999).\n", - "Column 'certifications': would clip 0 value(s) locally (bounds: -2.625, 4.375).\n", - "Column 'training_hours': would clip 0 value(s) locally (bounds: -10.0, 60.0).\n", - "Column 'overtime_hours': would clip 0 value(s) locally (bounds: -8.25, 27.75).\n", - "Column 'remote_ratio': would clip 0 value(s) locally (bounds: -0.3624999999999999, 1.1374999999999997).\n", - "Column 'salary': would clip 1 value(s) locally (bounds: 25000.0, 135000.0).\n", - "Column 'bonus': would clip 0 value(s) locally (bounds: -2500.0, 15500.0).\n", - "Column 'satisfaction_score': would clip 0 value(s) locally (bounds: 5.062499999999999, 11.162500000000001).\n", - "Column 'performance_rating': would clip 0 value(s) locally (bounds: 0.375, 7.375).\n", - "Column 'potential_score': would clip 0 value(s) locally (bounds: 45.5, 113.5).\n", - "Column 'attrition_flag': would clip 0 value(s) locally (bounds: -1.125, 1.875).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_oob_columns
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00salary
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%clipoutliers mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "41e5e6ae-419f-4a60-afe1-dd0a64b62d6c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_656479791b1d48fc.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1.0AliceHR30.0FNaN5.012.087.51.040.05.00.255000.0300.08.54.075.00.0
2.0BobEngineering45.0MMasters20.030.091.03.020.010.00.1135000.015000.09.05.089.00.0
3.0CharlieSales38.0MBachelorsNaN18.079.30.015.020.00.580000.07000.07.23.070.01.0
4.0DianaEngineering29.0FPhD6.022.095.22.050.02.00.097000.010000.09.65.095.00.0
5.0EveNaN35.0FBachelors8.015.088.01.030.06.00.390000.08000.08.04.085.00.0
6.0FrankHR50.0MHigh School25.08.072.50.010.015.00.760000.04000.06.52.060.01.0
7.0GraceSales42.0FBachelors18.020.081.41.025.012.00.485000.07000.07.83.074.00.0
8.0HenryEngineering31.0MMasters7.025.093.12.035.05.00.295000.09000.09.15.090.00.0
9.0IvyFinance27.0FBachelors3.010.085.00.020.08.00.670000.05000.08.24.082.00.0
10.0JackSales55.0MHigh School30.012.068.90.05.025.00.865000.02000.05.51.050.01.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%clipoutliers table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "78200a79-5413-47a2-ad46-c931b9d05d63", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_656479791b1d48fc -> test.employees; previous test.employees renamed to test.employees_prerollback_656479791b1d48fc.\n" - ] - } - ], - "source": [ - "%clipoutliers mode=rollback rollback_token=656479791b1d48fc" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d146b7c7-8860-4962-a9e8-78675b068982", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Clip outliers completed using zscore.\n", - "Column 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", - "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", - "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", - "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", - "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", - "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", - "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).\n", - "Total values clipped: 20. Modified in-place: data['last_select'] updated.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
2BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
3CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
4DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
5EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
6FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
7GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
8HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
9IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
10JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
11AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
12BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
13CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
14DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
15EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
16FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
17GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
18HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
19IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
20JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
21AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
22BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
23CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
24DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
25EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
26FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
27GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
28HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
29IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
30JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
31AliceHR30FBachelors51287.51405.0000000.255000.000003000.0000008.5000004.00000075.00
32BobEngineering45MMasters203091.032010.0000000.1119917.1930714358.9296889.0000005.00000089.00
33CharlieSales38MBachelors101879.301520.0000000.580000.000007000.0000007.2000003.00000070.01
34DianaEngineering29FPhD62295.22502.0000000.097000.0000010000.0000009.6000005.00000095.00
35EveFinance35FBachelors81588.01306.0000000.390000.000008000.0000008.0000004.00000085.00
36FrankHR50MHigh School25872.501015.0000000.760000.000004000.0000006.5000002.00000060.01
37GraceSales42FBachelors182081.412512.0000000.485000.000007000.0000007.8000003.00000074.00
38HenryEngineering31MMasters72593.12355.0000000.295000.000009000.0000009.1000005.00000090.00
39IvyFinance27FBachelors31085.00208.0000000.670000.000005000.0000008.2000004.00000082.00
40JackSales55MHigh School301268.90524.8562970.865000.000002000.0000005.5260241.00612250.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%clipoutliers method=zscore z_thresh=2.0 " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "61b6b28f-9b56-4d7f-8613-a406928dbb1c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Encoded columns in-place and updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
1305.01287.514050.255000.0300.08.5475.00020-1
24520.03091.0320100.11200000.015000.09.0589.001012
338NaN1879.3015200.580000.07000.07.2370.012310
4296.02295.225020.097000.010000.09.6595.003003
5358.01588.013060.390000.08000.08.0485.004-100
65025.0872.5010150.760000.04000.06.5260.015211
74218.02081.4125120.485000.07000.07.8374.006300
8317.02593.123550.295000.09000.09.1590.007012
9273.01085.002080.670000.05000.08.2482.008100
105530.01268.905250.865000.02000.05.5150.019311
11305.01287.514050.255000.0300.08.5475.00020-1
124520.03091.0320100.11200000.015000.09.0589.001012
1338NaN1879.3015200.580000.07000.07.2370.012310
14296.02295.225020.097000.010000.09.6595.003003
15358.01588.013060.390000.08000.08.0485.004-100
165025.0872.5010150.760000.04000.06.5260.015211
174218.02081.4125120.485000.07000.07.8374.006300
18317.02593.123550.295000.09000.09.1590.007012
19273.01085.002080.670000.05000.08.2482.008100
205530.01268.905250.865000.02000.05.5150.019311
21305.01287.514050.255000.0300.08.5475.00020-1
224520.03091.0320100.11200000.015000.09.0589.001012
2338NaN1879.3015200.580000.07000.07.2370.012310
24296.02295.225020.097000.010000.09.6595.003003
25358.01588.013060.390000.08000.08.0485.004-100
265025.0872.5010150.760000.04000.06.5260.015211
274218.02081.4125120.485000.07000.07.8374.006300
28317.02593.123550.295000.09000.09.1590.007012
29273.01085.002080.670000.05000.08.2482.008100
305530.01268.905250.865000.02000.05.5150.019311
31305.01287.514050.255000.0300.08.5475.00020-1
324520.03091.0320100.11200000.015000.09.0589.001012
3338NaN1879.3015200.580000.07000.07.2370.012310
34296.02295.225020.097000.010000.09.6595.003003
35358.01588.013060.390000.08000.08.0485.004-100
365025.0872.5010150.760000.04000.06.5260.015211
374218.02081.4125120.485000.07000.07.8374.006300
38317.02593.123550.295000.09000.09.1590.007012
39273.01085.002080.670000.05000.08.2482.008100
405530.01268.905250.865000.02000.05.5150.019311
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=label drop_original=true mode=apply" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "66a8378a-6f87-4b38-a729-5aab1a288cb0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", - "PREVIEW (local) estimated created columns: 4\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
2BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
6FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
9IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
10JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
11AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
12BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
13CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
14DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
16FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
17GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
18HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
19IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
20JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
21AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
22BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
23CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
24DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
26FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
27GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
28HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
29IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
30JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
31AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00
32BobEngineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
33CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
34DianaEngineering29FPhD6.02295.225020.097000.010000.09.6595.00
36FrankHR50MHigh School25.0872.5010150.760000.04000.06.5260.01
37GraceSales42FBachelors18.02081.4125120.485000.07000.07.8374.00
38HenryEngineering31MMasters7.02593.123550.295000.09000.09.1590.00
39IvyFinance27FBachelors3.01085.002080.670000.05000.08.2482.00
40JackSales55MHigh School30.01268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "2ebe63dd-3817-423b-bfb1-8e4b57dfb0a9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'department' unique non-null values: 4 (showing up to 10): ['HR', 'Engineering', 'Sales', 'Finance']\n", - "PREVIEW (local) estimated created columns: 4\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1Alice10000HR30FNaN5.01287.514050.255000.0300.08.5475.00
2Bob01000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3Charlie00100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4Diana01000Engineering29FPhD6.02295.225020.097000.010000.09.6595.00
6Frank10000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7Grace00100Sales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9Ivy00001Finance27FBachelors3.01085.002080.670000.05000.08.2482.00
10Jack00100Sales55MHigh School30.01268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "4760a0c0-869a-4e77-bce1-8985bca8006f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_8d3413bf829d4cd8.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartment_HRdepartment_Engineeringdepartment_Salesdepartment_NULLdepartment_Financedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
1Alice10000HR30FNaN5.01287.514050.255000.0300.08.5475.00
2Bob01000Engineering45MMasters20.03091.0320100.11200000.015000.09.0589.00
3Charlie00100Sales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01
4Diana01000Engineering29FPhD6.02295.225020.097000.010000.09.6595.00
5Eve00010NaN35FBachelors8.01588.013060.390000.08000.08.0485.00
6Frank10000HR50MHigh School25.0872.5010150.760000.04000.06.5260.01
7Grace00100Sales42FBachelors18.02081.4125120.485000.07000.07.8374.00
8Henry01000Engineering31MMasters7.02593.123550.295000.09000.09.1590.00
9Ivy00001Finance27FBachelors3.01085.002080.670000.05000.08.2482.00
10Jack00100Sales55MHigh School30.01268.905250.865000.02000.05.5150.01
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "65d25956-0729-4e5f-9f41-d91ab3361655", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_8d3413bf829d4cd8 -> test.employees; previous test.employees renamed to test.employees_prerollback_8d3413bf829d4cd8.\n" - ] - } - ], - "source": [ - "%encode mode=rollback rollback_token=8d3413bf829d4cd8" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ac369830-92cc-44ba-b7b2-cfa7b620aafe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Encoded columns in-place and updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalarydepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknown
1AliceHR30.05000.00.01.00.00.0
2BobUnknown40.065000.00.00.00.01.0
3CharlieEngineering35.0700000.01.00.00.00.0
4DavidHR25.048000.00.01.00.00.0
5EveUnknown35.065000.00.00.00.01.0
6FrankEngineering28.072000.01.00.00.00.0
7UnknownSales50.065000.00.00.01.00.0
8GraceSales45.065000.00.00.01.00.0
9AliceHR30.05000.00.01.00.00.0
10BobUnknown40.065000.00.00.00.01.0
11CharlieEngineering35.0700000.01.00.00.00.0
12DavidHR25.048000.00.01.00.00.0
13EveUnknown35.065000.00.00.00.01.0
14FrankEngineering28.072000.01.00.00.00.0
15UnknownSales50.065000.00.00.01.00.0
16GraceSales45.065000.00.00.01.00.0
17AliceHR30.05000.00.01.00.00.0
18BobUnknown40.065000.00.00.00.01.0
19CharlieEngineering35.0700000.01.00.00.00.0
20DavidHR25.048000.00.01.00.00.0
21EveUnknown35.065000.00.00.00.01.0
22FrankEngineering28.072000.01.00.00.00.0
23UnknownSales50.065000.00.00.01.00.0
24GraceSales45.065000.00.00.01.00.0
25AliceHR30.05000.00.01.00.00.0
26BobUnknown40.065000.00.00.00.01.0
27CharlieEngineering35.0700000.01.00.00.00.0
28DavidHR25.048000.00.01.00.00.0
29EveUnknown35.065000.00.00.00.01.0
30FrankEngineering28.072000.01.00.00.00.0
31UnknownSales50.065000.00.00.01.00.0
32GraceSales45.065000.00.00.01.00.0
33AliceHR30.05000.00.01.00.00.0
34BobUnknown40.065000.00.00.00.01.0
35CharlieEngineering35.0700000.01.00.00.00.0
36DavidHR25.048000.00.01.00.00.0
37EveUnknown35.065000.00.00.00.01.0
38FrankEngineering28.072000.01.00.00.00.0
39UnknownSales50.065000.00.00.01.00.0
40GraceSales45.065000.00.00.01.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=onehot columns=department drop_original=false" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "ed2b1c8f-4372-4cb4-9edb-f27bfa07ee83", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Encoded columns in-place and updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalarydepartment_lbldepartment_Engineeringdepartment_HRdepartment_Salesdepartment_Unknowndepartment_ord
1AliceHR30.05000.010.01.00.00.01.0
2BobUnknown40.0NaN30.00.00.01.03.0
3CharlieEngineeringNaN700000.001.00.00.00.00.0
4DavidHR25.048000.010.01.00.00.01.0
5EveUnknown35.0NaN30.00.00.01.03.0
6FrankEngineering28.072000.001.00.00.00.00.0
7UnknownSales50.0NaN20.00.01.00.02.0
8GraceSales45.065000.020.00.01.00.02.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%encode method=ordinal columns=department drop_original=false" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "ab1491d5-4a2b-46e8-a079-dab57ae95afe", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
0.000000AliceHR0.200.000000
0.142857BobNaN0.60NaN
0.285714CharlieEngineeringNaN1.000000
0.428571DavidHR0.000.061871
0.571429EveNaN0.40NaN
0.714286FrankEngineering0.120.096403
0.857143NaNSales1.00NaN
1.000000GraceSales0.800.086331
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "1708d88d-db07-40cb-aeef-fcb6baffe649", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'emp_id' min=1.0, max=40.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'age' min=27.0, max=55.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'years_experience' min=3.0, max=30.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'projects_completed' min=8.0, max=30.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'avg_project_score' min=68.9, max=95.2 -> range will map to (5.0, 10.0)\n", - "Local: Column 'certifications' min=0.0, max=3.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'training_hours' min=5.0, max=50.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'overtime_hours' min=2.0, max=25.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'remote_ratio' min=0.0, max=0.8 -> range will map to (5.0, 10.0)\n", - "Local: Column 'salary' min=55000.0, max=1200000.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'bonus' min=300.0, max=15000.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'satisfaction_score' min=5.5, max=9.6 -> range will map to (5.0, 10.0)\n", - "Local: Column 'performance_rating' min=1.0, max=5.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'potential_score' min=50.0, max=95.0 -> range will map to (5.0, 10.0)\n", - "Local: Column 'attrition_flag' min=0.0, max=1.0 -> range will map to (5.0, 10.0)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_norm_previewage_norm_previewyears_experience_norm_previewprojects_completed_norm_previewavg_project_score_norm_previewcertifications_norm_previewtraining_hours_norm_previewovertime_hours_norm_previewremote_ratio_norm_previewsalary_norm_previewbonus_norm_previewsatisfaction_score_norm_previewperformance_rating_norm_previewpotential_score_norm_previewattrition_flag_norm_preview
1305.01287.514050.255000.0300.08.5475.005.0000005.5357145.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000005.0000008.6585378.757.7777785.0
24520.03091.0320100.11200000.015000.09.0589.005.1282058.2142868.14814810.0000009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.0
338NaN1879.3015200.580000.07000.07.2370.015.2564106.964286NaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
4296.02295.225020.097000.010000.09.6595.005.3846155.3571435.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.0
5358.01588.013060.390000.08000.08.0485.005.5128216.4285715.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0
65025.0872.5010150.760000.04000.06.5260.015.6410269.1071439.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.0
74218.02081.4125120.485000.07000.07.8374.005.7692317.6785717.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0
8317.02593.123550.295000.09000.09.1590.005.8974365.7142865.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.0
9273.01085.002080.670000.05000.08.2482.006.0256415.0000005.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.0
105530.01268.905250.865000.02000.05.5150.016.15384610.00000010.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
11305.01287.514050.255000.0300.08.5475.006.2820515.5357145.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000005.0000008.6585378.757.7777785.0
124520.03091.0320100.11200000.015000.09.0589.006.4102568.2142868.14814810.0000009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.0
1338NaN1879.3015200.580000.07000.07.2370.016.5384626.964286NaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
14296.02295.225020.097000.010000.09.6595.006.6666675.3571435.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.0
15358.01588.013060.390000.08000.08.0485.006.7948726.4285715.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0
165025.0872.5010150.760000.04000.06.5260.016.9230779.1071439.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.0
174218.02081.4125120.485000.07000.07.8374.007.0512827.6785717.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0
18317.02593.123550.295000.09000.09.1590.007.1794875.7142865.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.0
19273.01085.002080.670000.05000.08.2482.007.3076925.0000005.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.0
205530.01268.905250.865000.02000.05.5150.017.43589710.00000010.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize feature_range=5,10 mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "6685559f-8986-4504-97eb-e62e20275bd3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_daf864252a6c46f1.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
5.000000AliceHR5.535714FNaN5.3703705.9090918.5361226.6666678.8888895.6521746.2505.0000005.0000008.6585378.757.7777785.0
5.555556BobEngineering8.214286MMasters8.14814810.0000009.20152110.0000006.6666676.7391305.62510.00000010.0000009.26829310.009.3333335.0
6.111111CharlieSales6.964286MBachelorsNaN7.2727276.9771865.0000006.1111118.9130438.1255.1091707.2789127.0731717.507.22222210.0
6.666667DianaEngineering5.357143FPhD5.5555568.18181810.0000008.33333310.0000005.0000005.0005.1834068.29932010.00000010.0010.0000005.0
7.222222EveNaN6.428571FBachelors5.9259266.5909098.6311796.6666677.7777785.8695656.8755.1528387.6190488.0487808.758.8888895.0
7.777778FrankHR9.107143MHigh School9.0740745.0000005.6844115.0000005.5555567.8260879.3755.0218346.2585036.2195126.256.11111110.0
8.333333GraceSales7.678571FBachelors7.7777787.7272737.3764266.6666677.2222227.1739137.5005.1310047.2789127.8048787.507.6666675.0
8.888889HenryEngineering5.714286MMasters5.7407418.8636369.6007608.3333338.3333335.6521746.2505.1746727.9591849.39024410.009.4444445.0
9.444444IvyFinance5.000000FBachelors5.0000005.4545458.0608375.0000006.6666676.3043488.7505.0655026.5986398.2926838.758.5555565.0
10.000000JackSales10.000000MHigh School10.0000005.9090915.0000005.0000005.00000010.00000010.0005.0436685.5782315.0000005.005.00000010.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%normalize feature_range=5,10 table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "e5bb1249-ba1e-4a9d-86f0-58b528c0e465", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_daf864252a6c46f1 -> test.employees; previous test.employees renamed to test.employees_prerollback_daf864252a6c46f1.\n" - ] - } - ], - "source": [ - "%normalize mode=rollback rollback_token=daf864252a6c46f1" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "5cc8a8b5-011e-4285-89f7-a7ee13b0da22", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW (local):\n", - "Local: Column 'emp_id': mean=20.5, std=11.543396380615196\n", - "Local: Column 'age': mean=38.2, std=9.064215354899728\n", - "Local: Column 'years_experience': mean=13.555555555555555, std=9.2988782012923\n", - "Local: Column 'projects_completed': mean=17.2, std=6.7201190465645775\n", - "Local: Column 'avg_project_score': mean=84.19000000000001, std=8.217353588595294\n", - "Local: Column 'certifications': mean=1.0, std=1.0\n", - "Local: Column 'training_hours': mean=25.0, std=13.228756555322953\n", - "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", - "Local: Column 'remote_ratio': mean=0.38, std=0.2521904042583698\n", - "Local: Column 'salary': mean=189700.0, std=337053.12637624354\n", - "Local: Column 'bonus': mean=6730.0, std=4002.0119939850256\n", - "Local: Column 'satisfaction_score': mean=7.94, std=1.1918053532351665\n", - "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n", - "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", - "Local: Column 'attrition_flag': mean=0.3, std=0.45825756949558394\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagemp_id_std_previewage_std_previewyears_experience_std_previewprojects_completed_std_previewavg_project_score_std_previewcertifications_std_previewtraining_hours_std_previewovertime_hours_std_previewremote_ratio_std_previewsalary_std_previewbonus_std_previewsatisfaction_score_std_previewperformance_rating_std_previewpotential_score_std_previewattrition_flag_std_preview
1305.01287.514050.255000.0300.08.5475.00-1.689278-0.904656-0.920063-0.7737960.4028060.01.133893-0.835766-0.713746-0.399640-1.6066920.4698750.312348-0.148823-0.654654
24520.03091.0320100.11200000.015000.09.0589.00-1.6026480.7502030.6930351.9047280.8287342.0-0.377964-0.115278-1.1102722.9974502.0664610.8894071.0932160.892940-0.654654
338NaN1879.3015200.580000.07000.07.2370.01-1.516018-0.022065NaN0.119046-0.595082-1.0-0.7559291.3256980.475831-0.3254680.067466-0.620907-0.468521-0.5208821.527525
4296.02295.225020.097000.010000.09.6595.00-1.429389-1.014980-0.8125230.7142731.3398471.01.889822-1.268059-1.506798-0.2750310.8170891.3928451.0932161.339410-0.654654
5358.01588.013060.390000.08000.08.0485.00-1.342759-0.353037-0.597444-0.3273750.4636530.00.377964-0.691669-0.317221-0.2957990.3173400.0503440.3123480.595293-0.654654
65025.0872.5010150.760000.04000.06.5260.01-1.2561291.3018231.230734-1.369023-1.422599-1.0-1.1338930.6052101.268883-0.384806-0.682157-1.208251-1.249390-1.2649991.527525
74218.02081.4125120.485000.07000.07.8374.00-1.1695000.4192310.4779550.416659-0.3395250.00.0000000.1729170.079305-0.3106340.067466-0.117469-0.468521-0.223235-0.654654
8317.02593.123550.295000.09000.09.1590.00-1.082870-0.794332-0.7049831.1606941.0842911.00.755929-0.835766-0.713746-0.2809650.5672150.9733131.0932160.967352-0.654654
9273.01085.002080.670000.05000.08.2482.00-0.996241-1.235628-1.135143-1.0714100.098572-1.0-0.377964-0.4034730.872357-0.355137-0.4322830.2181560.3123480.372058-0.654654
105530.01268.905250.865000.02000.05.5150.01-0.9096111.8534421.768433-0.773796-1.860696-1.0-1.5118582.0461861.665408-0.369971-1.181906-2.047314-2.030259-2.0091151.527525
11305.01287.514050.255000.0300.08.5475.00-0.822981-0.904656-0.920063-0.7737960.4028060.01.133893-0.835766-0.713746-0.399640-1.6066920.4698750.312348-0.148823-0.654654
124520.03091.0320100.11200000.015000.09.0589.00-0.7363520.7502030.6930351.9047280.8287342.0-0.377964-0.115278-1.1102722.9974502.0664610.8894071.0932160.892940-0.654654
1338NaN1879.3015200.580000.07000.07.2370.01-0.649722-0.022065NaN0.119046-0.595082-1.0-0.7559291.3256980.475831-0.3254680.067466-0.620907-0.468521-0.5208821.527525
14296.02295.225020.097000.010000.09.6595.00-0.563093-1.014980-0.8125230.7142731.3398471.01.889822-1.268059-1.506798-0.2750310.8170891.3928451.0932161.339410-0.654654
15358.01588.013060.390000.08000.08.0485.00-0.476463-0.353037-0.597444-0.3273750.4636530.00.377964-0.691669-0.317221-0.2957990.3173400.0503440.3123480.595293-0.654654
165025.0872.5010150.760000.04000.06.5260.01-0.3898331.3018231.230734-1.369023-1.422599-1.0-1.1338930.6052101.268883-0.384806-0.682157-1.208251-1.249390-1.2649991.527525
174218.02081.4125120.485000.07000.07.8374.00-0.3032040.4192310.4779550.416659-0.3395250.00.0000000.1729170.079305-0.3106340.067466-0.117469-0.468521-0.223235-0.654654
18317.02593.123550.295000.09000.09.1590.00-0.216574-0.794332-0.7049831.1606941.0842911.00.755929-0.835766-0.713746-0.2809650.5672150.9733131.0932160.967352-0.654654
19273.01085.002080.670000.05000.08.2482.00-0.129944-1.235628-1.135143-1.0714100.098572-1.0-0.377964-0.4034730.872357-0.355137-0.4322830.2181560.3123480.372058-0.654654
205530.01268.905250.865000.02000.05.5150.01-0.0433151.8534421.768433-0.773796-1.860696-1.0-1.5118582.0461861.665408-0.369971-1.181906-2.047314-2.030259-2.0091151.527525
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%standardize mode=preview" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "25f88a13-b173-4b8e-a2d0-4313b6f47b0c", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Apply completed: original preserved as test.employees_backup_f15a883f004548a8.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag
-1.56669AliceHR-0.90466FNaN-0.920066-0.773800.4028060.01.13389-0.83577-0.713748-0.399640-1.6066920.4698760.31235-0.148823-0.65459
-1.21854BobEngineering0.75020MMasters0.6930281.904730.8287342.0-0.37796-0.11528-1.1102742.9974502.0664610.8894071.093240.892940-0.65459
-0.87038CharlieSales-0.02206MBachelorsNaN0.11905-0.595082-1.0-0.755931.325710.475832-0.3254680.067466-0.620907-0.46853-0.5208821.52738
-0.52223DianaEngineering-1.01498FPhD-0.8125260.714281.3398471.01.88982-1.26807-1.506800-0.2750310.8170891.3928451.093241.339410-0.65459
-0.17408EveNaN-0.35304FBachelors-0.597447-0.327380.4636530.00.37796-0.69167-0.317221-0.2957990.3173400.0503440.312350.595293-0.65459
0.17408FrankHR1.30182MHigh School1.230726-1.36903-1.422599-1.0-1.133890.605211.268885-0.384806-0.682157-1.208251-1.24941-1.2649991.52738
0.52223GraceSales0.41923FBachelors0.4779490.41666-0.3395250.00.000000.172920.079305-0.3106340.067466-0.117469-0.46853-0.223235-0.65459
0.87038HenryEngineering-0.79433MMasters-0.7049871.160701.0842911.00.75593-0.83577-0.713748-0.2809650.5672150.9733141.093240.967352-0.65459
1.21854IvyFinance-1.23563FBachelors-1.135145-1.071410.098572-1.0-0.37796-0.403480.872358-0.355137-0.4322830.2181560.312350.372058-0.65459
1.56669JackSales1.85345MHigh School1.768424-0.77380-1.860696-1.0-1.511852.046201.665411-0.369971-1.181906-2.047315-2.03030-2.0091151.52738
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%standardize table=test.employees mode=apply confirm=true " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "7c023d4e-a273-4322-bcf2-6eb5a1e290dc", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Rollback: restored test.employees_backup_f15a883f004548a8 -> test.employees; previous test.employees renamed to test.employees_prerollback_f15a883f004548a8.\n" - ] - } - ], - "source": [ - "%standardize mode=rollback rollback_token=f15a883f004548a8" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "92f0fb87-521e-43dc-8604-9f4342d446e2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Standardized 2 column(s) (mean=0, std=1). Stored in data['last_select_standardized'].\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagdepartment_lbl
1AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
2BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
3CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
4DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
5EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
6FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
7GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
8HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
9IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
10JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
11AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
12BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
13CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
14DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
15EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
16FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
17GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
18HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
19IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
20JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
21AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
22BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
23CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
24DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
25EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
26FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
27GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
28HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
29IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
30JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
31AliceHR-0.904656FBachelors51287.514050.2-1.4150773000.08.5475.002
32BobEngineering0.750203MMasters203091.0320100.12.02986715000.09.0589.000
33CharlieSales-0.022065MBachelors101879.3015200.5-0.0900997000.07.2370.013
34DianaEngineering-1.014980FPhD62295.225020.00.81088710000.09.6595.000
35EveFinance-0.353037FBachelors81588.013060.30.4398938000.08.0485.001
36FrankHR1.301823MHigh School25872.5010150.7-1.1500824000.06.5260.012
37GraceSales0.419231FBachelors182081.4125120.40.1748977000.07.8374.003
38HenryEngineering-0.794332MMasters72593.123550.20.7048899000.09.1590.000
39IvyFinance-1.235628FBachelors31085.002080.6-0.6200905000.08.2482.001
40JackSales1.853442MHigh School301268.905250.8-0.8850862000.05.5150.013
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%standardize columns=age,salary inplace=False" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "6a7c652c-35b8-4c51-b00a-9ccb7c2c78b0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=8, train=6, test=2, val=0.\n" - ] - }, - { - "data": { - "text/html": [ - "

Train (6 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
5EveNaN35.0NaN
7NaNSales50.0NaN
3CharlieEngineeringNaN700000.0
8GraceSales45.065000.0
2BobNaN40.0NaN
6FrankEngineering28.072000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (2 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
4DavidHR25.048000.0
1AliceHR30.05000.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%splitdata" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "992bf2a2-15e2-4c67-a8fc-f0ac3c3e0630", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=40, train=28, test=8, val=4.\n" - ] - }, - { - "data": { - "text/html": [ - "

Train (28 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
18317.02593.123550.295000.09000.09.1590.007012
25358.01588.013060.390000.08000.08.0485.004-100
29273.01085.002080.670000.05000.08.2482.008100
2338NaN1879.3015200.580000.07000.07.2370.012310
65025.0872.5010150.760000.04000.06.5260.015211
405530.01268.905250.865000.02000.05.5150.019311
14296.02295.225020.097000.010000.09.6595.003003
224520.03091.0320100.11200000.015000.09.0589.001012
324520.03091.0320100.11200000.015000.09.0589.001012
124520.03091.0320100.11200000.015000.09.0589.001012
34296.02295.225020.097000.010000.09.6595.003003
3338NaN1879.3015200.580000.07000.07.2370.012310
74218.02081.4125120.485000.07000.07.8374.006300
265025.0872.5010150.760000.04000.06.5260.015211
19273.01085.002080.670000.05000.08.2482.008100
105530.01268.905250.865000.02000.05.5150.019311
365025.0872.5010150.760000.04000.06.5260.015211
21305.01287.514050.255000.0300.08.5475.00020-1
24296.02295.225020.097000.010000.09.6595.003003
31305.01287.514050.255000.0300.08.5475.00020-1
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Validation (4 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
15358.01588.013060.390000.08000.08.0485.004-100
305530.01268.905250.865000.02000.05.5150.019311
11305.01287.514050.255000.0300.08.5475.00020-1
338NaN1879.3015200.580000.07000.07.2370.012310
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (8 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
205530.01268.905250.865000.02000.05.5150.019311
174218.02081.4125120.485000.07000.07.8374.006300
165025.0872.5010150.760000.04000.06.5260.015211
274218.02081.4125120.485000.07000.07.8374.006300
5358.01588.013060.390000.08000.08.0485.004-100
1338NaN1879.3015200.580000.07000.07.2370.012310
38317.02593.123550.295000.09000.09.1590.007012
28317.02593.123550.295000.09000.09.1590.007012
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%splitdata test_size=0.2 val_size=0.1 random_state=42" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "000a2e5b-1918-4371-8b47-d3a4547a1759", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=40, train=23, test=12, val=5.\n" - ] - }, - { - "data": { - "text/html": [ - "

Train (23 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
365025.0872.5010150.760000.04000.06.5260.015211
3338NaN1879.3015200.580000.07000.07.2370.012310
34296.02295.225020.097000.010000.09.6595.003003
305530.01268.905250.865000.02000.05.5150.019311
338NaN1879.3015200.580000.07000.07.2370.012310
4296.02295.225020.097000.010000.09.6595.003003
405530.01268.905250.865000.02000.05.5150.019311
105530.01268.905250.865000.02000.05.5150.019311
31305.01287.514050.255000.0300.08.5475.00020-1
15358.01588.013060.390000.08000.08.0485.004-100
2338NaN1879.3015200.580000.07000.07.2370.012310
5358.01588.013060.390000.08000.08.0485.004-100
1305.01287.514050.255000.0300.08.5475.00020-1
205530.01268.905250.865000.02000.05.5150.019311
25358.01588.013060.390000.08000.08.0485.004-100
8317.02593.123550.295000.09000.09.1590.007012
274218.02081.4125120.485000.07000.07.8374.006300
374218.02081.4125120.485000.07000.07.8374.006300
165025.0872.5010150.760000.04000.06.5260.015211
224520.03091.0320100.11200000.015000.09.0589.001012
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Validation (5 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
18317.02593.123550.295000.09000.09.1590.007012
324520.03091.0320100.11200000.015000.09.0589.001012
174218.02081.4125120.485000.07000.07.8374.006300
265025.0872.5010150.760000.04000.06.5260.015211
35358.01588.013060.390000.08000.08.0485.004-100
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (12 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_lbldepartment_lblgender_lbleducation_level_lbl
74218.02081.4125120.485000.07000.07.8374.006300
11305.01287.514050.255000.0300.08.5475.00020-1
1338NaN1879.3015200.580000.07000.07.2370.012310
39273.01085.002080.670000.05000.08.2482.008100
65025.0872.5010150.760000.04000.06.5260.015211
124520.03091.0320100.11200000.015000.09.0589.001012
24296.02295.225020.097000.010000.09.6595.003003
24520.03091.0320100.11200000.015000.09.0589.001012
14296.02295.225020.097000.010000.09.6595.003003
21305.01287.514050.255000.0300.08.5475.00020-1
9273.01085.002080.670000.05000.08.2482.008100
38317.02593.123550.295000.09000.09.1590.007012
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%splitdata test_size=0.3 val_size=0.1 random_state=123" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "29b7abe8-4825-4096-86a0-7026c21de397", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=23, train=17, test=6, val=0.\n" - ] - }, - { - "data": { - "text/html": [ - "

Train (17 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
28DavidHR25.048000.0
20DavidHR25.048000.0
35CharlieEngineeringNaN700000.0
8GraceSales45.065000.0
18BobUnknown40.0NaN
24GraceSales45.065000.0
5EveUnknown35.0NaN
33AliceHR30.05000.0
3CharlieEngineeringNaN700000.0
30FrankEngineering28.072000.0
17AliceHR30.05000.0
22FrankEngineering28.072000.0
7UnknownSales50.0NaN
2BobUnknown40.0NaN
12DavidHR25.048000.0
34BobUnknown40.0NaN
23UnknownSales50.0NaN
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (6 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idnamedepartmentagesalary
32GraceSales45.065000.0
37EveUnknown35.0NaN
25AliceHR30.05000.0
19CharlieEngineeringNaN700000.0
6FrankEngineering28.072000.0
21EveUnknown35.0NaN
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%splitdata test_size=0.25 shuffle=False inplace=False train_name=mytrain test_name=mytest" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "2543a896-7047-45a7-a118-3adcfb822023", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'random_forest' trained and saved to data['last_model']. problem=regression. train_rows=28\n" - ] - } - ], - "source": [ - "%train_model target=department_lbl features=age,salary model=random_forest n_estimators=50 max_depth=4" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "4c75dd88-2c4a-462f-8ee5-48b50cae3d69", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'linear_regression' trained and saved to data['last_model']. problem=regression. train_rows=23\n" - ] - } - ], - "source": [ - "%train_model target=salary features=age model=linear_regression" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2778c96c-80b3-4d80-8199-75ff6b7154e7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'gbm' trained and saved to data['last_model']. problem=classification. train_rows=23\n" - ] - } - ], - "source": [ - "%train_model target=department features=age,salary model=gbm n_estimators=100 learning_rate=0.05" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "8f84d412-6352-4396-b5d6-d693262d3b7b", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'logistic_regression' trained and saved to data['last_model']. problem=classification. train_rows=23\n" - ] - } - ], - "source": [ - "%train_model target=department features=age,salary model=logistic_regression max_iter=500" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4f7d7e57-e6e6-47cd-84f4-ac653b11e9b1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'catboost' trained and saved to data['last_model']. problem=regression. train_rows=23\n" - ] - } - ], - "source": [ - "%train_model model=catboost target=department_lbl features=age,salary model_params='{\"iterations\":50}'" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1cdea559-1c4b-493d-a7bc-b9565f7f3b7d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model 'xgboost' trained and saved to data['last_model']. problem=classification. train_rows=23\n" - ] - } - ], - "source": [ - "%train_model model=xgboost target=department_lbl features=age,salary model_params='{\"n_estimators\":100, \"max_depth\":3}' problem=\"classification\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1d90ce87-aafd-4958-8318-09f66793b98e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", - "

Regression metrics

\n", - " \n", - " \n", - " \n", - " \n", - "
RMSE0.1237
MAE0.0737
0.9938
\n", - "
\n", - " \n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Predictions preview (actual vs predicted)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
department_lbl_predicted
32.88
32.96
22.03
32.96
-1-0.68
32.96
00.00
00.00
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "%evaluate_model" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "3e1a5300-a034-469a-abed-b50108a7f3a9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model from data['last_model'] saved to ./models/test_model.joblib\n" - ] - } - ], - "source": [ - "%savemodel model_name_in_data=last_model save_path=./models/test_model.joblib overwrite=True" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "5d5a292a-f4af-47bc-9eaa-44ddd63078ec", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded model from ./models/test_model.joblib → data['restored_model'] (features[2], target=department_lbl)\n" - ] - } - ], - "source": [ - "%loadmodel load_path=./models/test_model.joblib target_key=restored_model" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "788d9f75-9fe5-4bd2-b39d-1732eeee5bcd", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using inline feature values for prediction: {'age': 38, 'salary': 80000.0}\n" - ] - }, - { - "data": { - "text/html": [ - "

Predictions (last_preds)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
prediction
2.96
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Predictions stored in data['last_preds'] with shape=(1, 1)\n" - ] - } - ], - "source": [ - "%predict model_name=restored_model data_name=[38,\"80000.0\"] output_name=last_preds" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "6c5def76-a36c-45be-8712-d886a1e52e25", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Selection Results (method=correlation)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
potential_score0.828136
performance_rating0.817918
remote_ratio0.744150
training_hours0.742307
age0.683720
gender_lbl0.654654
certifications0.654654
department_lbl0.631068
years_experience0.477280
projects_completed0.441624
bonus0.392049
salary0.235729
name_lbl0.189934
education_level_lbl0.074848
emp_id0.047260
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n" - ] - } - ], - "source": [ - "%select_features target=attrition_flag method=correlation k=5 problem=classification " - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "b0c717fb-9f2f-47ba-8c0c-73c6934a069f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "

Feature Selection Results (method=rf_importance)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FeatureScore
department_lbl0.483284
years_experience0.062483
avg_project_score0.061437
satisfaction_score0.054678
remote_ratio0.051413
training_hours0.040343
performance_rating0.039563
attrition_flag0.039422
education_level_lbl0.036622
age0.036479
overtime_hours0.035503
bonus0.026721
salary0.010314
name_lbl0.010022
projects_completed0.005388
certifications0.004031
gender_lbl0.001930
emp_id0.000368
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selected 7 features saved to data['selected_features']: department_lbl, years_experience, avg_project_score, satisfaction_score, remote_ratio, training_hours, performance_rating\n" - ] - } - ], - "source": [ - "%select_features target=potential_score method=rf_importance k=7 problem=regression" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c106f132-0c90-4db8-8d7d-7c0cb29f6b10", - "metadata": {}, - "outputs": [], - "source": [ - "%select_features target=attrition_flag method=chi2 k=5 problem=classification output_name=top_features" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "cc570adc-ee80-42b9-a5a5-7a678224a220", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No features provided and no selected_features found. Run %select_features first.\n" - ] - } - ], - "source": [ - "%select_model target=attrition_flag problem=classification" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e535919d-788e-44c3-8a42-7c499044a265", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "PREVIEW: would drop 12 row(s) (from 40 to 28).\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idnamedepartmentagegendereducation_levelyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flag_would_be_dropped
1AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00True
3CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01True
5EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00True
11AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00True
13CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01True
15EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00True
21AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00True
23CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01True
25EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00True
31AliceHR30FNaN5.01287.514050.255000.0300.08.5475.00True
33CharlieSales38MBachelorsNaN1879.3015200.580000.07000.07.2370.01True
35EveNaN35FBachelors8.01588.013060.390000.08000.08.0485.00True
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Encoded columns in-place and updated last_select.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesdepartment____MISSING___gender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhDeducation_level____MISSING___
1305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
24520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
4296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
5358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
65025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
74218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
8317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
9273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
105530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
11305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
124520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
1338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
14296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
15358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
165025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
174218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
18317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
19273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
205530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
21305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
224520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
2338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
24296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
25358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
265025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
274218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
28317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
29273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
305530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
31305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
324520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
3338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
34296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
35358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
365025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
374218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
38317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
39273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
405530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Feature Selection Results (method=correlation)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
FeatureScore
overtime_hours0.867873
avg_project_score0.846331
satisfaction_score0.845916
potential_score0.828136
performance_rating0.817918
education_level_High School0.763763
remote_ratio0.744150
training_hours0.742307
age0.683720
gender_F0.654654
gender_M0.654654
certifications0.654654
department_Sales0.523810
name_Charlie0.509175
name_Jack0.509175
name_Frank0.509175
years_experience0.477280
projects_completed0.441624
department_Engineering0.428571
bonus0.392049
education_level_Masters0.327327
salary0.235729
department_HR0.218218
education_level____MISSING___0.218218
education_level_PhD0.218218
name_Diana0.218218
name_Alice0.218218
name_Bob0.218218
department_Finance0.218218
name_Eve0.218218
name_Henry0.218218
name_Ivy0.218218
department____MISSING___0.218218
name_Grace0.218218
education_level_Bachelors0.089087
emp_id0.047260
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selected 5 features saved to data['selected_features']: overtime_hours, avg_project_score, satisfaction_score, potential_score, performance_rating\n", - "PREVIEW (local):\n", - "Local: Column 'overtime_hours': mean=10.8, std=6.939740629158989\n", - "Local: Column 'avg_project_score': mean=84.19000000000001, std=8.217353588595294\n", - "Local: Column 'satisfaction_score': mean=7.94, std=1.1918053532351665\n", - "Local: Column 'potential_score': mean=77.0, std=13.438749941865872\n", - "Local: Column 'performance_rating': mean=3.6, std=1.2806248474865698\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
overtime_hoursavg_project_scoresatisfaction_scorepotential_scoreperformance_ratingovertime_hours_std_previewavg_project_score_std_previewsatisfaction_score_std_previewpotential_score_std_previewperformance_rating_std_preview
587.58.575.04-0.8357660.4028060.469875-0.1488230.312348
1091.09.089.05-0.1152780.8287340.8894070.8929401.093216
2079.37.270.031.325698-0.595082-0.620907-0.520882-0.468521
295.29.695.05-1.2680591.3398471.3928451.3394101.093216
688.08.085.04-0.6916690.4636530.0503440.5952930.312348
1572.56.560.020.605210-1.422599-1.208251-1.264999-1.249390
1281.47.874.030.172917-0.339525-0.117469-0.223235-0.468521
593.19.190.05-0.8357661.0842910.9733130.9673521.093216
885.08.282.04-0.4034730.0985720.2181560.3720580.312348
2568.95.550.012.046186-1.860696-2.047314-2.009115-2.030259
587.58.575.04-0.8357660.4028060.469875-0.1488230.312348
1091.09.089.05-0.1152780.8287340.8894070.8929401.093216
2079.37.270.031.325698-0.595082-0.620907-0.520882-0.468521
295.29.695.05-1.2680591.3398471.3928451.3394101.093216
688.08.085.04-0.6916690.4636530.0503440.5952930.312348
1572.56.560.020.605210-1.422599-1.208251-1.264999-1.249390
1281.47.874.030.172917-0.339525-0.117469-0.223235-0.468521
593.19.190.05-0.8357661.0842910.9733130.9673521.093216
885.08.282.04-0.4034730.0985720.2181560.3720580.312348
2568.95.550.012.046186-1.860696-2.047314-2.009115-2.030259
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Split completed: total=40, train=32, test=8, val=0.\n" - ] - }, - { - "data": { - "text/html": [ - "

Train (32 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesdepartment____MISSING___gender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhDeducation_level____MISSING___
305530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
2338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
19273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
274218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
29273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
224520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
24296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
4296.02295.225020.097000.010000.09.6595.000.00.00.01.00.00.00.00.00.00.01.00.00.00.00.01.00.00.00.00.01.00.0
374218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
324520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
5358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
35358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
39273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
9273.01085.002080.670000.05000.08.2482.000.00.00.00.00.00.00.00.01.00.00.01.00.00.00.01.00.01.00.00.00.00.0
65025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
3338NaN1879.3015200.580000.07000.07.2370.010.00.01.00.00.00.00.00.00.00.00.00.00.01.00.00.01.01.00.00.00.00.0
24520.03091.0320100.11200000.015000.09.0589.000.01.00.00.00.00.00.00.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
205530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
405530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Test (8 rows)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
emp_idageyears_experienceprojects_completedavg_project_scorecertificationstraining_hoursovertime_hoursremote_ratiosalarybonussatisfaction_scoreperformance_ratingpotential_scoreattrition_flagname_Alicename_Bobname_Charliename_Diananame_Evename_Frankname_Gracename_Henryname_Ivyname_Jackdepartment_Engineeringdepartment_Financedepartment_HRdepartment_Salesdepartment____MISSING___gender_Fgender_Meducation_level_Bachelorseducation_level_High Schooleducation_level_Masterseducation_level_PhDeducation_level____MISSING___
28317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
11305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
25358.01588.013060.390000.08000.08.0485.000.00.00.00.01.00.00.00.00.00.00.00.00.00.01.01.00.01.00.00.00.00.0
21305.01287.514050.255000.0300.08.5475.001.00.00.00.00.00.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.0
105530.01268.905250.865000.02000.05.5150.010.00.00.00.00.00.00.00.00.01.00.00.00.01.00.00.01.00.01.00.00.00.0
365025.0872.5010150.760000.04000.06.5260.010.00.00.00.00.01.00.00.00.00.00.00.01.00.00.00.01.00.01.00.00.00.0
174218.02081.4125120.485000.07000.07.8374.000.00.00.00.00.00.01.00.00.00.00.00.00.01.00.01.00.01.00.00.00.00.0
38317.02593.123550.295000.09000.09.1590.000.00.00.00.00.00.00.01.00.00.01.00.00.00.00.00.01.00.00.01.00.00.0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Model Selection Results (primary_metric=accuracy)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
logistic1.00000.00001.00000.00001.00.00001.00.0000
rf1.00000.00001.00000.00001.00.00001.00.0000
ada1.00000.00001.00000.00001.00.00001.00.0000
gbm1.00000.00001.00000.00001.00.00001.00.0000
catboost1.00000.00001.00000.00001.00.00001.00.0000
xgboost1.00000.00001.00000.00001.00.00001.00.0000
svm0.93810.07620.86670.16331.00.00000.80.2449
knn0.90950.07440.88000.09800.80.16331.00.0000
mlp0.90950.11700.60000.48990.60.48990.80.4000
lightgbm0.68570.02330.00000.00000.00.00000.00.0000
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best model 'logistic' (mean accuracy=1.0000) saved to data['last_model'].\n", - "[MLPipeline] Automatically selected best model via SelectModel.\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", - "
\n", - "

Metrics

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", - "
\n", - "
\"confusion
\n", - "

Classification report

\n", - "
              precision    recall  f1-score   support\n",
-       "\n",
-       "           0       1.00      1.00      1.00         6\n",
-       "           1       1.00      1.00      1.00         2\n",
-       "\n",
-       "    accuracy                           1.00         8\n",
-       "   macro avg       1.00      1.00      1.00         8\n",
-       "weighted avg       1.00      1.00      1.00         8\n",
-       "
\n", - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "

Predictions preview (actual vs predicted)

\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
attrition_flag_predicted_pred_proba
002.931267e-08
004.718786e-05
001.445187e-06
004.718786e-05
111.000000e+00
119.979979e-01
002.864653e-02
002.931267e-08
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving model from data['last_model'] to ./models/model.joblib...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Failed to save model: [Errno 2] No such file or directory: './models/model.joblib'\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[MLPipeline] Model saved to ./models/model.joblib.\n", - "[MLPipeline] ML pipeline completed successfully.\n" - ] - } - ], - "source": [ - "%ml_pipeline target=attrition_flag problem=classification save_path=./models/model.joblib" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4247d68c-6f93-4297-b1fd-fa09bf6362f8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table employees;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87d73330-b792-4d19-9eac-daa9cb0c7d1a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_name
1clipoutliers2025-10-28 07:39:14emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -18.5000, 59.5000).\n", - "Column 'age': clipped 0 value(s) (bounds: 7.5000, 67.5000).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -15.0000, 41.0000).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: -3.0000, 37.0000).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 61.7500, 108.5500).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -3.0000, 5.0000).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -15.0000, 65.0000).\n", - "Column 'overtime_hours': clipped 0 value(s) (bounds: -10.0000, 30.0000).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.4000, 1.2000).\n", - "Column 'salary': clipped 0 value(s) (bounds: 20000.0000, 140000.0000).\n", - "Column 'bonus': clipped 0 value(s) (bounds: -3500.0000, 16500.0000).\n", - "Column 'satisfaction_score': clipped 0 value(s) (bounds: 4.5000, 11.7000).\n", - "Column 'performance_rating': clipped 0 value(s) (bounds: 0.0000, 8.0000).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 41.5000, 117.5000).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -1.5000, 2.5000).test
2clipoutliersmethod=zscore z_thresh=2.0 2025-10-28 07:39:16emp_id\n", - "age\n", - "years_experience\n", - "projects_completed\n", - "avg_project_score\n", - "certifications\n", - "training_hours\n", - "overtime_hours\n", - "remote_ratio\n", - "salary\n", - "bonus\n", - "satisfaction_score\n", - "performance_rating\n", - "potential_score\n", - "attrition_flagsuccessColumn 'emp_id': clipped 0 value(s) (bounds: -2.8809, 43.8809).\n", - "Column 'age': clipped 0 value(s) (bounds: 19.8406, 56.5594).\n", - "Column 'years_experience': clipped 0 value(s) (bounds: -4.7983, 31.1983).\n", - "Column 'projects_completed': clipped 0 value(s) (bounds: 3.5885, 30.8115).\n", - "Column 'avg_project_score': clipped 0 value(s) (bounds: 67.5459, 100.8341).\n", - "Column 'certifications': clipped 0 value(s) (bounds: -1.0255, 3.0255).\n", - "Column 'training_hours': clipped 0 value(s) (bounds: -1.7946, 51.7946).\n", - "Column 'overtime_hours': clipped 4 value(s) (bounds: -3.2563, 24.8563).\n", - "Column 'remote_ratio': clipped 0 value(s) (bounds: -0.1308, 0.8908).\n", - "Column 'salary': clipped 4 value(s) (bounds: 43482.8069, 119917.1931).\n", - "Column 'bonus': clipped 4 value(s) (bounds: -358.9297, 14358.9297).\n", - "Column 'satisfaction_score': clipped 4 value(s) (bounds: 5.5260, 10.3540).\n", - "Column 'performance_rating': clipped 4 value(s) (bounds: 1.0061, 6.1939).\n", - "Column 'potential_score': clipped 0 value(s) (bounds: 49.7801, 104.2199).\n", - "Column 'attrition_flag': clipped 0 value(s) (bounds: -0.6282, 1.2282).test
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from magic_metadata;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87d02cd4-7308-4a7f-b598-064deb297357", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table magic_metadata;" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "572ea337-547d-4fcb-b6dc-87cb972ce5b7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] stored content length=0 preview= \n", - "[debug] using text from args (len=157)\n", - "[debug] stored content length=0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[warning] document content inserted into DB appears empty (possible client/encoding issue).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ingest complete. documents=1 chunks_total=1 embeddings_written=1\n", - "Notes: - embedding model used: all-MiniLM-L6-v2 (dim=384) - Native VECTOR column was created/used where available.\n" - ] - } - ], - "source": [ - "%maria_ingest doc_id=doc1 title=\"LangChain intro\" chunk_size=500 text=\"LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c59929ab-dbe6-45f1-8f9e-796e71bab31e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
VERSION()
11.8.3-MariaDB-ubu2404
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "SELECT VERSION();" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e7a3356-c3f8-422e-8eeb-49b8f8d1838b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Tables_in_test
chunks
documents
embeddings
employees
magic_metadata
models_store
sample_sales
saved_models
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "show tables;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4cbd3171-f8fb-4258-a7cc-c7288206b71f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
iddoc_idtitlecontentmetadatacreated_at
1doc_932786199{}2025-10-28 15:37:27
2doc1LangChain introLangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 15:49:11
5doc_965545306{}2025-10-28 16:12:19
6doc_749541677{}2025-10-28 17:18:47
7doc_307906215{}2025-10-28 17:26:03
8doc_266524367{}2025-10-28 17:27:25
9doc_77662266{}2025-10-28 17:45:42
11doc_from_fileReportThe magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:13:44
13search_test_docHybrid Search TestThe magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:34:51
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from documents;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "812c46fb-ef81-4503-9197-00efe6353db6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1doc10LangChain helps you build applications that combine LLMs with your data, APIs, and tools. It is often used to create chatbots, retrieval systems, and agents.{}2025-10-28 17:46:53
2doc_from_file0The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:13:44
3doc_from_file0The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (P{}2025-10-28 18:17:50
4doc_from_file1you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional e{}2025-10-28 18:17:50
5doc_from_file2hich would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an{}2025-10-28 18:17:50
6doc_from_file3s update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or{}2025-10-28 18:17:51
7doc_from_file4this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't{}2025-10-28 18:17:51
8doc_from_file5instead of this canvas update. Which would you like?\n", - "\n", - "The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed.\n", - "\n", - "If you'd like, I can:\n", - "\n", - "add support for PPTX / HTML extraction,\n", - "\n", - "automatically strip front-matter from markdown files,\n", - "\n", - "or produce a small PR-style diff instead of this canvas update. Which would you like?{}2025-10-28 18:17:51
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from chunks;" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "d96148c4-0e6b-4874-81d4-71a0b9dfd9fb", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] stored content length=0 preview=\n", - "\n", - "[debug] stored content length=0\n", - "Ingest complete. documents=1 chunks_total=0 embeddings_written=0\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warning: no chunks were created. If your document text is present in the `documents` table but chunk_text is missing, check client encoding and ensure the cell body was passed to the kernel. Use `SELECT content FROM documents WHERE doc_id=\"...\";` to inspect.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Notes:\n", - " - embedding model used: all-MiniLM-L6-v2 (dim=384)\n", - " - Native VECTOR column was created/used where available.\n", - "\n" - ] - } - ], - "source": [ - "%%maria_ingest doc_id=doc1 title=\"LangChain intro\" chunk_size=500\n", - "LangChain helps you build applications that combine LLMs with your data, APIs, and tools.\n", - "It is often used to create chatbots, retrieval systems, and agents." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "400d0901-9fc5-4bb4-bfa9-1912917b4450", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using file content from ./test.docx (len=28508)\n", - "\n", - "Using database: test\n", - "\n", - "Ingest complete.\n", - " documents=1\n", - " chunks_total=40\n", - " embeddings_written=40\n", - " native_attempts=40 native_successes=0 native_failures=40\n", - " fallback_successes=40 fallback_failures=0\n", - " Server version: 11.8.3-MariaDB-ubu2404\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Warnings/notes:\n", - "\n", - " - python-docx failed to extract docx: Package not found at '/home/iddhartha/mariadb_kernel/test.docx'\n", - "\n" - ] - } - ], - "source": [ - "%maria_ingest doc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "4ba9b623-929e-46f0-9580-93d86a226670", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] running hybrid search for query (len=19): how to get a refund\n", - "\n", - "chunk_id\tchunk_text...\tscore\tvec_sim\tbm25\tdoc_id\n", - "1\tOur store strives to deliver exceptional value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving th...\t0.819408\t0.484022\t0.209054\tsearch_test_doc\n", - "5\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "9\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "13\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "17\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "21\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "25\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "29\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", - "\n" - ] - } - ], - "source": [ - "%maria_search query=\"how to get a refund\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "96166ac5-627d-4bf4-a91c-370bc2df3dbd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table documents;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bdee425-e3ec-493d-83d6-37aa5fa49db7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table chunks;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19c330dd-4031-4b51-b063-925cfe7cda96", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from embeddings;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5fc7507-8871-4190-ba0e-58fc10496d7c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1search_test_doc0Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", - "\n", - "REFUND AND RETURN POLICY\n", - "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", - "Refunds are processed within 5–7 business days once the returned item is inspected. \n", - "Digital products, such as downloadable content or gift cards, are non-refundable.\n", - "\n", - "EXCHANGE POLICY\n", - "We offer one free exchange per order for issues such as size or color mismatch. \n", - "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", - "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", - "\n", - "SHIPPING AND DELIVERY\n", - "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-29 07:41:20
2search_test_doc1LIVERY\n", - "Free standard shipping applies to all orders over $75 within the continental United States. \n", - "International shipping rates vary depending on region and weight.\n", - "Express delivery options are available at an additional cost.\n", - "Customers will receive a tracking number once the order has been dispatched.\n", - "\n", - "PAYMENT METHODS\n", - "We accept major credit cards, PayPal, and Apple Pay. \n", - "For corporate purchases, wire transfers are supported upon request. \n", - "All transactions are encrypted using industry-standard SSL technology.\n", - "\n", - "WARRANTY INFORMATION\n", - "All electronics include a one-year limited warranty covering manufacturing defects. \n", - "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", - "Warranty claims do not cover accidental damage or misuse.\n", - "\n", - "TECHNICAL SUPPORT{}2025-10-29 07:41:20
3search_test_doc2nd proof of purchase. \n", - "Warranty claims do not cover accidental damage or misuse.\n", - "\n", - "TECHNICAL SUPPORT\n", - "Our helpdesk operates 24/7 via email and live chat.\n", - "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", - "We also maintain a searchable online knowledge base for common problems.\n", - "\n", - "DATA PRIVACY AND SECURITY\n", - "We are fully compliant with GDPR and CCPA regulations. \n", - "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", - "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", - "\n", - "SUSTAINABILITY COMMITMENTS\n", - "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", - "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-29 07:41:20
4search_test_doc3carbon-neutral shipping. \n", - "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", - "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", - "\n", - "LOYALTY PROGRAM\n", - "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", - "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", - "\n", - "CUSTOMER FEEDBACK\n", - "We value user feedback and continuously improve based on reviews. \n", - "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", - "\n", - "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-29 07:41:20
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from chunks;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acdb1c5b-87c1-445c-afb5-d9204cf07c65", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Tables_in_test
chunks
documents
embeddings
employees
magic_metadata
models_store
sample_sales
saved_models
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "show tables;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d13a0a43-a023-4682-b169-6e61c9b41a35", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
embeddings_count
0
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "SELECT COUNT(*) AS embeddings_count FROM embeddings;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c147c1e9-3243-43de-b035-6901dcf09caf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table embeddings;" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4e3f9dd8-33e1-4631-88e6-30cbce31e7e0", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[debug] RAG query received (len=26): How do I request a refund?\n", - "\n", - "\n", - "=== ANSWER ===\n", - "\n", - "You can request a refund within 30 days of receiving your item. The product must be unused, in its original packaging, and include proof of purchase [DOCID::chunk_0]. Digital products like downloadable content or gift cards are not eligible for refunds [DOCID::chunk_0]. Refunds are processed within 5-7 business days after the returned item is inspected [DOCID::chunk_0].\n", - "\n", - "\n" - ] - } - ], - "source": [ - "%maria_rag_query query=\"How do I request a refund?\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de68a877-5b52-4727-9606-4439152c4506", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1maria_ingestdoc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"2025-10-30 12:51:35documents,chunks,embeddingssuccessIngest complete.\n", - " documents=1\n", - " chunks_total=40\n", - " embeddings_written=40\n", - " native_attempts=40 native_successes=0 native_failures=40\n", - " fallback_successes=40 fallback_failures=0\n", - " Server version: 11.8.3-MariaDB-ubu2404\n", - "testNULLNULLNULL
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "select * from magic_metadata;" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b52bf52-58a7-41bb-a568-b2691ed22f02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query OK" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "drop table magic_metadata;" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "2e8ace40-a7b5-41e6-9225-fc52e197d0ea", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The result set was successfully written into last_query.csv\n" - ] - } - ], - "source": [ - "%df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "14bf423d-5e41-4d33-a2ef-a523937b88ef", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "MariaDB", - "language": "SQL", - "name": "mariadb_kernel" - }, - "language_info": { - "file_extension": ".sql", - "mimetype": "text/plain", - "name": "SQL" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/last_query.csv b/last_query.csv deleted file mode 100644 index 6132871..0000000 --- a/last_query.csv +++ /dev/null @@ -1,11 +0,0 @@ -emp_id,name,department_HR,department_Engineering,department_Sales,department_NULL,department_Finance,department,age,gender,education_level,years_experience,projects_completed,avg_project_score,certifications,training_hours,overtime_hours,remote_ratio,salary,bonus,satisfaction_score,performance_rating,potential_score,attrition_flag -1,Alice,1,0,0,0,0,HR,30,F,,5.0,12,87.5,1,40,5,0.2,55000.0,300.0,8.5,4,75.0,0 -2,Bob,0,1,0,0,0,Engineering,45,M,Masters,20.0,30,91.0,3,20,10,0.1,1200000.0,15000.0,9.0,5,89.0,0 -3,Charlie,0,0,1,0,0,Sales,38,M,Bachelors,,18,79.3,0,15,20,0.5,80000.0,7000.0,7.2,3,70.0,1 -4,Diana,0,1,0,0,0,Engineering,29,F,PhD,6.0,22,95.2,2,50,2,0.0,97000.0,10000.0,9.6,5,95.0,0 -5,Eve,0,0,0,1,0,,35,F,Bachelors,8.0,15,88.0,1,30,6,0.3,90000.0,8000.0,8.0,4,85.0,0 -6,Frank,1,0,0,0,0,HR,50,M,High School,25.0,8,72.5,0,10,15,0.7,60000.0,4000.0,6.5,2,60.0,1 -7,Grace,0,0,1,0,0,Sales,42,F,Bachelors,18.0,20,81.4,1,25,12,0.4,85000.0,7000.0,7.8,3,74.0,0 -8,Henry,0,1,0,0,0,Engineering,31,M,Masters,7.0,25,93.1,2,35,5,0.2,95000.0,9000.0,9.1,5,90.0,0 -9,Ivy,0,0,0,0,1,Finance,27,F,Bachelors,3.0,10,85.0,0,20,8,0.6,70000.0,5000.0,8.2,4,82.0,0 -10,Jack,0,0,1,0,0,Sales,55,M,High School,30.0,12,68.9,0,5,25,0.8,65000.0,2000.0,5.5,1,50.0,1 diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py index 383e1ae..243bf17 100644 --- a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py @@ -1,399 +1,399 @@ -# Copyright (c) MariaDB Foundation. -# Distributed under the terms of the Modified BSD License. - -from mariadb_kernel.maria_magics.maria_magic import MariaMagic -import pandas as pd -import shlex -from distutils import util -import logging -import os -import re - -# Optional helper to reliably get current DB name (if available) -try: - from mariadb_kernel.sql_fetch import SqlFetch -except Exception: - SqlFetch = None - - -class Missing(MariaMagic): - """ - %missing [action=show|percent|summary] [columns=col1,col2] - - Examples: - %missing -> shows count+percent of missing for all columns - %missing action=percent -> shows percent only - %missing action=summary -> shows dtype, missing, percent - - This magic also logs execution metadata into a table `magic_metadata` with fields: - id, command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name - """ - - def __init__(self, args=""): - self.args = args - - def type(self): - return "Line" - - def name(self): - return "missing" - - def help(self): - return ( - "%missing [action=show|percent|summary] [columns=col1,col2]\n" - "Display missing-value information from the last query result.\n" - "Execution metadata is recorded in table `magic_metadata`." - ) - - def _str_to_obj(self, s): - """Cast strings to Python objects where possible.""" - try: - return int(s) - except ValueError: - try: - return float(s) - except ValueError: - pass - try: - return bool(util.strtobool(s)) - except ValueError: - return s - - def parse_args(self, input_str): - """Parse key=value arguments.""" - if not input_str or input_str.strip() == "": - return {} - pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) - for k, v in pairs.items(): - pairs[k] = self._str_to_obj(v) - return pairs - - def _send_html(self, kernel, df): - """Display DataFrame as HTML in the notebook.""" - try: - html = df.to_html() - mime = "text/html" - except Exception: - html = str(df) - mime = "text/plain" - - display_content = {"data": {mime: html}, "metadata": {}} - kernel.send_response(kernel.iopub_socket, "display_data", display_content) - - # -------------------- metadata / DB helpers (best-effort) -------------------- - def _get_mariadb_client(self, kernel): - """Return mariadb_client if present on kernel, else None""" - return getattr(kernel, "mariadb_client", None) - - def _get_logger(self, kernel): - """Return a logger on kernel if present, else create a temporary logger""" - return getattr(kernel, "log", logging.getLogger(__name__)) - - def _sql_escape(self, val): - """Escape a value for SQL single-quoted literal insert. None -> NULL""" - if val is None: - return "NULL" - if not isinstance(val, str): - val = str(val) - return "'" + val.replace("'", "''") + "'" - - def _get_db_name(self, kernel): - """ - Attempt to determine the currently used DB. - Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. - Returns empty string if none found. - """ - mariadb_client = self._get_mariadb_client(kernel) - log = self._get_logger(kernel) - - # Try SqlFetch if available - if SqlFetch is not None and mariadb_client is not None: - try: - sf = SqlFetch(mariadb_client, log) - dbname = sf.get_db_name() - if isinstance(dbname, str): - return dbname - except Exception: - log.debug("SqlFetch available but .get_db_name() failed; falling back.") - - # Fallback: run SELECT DATABASE(); - if mariadb_client is None: - return "" - try: - result = mariadb_client.run_statement("SELECT DATABASE();") - if mariadb_client.iserror(): - return "" - if not result: - return "" - # If result is raw HTML table, try to parse with pandas - try: - df_list = pd.read_html(result) - if df_list and isinstance(df_list, list) and len(df_list) > 0: - val = df_list[0].iloc[0, 0] - if isinstance(val, float) and pd.isna(val): - return "" - return str(val) if val is not None else "" - except Exception: - # if not parseable by pandas, try regex to extract first cell content - m = re.search(r"(.*?)", str(result), flags=re.S | re.I) - if m: - txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags - txt = txt.strip() - if txt.lower() == "null" or txt == "": - return "" - return txt - # If result is plain text (like the DB name) - txt = str(result).strip() - if txt.lower() == "null" or txt == "": - return "" - return txt - except Exception: - return "" - return "" - - def _get_user_name(self, kernel): - """Try several places to find the current user name; fallback to OS login or empty string.""" - candidates = [ - getattr(kernel, "user_name", None), - getattr(kernel, "username", None), - getattr(kernel, "user", None), - getattr(kernel, "session", None), - ] - for cand in candidates: - if cand is None: - continue - if isinstance(cand, str) and cand.strip(): - return cand - try: - maybe = getattr(cand, "user", None) - if isinstance(maybe, str) and maybe.strip(): - return maybe - except Exception: - pass - try: - return os.getlogin() - except Exception: - return "" - - def _ensure_metadata_table(self, kernel, db_name): - """ - Create magic_metadata table if it doesn't exist. - Columns: id, command_name, arguments, execution_timestamp, - affected_columns, operation_status, message, db_name, user_name - """ - mariadb_client = self._get_mariadb_client(kernel) - log = self._get_logger(kernel) - - if mariadb_client is None: - # nothing to do - return - - table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" - - create_sql = f""" - CREATE TABLE IF NOT EXISTS {table_full_name} ( - id INT AUTO_INCREMENT PRIMARY KEY, - command_name VARCHAR(255), - arguments TEXT, - execution_timestamp DATETIME, - affected_columns TEXT, - operation_status VARCHAR(50), - message TEXT, - db_name VARCHAR(255), - user_name VARCHAR(255), - rollback_token VARCHAR(255), - backup_table VARCHAR(255), - original_table VARCHAR(255) - ); - """ - try: - mariadb_client.run_statement(create_sql) - if mariadb_client.iserror(): - log.error("Error creating magic_metadata table.") - except Exception as e: - log.error(f"Failed to ensure magic_metadata table: {e}") - - def _insert_metadata(self, kernel, command_name, arguments, affected_columns, - operation_status, message, db_name, user_name): - """ - Insert a metadata row into magic_metadata. Uses NOW() for timestamp. - """ - mariadb_client = self._get_mariadb_client(kernel) - log = self._get_logger(kernel) - if mariadb_client is None: - return - - table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" - - # Escape values - args_sql = self._sql_escape(arguments) - affected_sql = self._sql_escape(affected_columns) - status_sql = self._sql_escape(operation_status) - message_sql = self._sql_escape(message) - db_sql = self._sql_escape(db_name) - user_sql = self._sql_escape(user_name) - - insert_sql = f""" - INSERT INTO {table_full_name} - (command_name, arguments, execution_timestamp, affected_columns, - operation_status, message, db_name, user_name) - VALUES ( - {self._sql_escape(command_name)}, - {args_sql}, - NOW(), - {affected_sql}, - {status_sql}, - {message_sql}, - {db_sql}, - {user_sql} - ); - """ - try: - mariadb_client.run_statement(insert_sql) - if mariadb_client.iserror(): - log.error("Error inserting into magic_metadata.") - except Exception as e: - log.error(f"Exception while inserting metadata: {e}") - - # -------------------- end metadata helpers -------------------- - - def execute(self, kernel, data): - """Main execution for %missing magic.""" - df = data.get("last_select") - # Prepare metadata context early so we can log failures - db_name = self._get_db_name(kernel) - user_name = self._get_user_name(kernel) - try: - self._ensure_metadata_table(kernel, db_name) - except Exception: - try: - kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") - except Exception: - pass - - if df is None or (hasattr(df, "empty") and df.empty): - msg = "No data available to inspect for missing values." - kernel._send_message("stderr", msg) - # log metadata for failure - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="", - operation_status="error", - message=msg, - db_name=db_name, - user_name=user_name, - ) - except Exception: - pass - return - - try: - args = self.parse_args(self.args) - except Exception: - msg = "Error parsing arguments." - kernel._send_message("stderr", msg) - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="", - operation_status="error", - message=msg, - db_name=db_name, - user_name=user_name, - ) - except Exception: - pass - return - - action = args.get("action", "show") - cols_arg = args.get("columns", None) - - if isinstance(cols_arg, str): - columns = [c.strip() for c in cols_arg.split(",") if c.strip()] - elif isinstance(cols_arg, (list, tuple)): - columns = list(cols_arg) - else: - columns = None - - try: - subdf = df[columns] if columns else df - except KeyError as e: - msg = f"Column not found: {e}" - kernel._send_message("stderr", msg) - # log metadata for failure - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(columns) if columns else "", - operation_status="error", - message=msg, - db_name=db_name, - user_name=user_name, - ) - except Exception: - pass - return - - # Compute missing information - try: - missing_counts = subdf.isnull().sum() - total = len(subdf) - if total == 0: - percent = pd.Series([0] * len(missing_counts), index=missing_counts.index) - else: - percent = (missing_counts / total * 100).round(2) - - out = pd.DataFrame({"missing": missing_counts, "percent": percent}) - if action == "percent": - out = out[["percent"]] - elif action == "summary": - out["dtype"] = subdf.dtypes.astype(str) - out = out[["dtype", "missing", "percent"]] - - # Display results - self._send_html(kernel, out) - - # Prepare metadata success info - affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" - message = f"%missing action={action} examined {len(out)} column(s); total_rows={total}." - operation_status = "success" - - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns=affected_columns_str, - operation_status=operation_status, - message=message, - db_name=db_name, - user_name=user_name, - ) - except Exception: - # do not interrupt normal flow if logging fails - pass - - except Exception as e: - msg = f"Error while computing missing information: {e}" - kernel._send_message("stderr", msg) - try: - self._insert_metadata( - kernel=kernel, - command_name=self.name(), - arguments=self.args if isinstance(self.args, str) else str(self.args), - affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", - operation_status="error", - message=msg, - db_name=db_name, - user_name=user_name, - ) - except Exception: - pass +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import shlex +from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Missing(MariaMagic): + """ + %missing [action=show|percent|summary] [columns=col1,col2] + + Examples: + %missing -> shows count+percent of missing for all columns + %missing action=percent -> shows percent only + %missing action=summary -> shows dtype, missing, percent + + This magic also logs execution metadata into a table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "missing" + + def help(self): + return ( + "%missing [action=show|percent|summary] [columns=col1,col2]\n" + "Display missing-value information from the last query result.\n" + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Cast strings to Python objects where possible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except ValueError: + return s + + def parse_args(self, input_str): + """Parse key=value arguments.""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML in the notebook.""" + try: + html = df.to_html() + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + """Main execution for %missing magic.""" + df = data.get("last_select") + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No data available to inspect for missing values." + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + action = args.get("action", "show") + cols_arg = args.get("columns", None) + + if isinstance(cols_arg, str): + columns = [c.strip() for c in cols_arg.split(",") if c.strip()] + elif isinstance(cols_arg, (list, tuple)): + columns = list(cols_arg) + else: + columns = None + + try: + subdf = df[columns] if columns else df + except KeyError as e: + msg = f"Column not found: {e}" + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + # Compute missing information + try: + missing_counts = subdf.isnull().sum() + total = len(subdf) + if total == 0: + percent = pd.Series([0] * len(missing_counts), index=missing_counts.index) + else: + percent = (missing_counts / total * 100).round(2) + + out = pd.DataFrame({"missing": missing_counts, "percent": percent}) + if action == "percent": + out = out[["percent"]] + elif action == "summary": + out["dtype"] = subdf.dtypes.astype(str) + out = out[["dtype", "missing", "percent"]] + + # Display results + self._send_html(kernel, out) + + # Prepare metadata success info + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" + message = f"%missing action={action} examined {len(out)} column(s); total_rows={total}." + operation_status = "success" + + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message, + db_name=db_name, + user_name=user_name, + ) + except Exception: + # do not interrupt normal flow if logging fails + pass + + except Exception as e: + msg = f"Error while computing missing information: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index e2ceb18..69d940c 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -1,68 +1,68 @@ -""" Maintains a list of magic commands supported by the kernel """ - -# Copyright (c) MariaDB Foundation. -# Distributed under the terms of the Modified BSD License. - -from mariadb_kernel.maria_magics.line import Line -from mariadb_kernel.maria_magics.df import DF -from mariadb_kernel.maria_magics.lsmagic import LSMagic -from mariadb_kernel.maria_magics.maria_magic import MariaMagic -from mariadb_kernel.maria_magics.bar import Bar -from mariadb_kernel.maria_magics.pie import Pie -from mariadb_kernel.maria_magics.delimiter import Delimiter -from mariadb_kernel.maria_magics.load import Load -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers -from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers -from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode -from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize -from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize -from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData -from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel -from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel -from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel -from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures -from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel -from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel -from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict -from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline -from mariadb_kernel.maria_magics.rag_commands.maria_ingest import MariaIngest -from mariadb_kernel.maria_magics.rag_commands.maria_search import MariaSearch -from mariadb_kernel.maria_magics.rag_commands.maria_rag_query import MariaRAGQuery - -def get(): - return { - "line": Line, - "bar": Bar, - "pie": Pie, - "df": DF, - "lsmagic": LSMagic, - "delimiter": Delimiter, - "load": Load, - "missing": Missing, - "dropmissing": DropMissing, - "stats": Stats, - "fillmissing": FillMissing, - "outliers": Outliers, - "dropoutliers": DropOutliers, - "clipoutliers": ClipOutliers, - "encode": Encode, - "normalize": Normalize, - "standardize": Standardize, - "splitdata": SplitData, - "train_model": TrainModel, - "evaluate_model": EvaluateModel, - "savemodel": SaveModel, - "loadmodel": LoadModel, - "predict": Predict, - "select_features": SelectFeatures, - "select_model": SelectModel, - "ml_pipeline": MLPipeline, - "maria_ingest": MariaIngest, - "maria_search": MariaSearch, - "maria_rag_query": MariaRAGQuery, - } +""" Maintains a list of magic commands supported by the kernel """ + +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.line import Line +from mariadb_kernel.maria_magics.df import DF +from mariadb_kernel.maria_magics.lsmagic import LSMagic +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +from mariadb_kernel.maria_magics.bar import Bar +from mariadb_kernel.maria_magics.pie import Pie +from mariadb_kernel.maria_magics.delimiter import Delimiter +from mariadb_kernel.maria_magics.load import Load +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData +from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel +from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel +from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel +from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel +from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline +from mariadb_kernel.maria_magics.rag_commands.maria_ingest import MariaIngest +from mariadb_kernel.maria_magics.rag_commands.maria_search import MariaSearch +from mariadb_kernel.maria_magics.rag_commands.maria_rag_query import MariaRAGQuery + +def get(): + return { + "line": Line, + "bar": Bar, + "pie": Pie, + "df": DF, + "lsmagic": LSMagic, + "delimiter": Delimiter, + "load": Load, + "missing": Missing, + "dropmissing": DropMissing, + "stats": Stats, + "fillmissing": FillMissing, + "outliers": Outliers, + "dropoutliers": DropOutliers, + "clipoutliers": ClipOutliers, + "encode": Encode, + "normalize": Normalize, + "standardize": Standardize, + "splitdata": SplitData, + "train_model": TrainModel, + "evaluate_model": EvaluateModel, + "savemodel": SaveModel, + "loadmodel": LoadModel, + "predict": Predict, + "select_features": SelectFeatures, + "select_model": SelectModel, + "ml_pipeline": MLPipeline, + "maria_ingest": MariaIngest, + "maria_search": MariaSearch, + "maria_rag_query": MariaRAGQuery, + } From a2b0373cafc74f724e6d41624395324c8dcbee48 Mon Sep 17 00:00:00 2001 From: SiddharthaChakrabarty Date: Sat, 1 Nov 2025 11:31:31 +0000 Subject: [PATCH 38/38] Added requirements.txt --- requirements.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/requirements.txt b/requirements.txt index f95ca0d..1fdba48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,12 @@ setuptools-scm ipykernel beautifulsoup4 mycli +numpy +joblib +scikit-learn +sentence-transformers>=2.2.0 +google-generativeai>=0.3.0 +xgboost>=1.5.0 +lightgbm>=3.3.0 +catboost>=1.0.0 +scipy \ No newline at end of file