diff --git a/.gitignore b/.gitignore index dda68c2..d0e3462 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,8 @@ mariadb_kernel/_version.py # Ignore vscode editor setting .vscode/ + +.eggs/ +catboost_info/ +mariadb_kernel.egg-info/ +models/ diff --git a/DemoNotebooks/AutomatedMLPipeline.ipynb b/DemoNotebooks/AutomatedMLPipeline.ipynb new file mode 100644 index 0000000..a5075cc --- /dev/null +++ b/DemoNotebooks/AutomatedMLPipeline.ipynb @@ -0,0 +1,2446 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1b08e2b8-7680-420d-acd7-17a1f522276f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "847e1c93-f379-4175-ad00-d6be5f876d00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa\n", + "
24.931.40.2Iris-setosa\n", + "
34.73.21.30.2Iris-setosa\n", + "
44.63.11.50.2Iris-setosa\n", + "
553.61.40.2Iris-setosa\n", + "
65.43.91.70.4Iris-setosa\n", + "
74.6NULL1.40.3Iris-setosa\n", + "
853.41.50.2Iris-setosa\n", + "
94.42.91.40.2Iris-setosa\n", + "
104.93.11.50.1Iris-setosa\n", + "
115.43.71.50.2Iris-setosa\n", + "
124.83.41.60.2Iris-setosa\n", + "
134.831.40.1Iris-setosa\n", + "
144.331.10.1Iris-setosa\n", + "
155.841.20.1Iris-setosa\n", + "
165.74.41.50.1Iris-setosa\n", + "
175.4NULL1.30.1Iris-setosa\n", + "
185.13.51.40.1Iris-setosa\n", + "
195.73.81.70.1Iris-setosa\n", + "
205.13.81.50.1Iris-setosa\n", + "
215.43.41.70.1Iris-setosa\n", + "
225.13.71.50.1Iris-setosa\n", + "
234.63.610.1Iris-setosa\n", + "
245.13.31.70.1Iris-setosa\n", + "
254.83.41.90.1Iris-setosa\n", + "
26531.60.1Iris-setosa\n", + "
2753.41.60.1Iris-setosa\n", + "
28NULL3.51.50.1Iris-setosa\n", + "
295.23.41.40.1Iris-setosa\n", + "
304.73.21.60.1Iris-setosa\n", + "
314.83.1NULL0.1Iris-setosa\n", + "
325.43.41.50.1Iris-setosa\n", + "
335.24.11.50.1Iris-setosa\n", + "
345.54.21.40.1Iris-setosa\n", + "
354.93.11.50.1Iris-setosa\n", + "
3653.21.20.1Iris-setosa\n", + "
375.5NULL1.30.1Iris-setosa\n", + "
384.93.11.50.1Iris-setosa\n", + "
394.431.30.1Iris-setosa\n", + "
405.13.41.50.1Iris-setosa\n", + "
4153.51.30.1Iris-setosa\n", + "
424.52.31.30.1Iris-setosa\n", + "
434.43.21.30.1Iris-setosa\n", + "
4453.51.60.1Iris-setosa\n", + "
455.13.81.90.1Iris-setosa\n", + "
464.831.40.1Iris-setosa\n", + "
475.13.81.60.1Iris-setosa\n", + "
484.63.21.40.1Iris-setosa\n", + "
495.33.71.50.1Iris-setosa\n", + "
5053.31.40.1Iris-setosa\n", + "
5173.24.70.1Iris-versicolor\n", + "
526.43.24.50.1Iris-versicolor\n", + "
536.93.14.90.1Iris-versicolor\n", + "
545.52.340.1Iris-versicolor\n", + "
556.52.84.60.1Iris-versicolor\n", + "
565.72.84.50.1Iris-versicolor\n", + "
576.33.34.70.1Iris-versicolor\n", + "
584.92.43.30.1Iris-versicolor\n", + "
596.62.94.60.1Iris-versicolor\n", + "
605.22.73.90.1Iris-versicolor\n", + "
61523.50.1Iris-versicolor\n", + "
625.934.20.1Iris-versicolor\n", + "
6362.240.1Iris-versicolor\n", + "
646.12.94.70.1Iris-versicolor\n", + "
655.62.93.60.1Iris-versicolor\n", + "
666.73.14.40.1Iris-versicolor\n", + "
675.634.50.1Iris-versicolor\n", + "
685.82.74.10.1Iris-versicolor\n", + "
696.22.24.50.1Iris-versicolor\n", + "
705.62.53.90.1Iris-versicolor\n", + "
715.93.24.80.1Iris-versicolor\n", + "
726.12.840.1Iris-versicolor\n", + "
736.32.54.90.1Iris-versicolor\n", + "
746.12.84.70.1Iris-versicolor\n", + "
756.42.94.30.1Iris-versicolor\n", + "
766.634.40.1Iris-versicolor\n", + "
776.82.84.80.1Iris-versicolor\n", + "
786.7350.1Iris-versicolor\n", + "
7962.94.50.1Iris-versicolor\n", + "
805.72.63.50.1Iris-versicolor\n", + "
815.52.43.80.1Iris-versicolor\n", + "
825.52.43.70.1Iris-versicolor\n", + "
835.82.73.90.1Iris-versicolor\n", + "
8462.75.10.1Iris-versicolor\n", + "
855.434.50.1Iris-versicolor\n", + "
8663.44.50.1Iris-versicolor\n", + "
876.73.14.70.1Iris-versicolor\n", + "
886.32.34.40.1Iris-versicolor\n", + "
895.634.10.1Iris-versicolor\n", + "
905.52.540.1Iris-versicolor\n", + "
915.52.64.40.1Iris-versicolor\n", + "
926.134.60.1Iris-versicolor\n", + "
935.82.640.1Iris-versicolor\n", + "
9452.33.30.1Iris-versicolor\n", + "
955.62.74.20.1Iris-versicolor\n", + "
965.734.20.1Iris-versicolor\n", + "
975.72.94.20.1Iris-versicolor\n", + "
986.22.94.30.1Iris-versicolor\n", + "
995.12.530.1Iris-versicolor\n", + "
1005.72.84.10.1Iris-versicolor\n", + "
1016.33.360.1Iris-virginica\n", + "
1025.82.75.10.1Iris-virginica\n", + "
1037.135.90.1Iris-virginica\n", + "
1046.32.95.60.1Iris-virginica\n", + "
1056.535.80.1Iris-virginica\n", + "
1067.636.60.1Iris-virginica\n", + "
1074.92.54.50.1Iris-virginica\n", + "
1087.32.96.30.1Iris-virginica\n", + "
1096.72.55.80.1Iris-virginica\n", + "
1107.23.66.10.1Iris-virginica\n", + "
1116.53.25.10.1Iris-virginica\n", + "
1126.42.75.30.1Iris-virginica\n", + "
1136.835.50.1Iris-virginica\n", + "
1145.72.550.1Iris-virginica\n", + "
1155.82.85.10.1Iris-virginica\n", + "
1166.43.25.30.1Iris-virginica\n", + "
1176.535.50.1Iris-virginica\n", + "
1187.73.86.70.1Iris-virginica\n", + "
1197.72.66.90.1Iris-virginica\n", + "
12062.250.1Iris-virginica\n", + "
1216.93.25.70.1Iris-virginica\n", + "
1225.62.84.90.1Iris-virginica\n", + "
1237.72.86.70.1Iris-virginica\n", + "
1246.32.74.90.1Iris-virginica\n", + "
1256.73.35.70.1Iris-virginica\n", + "
1267.23.260.1Iris-virginica\n", + "
1276.22.84.80.1Iris-virginica\n", + "
1286.134.90.1Iris-virginica\n", + "
1296.42.85.60.1Iris-virginica\n", + "
1307.235.80.1Iris-virginica\n", + "
1317.42.86.10.1Iris-virginica\n", + "
1327.93.86.40.1Iris-virginica\n", + "
1336.42.85.60.1Iris-virginica\n", + "
1346.32.85.10.1Iris-virginica\n", + "
1356.12.65.60.1Iris-virginica\n", + "
1367.736.10.1Iris-virginica\n", + "
1376.33.45.60.1Iris-virginica\n", + "
1386.43.15.50.1Iris-virginica\n", + "
139634.80.1Iris-virginica\n", + "
1406.93.15.40.1Iris-virginica\n", + "
1416.73.15.60.1Iris-virginica\n", + "
1426.93.15.10.1Iris-virginica\n", + "
1435.82.75.10.1Iris-virginica\n", + "
1446.83.25.90.1Iris-virginica\n", + "
1456.73.35.70.1Iris-virginica\n", + "
1466.735.20.1Iris-virginica\n", + "
1476.32.550.1Iris-virginica\n", + "
1486.535.20.1Iris-virginica\n", + "
1496.23.45.40.1Iris-virginica\n", + "
150NULL35.10.1Iris-virginica\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT * FROM IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "6f73b503-62fb-429a-97a8-bbb95ffcb163", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dropped 6 row(s) with missing values (in-place local).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
34.73.21.30.2Iris-setosa
44.63.11.50.2Iris-setosa
55.03.61.40.2Iris-setosa
65.43.91.70.4Iris-setosa
85.03.41.50.2Iris-setosa
94.42.91.40.2Iris-setosa
104.93.11.50.1Iris-setosa
115.43.71.50.2Iris-setosa
124.83.41.60.2Iris-setosa
134.83.01.40.1Iris-setosa
144.33.01.10.1Iris-setosa
155.84.01.20.1Iris-setosa
165.74.41.50.1Iris-setosa
185.13.51.40.1Iris-setosa
195.73.81.70.1Iris-setosa
205.13.81.50.1Iris-setosa
215.43.41.70.1Iris-setosa
225.13.71.50.1Iris-setosa
234.63.61.00.1Iris-setosa
245.13.31.70.1Iris-setosa
254.83.41.90.1Iris-setosa
265.03.01.60.1Iris-setosa
275.03.41.60.1Iris-setosa
295.23.41.40.1Iris-setosa
304.73.21.60.1Iris-setosa
325.43.41.50.1Iris-setosa
335.24.11.50.1Iris-setosa
345.54.21.40.1Iris-setosa
354.93.11.50.1Iris-setosa
365.03.21.20.1Iris-setosa
384.93.11.50.1Iris-setosa
394.43.01.30.1Iris-setosa
405.13.41.50.1Iris-setosa
415.03.51.30.1Iris-setosa
424.52.31.30.1Iris-setosa
434.43.21.30.1Iris-setosa
445.03.51.60.1Iris-setosa
455.13.81.90.1Iris-setosa
464.83.01.40.1Iris-setosa
475.13.81.60.1Iris-setosa
484.63.21.40.1Iris-setosa
495.33.71.50.1Iris-setosa
505.03.31.40.1Iris-setosa
517.03.24.70.1Iris-versicolor
526.43.24.50.1Iris-versicolor
536.93.14.90.1Iris-versicolor
545.52.34.00.1Iris-versicolor
556.52.84.60.1Iris-versicolor
565.72.84.50.1Iris-versicolor
576.33.34.70.1Iris-versicolor
584.92.43.30.1Iris-versicolor
596.62.94.60.1Iris-versicolor
605.22.73.90.1Iris-versicolor
615.02.03.50.1Iris-versicolor
625.93.04.20.1Iris-versicolor
636.02.24.00.1Iris-versicolor
646.12.94.70.1Iris-versicolor
655.62.93.60.1Iris-versicolor
666.73.14.40.1Iris-versicolor
675.63.04.50.1Iris-versicolor
685.82.74.10.1Iris-versicolor
696.22.24.50.1Iris-versicolor
705.62.53.90.1Iris-versicolor
715.93.24.80.1Iris-versicolor
726.12.84.00.1Iris-versicolor
736.32.54.90.1Iris-versicolor
746.12.84.70.1Iris-versicolor
756.42.94.30.1Iris-versicolor
766.63.04.40.1Iris-versicolor
776.82.84.80.1Iris-versicolor
786.73.05.00.1Iris-versicolor
796.02.94.50.1Iris-versicolor
805.72.63.50.1Iris-versicolor
815.52.43.80.1Iris-versicolor
825.52.43.70.1Iris-versicolor
835.82.73.90.1Iris-versicolor
846.02.75.10.1Iris-versicolor
855.43.04.50.1Iris-versicolor
866.03.44.50.1Iris-versicolor
876.73.14.70.1Iris-versicolor
886.32.34.40.1Iris-versicolor
895.63.04.10.1Iris-versicolor
905.52.54.00.1Iris-versicolor
915.52.64.40.1Iris-versicolor
926.13.04.60.1Iris-versicolor
935.82.64.00.1Iris-versicolor
945.02.33.30.1Iris-versicolor
955.62.74.20.1Iris-versicolor
965.73.04.20.1Iris-versicolor
975.72.94.20.1Iris-versicolor
986.22.94.30.1Iris-versicolor
995.12.53.00.1Iris-versicolor
1005.72.84.10.1Iris-versicolor
1016.33.36.00.1Iris-virginica
1025.82.75.10.1Iris-virginica
1037.13.05.90.1Iris-virginica
1046.32.95.60.1Iris-virginica
1056.53.05.80.1Iris-virginica
1067.63.06.60.1Iris-virginica
1074.92.54.50.1Iris-virginica
1087.32.96.30.1Iris-virginica
1096.72.55.80.1Iris-virginica
1107.23.66.10.1Iris-virginica
1116.53.25.10.1Iris-virginica
1126.42.75.30.1Iris-virginica
1136.83.05.50.1Iris-virginica
1145.72.55.00.1Iris-virginica
1155.82.85.10.1Iris-virginica
1166.43.25.30.1Iris-virginica
1176.53.05.50.1Iris-virginica
1187.73.86.70.1Iris-virginica
1197.72.66.90.1Iris-virginica
1206.02.25.00.1Iris-virginica
1216.93.25.70.1Iris-virginica
1225.62.84.90.1Iris-virginica
1237.72.86.70.1Iris-virginica
1246.32.74.90.1Iris-virginica
1256.73.35.70.1Iris-virginica
1267.23.26.00.1Iris-virginica
1276.22.84.80.1Iris-virginica
1286.13.04.90.1Iris-virginica
1296.42.85.60.1Iris-virginica
1307.23.05.80.1Iris-virginica
1317.42.86.10.1Iris-virginica
1327.93.86.40.1Iris-virginica
1336.42.85.60.1Iris-virginica
1346.32.85.10.1Iris-virginica
1356.12.65.60.1Iris-virginica
1367.73.06.10.1Iris-virginica
1376.33.45.60.1Iris-virginica
1386.43.15.50.1Iris-virginica
1396.03.04.80.1Iris-virginica
1406.93.15.40.1Iris-virginica
1416.73.15.60.1Iris-virginica
1426.93.15.10.1Iris-virginica
1435.82.75.10.1Iris-virginica
1446.83.25.90.1Iris-virginica
1456.73.35.70.1Iris-virginica
1466.73.05.20.1Iris-virginica
1476.32.55.00.1Iris-virginica
1486.53.05.20.1Iris-virginica
1496.23.45.40.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Feature Selection Results (method=anova)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
PetalLengthCm1065.303728
Id543.481503
SepalLengthCm113.483483
SepalWidthCm42.504709
PetalWidthCm10.479730
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 5 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, SepalWidthCm, PetalWidthCm\n", + "PREVIEW (local):\n", + "Local: Column 'PetalLengthCm': mean=3.8305555555555557, std=1.7358666494553543\n", + "Local: Column 'Id': mean=76.77083333333333, std=42.624119207256875\n", + "Local: Column 'SepalLengthCm': mean=5.86875, std=0.8279382651099859\n", + "Local: Column 'SepalWidthCm': mean=3.0395833333333333, std=0.43095803321076276\n", + "Local: Column 'PetalWidthCm': mean=0.10833333333333332, std=0.03435921354681384\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PetalLengthCmIdSepalLengthCmSepalWidthCmPetalWidthCmPetalLengthCm_std_previewId_std_previewSepalLengthCm_std_previewSepalWidthCm_std_previewPetalWidthCm_std_preview
1.415.13.50.2-1.400197-1.777652-0.9285111.0683562.667892
1.424.93.00.2-1.400197-1.754191-1.170075-0.0918502.667892
1.334.73.20.2-1.457805-1.730730-1.4116390.3722332.667892
1.544.63.10.2-1.342589-1.707269-1.5324210.1401922.667892
1.455.03.60.2-1.400197-1.683808-1.0492931.3003972.667892
1.765.43.90.4-1.227373-1.660347-0.5661651.9965218.488747
1.585.03.40.2-1.342589-1.613425-1.0492930.8363152.667892
1.494.42.90.2-1.400197-1.589964-1.773985-0.3238912.667892
1.5104.93.10.1-1.342589-1.566504-1.1700750.140192-0.242536
1.5115.43.70.2-1.342589-1.543043-0.5661651.5324382.667892
1.6124.83.40.2-1.284981-1.519582-1.2908570.8363152.667892
1.4134.83.00.1-1.400197-1.496121-1.290857-0.091850-0.242536
1.1144.33.00.1-1.573021-1.472660-1.894767-0.091850-0.242536
1.2155.84.00.1-1.515413-1.449199-0.0830382.228562-0.242536
1.5165.74.40.1-1.342589-1.425738-0.2038203.156727-0.242536
1.4185.13.50.1-1.400197-1.378816-0.9285111.068356-0.242536
1.7195.73.80.1-1.227373-1.355355-0.2038201.764480-0.242536
1.5205.13.80.1-1.342589-1.331895-0.9285111.764480-0.242536
1.7215.43.40.1-1.227373-1.308434-0.5661650.836315-0.242536
1.5225.13.70.1-1.342589-1.284973-0.9285111.532438-0.242536
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=144, train=115, test=29, val=0.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (115 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
1396.03.04.80.1Iris-virginica
965.73.04.20.1Iris-versicolor
1386.43.15.50.1Iris-virginica
1074.92.54.50.1Iris-virginica
756.42.94.30.1Iris-versicolor
1267.23.26.00.1Iris-virginica
325.43.41.50.1Iris-setosa
445.03.51.60.1Iris-setosa
886.32.34.40.1Iris-versicolor
625.93.04.20.1Iris-versicolor
1197.72.66.90.1Iris-virginica
55.03.61.40.2Iris-setosa
605.22.73.90.1Iris-versicolor
124.83.41.60.2Iris-setosa
726.12.84.00.1Iris-versicolor
1225.62.84.90.1Iris-virginica
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
646.12.94.70.1Iris-versicolor
846.02.75.10.1Iris-versicolor
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (29 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
265.03.01.60.1Iris-setosa
926.13.04.60.1Iris-versicolor
405.13.41.50.1Iris-setosa
495.33.71.50.1Iris-setosa
545.52.34.00.1Iris-versicolor
1037.13.05.90.1Iris-virginica
1367.73.06.10.1Iris-virginica
275.03.41.60.1Iris-setosa
895.63.04.10.1Iris-versicolor
484.63.21.40.1Iris-setosa
805.72.63.50.1Iris-versicolor
1237.72.86.70.1Iris-virginica
1206.02.25.00.1Iris-virginica
115.43.71.50.2Iris-setosa
675.63.04.50.1Iris-versicolor
1435.82.75.10.1Iris-virginica
1346.32.85.10.1Iris-virginica
1426.93.15.10.1Iris-virginica
705.62.53.90.1Iris-versicolor
505.03.31.40.1Iris-setosa
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Model Selection Results (primary_metric=accuracy)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
rf0.99130.01740.98150.03700.98520.01810.99050.0190
logistic0.98260.03480.98150.03700.98670.02670.98100.0381
svm0.98260.02130.98260.02140.98520.01810.98210.0220
knn0.98260.03480.98150.03700.98670.02670.98100.0381
gbm0.98260.03480.98150.03700.98670.02670.98100.0381
ada0.98260.03480.98150.03700.98670.02670.98100.0381
catboost0.98260.03480.98150.03700.98670.02670.98100.0381
lightgbm0.98260.03480.98150.03700.98670.02670.98100.0381
xgboost0.97390.03480.97260.03690.97830.02770.97260.0376
mlp0.77390.28230.56470.36210.60880.34380.71190.2479
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model 'rf' (mean accuracy=0.9913) saved to data['last_model'].\n", + "[MLPipeline] Automatically selected best model via SelectModel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy1.0000
Precision (w)1.0000
Recall (w)1.0000
F1 (w)1.0000
ROC AUC1.0000
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
                 precision    recall  f1-score   support\n",
+       "\n",
+       "    Iris-setosa       1.00      1.00      1.00         9\n",
+       "Iris-versicolor       1.00      1.00      1.00        10\n",
+       " Iris-virginica       1.00      1.00      1.00        10\n",
+       "\n",
+       "       accuracy                           1.00        29\n",
+       "      macro avg       1.00      1.00      1.00        29\n",
+       "   weighted avg       1.00      1.00      1.00        29\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Predictions preview (actual vs predicted)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Species_predicted_pred_proba
Iris-setosaIris-setosa[0.99, 0.01, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-setosaIris-setosa[0.91, 0.09, 0.0]
Iris-versicolorIris-versicolor[0.01, 0.99, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-setosaIris-setosa[0.98, 0.01, 0.01]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.08, 0.92]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.02, 0.98]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.02, 0.98]
Iris-versicolorIris-versicolor[0.0, 0.99, 0.01]
Iris-setosaIris-setosa[0.73, 0.27, 0.0]
Iris-setosaIris-setosa[0.99, 0.01, 0.0]
Iris-versicolorIris-versicolor[0.0, 0.92, 0.08]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-setosaIris-setosa[1.0, 0.0, 0.0]
Iris-versicolorIris-versicolor[0.0, 0.99, 0.01]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
Iris-virginicaIris-virginica[0.0, 0.0, 1.0]
Iris-virginicaIris-virginica[0.0, 0.01, 0.99]
Iris-versicolorIris-versicolor[0.0, 1.0, 0.0]
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model from data['last_model'] saved to ./auto_models/model.joblib\n", + "[MLPipeline] Model saved to ./auto_models/model.joblib.\n", + "[MLPipeline] ML pipeline completed successfully.\n" + ] + } + ], + "source": [ + "%ml_pipeline target=Species problem=classification save_path='./auto_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75dc8ab9-f1ad-477f-a414-9380607df20a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DemoNotebooks/Iris.csv b/DemoNotebooks/Iris.csv new file mode 100644 index 0000000..15e8a8f --- /dev/null +++ b/DemoNotebooks/Iris.csv @@ -0,0 +1,151 @@ +Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species +1,5.1,3.5,1.4,0.2,Iris-setosa +2,4.9,3,1.4,0.2,Iris-setosa +3,4.7,3.2,1.3,0.2,Iris-setosa +4,4.6,3.1,1.5,0.2,Iris-setosa +5,5,3.6,1.4,0.2,Iris-setosa +6,5.4,3.9,1.7,0.4,Iris-setosa +7,4.6,NULL,1.4,0.3,Iris-setosa +8,5,3.4,1.5,0.2,Iris-setosa +9,4.4,2.9,1.4,0.2,Iris-setosa +10,4.9,3.1,1.5,0.1,Iris-setosa +11,5.4,3.7,1.5,0.2,Iris-setosa +12,4.8,3.4,1.6,0.2,Iris-setosa +13,4.8,3,1.4,0.1,Iris-setosa +14,4.3,3,1.1,0.1, +15,5.8,4,1.2,0.1,Iris-setosa +16,5.7,4.4,1.5,0.1,Iris-setosa +17,5.4,NULL,1.3,0.1,Iris-setosa +18,5.1,3.5,1.4,0.1,Iris-setosa +19,5.7,3.8,1.7,0.1,Iris-setosa +20,5.1,3.8,1.5,0.1,Iris-setosa +21,5.4,3.4,1.7,0.1,Iris-setosa +22,5.1,3.7,1.5,0.1,Iris-setosa +23,4.6,3.6,1,0.1,Iris-setosa +24,5.1,3.3,1.7,0.1,Iris-setosa +25,4.8,3.4,1.9,0.1,Iris-setosa +26,5,3,1.6,0.1,Iris-setosa +27,5,3.4,1.6,0.1,Iris-setosa +28,NULL,3.5,1.5,0.1,Iris-setosa +29,5.2,3.4,1.4,0.1,Iris-setosa +30,4.7,3.2,1.6,0.1,Iris-setosa +31,4.8,3.1,NULL,0.1,Iris-setosa +32,5.4,3.4,1.5,0.1,Iris-setosa +33,5.2,4.1,1.5,0.1,Iris-setosa +34,5.5,4.2,1.4,0.1,Iris-setosa +35,4.9,3.1,1.5,0.1,Iris-setosa +36,5,3.2,1.2,0.1,Iris-setosa +37,5.5,NULL,1.3,0.1,Iris-setosa +38,4.9,3.1,1.5,0.1,Iris-setosa +39,4.4,3,1.3,0.1,Iris-setosa +40,5.1,3.4,1.5,0.1,Iris-setosa +41,5,3.5,1.3,0.1,Iris-setosa +42,4.5,2.3,1.3,0.1,Iris-setosa +43,4.4,3.2,1.3,0.1,Iris-setosa +44,5,3.5,1.6,0.1,Iris-setosa +45,5.1,3.8,1.9,0.1,Iris-setosa +46,4.8,3,1.4,0.1,Iris-setosa +47,5.1,3.8,1.6,0.1,Iris-setosa +48,4.6,3.2,1.4,0.1,Iris-setosa +49,5.3,3.7,1.5,0.1,Iris-setosa +50,5,3.3,1.4,0.1,Iris-setosa +51,7,3.2,4.7,0.1,Iris-versicolor +52,6.4,3.2,4.5,0.1,Iris-versicolor +53,6.9,3.1,4.9,0.1,Iris-versicolor +54,5.5,2.3,4,0.1,Iris-versicolor +55,6.5,2.8,4.6,0.1,Iris-versicolor +56,5.7,2.8,4.5,0.1,Iris-versicolor +57,6.3,3.3,4.7,0.1,Iris-versicolor +58,4.9,2.4,3.3,0.1,Iris-versicolor +59,6.6,2.9,4.6,0.1,Iris-versicolor +60,5.2,2.7,3.9,0.1,Iris-versicolor +61,5,2,3.5,0.1,Iris-versicolor +62,5.9,3,4.2,0.1,Iris-versicolor +63,6,2.2,4,0.1,Iris-versicolor +64,6.1,2.9,4.7,0.1,Iris-versicolor +65,5.6,2.9,3.6,0.1,Iris-versicolor +66,6.7,3.1,4.4,0.1,Iris-versicolor +67,5.6,3,4.5,0.1,Iris-versicolor +68,5.8,2.7,4.1,0.1,Iris-versicolor +69,6.2,2.2,4.5,0.1,Iris-versicolor +70,5.6,2.5,3.9,0.1,Iris-versicolor +71,5.9,3.2,4.8,0.1,Iris-versicolor +72,6.1,2.8,4,0.1,Iris-versicolor +73,6.3,2.5,4.9,0.1,Iris-versicolor +74,6.1,2.8,4.7,0.1,Iris-versicolor +75,6.4,2.9,4.3,0.1,Iris-versicolor +76,6.6,3,4.4,0.1,Iris-versicolor +77,6.8,2.8,4.8,0.1,Iris-versicolor +78,6.7,3,5,0.1,Iris-versicolor +79,6,2.9,4.5,0.1,Iris-versicolor +80,5.7,2.6,3.5,0.1,Iris-versicolor +81,5.5,2.4,3.8,0.1,Iris-versicolor +82,5.5,2.4,3.7,0.1,Iris-versicolor +83,5.8,2.7,3.9,0.1,Iris-versicolor +84,6,2.7,5.1,0.1,Iris-versicolor +85,5.4,3,4.5,0.1,Iris-versicolor +86,6,3.4,4.5,0.1,Iris-versicolor +87,6.7,3.1,4.7,0.1,Iris-versicolor +88,6.3,2.3,4.4,0.1,Iris-versicolor +89,5.6,3,4.1,0.1,Iris-versicolor +90,5.5,2.5,4,0.1,Iris-versicolor +91,5.5,2.6,4.4,0.1,Iris-versicolor +92,6.1,3,4.6,0.1,Iris-versicolor +93,5.8,2.6,4,0.1,Iris-versicolor +94,5,2.3,3.3,0.1,Iris-versicolor +95,5.6,2.7,4.2,0.1,Iris-versicolor +96,5.7,3,4.2,0.1,Iris-versicolor +97,5.7,2.9,4.2,0.1,Iris-versicolor +98,6.2,2.9,4.3,0.1,Iris-versicolor +99,5.1,2.5,3,0.1,Iris-versicolor +100,5.7,2.8,4.1,0.1,Iris-versicolor +101,6.3,3.3,6,0.1,Iris-virginica +102,5.8,2.7,5.1,0.1,Iris-virginica +103,7.1,3,5.9,0.1,Iris-virginica +104,6.3,2.9,5.6,0.1,Iris-virginica +105,6.5,3,5.8,0.1,Iris-virginica +106,7.6,3,6.6,0.1,Iris-virginica +107,4.9,2.5,4.5,0.1,Iris-virginica +108,7.3,2.9,6.3,0.1,Iris-virginica +109,6.7,2.5,5.8,0.1,Iris-virginica +110,7.2,3.6,6.1,0.1,Iris-virginica +111,6.5,3.2,5.1,0.1,Iris-virginica +112,6.4,2.7,5.3,0.1,Iris-virginica +113,6.8,3,5.5,0.1,Iris-virginica +114,5.7,2.5,5,0.1,Iris-virginica +115,5.8,2.8,5.1,0.1,Iris-virginica +116,6.4,3.2,5.3,0.1,Iris-virginica +117,6.5,3,5.5,0.1,Iris-virginica +118,7.7,3.8,6.7,0.1,Iris-virginica +119,7.7,2.6,6.9,0.1,Iris-virginica +120,6,2.2,5,0.1,Iris-virginica +121,6.9,3.2,5.7,0.1,Iris-virginica +122,5.6,2.8,4.9,0.1,Iris-virginica +123,7.7,2.8,6.7,0.1,Iris-virginica +124,6.3,2.7,4.9,0.1,Iris-virginica +125,6.7,3.3,5.7,0.1,Iris-virginica +126,7.2,3.2,6,0.1,Iris-virginica +127,6.2,2.8,4.8,0.1,Iris-virginica +128,6.1,3,4.9,0.1,Iris-virginica +129,6.4,2.8,5.6,0.1,Iris-virginica +130,7.2,3,5.8,0.1,Iris-virginica +131,7.4,2.8,6.1,0.1,Iris-virginica +132,7.9,3.8,6.4,0.1,Iris-virginica +133,6.4,2.8,5.6,0.1,Iris-virginica +134,6.3,2.8,5.1,0.1,Iris-virginica +135,6.1,2.6,5.6,0.1,Iris-virginica +136,7.7,3,6.1,0.1,Iris-virginica +137,6.3,3.4,5.6,0.1,Iris-virginica +138,6.4,3.1,5.5,0.1,Iris-virginica +139,6,3,4.8,0.1,Iris-virginica +140,6.9,3.1,5.4,0.1,Iris-virginica +141,6.7,3.1,5.6,0.1,Iris-virginica +142,6.9,3.1,5.1,0.1,Iris-virginica +143,5.8,2.7,5.1,0.1,Iris-virginica +144,6.8,3.2,5.9,0.1,Iris-virginica +145,6.7,3.3,5.7,0.1,Iris-virginica +146,6.7,3,5.2,0.1,Iris-virginica +147,6.3,2.5,5,0.1,Iris-virginica +148,6.5,3,5.2,0.1,Iris-virginica +149,6.2,3.4,5.4,0.1, +150,NULL,3,5.1,0.1,Iris-virginica diff --git a/DemoNotebooks/RAG.ipynb b/DemoNotebooks/RAG.ipynb new file mode 100644 index 0000000..1aa70fd --- /dev/null +++ b/DemoNotebooks/RAG.ipynb @@ -0,0 +1,1292 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3603dec6-c3a3-404a-b965-293237c73e1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "87ac5ab4-54ae-4e34-82b4-973134b7a60f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using file content from ./test.docx (len=28508)\n", + "\n", + "Using database: BUGBREW\n", + "\n", + "Ingest complete.\n", + " documents=1\n", + " chunks_total=40\n", + " embeddings_written=40\n", + " Server version: 11.8.3-MariaDB-ubu2404\n", + "\n" + ] + } + ], + "source": [ + "%maria_ingest doc_id=search_test_doc title=\"Hybrid Search Test\" text_file=\"./test.docx\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0730da59-cc1c-4fe6-af9d-07a6db9835bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idtitlecontentmetadatacreated_at
1search_test_docHybrid Search TestOur store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-31 20:17:53
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from documents;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34038236-6e67-4f6a-a779-d5d635a7c781", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
iddoc_idchunk_indexchunk_textchunk_metacreated_at
1search_test_doc0Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:53
2search_test_doc1LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:53
3search_test_doc2nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:54
4search_test_doc3carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:54
5search_test_doc4nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:54
6search_test_doc5LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:54
7search_test_doc6nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:54
8search_test_doc7carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:54
9search_test_doc8nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:54
10search_test_doc9LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:54
11search_test_doc10nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
12search_test_doc11carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:55
13search_test_doc12nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:55
14search_test_doc13LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:55
15search_test_doc14nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
16search_test_doc15carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:55
17search_test_doc16nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:55
18search_test_doc17LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:55
19search_test_doc18nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:55
20search_test_doc19carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
21search_test_doc20nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:56
22search_test_doc21LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:56
23search_test_doc22nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:56
24search_test_doc23carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
25search_test_doc24nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:56
26search_test_doc25LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:56
27search_test_doc26nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:56
28search_test_doc27carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:56
29search_test_doc28nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:57
30search_test_doc29LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:57
31search_test_doc30nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:57
32search_test_doc31carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:57
33search_test_doc32nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:57
34search_test_doc33LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:57
35search_test_doc34nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:57
36search_test_doc35carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.\n", + "\n", + "Our store strives to deliver exceptional value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY{}2025-10-31 20:17:57
37search_test_doc36nal value and transparency across every stage of the customer experience.\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase.\n", + "Refunds are processed within 5–7 business days once the returned item is inspected. \n", + "Digital products, such as downloadable content or gift cards, are non-refundable.\n", + "\n", + "EXCHANGE POLICY\n", + "We offer one free exchange per order for issues such as size or color mismatch. \n", + "To start an exchange, customers should visit the Returns Center on our website and provide their order ID. \n", + "Replacement items are shipped as soon as the returned package is scanned by the courier.\n", + "\n", + "SHIPPING AND DELIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States.{}2025-10-31 20:17:58
38search_test_doc37LIVERY\n", + "Free standard shipping applies to all orders over $75 within the continental United States. \n", + "International shipping rates vary depending on region and weight.\n", + "Express delivery options are available at an additional cost.\n", + "Customers will receive a tracking number once the order has been dispatched.\n", + "\n", + "PAYMENT METHODS\n", + "We accept major credit cards, PayPal, and Apple Pay. \n", + "For corporate purchases, wire transfers are supported upon request. \n", + "All transactions are encrypted using industry-standard SSL technology.\n", + "\n", + "WARRANTY INFORMATION\n", + "All electronics include a one-year limited warranty covering manufacturing defects. \n", + "To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT{}2025-10-31 20:17:58
39search_test_doc38nd proof of purchase. \n", + "Warranty claims do not cover accidental damage or misuse.\n", + "\n", + "TECHNICAL SUPPORT\n", + "Our helpdesk operates 24/7 via email and live chat.\n", + "Common troubleshooting topics include product setup, firmware updates, and connectivity issues. \n", + "We also maintain a searchable online knowledge base for common problems.\n", + "\n", + "DATA PRIVACY AND SECURITY\n", + "We are fully compliant with GDPR and CCPA regulations. \n", + "Customer data is never sold to third parties and is used strictly for order processing and service improvement. \n", + "Users can request data deletion or export at any time by emailing privacy@ourstore.com.\n", + "\n", + "SUSTAINABILITY COMMITMENTS\n", + "We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use.{}2025-10-31 20:17:58
40search_test_doc39carbon-neutral shipping. \n", + "Customers may opt for “eco-packaging” at checkout to reduce plastic use. \n", + "We also partner with certified recyclers to properly dispose of returned electronic devices.\n", + "\n", + "LOYALTY PROGRAM\n", + "Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases.\n", + "Exclusive perks include early access to sales, extended return windows, and birthday rewards.\n", + "\n", + "CUSTOMER FEEDBACK\n", + "We value user feedback and continuously improve based on reviews. \n", + "Suggestions can be submitted through the feedback form located at the bottom of our website.\n", + "\n", + "Thank you for shopping with us and supporting sustainable retail practices.{}2025-10-31 20:17:58
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "select * from chunks;" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "3aec3bdd-c6f3-4ad4-90bb-bab2b445bdaf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] running hybrid search for query (len=19): how to get a refund\n", + "\n", + "chunk_id\tchunk_text...\tscore\tvec_sim\tbm25\tdoc_id\n", + "1\tOur store strives to deliver exceptional value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving th...\t0.819408\t0.484022\t0.209054\tsearch_test_doc\n", + "5\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "9\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "13\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "17\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "21\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "25\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "29\tnal value and transparency across every stage of the customer experience. REFUND AND RETURN POLICY Customers may request a refund within 30 days of receiving their item. Products must be unused, in...\t0.799462\t0.427034\t0.209054\tsearch_test_doc\n", + "\n" + ] + } + ], + "source": [ + "%maria_search query=\"how to get a refund\"" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "77e04582-6d81-4717-bb59-ea170f65d5aa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[debug] RAG query received (len=26): How do I request a refund?\n", + "\n", + "\n", + "=== ANSWER ===\n", + "\n", + "REFUND AND RETURN POLICY\n", + "Customers may request a refund within 30 days of receiving their item.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "%maria_rag_query query=\"How do I request a refund?\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cae763f-935f-43ab-8e99-554f382bb5b6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DemoNotebooks/RawMLPipeline.ipynb b/DemoNotebooks/RawMLPipeline.ipynb new file mode 100644 index 0000000..8942cb2 --- /dev/null +++ b/DemoNotebooks/RawMLPipeline.ipynb @@ -0,0 +1,8020 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e5af9aaa-128d-46cc-a84f-9000580b203b", + "metadata": {}, + "outputs": [], + "source": [ + "SHOW DATABASES;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0308f6d-a504-45b4-8f5b-a67181615fff", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE DATABASE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e276e32-8767-452f-bbbb-0ddbbafc7fb0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "USE BUGBREW;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b8c203d-36f1-4ae2-a614-8c5e62ab29ef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "DROP table IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40898b54-436b-4556-96fa-f3dbffa3fa17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "CREATE TABLE IRIS(\n", + " Id INT PRIMARY KEY,\n", + " SepalLengthCm DOUBLE,\n", + " SepalWidthCm DOUBLE,\n", + " PetalLengthCm DOUBLE,\n", + " PetalWidthCm DOUBLE,\n", + " Species VARCHAR(50)\n", + ");\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42e1bb47-ea97-4b57-9c4a-9d20a6ee5032", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query OK" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "LOAD DATA LOCAL INFILE '/home/iddhartha/mariadb_kernel/Iris.csv'\n", + "INTO TABLE IRIS\n", + "FIELDS TERMINATED BY ','\n", + "OPTIONALLY ENCLOSED BY '\"'\n", + "LINES TERMINATED BY '\\n'\n", + "IGNORE 1 LINES\n", + "(Id, SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm, Species);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "148a24eb-b93b-43dd-9d73-c9414f5d4807", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa\n", + "
24.931.40.2Iris-setosa\n", + "
34.73.21.30.2Iris-setosa\n", + "
44.63.11.50.2Iris-setosa\n", + "
553.61.40.2Iris-setosa\n", + "
65.43.91.70.4Iris-setosa\n", + "
74.6NULL1.40.3Iris-setosa\n", + "
853.41.50.2Iris-setosa\n", + "
94.42.91.40.2Iris-setosa\n", + "
104.93.11.50.1Iris-setosa\n", + "
115.43.71.50.2Iris-setosa\n", + "
124.83.41.60.2Iris-setosa\n", + "
134.831.40.1Iris-setosa\n", + "
144.331.10.1Iris-setosa\n", + "
155.841.20.1Iris-setosa\n", + "
165.74.41.50.1Iris-setosa\n", + "
175.4NULL1.30.1Iris-setosa\n", + "
185.13.51.40.1Iris-setosa\n", + "
195.73.81.70.1Iris-setosa\n", + "
205.13.81.50.1Iris-setosa\n", + "
215.43.41.70.1Iris-setosa\n", + "
225.13.71.50.1Iris-setosa\n", + "
234.63.610.1Iris-setosa\n", + "
245.13.31.70.1Iris-setosa\n", + "
254.83.41.90.1Iris-setosa\n", + "
26531.60.1Iris-setosa\n", + "
2753.41.60.1Iris-setosa\n", + "
28NULL3.51.50.1Iris-setosa\n", + "
295.23.41.40.1Iris-setosa\n", + "
304.73.21.60.1Iris-setosa\n", + "
314.83.1NULL0.1Iris-setosa\n", + "
325.43.41.50.1Iris-setosa\n", + "
335.24.11.50.1Iris-setosa\n", + "
345.54.21.40.1Iris-setosa\n", + "
354.93.11.50.1Iris-setosa\n", + "
3653.21.20.1Iris-setosa\n", + "
375.5NULL1.30.1Iris-setosa\n", + "
384.93.11.50.1Iris-setosa\n", + "
394.431.30.1Iris-setosa\n", + "
405.13.41.50.1Iris-setosa\n", + "
4153.51.30.1Iris-setosa\n", + "
424.52.31.30.1Iris-setosa\n", + "
434.43.21.30.1Iris-setosa\n", + "
4453.51.60.1Iris-setosa\n", + "
455.13.81.90.1Iris-setosa\n", + "
464.831.40.1Iris-setosa\n", + "
475.13.81.60.1Iris-setosa\n", + "
484.63.21.40.1Iris-setosa\n", + "
495.33.71.50.1Iris-setosa\n", + "
5053.31.40.1Iris-setosa\n", + "
5173.24.70.1Iris-versicolor\n", + "
526.43.24.50.1Iris-versicolor\n", + "
536.93.14.90.1Iris-versicolor\n", + "
545.52.340.1Iris-versicolor\n", + "
556.52.84.60.1Iris-versicolor\n", + "
565.72.84.50.1Iris-versicolor\n", + "
576.33.34.70.1Iris-versicolor\n", + "
584.92.43.30.1Iris-versicolor\n", + "
596.62.94.60.1Iris-versicolor\n", + "
605.22.73.90.1Iris-versicolor\n", + "
61523.50.1Iris-versicolor\n", + "
625.934.20.1Iris-versicolor\n", + "
6362.240.1Iris-versicolor\n", + "
646.12.94.70.1Iris-versicolor\n", + "
655.62.93.60.1Iris-versicolor\n", + "
666.73.14.40.1Iris-versicolor\n", + "
675.634.50.1Iris-versicolor\n", + "
685.82.74.10.1Iris-versicolor\n", + "
696.22.24.50.1Iris-versicolor\n", + "
705.62.53.90.1Iris-versicolor\n", + "
715.93.24.80.1Iris-versicolor\n", + "
726.12.840.1Iris-versicolor\n", + "
736.32.54.90.1Iris-versicolor\n", + "
746.12.84.70.1Iris-versicolor\n", + "
756.42.94.30.1Iris-versicolor\n", + "
766.634.40.1Iris-versicolor\n", + "
776.82.84.80.1Iris-versicolor\n", + "
786.7350.1Iris-versicolor\n", + "
7962.94.50.1Iris-versicolor\n", + "
805.72.63.50.1Iris-versicolor\n", + "
815.52.43.80.1Iris-versicolor\n", + "
825.52.43.70.1Iris-versicolor\n", + "
835.82.73.90.1Iris-versicolor\n", + "
8462.75.10.1Iris-versicolor\n", + "
855.434.50.1Iris-versicolor\n", + "
8663.44.50.1Iris-versicolor\n", + "
876.73.14.70.1Iris-versicolor\n", + "
886.32.34.40.1Iris-versicolor\n", + "
895.634.10.1Iris-versicolor\n", + "
905.52.540.1Iris-versicolor\n", + "
915.52.64.40.1Iris-versicolor\n", + "
926.134.60.1Iris-versicolor\n", + "
935.82.640.1Iris-versicolor\n", + "
9452.33.30.1Iris-versicolor\n", + "
955.62.74.20.1Iris-versicolor\n", + "
965.734.20.1Iris-versicolor\n", + "
975.72.94.20.1Iris-versicolor\n", + "
986.22.94.30.1Iris-versicolor\n", + "
995.12.530.1Iris-versicolor\n", + "
1005.72.84.10.1Iris-versicolor\n", + "
1016.33.360.1Iris-virginica\n", + "
1025.82.75.10.1Iris-virginica\n", + "
1037.135.90.1Iris-virginica\n", + "
1046.32.95.60.1Iris-virginica\n", + "
1056.535.80.1Iris-virginica\n", + "
1067.636.60.1Iris-virginica\n", + "
1074.92.54.50.1Iris-virginica\n", + "
1087.32.96.30.1Iris-virginica\n", + "
1096.72.55.80.1Iris-virginica\n", + "
1107.23.66.10.1Iris-virginica\n", + "
1116.53.25.10.1Iris-virginica\n", + "
1126.42.75.30.1Iris-virginica\n", + "
1136.835.50.1Iris-virginica\n", + "
1145.72.550.1Iris-virginica\n", + "
1155.82.85.10.1Iris-virginica\n", + "
1166.43.25.30.1Iris-virginica\n", + "
1176.535.50.1Iris-virginica\n", + "
1187.73.86.70.1Iris-virginica\n", + "
1197.72.66.90.1Iris-virginica\n", + "
12062.250.1Iris-virginica\n", + "
1216.93.25.70.1Iris-virginica\n", + "
1225.62.84.90.1Iris-virginica\n", + "
1237.72.86.70.1Iris-virginica\n", + "
1246.32.74.90.1Iris-virginica\n", + "
1256.73.35.70.1Iris-virginica\n", + "
1267.23.260.1Iris-virginica\n", + "
1276.22.84.80.1Iris-virginica\n", + "
1286.134.90.1Iris-virginica\n", + "
1296.42.85.60.1Iris-virginica\n", + "
1307.235.80.1Iris-virginica\n", + "
1317.42.86.10.1Iris-virginica\n", + "
1327.93.86.40.1Iris-virginica\n", + "
1336.42.85.60.1Iris-virginica\n", + "
1346.32.85.10.1Iris-virginica\n", + "
1356.12.65.60.1Iris-virginica\n", + "
1367.736.10.1Iris-virginica\n", + "
1376.33.45.60.1Iris-virginica\n", + "
1386.43.15.50.1Iris-virginica\n", + "
139634.80.1Iris-virginica\n", + "
1406.93.15.40.1Iris-virginica\n", + "
1416.73.15.60.1Iris-virginica\n", + "
1426.93.15.10.1Iris-virginica\n", + "
1435.82.75.10.1Iris-virginica\n", + "
1446.83.25.90.1Iris-virginica\n", + "
1456.73.35.70.1Iris-virginica\n", + "
1466.735.20.1Iris-virginica\n", + "
1476.32.550.1Iris-virginica\n", + "
1486.535.20.1Iris-virginica\n", + "
1496.23.45.40.1Iris-virginica\n", + "
150NULL35.10.1Iris-virginica\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT * FROM IRIS;" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f7b8f34e-e73b-45c4-a1c9-9c504ff12c4d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
missingpercent
Id00.00
SepalLengthCm21.33
SepalWidthCm32.00
PetalLengthCm10.67
PetalWidthCm00.00
Species00.00
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%missing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "bdaff76a-42cb-4d07-8c13-b8365ea38560", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW: missing counts per column:\n", + "Id: missing=0\n", + "SepalLengthCm: missing=2\n", + "SepalWidthCm: missing=3\n", + "PetalLengthCm: missing=1\n", + "PetalWidthCm: missing=0\n", + "Species: missing=0\n", + "PREVIEW: computed fill-values (best-effort):\n", + "Id: would fill with -> 75.5 (median via local preview)\n", + "SepalLengthCm: would fill with -> 5.8 (median via local preview)\n", + "SepalWidthCm: would fill with -> 3.0 (median via local preview)\n", + "PetalLengthCm: would fill with -> 4.4 (median via local preview)\n", + "PetalWidthCm: would fill with -> 0.1 (median via local preview)\n", + "Species: could NOT determine fill value (not numeric; cannot compute median locally); would skip\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_null_columnsId_filled_previewSepalLengthCm_filled_previewSepalWidthCm_filled_previewPetalLengthCm_filled_previewPetalWidthCm_filled_previewSpecies_filled_preview
74.6NaN1.40.3Iris-setosaSepalWidthCm74.63.01.40.3Iris-setosa
175.4NaN1.30.1Iris-setosaSepalWidthCm175.43.01.30.1Iris-setosa
28NaN3.51.50.1Iris-setosaSepalLengthCm285.83.51.50.1Iris-setosa
314.83.1NaN0.1Iris-setosaPetalLengthCm314.83.14.40.1Iris-setosa
375.5NaN1.30.1Iris-setosaSepalWidthCm375.53.01.30.1Iris-setosa
150NaN3.05.10.1Iris-virginicaSepalLengthCm1505.83.05.10.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%fillmissing strategy=median mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "95bd44b3-5cb4-472e-b4ee-a3321e595b4d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Apply completed: original preserved as IRIS_backup_29adf350d1ab4121.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
34.73.21.30.2Iris-setosa
44.63.11.50.2Iris-setosa
55.03.61.40.2Iris-setosa
65.43.91.70.4Iris-setosa
85.03.41.50.2Iris-setosa
94.42.91.40.2Iris-setosa
104.93.11.50.1Iris-setosa
115.43.71.50.2Iris-setosa
124.83.41.60.2Iris-setosa
134.83.01.40.1Iris-setosa
144.33.01.10.1Iris-setosa
155.84.01.20.1Iris-setosa
165.74.41.50.1Iris-setosa
185.13.51.40.1Iris-setosa
195.73.81.70.1Iris-setosa
205.13.81.50.1Iris-setosa
215.43.41.70.1Iris-setosa
225.13.71.50.1Iris-setosa
234.63.61.00.1Iris-setosa
245.13.31.70.1Iris-setosa
254.83.41.90.1Iris-setosa
265.03.01.60.1Iris-setosa
275.03.41.60.1Iris-setosa
295.23.41.40.1Iris-setosa
304.73.21.60.1Iris-setosa
325.43.41.50.1Iris-setosa
335.24.11.50.1Iris-setosa
345.54.21.40.1Iris-setosa
354.93.11.50.1Iris-setosa
365.03.21.20.1Iris-setosa
384.93.11.50.1Iris-setosa
394.43.01.30.1Iris-setosa
405.13.41.50.1Iris-setosa
415.03.51.30.1Iris-setosa
424.52.31.30.1Iris-setosa
434.43.21.30.1Iris-setosa
445.03.51.60.1Iris-setosa
455.13.81.90.1Iris-setosa
464.83.01.40.1Iris-setosa
475.13.81.60.1Iris-setosa
484.63.21.40.1Iris-setosa
495.33.71.50.1Iris-setosa
505.03.31.40.1Iris-setosa
517.03.24.70.1Iris-versicolor
526.43.24.50.1Iris-versicolor
536.93.14.90.1Iris-versicolor
545.52.34.00.1Iris-versicolor
556.52.84.60.1Iris-versicolor
565.72.84.50.1Iris-versicolor
576.33.34.70.1Iris-versicolor
584.92.43.30.1Iris-versicolor
596.62.94.60.1Iris-versicolor
605.22.73.90.1Iris-versicolor
615.02.03.50.1Iris-versicolor
625.93.04.20.1Iris-versicolor
636.02.24.00.1Iris-versicolor
646.12.94.70.1Iris-versicolor
655.62.93.60.1Iris-versicolor
666.73.14.40.1Iris-versicolor
675.63.04.50.1Iris-versicolor
685.82.74.10.1Iris-versicolor
696.22.24.50.1Iris-versicolor
705.62.53.90.1Iris-versicolor
715.93.24.80.1Iris-versicolor
726.12.84.00.1Iris-versicolor
736.32.54.90.1Iris-versicolor
746.12.84.70.1Iris-versicolor
756.42.94.30.1Iris-versicolor
766.63.04.40.1Iris-versicolor
776.82.84.80.1Iris-versicolor
786.73.05.00.1Iris-versicolor
796.02.94.50.1Iris-versicolor
805.72.63.50.1Iris-versicolor
815.52.43.80.1Iris-versicolor
825.52.43.70.1Iris-versicolor
835.82.73.90.1Iris-versicolor
846.02.75.10.1Iris-versicolor
855.43.04.50.1Iris-versicolor
866.03.44.50.1Iris-versicolor
876.73.14.70.1Iris-versicolor
886.32.34.40.1Iris-versicolor
895.63.04.10.1Iris-versicolor
905.52.54.00.1Iris-versicolor
915.52.64.40.1Iris-versicolor
926.13.04.60.1Iris-versicolor
935.82.64.00.1Iris-versicolor
945.02.33.30.1Iris-versicolor
955.62.74.20.1Iris-versicolor
965.73.04.20.1Iris-versicolor
975.72.94.20.1Iris-versicolor
986.22.94.30.1Iris-versicolor
995.12.53.00.1Iris-versicolor
1005.72.84.10.1Iris-versicolor
1016.33.36.00.1Iris-virginica
1025.82.75.10.1Iris-virginica
1037.13.05.90.1Iris-virginica
1046.32.95.60.1Iris-virginica
1056.53.05.80.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropmissing columns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf048049-058b-4c46-85e6-6966fb1ff7ae", + "metadata": {}, + "outputs": [], + "source": [ + "%dropmissing mode=rollback rollback_token=" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "23d4d6fa-590c-4f01-9a18-8e62a0e499d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCm
count100.000000100.000000100.000000100.000000100.000000
mean54.4500005.5360003.0690003.0750000.112000
std30.3968280.6685610.4666011.5339510.040899
min1.0000004.3000002.0000001.0000000.100000
25%28.5000005.0000002.8000001.5000000.100000
50%55.5000005.5000003.0000003.5500000.100000
75%80.2500006.0000003.4000004.5000000.100000
max105.0000007.1000004.4000006.0000000.400000
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%stats" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "78f0e9c1-c61a-4d79-bb3e-31efd05b4bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Outlier detection completed (non in-place). Summary:\n", + "Column 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.\n", + "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesId_is_outlierSepalLengthCm_is_outlierSepalWidthCm_is_outlierPetalLengthCm_is_outlierPetalWidthCm_is_outlier
15.13.51.40.2Iris-setosaFalseFalseFalseFalseTrue
24.93.01.40.2Iris-setosaFalseFalseFalseFalseTrue
34.73.21.30.2Iris-setosaFalseFalseFalseFalseTrue
44.63.11.50.2Iris-setosaFalseFalseFalseFalseTrue
55.03.61.40.2Iris-setosaFalseFalseFalseFalseTrue
65.43.91.70.4Iris-setosaFalseFalseFalseFalseTrue
85.03.41.50.2Iris-setosaFalseFalseFalseFalseTrue
94.42.91.40.2Iris-setosaFalseFalseFalseFalseTrue
104.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
115.43.71.50.2Iris-setosaFalseFalseFalseFalseTrue
124.83.41.60.2Iris-setosaFalseFalseFalseFalseTrue
134.83.01.40.1Iris-setosaFalseFalseFalseFalseFalse
144.33.01.10.1Iris-setosaFalseFalseFalseFalseFalse
155.84.01.20.1Iris-setosaFalseFalseFalseFalseFalse
165.74.41.50.1Iris-setosaFalseFalseTrueFalseFalse
185.13.51.40.1Iris-setosaFalseFalseFalseFalseFalse
195.73.81.70.1Iris-setosaFalseFalseFalseFalseFalse
205.13.81.50.1Iris-setosaFalseFalseFalseFalseFalse
215.43.41.70.1Iris-setosaFalseFalseFalseFalseFalse
225.13.71.50.1Iris-setosaFalseFalseFalseFalseFalse
234.63.61.00.1Iris-setosaFalseFalseFalseFalseFalse
245.13.31.70.1Iris-setosaFalseFalseFalseFalseFalse
254.83.41.90.1Iris-setosaFalseFalseFalseFalseFalse
265.03.01.60.1Iris-setosaFalseFalseFalseFalseFalse
275.03.41.60.1Iris-setosaFalseFalseFalseFalseFalse
295.23.41.40.1Iris-setosaFalseFalseFalseFalseFalse
304.73.21.60.1Iris-setosaFalseFalseFalseFalseFalse
325.43.41.50.1Iris-setosaFalseFalseFalseFalseFalse
335.24.11.50.1Iris-setosaFalseFalseFalseFalseFalse
345.54.21.40.1Iris-setosaFalseFalseFalseFalseFalse
354.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
365.03.21.20.1Iris-setosaFalseFalseFalseFalseFalse
384.93.11.50.1Iris-setosaFalseFalseFalseFalseFalse
394.43.01.30.1Iris-setosaFalseFalseFalseFalseFalse
405.13.41.50.1Iris-setosaFalseFalseFalseFalseFalse
415.03.51.30.1Iris-setosaFalseFalseFalseFalseFalse
424.52.31.30.1Iris-setosaFalseFalseFalseFalseFalse
434.43.21.30.1Iris-setosaFalseFalseFalseFalseFalse
445.03.51.60.1Iris-setosaFalseFalseFalseFalseFalse
455.13.81.90.1Iris-setosaFalseFalseFalseFalseFalse
464.83.01.40.1Iris-setosaFalseFalseFalseFalseFalse
475.13.81.60.1Iris-setosaFalseFalseFalseFalseFalse
484.63.21.40.1Iris-setosaFalseFalseFalseFalseFalse
495.33.71.50.1Iris-setosaFalseFalseFalseFalseFalse
505.03.31.40.1Iris-setosaFalseFalseFalseFalseFalse
517.03.24.70.1Iris-versicolorFalseFalseFalseFalseFalse
526.43.24.50.1Iris-versicolorFalseFalseFalseFalseFalse
536.93.14.90.1Iris-versicolorFalseFalseFalseFalseFalse
545.52.34.00.1Iris-versicolorFalseFalseFalseFalseFalse
556.52.84.60.1Iris-versicolorFalseFalseFalseFalseFalse
565.72.84.50.1Iris-versicolorFalseFalseFalseFalseFalse
576.33.34.70.1Iris-versicolorFalseFalseFalseFalseFalse
584.92.43.30.1Iris-versicolorFalseFalseFalseFalseFalse
596.62.94.60.1Iris-versicolorFalseFalseFalseFalseFalse
605.22.73.90.1Iris-versicolorFalseFalseFalseFalseFalse
615.02.03.50.1Iris-versicolorFalseFalseFalseFalseFalse
625.93.04.20.1Iris-versicolorFalseFalseFalseFalseFalse
636.02.24.00.1Iris-versicolorFalseFalseFalseFalseFalse
646.12.94.70.1Iris-versicolorFalseFalseFalseFalseFalse
655.62.93.60.1Iris-versicolorFalseFalseFalseFalseFalse
666.73.14.40.1Iris-versicolorFalseFalseFalseFalseFalse
675.63.04.50.1Iris-versicolorFalseFalseFalseFalseFalse
685.82.74.10.1Iris-versicolorFalseFalseFalseFalseFalse
696.22.24.50.1Iris-versicolorFalseFalseFalseFalseFalse
705.62.53.90.1Iris-versicolorFalseFalseFalseFalseFalse
715.93.24.80.1Iris-versicolorFalseFalseFalseFalseFalse
726.12.84.00.1Iris-versicolorFalseFalseFalseFalseFalse
736.32.54.90.1Iris-versicolorFalseFalseFalseFalseFalse
746.12.84.70.1Iris-versicolorFalseFalseFalseFalseFalse
756.42.94.30.1Iris-versicolorFalseFalseFalseFalseFalse
766.63.04.40.1Iris-versicolorFalseFalseFalseFalseFalse
776.82.84.80.1Iris-versicolorFalseFalseFalseFalseFalse
786.73.05.00.1Iris-versicolorFalseFalseFalseFalseFalse
796.02.94.50.1Iris-versicolorFalseFalseFalseFalseFalse
805.72.63.50.1Iris-versicolorFalseFalseFalseFalseFalse
815.52.43.80.1Iris-versicolorFalseFalseFalseFalseFalse
825.52.43.70.1Iris-versicolorFalseFalseFalseFalseFalse
835.82.73.90.1Iris-versicolorFalseFalseFalseFalseFalse
846.02.75.10.1Iris-versicolorFalseFalseFalseFalseFalse
855.43.04.50.1Iris-versicolorFalseFalseFalseFalseFalse
866.03.44.50.1Iris-versicolorFalseFalseFalseFalseFalse
876.73.14.70.1Iris-versicolorFalseFalseFalseFalseFalse
886.32.34.40.1Iris-versicolorFalseFalseFalseFalseFalse
895.63.04.10.1Iris-versicolorFalseFalseFalseFalseFalse
905.52.54.00.1Iris-versicolorFalseFalseFalseFalseFalse
915.52.64.40.1Iris-versicolorFalseFalseFalseFalseFalse
926.13.04.60.1Iris-versicolorFalseFalseFalseFalseFalse
935.82.64.00.1Iris-versicolorFalseFalseFalseFalseFalse
945.02.33.30.1Iris-versicolorFalseFalseFalseFalseFalse
955.62.74.20.1Iris-versicolorFalseFalseFalseFalseFalse
965.73.04.20.1Iris-versicolorFalseFalseFalseFalseFalse
975.72.94.20.1Iris-versicolorFalseFalseFalseFalseFalse
986.22.94.30.1Iris-versicolorFalseFalseFalseFalseFalse
995.12.53.00.1Iris-versicolorFalseFalseFalseFalseFalse
1005.72.84.10.1Iris-versicolorFalseFalseFalseFalseFalse
1016.33.36.00.1Iris-virginicaFalseFalseFalseFalseFalse
1025.82.75.10.1Iris-virginicaFalseFalseFalseFalseFalse
1037.13.05.90.1Iris-virginicaFalseFalseFalseFalseFalse
1046.32.95.60.1Iris-virginicaFalseFalseFalseFalseFalse
1056.53.05.80.1Iris-virginicaFalseFalseFalseFalseFalse
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "39e17585-f0af-4b7c-b8c1-8a4a28c76ee8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local): would drop 11 row(s) (from 100 to 89).\n", + "Column 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_outlier_cols
15.13.51.40.2Iris-setosa
24.93.01.40.2Iris-setosa
34.73.21.30.2Iris-setosa
44.63.11.50.2Iris-setosa
55.03.61.40.2Iris-setosa
65.43.91.70.4Iris-setosa
85.03.41.50.2Iris-setosa
94.42.91.40.2Iris-setosa
115.43.71.50.2Iris-setosa
124.83.41.60.2Iris-setosa
165.74.41.50.1Iris-setosa
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%dropoutliers mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "06543b64-2e3e-4be4-aa0b-fd90d4d1125f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: non-numeric columns skipped: Species\n", + "Apply completed: original preserved as IRIS_backup_89cc522a9f0c4aad.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies
15.13.51.40.1Iris-setosa
24.93.01.40.1Iris-setosa
34.73.21.30.1Iris-setosa
44.63.11.50.1Iris-setosa
55.03.61.40.1Iris-setosa
65.43.91.70.1Iris-setosa
85.03.41.50.1Iris-setosa
94.42.91.40.1Iris-setosa
104.93.11.50.1Iris-setosa
115.43.71.50.1Iris-setosa
124.83.41.60.1Iris-setosa
134.83.01.40.1Iris-setosa
144.33.01.10.1Iris-setosa
155.84.01.20.1Iris-setosa
165.74.31.50.1Iris-setosa
185.13.51.40.1Iris-setosa
195.73.81.70.1Iris-setosa
205.13.81.50.1Iris-setosa
215.43.41.70.1Iris-setosa
225.13.71.50.1Iris-setosa
234.63.61.00.1Iris-setosa
245.13.31.70.1Iris-setosa
254.83.41.90.1Iris-setosa
265.03.01.60.1Iris-setosa
275.03.41.60.1Iris-setosa
295.23.41.40.1Iris-setosa
304.73.21.60.1Iris-setosa
325.43.41.50.1Iris-setosa
335.24.11.50.1Iris-setosa
345.54.21.40.1Iris-setosa
354.93.11.50.1Iris-setosa
365.03.21.20.1Iris-setosa
384.93.11.50.1Iris-setosa
394.43.01.30.1Iris-setosa
405.13.41.50.1Iris-setosa
415.03.51.30.1Iris-setosa
424.52.31.30.1Iris-setosa
434.43.21.30.1Iris-setosa
445.03.51.60.1Iris-setosa
455.13.81.90.1Iris-setosa
464.83.01.40.1Iris-setosa
475.13.81.60.1Iris-setosa
484.63.21.40.1Iris-setosa
495.33.71.50.1Iris-setosa
505.03.31.40.1Iris-setosa
517.03.24.70.1Iris-versicolor
526.43.24.50.1Iris-versicolor
536.93.14.90.1Iris-versicolor
545.52.34.00.1Iris-versicolor
556.52.84.60.1Iris-versicolor
565.72.84.50.1Iris-versicolor
576.33.34.70.1Iris-versicolor
584.92.43.30.1Iris-versicolor
596.62.94.60.1Iris-versicolor
605.22.73.90.1Iris-versicolor
615.02.03.50.1Iris-versicolor
625.93.04.20.1Iris-versicolor
636.02.24.00.1Iris-versicolor
646.12.94.70.1Iris-versicolor
655.62.93.60.1Iris-versicolor
666.73.14.40.1Iris-versicolor
675.63.04.50.1Iris-versicolor
685.82.74.10.1Iris-versicolor
696.22.24.50.1Iris-versicolor
705.62.53.90.1Iris-versicolor
715.93.24.80.1Iris-versicolor
726.12.84.00.1Iris-versicolor
736.32.54.90.1Iris-versicolor
746.12.84.70.1Iris-versicolor
756.42.94.30.1Iris-versicolor
766.63.04.40.1Iris-versicolor
776.82.84.80.1Iris-versicolor
786.73.05.00.1Iris-versicolor
796.02.94.50.1Iris-versicolor
805.72.63.50.1Iris-versicolor
815.52.43.80.1Iris-versicolor
825.52.43.70.1Iris-versicolor
835.82.73.90.1Iris-versicolor
846.02.75.10.1Iris-versicolor
855.43.04.50.1Iris-versicolor
866.03.44.50.1Iris-versicolor
876.73.14.70.1Iris-versicolor
886.32.34.40.1Iris-versicolor
895.63.04.10.1Iris-versicolor
905.52.54.00.1Iris-versicolor
915.52.64.40.1Iris-versicolor
926.13.04.60.1Iris-versicolor
935.82.64.00.1Iris-versicolor
945.02.33.30.1Iris-versicolor
955.62.74.20.1Iris-versicolor
965.73.04.20.1Iris-versicolor
975.72.94.20.1Iris-versicolor
986.22.94.30.1Iris-versicolor
995.12.53.00.1Iris-versicolor
1005.72.84.10.1Iris-versicolor
1016.33.36.00.1Iris-virginica
1025.82.75.10.1Iris-virginica
1037.13.05.90.1Iris-virginica
1046.32.95.60.1Iris-virginica
1056.53.05.80.1Iris-virginica
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%clipoutliers columns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "1bb7b324-237b-4817-9877-8f6bea4d455b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'Species' unique non-null values: 3 (showing up to 10): ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']\n", + "PREVIEW (local) estimated created columns: 3\n", + "Preview sample with encoded columns (showing 100 rows):\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpeciesSpecies_Iris-setosaSpecies_Iris-versicolorSpecies_Iris-virginica
15.13.51.40.1Iris-setosa1.00.00.0
24.93.01.40.1Iris-setosa1.00.00.0
34.73.21.30.1Iris-setosa1.00.00.0
44.63.11.50.1Iris-setosa1.00.00.0
55.03.61.40.1Iris-setosa1.00.00.0
65.43.91.70.1Iris-setosa1.00.00.0
85.03.41.50.1Iris-setosa1.00.00.0
94.42.91.40.1Iris-setosa1.00.00.0
104.93.11.50.1Iris-setosa1.00.00.0
115.43.71.50.1Iris-setosa1.00.00.0
124.83.41.60.1Iris-setosa1.00.00.0
134.83.01.40.1Iris-setosa1.00.00.0
144.33.01.10.1Iris-setosa1.00.00.0
155.84.01.20.1Iris-setosa1.00.00.0
165.74.31.50.1Iris-setosa1.00.00.0
185.13.51.40.1Iris-setosa1.00.00.0
195.73.81.70.1Iris-setosa1.00.00.0
205.13.81.50.1Iris-setosa1.00.00.0
215.43.41.70.1Iris-setosa1.00.00.0
225.13.71.50.1Iris-setosa1.00.00.0
234.63.61.00.1Iris-setosa1.00.00.0
245.13.31.70.1Iris-setosa1.00.00.0
254.83.41.90.1Iris-setosa1.00.00.0
265.03.01.60.1Iris-setosa1.00.00.0
275.03.41.60.1Iris-setosa1.00.00.0
295.23.41.40.1Iris-setosa1.00.00.0
304.73.21.60.1Iris-setosa1.00.00.0
325.43.41.50.1Iris-setosa1.00.00.0
335.24.11.50.1Iris-setosa1.00.00.0
345.54.21.40.1Iris-setosa1.00.00.0
354.93.11.50.1Iris-setosa1.00.00.0
365.03.21.20.1Iris-setosa1.00.00.0
384.93.11.50.1Iris-setosa1.00.00.0
394.43.01.30.1Iris-setosa1.00.00.0
405.13.41.50.1Iris-setosa1.00.00.0
415.03.51.30.1Iris-setosa1.00.00.0
424.52.31.30.1Iris-setosa1.00.00.0
434.43.21.30.1Iris-setosa1.00.00.0
445.03.51.60.1Iris-setosa1.00.00.0
455.13.81.90.1Iris-setosa1.00.00.0
464.83.01.40.1Iris-setosa1.00.00.0
475.13.81.60.1Iris-setosa1.00.00.0
484.63.21.40.1Iris-setosa1.00.00.0
495.33.71.50.1Iris-setosa1.00.00.0
505.03.31.40.1Iris-setosa1.00.00.0
517.03.24.70.1Iris-versicolor0.01.00.0
526.43.24.50.1Iris-versicolor0.01.00.0
536.93.14.90.1Iris-versicolor0.01.00.0
545.52.34.00.1Iris-versicolor0.01.00.0
556.52.84.60.1Iris-versicolor0.01.00.0
565.72.84.50.1Iris-versicolor0.01.00.0
576.33.34.70.1Iris-versicolor0.01.00.0
584.92.43.30.1Iris-versicolor0.01.00.0
596.62.94.60.1Iris-versicolor0.01.00.0
605.22.73.90.1Iris-versicolor0.01.00.0
615.02.03.50.1Iris-versicolor0.01.00.0
625.93.04.20.1Iris-versicolor0.01.00.0
636.02.24.00.1Iris-versicolor0.01.00.0
646.12.94.70.1Iris-versicolor0.01.00.0
655.62.93.60.1Iris-versicolor0.01.00.0
666.73.14.40.1Iris-versicolor0.01.00.0
675.63.04.50.1Iris-versicolor0.01.00.0
685.82.74.10.1Iris-versicolor0.01.00.0
696.22.24.50.1Iris-versicolor0.01.00.0
705.62.53.90.1Iris-versicolor0.01.00.0
715.93.24.80.1Iris-versicolor0.01.00.0
726.12.84.00.1Iris-versicolor0.01.00.0
736.32.54.90.1Iris-versicolor0.01.00.0
746.12.84.70.1Iris-versicolor0.01.00.0
756.42.94.30.1Iris-versicolor0.01.00.0
766.63.04.40.1Iris-versicolor0.01.00.0
776.82.84.80.1Iris-versicolor0.01.00.0
786.73.05.00.1Iris-versicolor0.01.00.0
796.02.94.50.1Iris-versicolor0.01.00.0
805.72.63.50.1Iris-versicolor0.01.00.0
815.52.43.80.1Iris-versicolor0.01.00.0
825.52.43.70.1Iris-versicolor0.01.00.0
835.82.73.90.1Iris-versicolor0.01.00.0
846.02.75.10.1Iris-versicolor0.01.00.0
855.43.04.50.1Iris-versicolor0.01.00.0
866.03.44.50.1Iris-versicolor0.01.00.0
876.73.14.70.1Iris-versicolor0.01.00.0
886.32.34.40.1Iris-versicolor0.01.00.0
895.63.04.10.1Iris-versicolor0.01.00.0
905.52.54.00.1Iris-versicolor0.01.00.0
915.52.64.40.1Iris-versicolor0.01.00.0
926.13.04.60.1Iris-versicolor0.01.00.0
935.82.64.00.1Iris-versicolor0.01.00.0
945.02.33.30.1Iris-versicolor0.01.00.0
955.62.74.20.1Iris-versicolor0.01.00.0
965.73.04.20.1Iris-versicolor0.01.00.0
975.72.94.20.1Iris-versicolor0.01.00.0
986.22.94.30.1Iris-versicolor0.01.00.0
995.12.53.00.1Iris-versicolor0.01.00.0
1005.72.84.10.1Iris-versicolor0.01.00.0
1016.33.36.00.1Iris-virginica0.00.01.0
1025.82.75.10.1Iris-virginica0.00.01.0
1037.13.05.90.1Iris-virginica0.00.01.0
1046.32.95.60.1Iris-virginica0.00.01.0
1056.53.05.80.1Iris-virginica0.00.01.0
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=onehot columns=Species drop_original=false" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b12612ad-d4b2-4864-a3a1-0874dfbe414d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Encoded columns in-place and updated last_select.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
15.13.51.40.10
24.93.01.40.10
34.73.21.30.10
44.63.11.50.10
55.03.61.40.10
65.43.91.70.10
85.03.41.50.10
94.42.91.40.10
104.93.11.50.10
115.43.71.50.10
124.83.41.60.10
134.83.01.40.10
144.33.01.10.10
155.84.01.20.10
165.74.31.50.10
185.13.51.40.10
195.73.81.70.10
205.13.81.50.10
215.43.41.70.10
225.13.71.50.10
234.63.61.00.10
245.13.31.70.10
254.83.41.90.10
265.03.01.60.10
275.03.41.60.10
295.23.41.40.10
304.73.21.60.10
325.43.41.50.10
335.24.11.50.10
345.54.21.40.10
354.93.11.50.10
365.03.21.20.10
384.93.11.50.10
394.43.01.30.10
405.13.41.50.10
415.03.51.30.10
424.52.31.30.10
434.43.21.30.10
445.03.51.60.10
455.13.81.90.10
464.83.01.40.10
475.13.81.60.10
484.63.21.40.10
495.33.71.50.10
505.03.31.40.10
517.03.24.70.11
526.43.24.50.11
536.93.14.90.11
545.52.34.00.11
556.52.84.60.11
565.72.84.50.11
576.33.34.70.11
584.92.43.30.11
596.62.94.60.11
605.22.73.90.11
615.02.03.50.11
625.93.04.20.11
636.02.24.00.11
646.12.94.70.11
655.62.93.60.11
666.73.14.40.11
675.63.04.50.11
685.82.74.10.11
696.22.24.50.11
705.62.53.90.11
715.93.24.80.11
726.12.84.00.11
736.32.54.90.11
746.12.84.70.11
756.42.94.30.11
766.63.04.40.11
776.82.84.80.11
786.73.05.00.11
796.02.94.50.11
805.72.63.50.11
815.52.43.80.11
825.52.43.70.11
835.82.73.90.11
846.02.75.10.11
855.43.04.50.11
866.03.44.50.11
876.73.14.70.11
886.32.34.40.11
895.63.04.10.11
905.52.54.00.11
915.52.64.40.11
926.13.04.60.11
935.82.64.00.11
945.02.33.30.11
955.62.74.20.11
965.73.04.20.11
975.72.94.20.11
986.22.94.30.11
995.12.53.00.11
1005.72.84.10.11
1016.33.36.00.12
1025.82.75.10.12
1037.13.05.90.12
1046.32.95.60.12
1056.53.05.80.12
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%encode method=label columns=Species drop_original=true mode=apply confirm=true" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8ba776f8-84ff-477b-a5e3-c02e678c839c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PREVIEW (local):\n", + "Local: Column 'Id': mean=54.45, std=30.244462303039874\n", + "Local: Column 'SepalLengthCm': mean=5.535999999999999, std=0.6652097413598211\n", + "Local: Column 'SepalWidthCm': mean=3.067999999999999, std=0.46149322855270586\n", + "Local: Column 'PetalLengthCm': mean=3.0750000000000006, std=1.526261773091366\n", + "Local: Column 'PetalWidthCm': mean=0.09999999999999998, std=2.7755575615628914e-17\n", + "Local: Column 'Species_lbl': mean=0.6, std=0.5830951894845301\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lblId_std_previewSepalLengthCm_std_previewSepalWidthCm_std_previewPetalLengthCm_std_previewPetalWidthCm_std_previewSpecies_lbl_std_preview
15.13.51.40.10-1.767266-0.6554320.936092-1.0974531.0-1.028992
24.93.01.40.10-1.734202-0.956089-0.147348-1.0974531.0-1.028992
34.73.21.30.10-1.701138-1.2567460.286028-1.1629721.0-1.028992
44.63.11.50.10-1.668074-1.4070750.069340-1.0319331.0-1.028992
55.03.61.40.10-1.635010-0.8057611.152780-1.0974531.0-1.028992
65.43.91.70.10-1.601946-0.2044471.802843-0.9008941.0-1.028992
85.03.41.50.10-1.535818-0.8057610.719404-1.0319331.0-1.028992
94.42.91.40.10-1.502754-1.707732-0.364036-1.0974531.0-1.028992
104.93.11.50.10-1.469691-0.9560890.069340-1.0319331.0-1.028992
115.43.71.50.10-1.436627-0.2044471.369468-1.0319331.0-1.028992
124.83.41.60.10-1.403563-1.1064180.719404-0.9664141.0-1.028992
134.83.01.40.10-1.370499-1.106418-0.147348-1.0974531.0-1.028992
144.33.01.10.10-1.337435-1.858061-0.147348-1.2940111.0-1.028992
155.84.01.20.10-1.3043710.3968672.019531-1.2284921.0-1.028992
165.74.31.50.10-1.2713070.2465392.669595-1.0319331.0-1.028992
185.13.51.40.10-1.205179-0.6554320.936092-1.0974531.0-1.028992
195.73.81.70.10-1.1721150.2465391.586155-0.9008941.0-1.028992
205.13.81.50.10-1.139051-0.6554321.586155-1.0319331.0-1.028992
215.43.41.70.10-1.105988-0.2044470.719404-0.9008941.0-1.028992
225.13.71.50.10-1.072924-0.6554321.369468-1.0319331.0-1.028992
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%standardize mode=preview" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "dbca0744-2672-4c67-bdab-b94ec62b8cff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Normalized 3 column(s) to range (0.0, 1.0). Updated data['last_select'] in-place.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
10.2857140.6521740.080.10
20.2142860.4347830.080.10
30.1428570.5217390.060.10
40.1071430.4782610.100.10
50.2500000.6956520.080.10
60.3928570.8260870.140.10
80.2500000.6086960.100.10
90.0357140.3913040.080.10
100.2142860.4782610.100.10
110.3928570.7391300.100.10
120.1785710.6086960.120.10
130.1785710.4347830.080.10
140.0000000.4347830.020.10
150.5357140.8695650.040.10
160.5000001.0000000.100.10
180.2857140.6521740.080.10
190.5000000.7826090.140.10
200.2857140.7826090.100.10
210.3928570.6086960.140.10
220.2857140.7391300.100.10
230.1071430.6956520.000.10
240.2857140.5652170.140.10
250.1785710.6086960.180.10
260.2500000.4347830.120.10
270.2500000.6086960.120.10
290.3214290.6086960.080.10
300.1428570.5217390.120.10
320.3928570.6086960.100.10
330.3214290.9130430.100.10
340.4285710.9565220.080.10
350.2142860.4782610.100.10
360.2500000.5217390.040.10
380.2142860.4782610.100.10
390.0357140.4347830.060.10
400.2857140.6086960.100.10
410.2500000.6521740.060.10
420.0714290.1304350.060.10
430.0357140.5217390.060.10
440.2500000.6521740.120.10
450.2857140.7826090.180.10
460.1785710.4347830.080.10
470.2857140.7826090.120.10
480.1071430.5217390.080.10
490.3571430.7391300.100.10
500.2500000.5652170.080.10
510.9642860.5217390.740.11
520.7500000.5217390.700.11
530.9285710.4782610.780.11
540.4285710.1304350.600.11
550.7857140.3478260.720.11
560.5000000.3478260.700.11
570.7142860.5652170.740.11
580.2142860.1739130.460.11
590.8214290.3913040.720.11
600.3214290.3043480.580.11
610.2500000.0000000.500.11
620.5714290.4347830.640.11
630.6071430.0869570.600.11
640.6428570.3913040.740.11
650.4642860.3913040.520.11
660.8571430.4782610.680.11
670.4642860.4347830.700.11
680.5357140.3043480.620.11
690.6785710.0869570.700.11
700.4642860.2173910.580.11
710.5714290.5217390.760.11
720.6428570.3478260.600.11
730.7142860.2173910.780.11
740.6428570.3478260.740.11
750.7500000.3913040.660.11
760.8214290.4347830.680.11
770.8928570.3478260.760.11
780.8571430.4347830.800.11
790.6071430.3913040.700.11
800.5000000.2608700.500.11
810.4285710.1739130.560.11
820.4285710.1739130.540.11
830.5357140.3043480.580.11
840.6071430.3043480.820.11
850.3928570.4347830.700.11
860.6071430.6086960.700.11
870.8571430.4782610.740.11
880.7142860.1304350.680.11
890.4642860.4347830.620.11
900.4285710.2173910.600.11
910.4285710.2608700.680.11
920.6428570.4347830.720.11
930.5357140.2608700.600.11
940.2500000.1304350.460.11
950.4642860.3043480.640.11
960.5000000.4347830.640.11
970.5000000.3913040.640.11
980.6785710.3913040.660.11
990.2857140.2173910.400.11
1000.5000000.3478260.620.11
1010.7142860.5652171.000.12
1020.5357140.3043480.820.12
1031.0000000.4347830.980.12
1040.7142860.3913040.920.12
1050.7857140.4347830.960.12
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%normalize columns=SepalLengthCm,SepalWidthCm,PetalLengthCm mode=apply confirm=true " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "76f81e4f-4f72-4f6b-b31e-339e660de710", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Split completed: total=100, train=70, test=20, val=10.\n" + ] + }, + { + "data": { + "text/html": [ + "

Train (70 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
770.8928570.3478260.760.11
600.3214290.3043480.580.11
400.2857140.6086960.100.10
140.0000000.4347830.020.10
740.6428570.3478260.740.11
10.2857140.6521740.080.10
270.2500000.6086960.120.10
850.3928570.4347830.700.11
680.5357140.3043480.620.11
20.2142860.4347830.080.10
450.2857140.7826090.180.10
800.5000000.2608700.500.11
670.4642860.4347830.700.11
1020.5357140.3043480.820.12
950.4642860.3043480.640.11
840.6071430.3043480.820.11
40.1071430.4782610.100.10
980.6785710.3913040.660.11
150.5357140.8695650.040.10
300.1428570.5217390.120.10
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Validation (10 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
810.4285710.1739130.560.11
260.2500000.4347830.120.10
380.2142860.4782610.100.10
940.2500000.1304350.460.11
110.3928570.7391300.100.10
80.2500000.6086960.100.10
560.5000000.3478260.700.11
720.6428570.3478260.600.11
190.5000000.7826090.140.10
910.4285710.2608700.680.11
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Test (20 rows)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdSepalLengthCmSepalWidthCmPetalLengthCmPetalWidthCmSpecies_lbl
700.4642860.2173910.580.11
590.8214290.3913040.720.11
570.7142860.5652170.740.11
750.7500000.3913040.660.11
90.0357140.3913040.080.10
330.3214290.9130430.100.10
230.1071430.6956520.000.10
540.4285710.1304350.600.11
210.3928570.6086960.140.10
250.1785710.6086960.180.10
160.5000001.0000000.100.10
1031.0000000.4347830.980.12
500.2500000.5652170.080.10
580.2142860.1739130.460.11
120.1785710.6086960.120.10
520.7500000.5217390.700.11
640.6428570.3913040.740.11
440.2500000.6521740.120.10
760.8214290.4347830.680.11
780.8571430.4347830.800.11
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%splitdata test_size=0.2 val_size=0.1 stratify=Species_lbl random_state=42" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "512becef-92a8-4fa9-a856-39f60c661caa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Feature Selection Results (method=anova)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureScore
PetalLengthCm624.413887
Id122.246919
SepalLengthCm49.475597
PetalWidthCm33.500000
SepalWidthCm24.338860
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selected 4 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, PetalWidthCm\n" + ] + } + ], + "source": [ + "%select_features target=Species_lbl problem=classification k=4" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "28490e5c-dad8-44c5-9710-75802aa037e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Model Selection Results (primary_metric=accuracy)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Modelaccuracy_Meanaccuracy_Stdf1_Meanf1_Stdprecision_Meanprecision_Stdrecall_Meanrecall_Std
gbm1.00000.00000.92890.14221.00000.00000.91430.1292
ada1.00000.00000.92890.14220.92500.15001.00000.0000
lightgbm0.98570.02860.92890.14220.92500.15000.93330.1333
rf0.98570.02860.92890.14220.92500.15000.93330.1333
catboost0.98570.02860.92890.14220.92500.15000.93330.1333
logistic0.95710.05710.95200.05970.95000.06670.97140.0381
xgboost0.95710.05710.84760.18740.83930.19760.85710.1756
knn0.95710.05710.95200.05970.95000.06670.97140.0381
svm0.90000.09690.68440.16890.67280.17190.70480.1576
mlp0.82860.21480.47020.28700.72200.31040.56670.2494
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best model 'gbm' (mean accuracy=1.0000) saved to data['best_model'].\n" + ] + } + ], + "source": [ + "%select_model target=Species_lbl problem=classification " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a48fcd76-8b4e-44b7-8bad-bc91d76c8f5e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model 'logistic' trained and saved to data['last_model']. problem=classification. train_rows=70\n" + ] + } + ], + "source": [ + "%train_model features=PetalLengthCm,SepalLengthCm,PetalWidthCm,SepalWidthCm target=Species_lbl problem=classification model=logistic" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "9afbac09-ac9b-47ec-b3cb-29c650abc225", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "
\n", + "
\n", + "

Metrics

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Accuracy0.9500
Precision (w)0.9045
Recall (w)0.9500
F1 (w)0.9262
ROC AUCRequires numeric y for ROC AUC.
\n", + "
\n", + "
\"confusion
\n", + "

Classification report

\n", + "
              precision    recall  f1-score   support\n",
+       "\n",
+       "           0       1.00      1.00      1.00         9\n",
+       "           1       0.91      1.00      0.95        10\n",
+       "           2       0.00      0.00      0.00         1\n",
+       "\n",
+       "    accuracy                           0.95        20\n",
+       "   macro avg       0.64      0.67      0.65        20\n",
+       "weighted avg       0.90      0.95      0.93        20\n",
+       "
\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Predictions preview (actual vs predicted)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Species_lbl_predicted_pred_proba
11[0.1568254770832685, 0.7713064473235676, 0.07186807559316397]
11[0.06718779974737324, 0.8435489946780091, 0.0892632055746176]
11[0.11193644578263154, 0.7807944605541814, 0.10726909366318703]
11[0.0969971439337914, 0.8149505429793529, 0.0880523130868557]
00[0.8433008108096057, 0.13829791543550074, 0.01840127375489365]
00[0.895686730493734, 0.08380524435902056, 0.020508025147245475]
00[0.9268617694515947, 0.061193135907732926, 0.011945094640672366]
11[0.13151754326324983, 0.8012709935000745, 0.06721146323667562]
00[0.7596738139774813, 0.2071521192267119, 0.033174066795806915]
00[0.8118943280777537, 0.1607394103543395, 0.027366261567906593]
00[0.8754735560817294, 0.09876077960574704, 0.025765664312523556]
21[0.018507285282619034, 0.8871550456187827, 0.09433766909859825]
00[0.8331662419100963, 0.14412605014620486, 0.02270770794369895]
11[0.32161813792276145, 0.6197736419781975, 0.05860822009904103]
00[0.8460107506358353, 0.13157965022216966, 0.022409599141995116]
11[0.11013288426842004, 0.7886084264808748, 0.10125868925070508]
11[0.08986421569943458, 0.8189349996674764, 0.09120078463308896]
00[0.8381775823858857, 0.13751583333001305, 0.02430658428410121]
11[0.0855996123051164, 0.8222134845068737, 0.09218690318800986]
11[0.050674387944242026, 0.8545552152483651, 0.09477039680739305]
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "%evaluate_model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "7544f465-842c-4d37-8c68-5837bb235411", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model from data['last_model'] saved to ./saved_models/model.joblib\n" + ] + } + ], + "source": [ + "%savemodel model_name_in_data=last_model save_path='./saved_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "63b78371-7c0f-40f0-8bcf-d6ecf5628feb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded model from ./saved_models/model.joblib → data['last_model'] (features[4], target=Species_lbl)\n" + ] + } + ], + "source": [ + "%loadmodel load_path='./saved_models/model.joblib'" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "f50c0979-343d-43fb-a132-efd77299370b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using inline feature values for prediction: {'PetalLengthCm': 6.4, 'SepalLengthCm': 4.2, 'PetalWidthCm': 2.8, 'SepalWidthCm': 1.2}\n" + ] + }, + { + "data": { + "text/html": [ + "

Predictions (last_preds)

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
prediction
1
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predictions stored in data['last_preds'] with shape=(1, 1)\n" + ] + } + ], + "source": [ + "%predict model_name=last_model data_name=[6.4,4.2,2.8,1.2] output_name=last_preds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "248bd31c-42bd-4747-b1ca-94b31c97d70d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
idcommand_nameargumentsexecution_timestampaffected_columnsoperation_statusmessagedb_nameuser_namerollback_tokenbackup_tableoriginal_table
1missing2025-11-01 10:26:00ALL_COLUMNSsuccess%missing action=show examined 6 column(s); total_rows=150.BUGBREWNULLNULLNULL
2fillmissingstrategy=median mode=preview2025-11-01 10:26:08Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciespreviewpreview_computed_fill_valuesBUGBREWNULLNULLNULL
3dropmissingcolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:26:15SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciesappliedapplied_backup=IRIS_backup_c8cac3251a844d7bBUGBREWc8cac3251a844d7bIRIS_backup_c8cac3251a844d7bIRIS
4missing2025-11-01 10:29:32ALL_COLUMNSsuccess%missing action=show examined 6 column(s); total_rows=150.BUGBREWNULLNULLNULL
5fillmissingstrategy=median mode=preview2025-11-01 10:29:38Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciespreviewpreview_computed_fill_valuesBUGBREWNULLNULLNULL
6dropmissingcolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:29:46SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Speciesappliedapplied_backup=IRIS_backup_29adf350d1ab4121BUGBREW29adf350d1ab4121IRIS_backup_29adf350d1ab4121IRIS
7stats2025-11-01 10:30:13ALL_COLUMNSsuccessStats computed for 5 column(s); total_rows=100; percentiles=; include=numeric.BUGBREWNULLNULLNULL
8outliers2025-11-01 10:30:18Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmsuccessColumn 'Id': detected 0 outlier(s) using iqr.\n", + "Column 'SepalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'SepalWidthCm': detected 1 outlier(s) using iqr.\n", + "Column 'PetalLengthCm': detected 0 outlier(s) using iqr.\n", + "Column 'PetalWidthCm': detected 10 outlier(s) using iqr.BUGBREWNULLNULLNULL
9dropoutliersmode=preview2025-11-01 10:30:25Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmpreviewpreview_completedBUGBREWNULLNULLNULL
10clipoutlierscolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species mode=apply table=IRIS confirm=true2025-11-01 10:30:29SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCmappliedapplied_backup=IRIS_backup_89cc522a9f0c4aadBUGBREW89cc522a9f0c4aadIRIS_backup_89cc522a9f0c4aadIRIS
11encodemethod=onehot columns=Species drop_original=false2025-11-01 10:30:33Speciespreviewpreview_completedBUGBREWNULLNULLNULL
12encodemethod=label columns=Species drop_original=true mode=apply confirm=true2025-11-01 10:30:39SpeciessuccessMethod: label\n", + "Created columns:\n", + "Species_lbl\n", + "\n", + "Details:\n", + "Column 'Species': label-encoded -> Species_lbl (unique_values=3)BUGBREWNULLNULLNULL
13standardizemode=preview2025-11-01 10:30:44Id\n", + "SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCm\n", + "PetalWidthCm\n", + "Species_lblpreviewpreview_completedBUGBREWNULLNULLNULL
14normalizecolumns=SepalLengthCm,SepalWidthCm,PetalLengthCm mode=apply confirm=true 2025-11-01 10:30:50SepalLengthCm\n", + "SepalWidthCm\n", + "PetalLengthCmsuccessFeature range: (0.0, 1.0)\n", + "\n", + "Details:\n", + "Normalized 3 column(s) to range (0.0, 1.0).BUGBREWNULLNULLNULL
15splitdatatest_size=0.2 val_size=0.1 stratify=Species_lbl random_state=422025-11-01 10:30:55Species_lblsuccesstrain_name=last_select_train, test_name=last_select_test, val_name=last_select_val\n", + "train_count=70, test_count=20, val_count=10\n", + "test_frac=0.2, val_frac=0.1, shuffle=True, random_state=42BUGBREWNULLNULLNULL
16select_featurestarget=Species_lbl problem=classification k=42025-11-01 10:31:07PetalLengthCm\n", + "Id\n", + "SepalLengthCm\n", + "PetalWidthCmsuccessSelected 4 features saved to data['selected_features']: PetalLengthCm, Id, SepalLengthCm, PetalWidthCmBUGBREWNULLNULLNULL
17select_modeltarget=Species_lbl problem=classification 2025-11-01 10:31:35PetalLengthCm\n", + "Id\n", + "SepalLengthCm\n", + "PetalWidthCmsuccessBest model 'gbm' (mean accuracy=1.0000) saved to data['best_model'].BUGBREWNULLNULLNULL
18train_modelfeatures=PetalLengthCm,SepalLengthCm,PetalWidthCm,SepalWidthCm target=Species_lbl problem=classification model=logistic2025-11-01 10:31:43PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessModel 'logistic' trained and saved to data['last_model']. problem=classification. train_rows=70BUGBREWNULLNULLNULL
19evaluate_model2025-11-01 10:31:46PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessEvaluation success. Model='last_model', test='last_select_test', preds_saved='last_preds'. accuracy=0.9500, precision=0.9045, recall=0.9500, f1=0.9262BUGBREWNULLNULLNULL
20save_modelmodel_name_in_data=last_model save_path='./saved_models/model.joblib'2025-11-01 10:31:54last_modelsuccessSaved model to ./saved_models/model.joblib (features[4], target=Species_lbl)BUGBREWNULLNULLNULL
21load_modelload_path='./saved_models/model.joblib'2025-11-01 10:31:56last_modelsuccessLoaded model from ./saved_models/model.joblib → data['last_model'] (features[4], target=Species_lbl)BUGBREWNULLNULLNULL
22predict_modelmodel_name=last_model data_name=[6.4,4.2,2.8,1.2] output_name=last_preds2025-11-01 10:31:58PetalLengthCm\n", + "SepalLengthCm\n", + "PetalWidthCm\n", + "SepalWidthCmsuccessPrediction success. model=last_model, data_arg=[6.4, 4.2, 2.8, 1.2], output=last_preds, shape=(1, 1) inline_values=[6.4, 4.2, 2.8, 1.2]BUGBREWNULLNULLNULL
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "SELECT * FROM magic_metadata;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0706b5d-44a3-4bc3-a5c9-aff61a301cb4", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "411fb892-8b7c-4495-ae2e-d24554d872a8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MariaDB", + "language": "SQL", + "name": "mariadb_kernel" + }, + "language_info": { + "file_extension": ".sql", + "mimetype": "text/plain", + "name": "SQL" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DemoNotebooks/test.docx b/DemoNotebooks/test.docx new file mode 100644 index 0000000..66f28d5 --- /dev/null +++ b/DemoNotebooks/test.docx @@ -0,0 +1,529 @@ +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. + +Our store strives to deliver exceptional value and transparency across every stage of the customer experience. + +REFUND AND RETURN POLICY +Customers may request a refund within 30 days of receiving their item. Products must be unused, in original packaging, and accompanied by proof of purchase. +Refunds are processed within 5–7 business days once the returned item is inspected. +Digital products, such as downloadable content or gift cards, are non-refundable. + +EXCHANGE POLICY +We offer one free exchange per order for issues such as size or color mismatch. +To start an exchange, customers should visit the Returns Center on our website and provide their order ID. +Replacement items are shipped as soon as the returned package is scanned by the courier. + +SHIPPING AND DELIVERY +Free standard shipping applies to all orders over $75 within the continental United States. +International shipping rates vary depending on region and weight. +Express delivery options are available at an additional cost. +Customers will receive a tracking number once the order has been dispatched. + +PAYMENT METHODS +We accept major credit cards, PayPal, and Apple Pay. +For corporate purchases, wire transfers are supported upon request. +All transactions are encrypted using industry-standard SSL technology. + +WARRANTY INFORMATION +All electronics include a one-year limited warranty covering manufacturing defects. +To make a warranty claim, contact support@ourstore.com with your serial number and proof of purchase. +Warranty claims do not cover accidental damage or misuse. + +TECHNICAL SUPPORT +Our helpdesk operates 24/7 via email and live chat. +Common troubleshooting topics include product setup, firmware updates, and connectivity issues. +We also maintain a searchable online knowledge base for common problems. + +DATA PRIVACY AND SECURITY +We are fully compliant with GDPR and CCPA regulations. +Customer data is never sold to third parties and is used strictly for order processing and service improvement. +Users can request data deletion or export at any time by emailing privacy@ourstore.com. + +SUSTAINABILITY COMMITMENTS +We use recycled materials in 80% of our packaging and are working toward carbon-neutral shipping. +Customers may opt for “eco-packaging” at checkout to reduce plastic use. +We also partner with certified recyclers to properly dispose of returned electronic devices. + +LOYALTY PROGRAM +Members of our loyalty program earn 1 point per dollar spent, redeemable for discounts on future purchases. +Exclusive perks include early access to sales, extended return windows, and birthday rewards. + +CUSTOMER FEEDBACK +We value user feedback and continuously improve based on reviews. +Suggestions can be submitted through the feedback form located at the bottom of our website. + +Thank you for shopping with us and supporting sustainable retail practices. \ No newline at end of file diff --git a/DemoNotebooks/test.txt b/DemoNotebooks/test.txt new file mode 100644 index 0000000..8843993 --- /dev/null +++ b/DemoNotebooks/test.txt @@ -0,0 +1,109 @@ +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? + +The magic now emits warnings if a file can't be read or requires an optional extractor (PyPDF2 / python-docx) but those libraries are not installed. + +If you'd like, I can: + +add support for PPTX / HTML extraction, + +automatically strip front-matter from markdown files, + +or produce a small PR-style diff instead of this canvas update. Which would you like? \ No newline at end of file diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py new file mode 100644 index 0000000..11423a0 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/clipoutliers.py @@ -0,0 +1,965 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from collections import namedtuple +import enum +from typing import Callable, List, NamedTuple, Tuple +import pandas +from pandas.core.frame import DataFrame +# note: we don't strictly rely on SqlFetch import path. We'll attempt to use it if available. +try: + from mariadb_kernel.sql_fetch import SqlFetch # optional; used if present +except Exception: + SqlFetch = None +import logging +import math +from datetime import datetime +import re +import os +import uuid +import time + + +class ClipOutliers(MariaMagic): + """ + %clipoutliers [columns=col1,col2,...] [method=iqr|zscore] + [k=1.5] [z_thresh=3.0] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] + + Clamps (clips) extreme values to computed boundary limits. + - method: + iqr -> Tukey IQR method using k (default 1.5) + zscore -> mean ± z_thresh * std (default z_thresh=3.0) + - columns: comma-separated list of columns to operate on. If omitted, all numeric columns are used. + - inplace: if True (default) modifies data["last_select"] in-place. + if False stores clipped copy in data["last_select_clipped"]. + - mode: + preview -> show what would happen (local + optional DB estimates) + apply -> perform clipping (local or DB) + rollback-> restore DB backup created by apply + Additionally, execution metadata is stored into a table `magic_metadata`. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "clipoutliers" + + def help(self): + return ( + "%clipoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [inplace=True|False]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false]\n" + " [sample_size=100] [lock_timeout=10]\n" + "Clamps extreme numeric values to computed boundaries (in-place by default).\n" + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Convert strings like numbers or bools into Python objects.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments.""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML.""" + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {mime: html}, "metadata": {}}) + + def _compute_bounds(self, series, method, k=1.5, z_thresh=3.0): + """Compute (lower, upper) clipping bounds for a pandas Series.""" + s = series.dropna() + if s.empty: + return None, None + if method == "iqr": + q1 = s.quantile(0.25) + q3 = s.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return float(lower), float(upper) + elif method == "zscore": + mean = s.mean() + std = s.std() + if std == 0 or np.isnan(std): + return None, None + lower = mean - z_thresh * std + upper = mean + z_thresh * std + return float(lower), float(upper) + else: + raise ValueError(f"Unknown method {method}") + + # ---- New DB / metadata helpers ---- + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + # fallthrough to manual approach + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pandas.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pandas.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Includes rollback support columns (rollback_token, backup_table, original_table). + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error(f"Error creating magic_metadata table: {mariadb_client.run_statement('SHOW WARNINGS;')}") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + # swallow errors but log + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + # DB helpers for threshold computation and parsing + def _compute_thresholds_db(self, mariadb_client, table_full, col, method, k=1.5, z_thresh=3.0, sample_size=100): + """ + Sample non-null values from DB and compute thresholds for IQR or zscore. + Returns (ok, {lower:.., upper:..}, message) + """ + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "sample query failed" + try: + df_list = pd.read_html(out) + if not df_list or len(df_list) == 0: + return False, None, "no sample rows parsed" + # try numeric conversion + series = pd.to_numeric(df_list[0].iloc[:, 0], errors="coerce").dropna() + if series.empty: + return False, None, "sample contains no numeric values" + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via sampling" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via sampling" + else: + return False, None, "unknown method" + except Exception: + vals = re.findall(r"(.*?)", str(out), flags=re.S | re.I) + nums = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + try: + nums.append(float(txt)) + except Exception: + continue + if not nums: + return False, None, "parsed sample contains no numeric values" + series = pd.Series(nums) + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via regex sample" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via regex sample" + else: + return False, None, "unknown method" + except Exception as e: + return False, None, f"exception computing thresholds: {e}" + + def _parse_count_result(self, res): + """Parse a SELECT COUNT(*) result returned by mariadb_client.run_statement (HTML or text).""" + try: + df_list = pd.read_html(res) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + try: + return int(val) + except Exception: + try: + return int(float(val)) + except Exception: + return None + except Exception: + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return int(txt) + except Exception: + try: + return int(float(txt)) + except Exception: + return None + # fallback: try to parse raw + try: + txt = str(res).strip() + return int(txt) + except Exception: + try: + return int(float(str(res))) + except Exception: + return None + + # ---- End DB helpers ---- + + def execute(self, kernel, data): + """Execute the %clipoutliers magic with metadata logging and DB support.""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse args + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + inplace = bool(args.get("inplace", True)) + + # mode and DB args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # Determine numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # log and return + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + kernel._send_message("stderr", "No numeric target columns found to clip outliers.") + # log and return + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No numeric target columns found to clip outliers.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # ensure metadata table exists + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # --- PREVIEW MODE --- + if mode == "preview": + try: + messages = [] + total_would_change = 0 + combined_info = [] + for col in target_columns: + lower, upper = self._compute_bounds(df[col], method, k=k, z_thresh=z_thresh) + if lower is None and upper is None: + messages.append(f"Column '{col}': insufficient local data to compute bounds; skipped.") + combined_info.append((col, None, None, 0)) + continue + mask = ((df[col] < lower) | (df[col] > upper)) & ~df[col].isna() + n_changed = int(mask.sum()) + total_would_change += n_changed + messages.append(f"Column '{col}': would clip {n_changed} value(s) locally (bounds: {lower}, {upper}).") + combined_info.append((col, lower, upper, n_changed)) + + n_before = len(df) + n_after = n_before # clipping doesn't remove rows locally + kernel._send_message("stdout", f"PREVIEW (local): would modify {total_would_change} value(s) across {len(target_columns)} column(s).\n" + "\n".join(messages)) + + # sample rows that have any out-of-bounds values + mask_any = pd.Series(False, index=df.index) + for col, lower, upper, _ in combined_info: + if lower is None and upper is None: + continue + mask_any = mask_any | (((df[col] < lower) | (df[col] > upper)) & ~df[col].isna()) + sample_rows = df[mask_any].head(sample_size).copy() + if not sample_rows.empty: + # annotate which columns are OOB for each row + def oob_cols(r): + cols = [c for c, lower, upper, _ in combined_info if lower is not None and upper is not None and (pd.notna(r.get(c)) and (r.get(c) < lower or r.get(c) > upper))] + return ",".join(cols) + sample_rows["_oob_columns"] = sample_rows.apply(oob_cols, axis=1) + + # ADD: compute and show clipped preview columns for visibility + for c, lower, upper, _ in combined_info: + clipped_col_name = f"{c}_clipped_preview" + try: + if lower is None and upper is None: + # no computed bounds; copy original values + sample_rows[clipped_col_name] = sample_rows[c] + else: + # use pandas clip to compute what the value would be after clipping + sample_rows[clipped_col_name] = sample_rows[c].clip(lower=lower, upper=upper) + except Exception: + # fallback: try elementwise clipping to avoid exceptions on mixed types + def _clip_val(v): + try: + if pd.isna(v): + return v + if lower is not None and v < lower: + return lower + if upper is not None and v > upper: + return upper + return v + except Exception: + return v + sample_rows[clipped_col_name] = sample_rows[c].apply(_clip_val) + + try: + # prefer HTML display; this will include the *_clipped_preview columns + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + else: + kernel._send_message("stdout", "PREVIEW (local): no sample rows flagged as out-of-bounds.") + + # DB estimates if requested + if table_full and mariadb_client is not None: + db_messages = [] + predicates = [] + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + db_messages.append(f"{col}: thresholds approx [{lower}, {upper}] ({msg})") + else: + db_messages.append(f"{col}: could not compute thresholds ({msg}) - skipped") + + if predicates: + db_pred = " OR ".join(predicates) + try: + out = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full} WHERE {db_pred};") + cnt = self._parse_count_result(out) + if cnt is None: + kernel._send_message("stdout", "PREVIEW (db): could not parse count result (check permissions).") + else: + kernel._send_message("stdout", f"PREVIEW (db): estimated rows with OOB values: {cnt}.") + except Exception: + kernel._send_message("stdout", "PREVIEW (db): failed to run count query (continuing).") + kernel._send_message("stdout", "PREVIEW (db) thresholds:\n" + "\n".join(db_messages)) + else: + kernel._send_message("stdout", "PREVIEW (db): no DB predicates could be computed (insufficient sample/values).") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"clipoutliers_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + clip_map = {} + messages = [] + + # compute thresholds for each column using DB sampling + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + clip_map[col] = (thresholds["lower"], thresholds["upper"]) + messages.append(f"{col}: thresholds [{thresholds['lower']}, {thresholds['upper']}] ({msg})") + else: + messages.append(f"{col}: could not compute thresholds ({msg}); will leave column unchanged in DB apply") + + # Build SELECT exprs: for clipped cols use LEAST(GREATEST(col, lower), upper) AS col, else `col` + select_exprs = [] + for c in df.columns: + if c in clip_map: + lower, upper = clip_map[c] + # use repr to preserve numeric literal format + select_exprs.append(f"LEAST(GREATEST({c}, {repr(lower)}), {repr(upper)}) AS {c}") + else: + select_exprs.append(c) + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"clipoutliers_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with clipped values + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + target_df = df if inplace else df.copy(deep=True) + messages = [] + total_clipped = 0 + operation_status = "success" + try: + for col in target_columns: + series = target_df[col] + lower, upper = self._compute_bounds(series, method, k=k, z_thresh=z_thresh) + if lower is None and upper is None: + messages.append(f"Column '{col}': insufficient data to compute bounds; skipped.") + continue + mask = ((series < lower) | (series > upper)) & ~series.isna() + n_changed = int(mask.sum()) + target_df[col] = series.clip(lower=lower, upper=upper) + total_clipped += n_changed + messages.append(f"Column '{col}': clipped {n_changed} value(s) (bounds: {lower:.4f}, {upper:.4f}).") + if inplace: + data["last_select"] = target_df + location_msg = "Modified in-place: data['last_select'] updated." + else: + data["last_select_clipped"] = target_df + location_msg = "Result stored in data['last_select_clipped'] (original unchanged)." + kernel._send_message("stdout", f"Clip outliers completed using {method}.\n" + + "\n".join(messages) + + f"\nTotal values clipped: {total_clipped}. {location_msg}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error during clipping: {e}") + kernel._send_message("stderr", f"Fatal error during clipping: {e}") + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception as e: + try: + kernel._send_message("stdout", f"Warning: failed to write metadata: {e}") + except Exception: + pass + + # Show output (DataFrame) + try: + self._send_html(kernel, target_df) + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py new file mode 100644 index 0000000..8f50a62 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropmissing.py @@ -0,0 +1,838 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from collections import namedtuple +import logging +import math +import os +import re +import time +import uuid +import json +import html + +# Attempt to import SqlFetch if available (helps to determine current DB reliably) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class DropMissing(MariaMagic): + """ + %dropmissing [columns=col1,col2,...] [mode=preview|apply|rollback] + [table=schema.table] + [sample_size=100] [confirm=true|false] + [rollback_token=] [lock_timeout=10] + + Notes: + - The "analyze" mode has been removed. + - There is no strategy argument: DB applies always use the safe "versioned" + approach (CTAS + atomic RENAME). This provides a straightforward + rollback path via a backup table and rollback_token. + - When an apply is performed the generated rollback_token is printed to + stdout so users can copy it for a later rollback. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "dropmissing" + + def help(self): + return ( + "%dropmissing [columns=col1,col2,...] [mode=preview|apply|rollback] [table=schema.table]\n" + "Preview operates on data['last_select']. Apply will always use a versioned CTAS+RENAME strategy (requires confirm=true when targeting DB).\n" + "Execution metadata recorded in table `magic_metadata`." + ) + + # -------------------- Basic helpers --------------------------------- + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html_repr = df.to_html(index=False) + mime = "text/html" + except Exception: + html_repr = "
" + html.escape(str(df)) + "
" + mime = "text/html" + display_content = {"data": {mime: html_repr}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + # -------------------- DB / metadata helpers --------------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch.get_db_name failed; falling back to SELECT DATABASE()") + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + # ---------- End DB helpers ---------- + + def _build_delete_predicate(self, columns): + """Return SQL predicate that matches rows with missing values in given columns. + columns==None means any column is NULL => predicate for any column null can't be generated without schema + so we return None in that case (caller should handle). + """ + if not columns: + return None + clauses = [f"{col} IS NULL" for col in columns] + return " OR ".join(clauses) + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def execute(self, kernel, data): + """Execute the dropmissing magic (supports preview/analyze/apply/rollback and logs metadata).""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + # operational args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "analyze", "apply", "rollback"} else "preview" + table_full = args.get("table", None) # expected 'schema.table' or 'table' + strategy = str(args.get("strategy", "versioned")).lower() + sample_size = int(args.get("sample_size", 100)) + confirm = bool(args.get("confirm", False)) + pk_col = args.get("pk", None) + rollback_token = args.get("rollback_token", None) + lock_timeout = int(args.get("lock_timeout", 10)) + analyze_real = bool(args.get("analyze_real", False)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # validate requested columns exist in df + if columns is not None: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found in last_select: {', '.join(missing_cols)}") + # Log metadata for failure + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=','.join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # prepare predicate + sql_predicate = self._build_delete_predicate(columns) + + # metadata table ensure (best-effort) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # --- PREVIEW MODE ------------------------------------------------- + if mode == "preview": + try: + before_count = len(df) + if columns is None: + after_df = df.dropna() + else: + after_df = df.dropna(axis=0, subset=columns) + after_count = len(after_df) + dropped = before_count - after_count + + kernel._send_message("stdout", f"PREVIEW: would drop {dropped} row(s) (from {before_count} to {after_count}).") + + # show small sample with before/after preview for rows that would be dropped + if columns is None: + predicate_mask = df.isnull().any(axis=1) + else: + predicate_mask = df[columns].isnull().any(axis=1) + + sample_rows = df[predicate_mask].head(sample_size) + # show 'after' preview as dropped rows (so after preview is empty for those rows) + sample_preview = sample_rows.copy() + sample_preview["_would_be_dropped"] = True + + if not sample_preview.empty: + try: + self._send_html(kernel, sample_preview) + except Exception: + pass + + # If table specified, show EXPLAIN for corresponding DELETE + if table_full and mariadb_client is not None: + if sql_predicate is None: + kernel._send_message("stdout", "Preview: cannot generate DB predicate for 'any column null' without explicit columns.") + else: + delete_sql = f"DELETE FROM {table_full} WHERE {sql_predicate};" + try: + # EXPLAIN (no execute) + mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql) + if mariadb_client.iserror(): + kernel._send_message("stdout", "Could not run EXPLAIN on DB — check permissions or SQL syntax.") + else: + kernel._send_message("stdout", "EXPLAIN (estimate) for corresponding DELETE (JSON):") + kernel._send_message("stdout", mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql)) + except Exception: + kernel._send_message("stdout", "Failed to run EXPLAIN on DB (continuing).") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='preview', + message=f'preview_dropped={dropped}', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ANALYZE MODE ----------------------------------------------- + if mode == "analyze": + if mariadb_client is None or not table_full: + kernel._send_message("stderr", "ANALYZE requires a connected mariadb_client and table= argument.") + return + if sql_predicate is None: + kernel._send_message("stderr", "ANALYZE requires explicit columns= to generate delete predicate.") + return + delete_sql = f"DELETE FROM {table_full} WHERE {sql_predicate};" + try: + # Run EXPLAIN (estimate) + explain_out = mariadb_client.run_statement("EXPLAIN FORMAT=JSON " + delete_sql) + kernel._send_message("stdout", "EXPLAIN (estimate):") + kernel._send_message("stdout", explain_out) + # Optionally run EXPLAIN ANALYZE if requested + if analyze_real: + try: + analyze_out = mariadb_client.run_statement("EXPLAIN ANALYZE " + delete_sql) + kernel._send_message("stdout", "EXPLAIN ANALYZE (actual run):") + kernel._send_message("stdout", analyze_out) + except Exception: + kernel._send_message("stdout", "EXPLAIN ANALYZE failed or is not supported on this server.") + # log analyze metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='analyze', + message='analyze_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + except Exception as e: + kernel._send_message("stderr", f"Error during analyze: {e}") + return + + # --- ROLLBACK MODE --------------------------------------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + # If rollback_token provided, try to find matching metadata entry + token = rollback_token + try: + if not token: + # try to read latest magic_metadata entry for this command and user + mariadb_client.run_statement(f"SELECT id, rollback_token, backup_table, original_table, arguments, execution_timestamp FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + # now find backup_table and original_table associated with token + mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + backup_out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(backup_out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + # depending on HTML ordering we try to extract both; fallback below parses individually + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + # fallback: fetch backup_table + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic swap to restore backup -> original + lock_name = f"dropmissing_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + # If original_table was recorded during apply, prefer to use it + if original_table: + # if original exists, rename original -> original_backup_before_rb_{token}, then rename backup -> original + if self._table_exists(mariadb_client, original_table): + # create a unique temp name for the old original + original_old = f"{original_table}_prerollback_{token}" + # atomic multi-rename: rename original -> original_old, backup -> original + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + # record rollback metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # original does not exist currently; rename backup -> original directly + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + # record rollback metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # No original_table recorded — best-effort: attempt to infer original name from arguments + # try to fetch arguments column + try: + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + # try to find table=... inside arguments + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + # same logic as above using inferred_original + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + except Exception as e: + kernel._send_message("stderr", f"Rollback error while inferring original table: {e}") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE ----------------------------------------------- + if mode == "apply": + # two main apply targets: DB (table_full provided and mariadb_client present) or local DataFrame + if table_full and mariadb_client is not None: + # safety: require explicit confirmation to run DB changes + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + if sql_predicate is None: + kernel._send_message("stderr", "Apply to DB requires explicit columns= to build a safe predicate (avoid accidental full-table deletes).") + return + + # strategy selection + if strategy == "versioned": + # create a new table (CTAS) containing rows we want to keep (i.e., NOT predicate) + # generate unique backup name + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + delete_pred = sql_predicate + try: + # acquire lock + lock_name = f"dropmissing_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with rows to keep + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT * FROM {table_full} WHERE NOT ({delete_pred});") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + # basic validation: counts (best-effort) + mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full};") + total_old = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full};") + mariadb_client.run_statement(f"SELECT COUNT(*) FROM {new_table};") + + # atomic rename: original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + # attempt cleanup + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) and record original_table + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + # update in-memory last_select to reflect applied state (fetch fresh) + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + # try to parse HTML into DataFrame + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + # cannot parse, just notify + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + elif strategy == "transactional": + # transactional apply: capture changed rows in an audit table, then delete + if not pk_col: + kernel._send_message("stderr", "Transactional strategy requires pk= to capture changed rows for rollback. Falling back to versioned strategy.") + # fall back to versioned + args["strategy"] = "versioned" + self.args = "".join([f"{k}={v} " for k, v in args.items()]) + return self.execute(kernel, data) + + token = str(uuid.uuid4()).replace('-', '')[:16] + audit_table = f"{db_name}.magic_audit_{token}" + delete_pred = sql_predicate + try: + lock_name = f"dropmissing_tx_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create audit table + mariadb_client.run_statement(f"CREATE TABLE IF NOT EXISTS {audit_table} (tx_id VARCHAR(64), pk_val TEXT, old_row LONGTEXT, created_at DATETIME);") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create audit table.") + return + + # insert affected rows into audit table + mariadb_client.run_statement(f"INSERT INTO {audit_table} (tx_id, pk_val, old_row, created_at) SELECT '{token}', CAST({pk_col} AS CHAR), TO_BASE64(ROW_TO_JSON(t)), NOW() FROM {table_full} t WHERE {delete_pred};") + # Note: ROW_TO_JSON and TO_BASE64 may not be available depending on server; this is best-effort + + # delete rows + mariadb_client.run_statement(f"DELETE FROM {table_full} WHERE {delete_pred};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "DELETE failed during transactional apply (check SQL and permissions).") + return + + kernel._send_message("stdout", f"Transactional apply completed; audit table {audit_table} contains old rows for rollback with token {token}.") + # metadata + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns) if columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'audit_table={audit_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=audit_table, + original_table=table_full + ) + # refresh in-memory last_select little + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (transactional) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + kernel._send_message("stderr", f"Unknown strategy: {strategy}") + return + + else: + # operate locally on data['last_select'] (in-place) + operation_status = "success" + messages = [] + try: + before_count = len(df) + if columns is None: + df.dropna(axis=0, inplace=True) + else: + df.dropna(axis=0, subset=columns, inplace=True) + after_count = len(df) + dropped = before_count - after_count + data["last_select"] = df + msg = f"Dropped {dropped} row(s) with missing values (in-place local)." + kernel._send_message("stdout", msg) + messages.append(msg) + try: + self._send_html(kernel, df) + except Exception: + pass + except Exception as e: + operation_status = "error" + err_msg = f"Error while dropping missing values locally: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py new file mode 100644 index 0000000..b45af51 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/dropoutliers.py @@ -0,0 +1,910 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from collections import namedtuple +import logging +import os +import re +import uuid +import time + +# Optional helper to reliably get current DB name (if available in environment) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class DropOutliers(MariaMagic): + """ + %dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] + + Modes: + - preview: estimate rows removed, show sample rows + - apply: perform removal (in-place local or DB CTAS+RENAME if table= provided) + - rollback: restore DB backup created by apply (requires mariadb_client) + + Notes: + - DB apply uses sampling to compute thresholds (best-effort). + - Execution metadata recorded in magic_metadata (includes rollback token). + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "dropoutliers" + + def help(self): + return ( + "%dropoutliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false]\n" + " [sample_size=100]\n" + "Removes rows containing outliers from data['last_select'] or from a DB table (versioned apply).\n" + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments (keeps behavior consistent with other magics).""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML (fallback to text if needed).""" + try: + html_repr = df.to_html(index=False) + mime = "text/html" + except Exception: + html_repr = str(df) + mime = "text/plain" + display_content = {"data": {mime: html_repr}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): + """Return boolean mask of outliers for a pandas Series (True where outlier).""" + if series.dropna().empty: + return pd.Series(False, index=series.index) + + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + mask = (series < lower) | (series > upper) + return mask.fillna(False) + + elif method == "zscore": + mean = series.mean(skipna=True) + std = series.std(skipna=True) + if std == 0 or np.isnan(std): + return pd.Series(False, index=series.index) + z = (series - mean) / std + mask = z.abs() > float(z_thresh) + return mask.fillna(False) + + else: + raise ValueError(f"Unknown method {method}") + + # --- metadata / DB helper methods (best-effort) --- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape value to single-quoted SQL literal (None -> NULL).""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine current DB. Use SqlFetch if present; otherwise run SELECT DATABASE(); parse result. + Returns empty string if none. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + return sf.get_db_name() or "" + except Exception: + log.debug("SqlFetch.get_db_name() failed; falling back to manual query.") + + if mariadb_client is None: + return "" + try: + res = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not res: + return "" + # try parsing HTML table via pandas + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # fallback: regex extract first td + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(res).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _compute_thresholds_db(self, mariadb_client, table_full, col, method, k=1.5, z_thresh=3.0, sample_size=100): + """ + Sample non-null values from DB and compute thresholds for IQR or zscore. + Returns (ok, {lower:.., upper:..}, message) + """ + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "sample query failed" + try: + df_list = pd.read_html(out) + if not df_list or len(df_list) == 0: + return False, None, "no sample rows parsed" + series = df_list[0].iloc[:, 0].astype(float) # try numeric conversion + if series.dropna().empty: + return False, None, "sample contains no numeric values" + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via sampling" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via sampling" + else: + return False, None, "unknown method" + except Exception: + # fallback regex parse single-column HTML + vals = re.findall(r"(.*?)", str(out), flags=re.S | re.I) + nums = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + try: + nums.append(float(txt)) + except Exception: + continue + if not nums: + return False, None, "parsed sample contains no numeric values" + series = pd.Series(nums) + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + return True, {"lower": float(lower), "upper": float(upper)}, "iqr via regex sample" + elif method == "zscore": + mean = float(series.mean()) + std = float(series.std()) + if std == 0: + return False, None, "std==0 in sample" + lower = mean - float(z_thresh) * std + upper = mean + float(z_thresh) * std + return True, {"lower": float(lower), "upper": float(upper)}, "zscore via regex sample" + else: + return False, None, "unknown method" + except Exception as e: + return False, None, f"exception computing thresholds: {e}" + + def _parse_count_result(self, res): + """Parse a SELECT COUNT(*) result returned by mariadb_client.run_statement (HTML or text).""" + try: + df_list = pd.read_html(res) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + try: + return int(val) + except Exception: + try: + return int(float(val)) + except Exception: + return None + except Exception: + m = re.search(r"(.*?)", str(res), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return int(txt) + except Exception: + try: + return int(float(txt)) + except Exception: + return None + # fallback: try to parse raw + try: + txt = str(res).strip() + return int(txt) + except Exception: + try: + return int(float(str(res))) + except Exception: + return None + + def execute(self, kernel, data): + """Execute the dropoutliers magic (preview/apply/rollback).""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + # method and params + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + kernel._send_message("stderr", f"Unknown method '{method}'. Allowed: iqr, zscore.") + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + + # mode + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # Determine target numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found: {', '.join(missing_cols)}") + # log metadata for failure and return + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + # keep only numeric columns + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + kernel._send_message("stderr", "No numeric target columns found to detect outliers.") + # log metadata for early exit + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No numeric target columns found to detect outliers.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # ensure metadata table exists + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # --- PREVIEW MODE --- + if mode == "preview": + try: + # local detection counts & sample + messages = [] + combined_mask = None + for col in target_columns: + mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + if combined_mask is None: + combined_mask = mask.astype(bool) + else: + combined_mask = combined_mask | mask.astype(bool) + + n_before = len(df) + n_after = n_before - (int(combined_mask.sum()) if combined_mask is not None else 0) + kernel._send_message("stdout", f"PREVIEW (local): would drop {n_before - n_after} row(s) (from {n_before} to {n_after}).\n" + "\n".join(messages)) + + # show sample rows that would be dropped (local) + if combined_mask is not None and combined_mask.any(): + sample_rows = df[combined_mask].head(sample_size).copy() + sample_rows["_outlier_cols"] = sample_rows.apply(lambda r: ",".join([c for c in target_columns if pd.isnull(r.get(c)) is False and self._detect_outliers_series(pd.Series([r.get(c)]*1), method, k=k, z_thresh=z_thresh).iloc[0]]), axis=1) + try: + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + else: + kernel._send_message("stdout", "PREVIEW (local): no rows with outliers in the sample.") + + # If DB target provided, attempt DB-based estimate (using sampling thresholds) + if table_full and mariadb_client is not None: + db_messages = [] + predicates = [] + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + # ensure numeric literal formatting + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + db_messages.append(f"{col}: thresholds approx [{lower}, {upper}] ({msg})") + else: + db_messages.append(f"{col}: could not compute thresholds ({msg}) - skipped in DB predicate") + + if predicates: + db_pred = " OR ".join(predicates) + try: + out = mariadb_client.run_statement(f"SELECT COUNT(*) FROM {table_full} WHERE {db_pred};") + cnt = self._parse_count_result(out) + if cnt is None: + kernel._send_message("stdout", "PREVIEW (db): could not parse count result (check permissions).") + else: + kernel._send_message("stdout", f"PREVIEW (db): estimated rows matching outlier predicate: {cnt}.") + except Exception: + kernel._send_message("stdout", "PREVIEW (db): failed to run count query (continuing).") + kernel._send_message("stdout", "PREVIEW (db) thresholds:\n" + "\n".join(db_messages)) + else: + kernel._send_message("stdout", "PREVIEW (db): no DB predicates could be computed (insufficient sample/values).") + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + # try to find latest rollback_token for this command + user + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # get backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"dropoutliers_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + predicates = [] + messages = [] + + # compute thresholds for each column using DB sampling + for col in target_columns: + ok, thresholds, msg = self._compute_thresholds_db(mariadb_client, table_full, col, method, k=k, z_thresh=z_thresh, sample_size=sample_size) + if ok and thresholds: + lower = thresholds["lower"] + upper = thresholds["upper"] + # use repr to keep decimal representation + predicates.append(f"({col} < {repr(lower)} OR {col} > {repr(upper)})") + messages.append(f"{col}: thresholds [{lower}, {upper}] ({msg})") + else: + messages.append(f"{col}: could not compute thresholds ({msg}); this column will not be used in DB predicate") + + if not predicates: + kernel._send_message("stderr", "Could not compute DB predicates for any column; aborting DB apply.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='error', + message='db_apply_failed_no_predicates', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + db_pred = " OR ".join(predicates) + + try: + lock_name = f"dropoutliers_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with rows to keep (NOT predicate) + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT * FROM {table_full} WHERE NOT ({db_pred});") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with rollback token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message='applied_db_versioned', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + combined_mask = None + messages = [] + operation_status = "success" + try: + for col in target_columns: + try: + mask = self._detect_outliers_series(df[col], method, k=k, z_thresh=z_thresh) + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + if combined_mask is None: + combined_mask = mask.astype(bool) + else: + combined_mask = combined_mask | mask.astype(bool) + except Exception as e: + operation_status = "error" + messages.append(f"Column '{col}': error detecting outliers: {e}") + + if combined_mask is None or not combined_mask.any(): + kernel._send_message("stdout", "No outliers detected. No rows removed.\n" + "\n".join(messages)) + try: + self._send_html(kernel, df) + except Exception: + pass + else: + n_before = len(df) + df.drop(index=df[combined_mask].index, inplace=True) + data["last_select"] = df + n_after = len(df) + removed = n_before - n_after + kernel._send_message("stdout", f"Dropped {removed} row(s) containing outliers (in-place).\n" + "\n".join(messages)) + try: + self._send_html(kernel, df) + except Exception: + pass + + except Exception as e: + operation_status = "error" + kernel._send_message("stderr", f"Error while removing outlier rows locally: {e}") + messages.append(str(e)) + + # Insert metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(target_columns), + operation_status=operation_status, + message="\n".join(messages), + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py new file mode 100644 index 0000000..bc4ee2a --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/fillmissing.py @@ -0,0 +1,935 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import logging +import os +import re +import uuid +import time + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class FillMissing(MariaMagic): + """ + %fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] + [value=const] [mode=preview|apply|rollback] [table=schema.table] + [confirm=true|false] [sample_size=100] + + Behavior: + - preview: shows what would be filled (counts, sample rows with nulls, and computed fill values) + - apply: performs the fill (locally or on DB if table= specified) + - rollback: attempts to restore a backup created by an apply (requires mariadb_client + rollback_token or will use latest by user) + + Notes: + - DB apply uses a CTAS + atomic RENAME pattern so the original is preserved as _backup_. + - For DB fill values we compute values using SQL when possible (AVG for mean, GROUP BY+COUNT for mode). + Median uses a sampling fallback to compute the median in Python (best-effort). + - Execution metadata is recorded in table `magic_metadata` (including rollback_token, backup_table, original_table). + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "fillmissing" + + def help(self): + return ( + "%fillmissing [columns=col1,col2,...] [strategy=mean|median|mode|constant] [value=const]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] [sample_size=100]\n" + "Fills missing values in data['last_select'] or in DB table when table= is provided." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + # Remove surrounding quotes if present so value="abc" becomes abc (still as string) + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + """Parse key=value arguments (keeps behavior consistent with other magics).""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML (fallback to text if needed).""" + try: + html_repr = df.to_html(index=False) + mime = "text/html" + except Exception: + html_repr = str(df) + mime = "text/plain" + display_content = {"data": {mime: html_repr}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns include fields to support rollback tracking. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + # swallow errors but log + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + # -------------------- end metadata helpers -------------------- + + def _compute_fill_value_db(self, mariadb_client, table_full, col, strategy, const_value, sample_size=100): + """ + Compute fill value for a DB column using SQL when possible; otherwise fall back to sampling. + Returns (success_bool, fill_value or None, message) + """ + try: + # constant + if strategy == "constant": + return True, const_value, "constant provided" + + # mean -> AVG + if strategy == "mean": + try: + out = mariadb_client.run_statement(f"SELECT AVG({col}) FROM {table_full} WHERE {col} IS NOT NULL;") + if mariadb_client.iserror() or not out: + return False, None, "AVG query failed" + # parse result + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + # convert to numeric if possible + try: + valf = float(val) + return True, valf, "mean via SQL" + except Exception: + return True, val, "mean via SQL (non-numeric parse)" + except Exception: + # regex fallback + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + try: + return True, float(txt), "mean via SQL (regex)" + except Exception: + return True, txt, "mean via SQL (regex)" + return False, None, "Could not parse AVG result" + except Exception: + return False, None, "AVG query exception" + + # mode -> most frequent value via GROUP BY + if strategy == "mode": + try: + out = mariadb_client.run_statement(f"SELECT {col}, COUNT(*) AS cnt FROM {table_full} WHERE {col} IS NOT NULL GROUP BY {col} ORDER BY cnt DESC LIMIT 1;") + if mariadb_client.iserror() or not out: + return False, None, "MODE query failed" + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + return True, val, "mode via SQL" + except Exception: + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + return True, txt, "mode via SQL (regex)" + return False, None, "Could not parse mode result" + except Exception: + return False, None, "MODE query exception" + + # median -> sampling fallback: select a sample of non-null values and compute median in pandas + if strategy == "median": + try: + out = mariadb_client.run_statement(f"SELECT {col} FROM {table_full} WHERE {col} IS NOT NULL LIMIT {int(sample_size)};") + if mariadb_client.iserror() or not out: + return False, None, "Median: sample query failed" + try: + df_list = pd.read_html(out) + if df_list and len(df_list) > 0: + series = df_list[0].iloc[:, 0] + # convert to numeric where possible + try: + series_num = pd.to_numeric(series, errors="coerce").dropna() + if series_num.empty: + return False, None, "Median: non-numeric or all missing in sample" + med = series_num.median() + return True, float(med), "median via sampling" + except Exception: + return False, None, "Median: numeric conversion failed" + except Exception: + return False, None, "Median: parsing sample failed" + except Exception: + return False, None, "Median: sample query exception" + + return False, None, "Unknown strategy" + except Exception as e: + return False, None, f"Exception computing fill value: {e}" + + def execute(self, kernel, data): + """Execute the fillmissing magic (preview/apply/rollback) and log metadata.""" + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to process (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments. Use key=value syntax.") + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + target_columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + target_columns = list(columns_arg) + else: + target_columns = None + + # mode: preview|apply|rollback + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + + # other args + strategy = args.get("strategy", "mean") + if isinstance(strategy, str): + strategy = strategy.lower() + else: + strategy = str(strategy).lower() + + allowed = {"mean", "median", "mode", "constant"} + if strategy not in allowed: + kernel._send_message("stderr", f"Unknown strategy '{strategy}'. Allowed: {', '.join(allowed)}") + return + + value_provided = "value" in args + const_value = args.get("value", None) + + if strategy == "constant" and not value_provided and mode != "preview": + kernel._send_message("stderr", "Strategy 'constant' requires a 'value=...' argument.") + return + + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # determine local target columns if not provided + if target_columns is None: + target_columns = list(df.columns) + else: + missing_cols = [c for c in target_columns if c not in df.columns] + if missing_cols: + kernel._send_message("stderr", f"Column(s) not found in last_select: {', '.join(missing_cols)}") + # log metadata for failure + try: + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(target_columns), + operation_status="error", + message=f"Column(s) not found: {', '.join(missing_cols)}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Ensure metadata table exists (best-effort) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # --- PREVIEW MODE ------------------------------------------------- + if mode == "preview": + try: + messages = [] + # missing count per column in the DataFrame + missing_counts = {col: int(df[col].isnull().sum()) for col in target_columns} + summary_lines = [f"{col}: missing={count}" for col, count in missing_counts.items()] + kernel._send_message("stdout", "PREVIEW: missing counts per column:\n" + "\n".join(summary_lines)) + + # compute would-be fill values (local logic); for DB-target, attempt to compute via DB if mariadb_client available + computed = {} + for col in target_columns: + # local compute based on df contents + if strategy == "constant": + computed[col] = (True, const_value, "constant provided") + else: + # compute on the sample df (non-null values) + series = df[col].dropna() + if series.empty: + computed[col] = (False, None, "no non-missing values in preview sample") + else: + if strategy == "mean": + if pd.api.types.is_numeric_dtype(series): + computed[col] = (True, float(series.mean()), "mean via local preview") + else: + computed[col] = (False, None, "not numeric; cannot compute mean locally") + elif strategy == "median": + if pd.api.types.is_numeric_dtype(series): + computed[col] = (True, float(series.median()), "median via local preview") + else: + computed[col] = (False, None, "not numeric; cannot compute median locally") + elif strategy == "mode": + modes = series.mode(dropna=True) + if not modes.empty: + computed[col] = (True, modes.iloc[0], "mode via local preview") + else: + computed[col] = (False, None, "no mode found in local preview") + + # if DB target specified and mariadb_client available, try DB-based compute (overrides local) + if table_full and mariadb_client is not None: + ok, val, msg = self._compute_fill_value_db(mariadb_client, table_full, col, strategy, const_value, sample_size=sample_size) + computed[col] = (ok, val, f"db:{msg}" if msg else "db:unknown") + + # display computed fill values + comp_lines = [] + for col, (ok, val, msg) in computed.items(): + if ok: + comp_lines.append(f"{col}: would fill with -> {val} ({msg})") + else: + comp_lines.append(f"{col}: could NOT determine fill value ({msg}); would skip") + kernel._send_message("stdout", "PREVIEW: computed fill-values (best-effort):\n" + "\n".join(comp_lines)) + + # show a sample of rows that would be affected (rows with any NULL in target_columns) + mask = df[target_columns].isnull().any(axis=1) + sample_rows = df[mask].head(sample_size) + if not sample_rows.empty: + # Add a helper column to indicate which columns are null in that row + def nulls_in_row(r): + return ",".join([c for c in target_columns if pd.isnull(r.get(c))]) + sample_preview = sample_rows.copy() + sample_preview["_null_columns"] = sample_preview.apply(nulls_in_row, axis=1) + + # --- NEW: compute filled-preview columns so the user can see what values would be used --- + for c in target_columns: + filled_col = f"{c}_filled_preview" + ok, fill_val, _ = computed.get(c, (False, None, "")) + try: + if ok and fill_val is not None: + # use pandas fillna on the preview sample to show the to-be-filled value + sample_preview[filled_col] = sample_preview[c].fillna(fill_val) + else: + # no computed fill value: show original values so preview is still informative + sample_preview[filled_col] = sample_preview[c] + except Exception: + # fallback elementwise: preserve original when something goes wrong + def _fill_elem(v): + try: + if pd.isna(v) and ok and fill_val is not None: + return fill_val + return v + except Exception: + return v + sample_preview[filled_col] = sample_preview[c].apply(_fill_elem) + + try: + self._send_html(kernel, sample_preview) + except Exception: + kernel._send_message("stdout", str(sample_preview.head())) + else: + kernel._send_message("stdout", "PREVIEW: no rows with missing values in the preview sample.") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='preview', + message='preview_computed_fill_values', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --------------------------------------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + # find latest metadata for this command and user + mariadb_client.run_statement(f"SELECT rollback_token, backup_table, original_table FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + # fallback single column parses + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"fillmissing_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=10) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE ----------------------------------------------- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + # require explicit confirmation for DB changes + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + # we'll compute fill-values per column (best-effort), construct a CTAS where we apply COALESCE(column, ) + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + fill_map = {} # col -> (ok, val, msg) + + for col in target_columns: + ok, val, msg = self._compute_fill_value_db(mariadb_client, table_full, col, strategy, const_value, sample_size=sample_size) + fill_map[col] = (ok, val, msg) + if not ok: + kernel._send_message("stdout", f"Column '{col}': could not compute fill value ({msg}) — will skip filling this column in DB apply.") + + # Build select expressions: for columns we can fill use COALESCE(col, ) AS col; for others keep col + exprs = [] + for c in list(df.columns): + if c in fill_map and fill_map[c][0]: + val = fill_map[c][1] + # decide whether to quote: try numeric conversion + try: + # allow numeric literal if val is a number + if isinstance(val, (int, float)): + literal = str(val) + else: + # attempt to parse numeric-like string + literal = str(val) + # try to parse float + try: + float(literal) + literal = literal + except Exception: + literal = self._sql_escape(literal) + except Exception: + literal = self._sql_escape(str(val)) + # If literal looks already quoted (i.e. started with '), use directly + if isinstance(literal, str) and literal.startswith("'") and literal.endswith("'"): + exprs.append(f"COALESCE({c}, {literal}) AS {c}") + else: + # numeric or unquoted string (we still need to ensure strings are quoted) + try: + # if numeric + float(literal) + exprs.append(f"COALESCE({c}, {literal}) AS {c}") + except Exception: + exprs.append(f"COALESCE({c}, {self._sql_escape(literal)}) AS {c}") + else: + exprs.append(c) + + select_expr = ", ".join(exprs) + try: + lock_name = f"fillmissing_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=10) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with filled values + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_expr} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # Insert metadata with token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns) if target_columns else 'ALL_COLUMNS', + operation_status='applied', + message=f'applied_backup={backup_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (behaves like original implementation) + operation_status = "success" + messages = [] + try: + before_count = len(df) + for col in target_columns: + try: + series = df[col] + if strategy in {"mean", "median"}: + if pd.api.types.is_numeric_dtype(series): + if strategy == "mean": + fill_val = series.mean(skipna=True) + else: + fill_val = series.median(skipna=True) + if pd.isna(fill_val): + messages.append(f"Column '{col}': no non-missing values to compute {strategy}. Skipped.") + continue + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with {strategy}={fill_val}.") + else: + messages.append(f"Column '{col}' is not numeric; cannot use {strategy}. Skipped.") + continue + elif strategy == "mode": + modes = series.mode(dropna=True) + if modes.empty: + messages.append(f"Column '{col}': no mode (all missing). Skipped.") + continue + fill_val = modes.iloc[0] + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with mode={fill_val}.") + elif strategy == "constant": + fill_val = const_value + df[col].fillna(fill_val, inplace=True) + messages.append(f"Column '{col}': filled missing with constant value={fill_val}.") + except Exception as e: + operation_status = "error" + messages.append(f"Column '{col}': error while filling missing values: {e}") + + after_count = len(df) + dropped = 0 # not relevant here + data["last_select"] = df + summary = "\n".join(messages) + kernel._send_message("stdout", f"Fill missing completed (in-place). Summary:\n{summary}") + try: + self._send_html(kernel, df) + except Exception: + pass + except Exception as e: + operation_status = "error" + kernel._send_message("stderr", f"Error while applying fillmissing locally: {e}") + messages.append(f"Error while applying fillmissing locally: {e}") + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) if target_columns else "" + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py new file mode 100644 index 0000000..243bf17 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/missing.py @@ -0,0 +1,399 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import shlex +from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Missing(MariaMagic): + """ + %missing [action=show|percent|summary] [columns=col1,col2] + + Examples: + %missing -> shows count+percent of missing for all columns + %missing action=percent -> shows percent only + %missing action=summary -> shows dtype, missing, percent + + This magic also logs execution metadata into a table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "missing" + + def help(self): + return ( + "%missing [action=show|percent|summary] [columns=col1,col2]\n" + "Display missing-value information from the last query result.\n" + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Cast strings to Python objects where possible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except ValueError: + return s + + def parse_args(self, input_str): + """Parse key=value arguments.""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Display DataFrame as HTML in the notebook.""" + try: + html = df.to_html() + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + """Main execution for %missing magic.""" + df = data.get("last_select") + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No data available to inspect for missing values." + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + action = args.get("action", "show") + cols_arg = args.get("columns", None) + + if isinstance(cols_arg, str): + columns = [c.strip() for c in cols_arg.split(",") if c.strip()] + elif isinstance(cols_arg, (list, tuple)): + columns = list(cols_arg) + else: + columns = None + + try: + subdf = df[columns] if columns else df + except KeyError as e: + msg = f"Column not found: {e}" + kernel._send_message("stderr", msg) + # log metadata for failure + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass + return + + # Compute missing information + try: + missing_counts = subdf.isnull().sum() + total = len(subdf) + if total == 0: + percent = pd.Series([0] * len(missing_counts), index=missing_counts.index) + else: + percent = (missing_counts / total * 100).round(2) + + out = pd.DataFrame({"missing": missing_counts, "percent": percent}) + if action == "percent": + out = out[["percent"]] + elif action == "summary": + out["dtype"] = subdf.dtypes.astype(str) + out = out[["dtype", "missing", "percent"]] + + # Display results + self._send_html(kernel, out) + + # Prepare metadata success info + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" + message = f"%missing action={action} examined {len(out)} column(s); total_rows={total}." + operation_status = "success" + + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message, + db_name=db_name, + user_name=user_name, + ) + except Exception: + # do not interrupt normal flow if logging fails + pass + + except Exception as e: + msg = f"Error while computing missing information: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name, + ) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py new file mode 100644 index 0000000..d1f0e12 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/outliers.py @@ -0,0 +1,586 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import io +import matplotlib +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Outliers(MariaMagic): + """ + %outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False] + + Detects outliers (NON IN-PLACE) and stores a copy of the DataFrame with boolean + indicator columns in data['last_select_outliers']. + + Additionally logs execution metadata into `magic_metadata` table: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "outliers" + + def help(self): + return ( + "%outliers [columns=col1,col2,...] [method=iqr|zscore] [k=1.5] [z_thresh=3.0] [plot=True|False]\n" + "Detects outliers in data['last_select'] (non in-place). Results placed in data['last_select_outliers']." + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Cast simple strings to Python objects where sensible.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and ((s[0] == s[-1] == '"') or (s[0] == s[-1] == "'")): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _send_image(self, kernel, fig): + buf = io.BytesIO() + try: + fig.tight_layout() + except Exception: + pass + fig.savefig(buf, format="png", bbox_inches="tight") + plt.close(fig) + buf.seek(0) + img_bytes = buf.read() + display_content = {"data": {"image/png": img_bytes}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _detect_outliers_series(self, series, method, k=1.5, z_thresh=3.0): + """Return boolean mask of outliers for a pandas Series (True where outlier).""" + if series.dropna().empty: + return pd.Series(False, index=series.index) + + if method == "iqr": + q1 = series.quantile(0.25) + q3 = series.quantile(0.75) + iqr = q3 - q1 + lower = q1 - k * iqr + upper = q3 + k * iqr + mask = (series < lower) | (series > upper) + return mask.fillna(False) + + elif method == "zscore": + mean = series.mean(skipna=True) + std = series.std(skipna=True) + if std == 0 or np.isnan(std): + return pd.Series(False, index=series.index) + z = (series - mean) / std + mask = z.abs() > float(z_thresh) + return mask.fillna(False) + + else: + raise ValueError(f"Unknown method {method}") + + def _build_plots(self, df_numeric, outlier_masks): + """ + Builds a figure with: + - top: boxplot of df_numeric (showfliers=False) with overlay of detected outliers + - bottom: scatter plot (index vs value) for each column; outliers highlighted in red + """ + cols = list(df_numeric.columns) + if not cols: + fig = plt.figure(figsize=(6, 3)) + plt.text(0.5, 0.5, "No numeric columns to plot", ha="center", va="center") + return fig + + ncols = len(cols) + fig = plt.figure(figsize=(max(6, ncols * 1.2), 6)) + gs = fig.add_gridspec(2, 1, height_ratios=[1, 1.2], hspace=0.35) + + # Top: boxplot + ax_box = fig.add_subplot(gs[0, 0]) + df_numeric.boxplot(column=cols, ax=ax_box, showfliers=False) + ax_box.set_title("Box plot (detected outliers overlaid)") + ax_box.set_xlabel("") + ax_box.set_ylabel("Value") + + xs = np.arange(1, len(cols) + 1) + for i, col in enumerate(cols): + mask = outlier_masks.get(col) + if mask is None: + continue + out_vals = df_numeric.loc[mask, col] + if out_vals.empty: + continue + # slight horizontal jitter for readability + jitter = np.random.normal(scale=0.05, size=len(out_vals)) + ax_box.scatter(np.full(len(out_vals), xs[i]) + jitter, out_vals.values, + marker='x', s=50, linewidths=1.0, zorder=6) + + # Bottom: scatter plot (index vs value) per column + ax_scatter = fig.add_subplot(gs[1, 0]) + # Plot each column as its own series using the DataFrame index as x + for i, col in enumerate(cols): + series = df_numeric[col] + mask = outlier_masks.get(col, pd.Series(False, index=series.index)) + # Small x-offset per column to avoid overlap when multiple columns share indices + x_offset = (i - (ncols - 1) / 2) * 0.08 + xs_plot = series.index.values.astype(float) + x_offset + ax_scatter.scatter(xs_plot, series.values, alpha=0.6, label=col, s=20) + # highlight outliers in red with larger marker + if mask.any(): + ax_scatter.scatter(series.index.values.astype(float)[mask], series[mask].values, + color='red', edgecolors='k', s=50, label=f"{col} outlier", zorder=7) + + ax_scatter.set_title("Scatter plot (index vs value) — outliers highlighted") + ax_scatter.set_xlabel("Row index") + ax_scatter.set_ylabel("Value") + # avoid duplicate legend entries + handles, labels = ax_scatter.get_legend_handles_labels() + by_label = dict(zip(labels, handles)) + ax_scatter.legend(by_label.values(), by_label.keys(), fontsize='small', loc='best', ncol=2) + + return fig + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + """Execute the outliers magic (non in-place) and log metadata.""" + df = data.get("last_select") + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None: + msg = "No last_select found in kernel data." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + if hasattr(df, "empty") and df.empty: + msg = "There is no data to process (empty DataFrame)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # parse columns argument + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + method = str(args.get("method", "iqr")).lower() + if method not in {"iqr", "zscore"}: + msg = f"Unknown method '{method}'. Allowed: iqr, zscore." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + k = float(args.get("k", 1.5)) + except Exception: + k = 1.5 + + try: + z_thresh = float(args.get("z_thresh", 3.0)) + except Exception: + z_thresh = 3.0 + + plot = bool(args.get("plot", False)) + + # Determine target numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + msg = f"Column(s) not found: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + # keep only numeric columns (skip non-numeric) + target_columns = [c for c in columns if pd.api.types.is_numeric_dtype(df[c])] + non_numeric = [c for c in columns if c not in target_columns] + if non_numeric: + kernel._send_message("stdout", f"Warning: non-numeric columns skipped: {', '.join(non_numeric)}") + + if not target_columns: + msg = "No numeric target columns found to detect outliers." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Work on a copy (non in-place) + result_df = df.copy(deep=True) + + # Detect outliers per column and store masks + outlier_masks = {} + messages = [] + operation_status = "success" + try: + for col in target_columns: + try: + mask = self._detect_outliers_series(result_df[col], method, k=k, z_thresh=z_thresh) + outlier_masks[col] = mask + n_out = int(mask.sum()) + messages.append(f"Column '{col}': detected {n_out} outlier(s) using {method}.") + # add boolean indicator column to the copy (non in-place on original) + result_df[f"{col}_is_outlier"] = mask.astype(bool) + except Exception as e: + messages.append(f"Column '{col}': error detecting outliers: {e}") + except Exception as e: + operation_status = "error" + messages.append(f"Fatal error while detecting outliers: {e}") + + # Store result in a separate key so original remains unchanged + data["last_select_outliers"] = result_df + + # Send summary message + kernel._send_message("stdout", "Outlier detection completed (non in-place). Summary:\n" + "\n".join(messages)) + kernel._send_message("stdout", "Results stored in data['last_select_outliers'] (original data['last_select'] unchanged).") + + # Plot if requested + plot_error = None + if plot: + try: + df_numeric = result_df[target_columns] + fig = self._build_plots(df_numeric, outlier_masks) + self._send_image(kernel, fig) + except Exception as e: + plot_error = f"Error while plotting: {e}" + kernel._send_message("stderr", plot_error) + messages.append(plot_error) + operation_status = "error" + + # Finally show the result DataFrame (the copy with indicator columns) + try: + self._send_html(kernel, data["last_select_outliers"]) + except Exception: + pass + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py new file mode 100644 index 0000000..ebad038 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_cleaning/stats.py @@ -0,0 +1,473 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import shlex +from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Stats(MariaMagic): + """ + %stats [columns=col1,col2] [include=all|numeric|object] [percentiles=25,50,75] [transpose=true|false] + + Produce a statistical summary of the DataFrame in data["last_select"]. + + Examples: + %stats + -> numeric summary (count, mean, std, min, 25%, 50%, 75%, max) + %stats include=all + -> include all dtypes (object, category, datetime etc.) + %stats columns=age,salary + -> summary only for the specified columns + %stats percentiles=10,90 + -> include the 10th and 90th percentiles (values can be 0-100 or 0-1) + %stats transpose=true + -> show summary transposed (rows <-> columns) + + Execution metadata is recorded into table `magic_metadata`. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "stats" + + def help(self): + return ( + "%stats [columns=col1,col2] [include=all|numeric|object] " + "[percentiles=25,50,75] [transpose=true|false]\n" + "Show statistical summary (uses pandas.DataFrame.describe under the hood)." + "Execution metadata is recorded in table `magic_metadata`." + ) + + def _str_to_obj(self, s): + """Cast string tokens to int/float/bool if possible, otherwise return string.""" + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except ValueError: + return s + + def parse_args(self, input_str): + """Parse arguments given as key=value pairs (space separated).""" + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + """Send a DataFrame as HTML (fallback to plain text).""" + try: + html = df.to_html() + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _parse_percentiles(self, pct_arg): + """ + Accept percentiles as comma-separated list of numbers. + Values may be 0-100 (e.g. 25) or 0-1 (e.g. 0.25). + Return list of floats in [0,1] as required by pandas. + """ + if pct_arg is None: + return None + if isinstance(pct_arg, (list, tuple)): + raw = pct_arg + else: + raw = str(pct_arg).split(",") + out = [] + for item in raw: + s = str(item).strip() + if s == "": + continue + try: + v = float(s) + except ValueError: + # ignore bad token + continue + if v > 1: + v = v / 100.0 + if 0 <= v <= 1: + out.append(v) + # pandas.describe requires percentiles to be sorted and unique + out = sorted(set(out)) + return out if out else None + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + # Fallback: run SELECT DATABASE(); + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # If result is raw HTML table, try to parse with pandas + try: + df_list = pd.read_html(result) + if df_list and isinstance(df_list, list) and len(df_list) > 0: + val = df_list[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + # if not parseable by pandas, try regex to extract first cell content + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)) # strip tags + txt = txt.strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + # If result is plain text (like the DB name) + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Columns: id, command_name, arguments, execution_timestamp, + affected_columns, operation_status, message, db_name, user_name + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + # nothing to do + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + # Escape values + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + """Execute the %stats magic (display-only) and log metadata.""" + df = data.get("last_select") + + # Prepare metadata context early so we can log failures + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None: + msg = "No last_select found in kernel data." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + if hasattr(df, "empty") and df.empty: + msg = "There is no data to summarize (empty DataFrame)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # columns handling + cols_arg = args.get("columns", None) + if isinstance(cols_arg, str): + columns = [c.strip() for c in cols_arg.split(",") if c.strip()] + elif isinstance(cols_arg, (list, tuple)): + columns = list(cols_arg) + else: + columns = None + + # include: pandas.describe 'include' parameter (None default -> numeric) + include = args.get("include", "numeric") + if include not in ("numeric", "object", "all"): + # allow user to pass pandas dtypes-like include, but restrict to these for simplicity + include = "numeric" + include_param = None + if include == "all": + include_param = "all" + elif include == "object": + include_param = object + else: + include_param = None # pandas default -> numeric only + + # percentiles + percentiles_arg = args.get("percentiles", None) + percentiles = self._parse_percentiles(percentiles_arg) + + transpose = bool(args.get("transpose", False)) + + # subset dataframe if columns specified + try: + subdf = df[columns] if columns is not None else df + except KeyError as e: + msg = f"Column not found: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # call pandas describe + try: + describe_kwargs = {} + if percentiles is not None: + describe_kwargs["percentiles"] = percentiles + if include_param is not None: + describe_kwargs["include"] = include_param + + result = subdf.describe(**describe_kwargs) + # For object dtypes, pandas describe may include top/freq; that's fine. + if transpose: + try: + result = result.transpose() + except Exception: + # fallback without transposing if something goes wrong + pass + + self._send_html(kernel, result) + + # Insert metadata (success) + affected_columns_str = "\n".join(columns) if columns else "ALL_COLUMNS" + pct_str = ",".join(str(p) for p in (percentiles or [])) if percentiles else "" + message = f"Stats computed for {len(result.columns) if hasattr(result, 'columns') else 'N'} column(s); total_rows={len(subdf)}; percentiles={pct_str}; include={include}." + operation_status = "success" + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message, + db_name=db_name, + user_name=user_name + ) + except Exception: + # do not interrupt flow if logging fails + pass + + except Exception as e: + msg = f"Error computing statistics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "ALL_COLUMNS", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py new file mode 100644 index 0000000..69f494a --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/encode.py @@ -0,0 +1,942 @@ +# encode.py +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import shlex +from distutils import util +import numpy as np +from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder +import logging +import os +import re +import uuid +import time + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Encode(MariaMagic): + """ + %encode method= + [columns=col1,col2,...] + [inplace=true|false] + [drop_original=true|false] + [mode=preview|apply|rollback] + [table=schema.table] [confirm=true|false] [sample_size=100] + + Notes: + - If columns omitted, object/category dtype columns are auto-selected. + - Default: inplace=true, drop_original=true. + - DB apply uses CTAS + atomic RENAME and records rollback metadata. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "encode" + + def help(self): + return ( + "%encode method= [columns=col1,col2] " + "[inplace=true] [drop_original=true] [mode=preview|apply|rollback]\n" + "[table=schema.table] [confirm=true] [sample_size=100]\n" + "Encode categorical columns. Preview shows what will be created. " + "Apply can operate locally or on a DB table (versioned)." + ) + + def _str_to_obj(self, s): + """Cast to int/float/bool when possible, otherwise return string.""" + try: + return int(s) + except (ValueError, TypeError): + pass + try: + return float(s) + except (ValueError, TypeError): + pass + try: + return bool(util.strtobool(str(s))) + except Exception: + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + mime = "text/html" + except Exception: + html = str(df) + mime = "text/plain" + display_content = {"data": {mime: html}, "metadata": {}} + kernel.send_response(kernel.iopub_socket, "display_data", display_content) + + def _make_ohe(self, **kwargs): + """ + Create OneHotEncoder in a sklearn-version compatible way. + Older sklearn versions accept `sparse`; newer use `sparse_output`. + """ + try: + return OneHotEncoder(sparse=False, **kwargs) + except TypeError: + return OneHotEncoder(sparse_output=False, **kwargs) + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _safe_colname(self, s): + """Create a safe column identifier from an arbitrary string.""" + if s is None: + return "" + s2 = re.sub(r"[^0-9A-Za-z_]", "_", str(s)) + # ensure not starting with digit + if re.match(r"^[0-9]", s2): + s2 = "_" + s2 + return s2[:200] # cap length + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Includes rollback support columns (rollback_token, backup_table, original_table). + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_distinct_results(self, result): + """Return list of values from a run_statement output (HTML or plain).""" + if not result: + return [] + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + series = dfs[0].iloc[:, 0].astype(object) + return [None if (pd.isna(x) or x is None) else x for x in series.tolist()] + except Exception: + vals = re.findall(r"(.*?)", str(result), flags=re.S | re.I) + parsed = [] + for v in vals: + txt = re.sub(r"<.*?>", "", v).strip() + if txt.lower() == "null": + parsed.append(None) + else: + parsed.append(txt) + if parsed: + return parsed + # Last fallback: attempt to split raw text lines + try: + txt = str(result).strip() + lines = [l.strip() for l in txt.splitlines() if l.strip()] + return lines + except Exception: + return [] + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # get DataFrame + df = data.get("last_select") + if df is None: + kernel._send_message("stderr", "No last_select found in kernel data.") + return + if hasattr(df, "empty") and df.empty: + kernel._send_message("stderr", "There is no data to encode (empty DataFrame).") + return + + try: + args = self.parse_args(self.args) + except Exception: + kernel._send_message("stderr", "Error parsing arguments.") + return + + method = str(args.get("method", "label")).lower() + cols_arg = args.get("columns", None) + if isinstance(cols_arg, str): + columns = [c.strip() for c in cols_arg.split(",") if c.strip()] + elif isinstance(cols_arg, (list, tuple)): + columns = list(cols_arg) + else: + columns = list(df.select_dtypes(include=["object", "category"]).columns) + + if not columns: + kernel._send_message("stderr", "No columns specified or detected for encoding.") + # log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message="No columns specified or detected for encoding.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # validate existence + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + msg = f"Column(s) not found: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + # log metadata for failure + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + inplace = bool(args.get("inplace", True)) + drop_original = bool(args.get("drop_original", True)) + + # mode and db args + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + + # ensure metadata table exists + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # --- PREVIEW MODE --- + if mode == "preview": + try: + messages = [] + created_columns = [] + # Local preview: compute unique counts, sample mappings + for col in columns: + series = df[col] + uniques = pd.Index(series.dropna().unique()) + n_uniques = len(uniques) + messages.append(f"Local: Column '{col}' unique non-null values: {n_uniques} (showing up to 10): {list(uniques[:10])}") + if method == "label" or method == "ordinal": + created_columns.append(f"{col}_lbl" if method == "label" else f"{col}_ord") + elif method == "onehot": + # onehot creates one column per category + for v in list(uniques[:100]): # cap for preview listing + created_columns.append(f"{col}_{self._safe_colname(v)}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + kernel._send_message("stdout", f"PREVIEW (local) estimated created columns: {len(created_columns)}") + + # Show sample rows that will be modified (where any column is not-na) + sample_mask = pd.Series(False, index=df.index) + for col in columns: + sample_mask = sample_mask | df[col].notna() + sample_rows = df[sample_mask].head(sample_size) + if not sample_rows.empty: + try: + self._send_html(kernel, sample_rows) + except Exception: + kernel._send_message("stdout", str(sample_rows.head())) + + # DB preview if requested + if table_full and mariadb_client is not None: + db_msgs = [] + total_estimated_new_cols = 0 + for col in columns: + try: + out = mariadb_client.run_statement(f"SELECT DISTINCT {col} FROM {table_full} LIMIT {sample_size};") + vals = self._parse_distinct_results(out) + nvals = len(vals) + db_msgs.append(f"DB: Column '{col}' distinct values (up to {sample_size}): {vals[:10]} (count_est={nvals})") + if method == "label" or method == "ordinal": + total_estimated_new_cols += 1 + else: + total_estimated_new_cols += nvals + except Exception as e: + db_msgs.append(f"DB: Column '{col}' distinct query failed: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + kernel._send_message("stdout", f"PREVIEW (db) estimated created columns: {total_estimated_new_cols}") + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + # try to read latest magic_metadata entry for this command and user + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"encode_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=10) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # gather distinct values per column from DB (best-effort) + col_values = {} + messages = [] + for col in columns: + try: + out = mariadb_client.run_statement(f"SELECT DISTINCT {col} FROM {table_full} LIMIT {sample_size};") + vals = self._parse_distinct_results(out) + # we keep the order returned; limit cardinality to avoid explosion + col_values[col] = vals + messages.append(f"{col}: discovered {len(vals)} distinct values (sample limit {sample_size}).") + except Exception as e: + col_values[col] = [] + messages.append(f"{col}: failed to collect distinct values: {e}") + + # Build SELECT expressions + select_exprs = [] + all_columns = list(df.columns) + + created_cols = [] + created_count = 0 + + for c in all_columns: + if c in columns: + vals = col_values.get(c, []) + if method == "label": + # build CASE ... WHEN ... THEN idx ELSE NULL END AS col_lbl + cases = [] + for idx, v in enumerate(vals): + if v is None: + cases.append(f"WHEN {c} IS NULL THEN {idx}") + else: + cases.append(f"WHEN {c} = {self._sql_escape(v)} THEN {idx}") + case_sql = " ".join(cases) + new_name = f"{c}_lbl" + select_exprs.append(f"CASE {case_sql} ELSE NULL END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + elif method == "ordinal": + cases = [] + for idx, v in enumerate(vals): + if v is None: + cases.append(f"WHEN {c} IS NULL THEN {idx}") + else: + cases.append(f"WHEN {c} = {self._sql_escape(v)} THEN {idx}") + new_name = f"{c}_ord" + select_exprs.append(f"CASE {' '.join(cases)} ELSE NULL END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + elif method == "onehot": + # for each distinct value create column col_ as CASE WHEN col=val THEN 1 ELSE 0 END + for v in vals: + safe = self._safe_colname(v if v is not None else "NULL") + new_name = f"{c}_{safe}" + if v is None: + select_exprs.append(f"CASE WHEN {c} IS NULL THEN 1 ELSE 0 END AS {new_name}") + else: + select_exprs.append(f"CASE WHEN {c} = {self._sql_escape(v)} THEN 1 ELSE 0 END AS {new_name}") + created_cols.append(new_name) + created_count += 1 + if not drop_original: + select_exprs.append(c) + else: + # fallback: keep original + select_exprs.append(c) + else: + # not a targeted column — keep as is + select_exprs.append(c) + + # safety cap + if created_count > 1000: + kernel._send_message("stderr", f"Refusing to create {created_count} encoded columns ( > 1000 ). Narrow the columns or reduce distinct values.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='error', + message=f"too_many_created_columns={created_count}", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + select_sql = ", ".join(select_exprs) + try: + lock_name = f"encode_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=10) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with encoded columns + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with token so rollback can restore + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(columns), + operation_status='applied', + message=f'applied_backup={backup_table};created_columns={created_count}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + result_df = df if inplace else df.copy() + messages = [] + operation_status = "success" + created_columns = [] + try: + encoder_obj = None + label_mappings = None + + if method == "label": + label_mappings = {} + for col in columns: + codes, uniques = pd.factorize(result_df[col], sort=True) + new_col = f"{col}_lbl" + result_df[new_col] = codes + created_columns.append(new_col) + mapping = {val: idx for idx, val in enumerate(uniques)} + label_mappings[col] = mapping + if drop_original: + result_df.drop(columns=[col], inplace=True) + messages.append(f"Column '{col}': label-encoded -> {new_col} (unique_values={len(uniques)})") + encoder_obj = label_mappings + + elif method == "onehot": + encoder = self._make_ohe(handle_unknown="ignore") + tmp = result_df[columns].astype(object).fillna("___MISSING___") + arr = encoder.fit_transform(tmp) + try: + feature_names = encoder.get_feature_names_out(columns) + feature_names = [str(fn) for fn in feature_names] + except Exception: + cats = encoder.categories_ + feature_names = [] + for cname, cat_list in zip(columns, cats): + for cat in cat_list: + feature_names.append(f"{cname}_{str(cat)}") + ohe_df = pd.DataFrame(arr, columns=feature_names, index=result_df.index) + if drop_original: + result_df = pd.concat([result_df.drop(columns=columns), ohe_df], axis=1) + else: + result_df = pd.concat([result_df, ohe_df], axis=1) + created_columns.extend(feature_names) + messages.append(f"Columns {columns} one-hot encoded -> created {len(feature_names)} columns.") + encoder_obj = encoder + + elif method == "ordinal": + enc = OrdinalEncoder(dtype=np.float64) + tmp = result_df[columns].astype(object).fillna("___MISSING___") + enc_arr = enc.fit_transform(tmp) + for i, col in enumerate(columns): + new_col = f"{col}_ord" + result_df[new_col] = enc_arr[:, i] + created_columns.append(new_col) + if drop_original: + result_df.drop(columns=[col], inplace=True) + messages.append(f"Column '{col}': ordinal-encoded -> {new_col}") + encoder_obj = enc + + else: + kernel._send_message("stderr", "Unsupported method. Supported: label, onehot, ordinal.") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message="Unsupported method requested.", + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Apply result back to shared data if inplace + if inplace: + data["last_select"] = result_df + kernel._send_message("stdout", "Encoded columns in-place and updated last_select.") + else: + kernel._send_message("stdout", "Displayed encoded result (last_select not modified).") + + # Save encoder (or mapping) to shared data for downstream pipeline usage + try: + if encoder_obj is not None: + data["last_select_encoder"] = encoder_obj + elif label_mappings is not None: + data["last_select_encoder"] = label_mappings + except Exception: + pass + + # display + self._send_html(kernel, result_df) + + except Exception as e: + operation_status = "error" + err_msg = f"Error during encoding: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Attempt to insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(columns) + created_columns_str = "\n".join(created_columns) if created_columns else "" + details = "\n".join(messages) if messages else "Encoding completed." + metadata_message = f"Method: {method}\nCreated columns:\n{created_columns_str}\n\nDetails:\n{details}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py new file mode 100644 index 0000000..e85fddc --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/normalize.py @@ -0,0 +1,900 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.preprocessing import MinMaxScaler +import logging +import os +import re +import uuid +import time + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Normalize(MariaMagic): + """ + %normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] [confirm=true|false] + [sample_size=100] [lock_timeout=10] + + Scales numeric columns to a fixed range (default 0-1) using sklearn's MinMaxScaler. + + - columns: list of columns to normalize. If omitted, all numeric columns are used. + - feature_range: lower and upper bounds for scaling (default: 0,1) + - inplace: if True (default), modifies data["last_select"] in-place. + if False, stores result in data["last_select_normalized"]. + - mode: preview/apply/rollback (preview default) + + Examples: + %normalize + %normalize columns=age,salary + %normalize feature_range=5,10 inplace=False + %normalize mode=apply table=schema.emp confirm=true + Execution metadata is recorded in table `magic_metadata`. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "normalize" + + def help(self): + return ( + "%normalize [columns=col1,col2,...] [feature_range=0,1] [inplace=True|False]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true]\n" + "Normalize numeric columns using MinMaxScaler (in-place by default)." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + """Return mariadb_client if present on kernel, else None""" + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + """Return a logger on kernel if present, else create a temporary logger""" + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + # double single-quotes for SQL escaping + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + """ + Attempt to determine the currently used DB. + Prefer SqlFetch if available; otherwise run SELECT DATABASE(); and try to parse. + Returns empty string if none found. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror(): + return "" + if not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + """Try several places to find the current user name; fallback to OS login or empty string.""" + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + """ + Create magic_metadata table if it doesn't exist. + Includes rollback support columns (rollback_token, backup_table, original_table). + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + """ + Insert a metadata row into magic_metadata. Uses NOW() for timestamp. + """ + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_two_value_result(self, res): + """ + Parse results expected to have two values (MIN and MAX, HTML table or plain). + Returns (val1, val2) or (None, None) if parsing fails. + """ + if not res: + return None, None + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + r0 = dfs[0].iloc[0, 0] + r1 = dfs[0].iloc[0, 1] if dfs[0].shape[1] > 1 else None + try: + v0 = float(r0) if pd.notna(r0) else None + except Exception: + v0 = None + try: + v1 = float(r1) if pd.notna(r1) else None + except Exception: + v1 = None + return v0, v1 + except Exception: + # regex fallback: pick first two ", str(res), flags=re.S | re.I) + if m and len(m) >= 1: + def tofloat(txt): + txt = re.sub(r"<.*?>", "", txt).strip() + if txt.lower() == "null" or txt == "": + return None + try: + return float(txt) + except Exception: + return None + v0 = tofloat(m[0]) + v1 = tofloat(m[1]) if len(m) > 1 else None + return v0, v1 + # final fallback: try to split lines + try: + txt = str(res).strip() + parts = [p.strip() for p in txt.split() if p.strip()] + if len(parts) >= 2: + try: + return float(parts[0]), float(parts[1]) + except Exception: + return None, None + except Exception: + return None, None + return None, None + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + df = data.get("last_select") + # Prepare metadata context early so failures can be logged + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + feature_range_arg = args.get("feature_range", "0,1") + if isinstance(feature_range_arg, str): + parts = [p.strip() for p in feature_range_arg.split(",")] + if len(parts) == 2: + try: + feature_range = (float(parts[0]), float(parts[1])) + except Exception: + msg = "feature_range values must be numeric." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + msg = "feature_range must be provided as 'min,max'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns) if columns else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + try: + feature_range = tuple(feature_range_arg) + except Exception: + feature_range = (0, 1) + + inplace = bool(args.get("inplace", True)) + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Select numeric columns + if columns is None: + target_columns = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])] + else: + missing_cols = [c for c in columns if c not in df.columns] + if missing_cols: + msg = f"Missing columns: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + target_columns = columns + + if not target_columns: + msg = "No numeric columns to normalize." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # --- PREVIEW MODE --- + if mode == "preview": + try: + messages = [] + # local preview: compute local min/max and show sample transformed values + local_ranges = {} + for col in target_columns: + s = pd.to_numeric(df[col], errors="coerce").dropna() + if s.empty: + messages.append(f"Local: Column '{col}' has no numeric non-null values; skipped.") + local_ranges[col] = (None, None) + continue + lo = float(s.min()) + hi = float(s.max()) + local_ranges[col] = (lo, hi) + messages.append(f"Local: Column '{col}' min={lo}, max={hi} -> range will map to {feature_range}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + + # show sample transformed rows for local preview + try: + sample = df[target_columns].head(sample_size).copy() + for col in target_columns: + lo, hi = local_ranges.get(col, (None, None)) + if lo is None or hi is None or hi == lo: + # can't transform deterministically; show original values + sample[col + "_norm_preview"] = sample[col] + else: + # scale + rng = feature_range[1] - feature_range[0] + sample[col + "_norm_preview"] = ((pd.to_numeric(sample[col], errors="coerce") - lo) / (hi - lo)) * rng + feature_range[0] + if not sample.empty: + self._send_html(kernel, sample.head(20)) + except Exception: + pass + + # DB preview if table provided + if table_full and mariadb_client is not None: + db_msgs = [] + for col in target_columns: + try: + # attempt to get MIN and MAX from DB for each column + res = mariadb_client.run_statement(f"SELECT MIN({col}), MAX({col}) FROM {table_full};") + if mariadb_client.iserror(): + db_msgs.append(f"DB: Column '{col}': MIN/MAX query failed (check permissions).") + continue + minv, maxv = self._parse_two_value_result(res) + if minv is None and maxv is None: + db_msgs.append(f"DB: Column '{col}': could not parse MIN/MAX (empty/unsupported).") + continue + db_msgs.append(f"DB: Column '{col}' min={minv}, max={maxv} -> would map to {feature_range}") + # show SQL expression that would be used + if minv is None or maxv is None: + expr = f"{col} /* cannot compute min/max */" + elif maxv == minv: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE {feature_range[0]} END AS {col}" + else: + # normalization SQL: ((col - min) / (max - min)) * (range_max - range_min) + range_min + rng = feature_range[1] - feature_range[0] + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE (({col} - {repr(minv)}) / ({repr(maxv - minv)})) * {repr(rng)} + {repr(feature_range[0])} END AS {col}" + db_msgs.append(f"DB: Column '{col}' expression: {expr}") + except Exception as e: + db_msgs.append(f"DB: Column '{col}' MIN/MAX query exception: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # --- ROLLBACK MODE --- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + # fetch backup_table and original_table + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + # perform atomic restore: backup_table -> original_table + lock_name = f"normalize_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # --- APPLY MODE --- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # compute MIN and MAX per column in DB + col_minmax = {} + msgs = [] + for col in target_columns: + try: + res = mariadb_client.run_statement(f"SELECT MIN({col}), MAX({col}) FROM {table_full};") + if mariadb_client.iserror(): + msgs.append(f"{col}: MIN/MAX query failed (permissions?).") + col_minmax[col] = (None, None) + continue + minv, maxv = self._parse_two_value_result(res) + col_minmax[col] = (minv, maxv) + msgs.append(f"{col}: min={minv}, max={maxv}") + except Exception as e: + col_minmax[col] = (None, None) + msgs.append(f"{col}: exception computing min/max: {e}") + + # build select expressions + select_exprs = [] + for c in df.columns: + if c in target_columns: + minv, maxv = col_minmax.get(c, (None, None)) + if minv is None or maxv is None: + # cannot compute; keep original + select_exprs.append(c) + elif maxv == minv: + # constant mapping to feature_range[0] + expr = f"CASE WHEN {c} IS NULL THEN NULL ELSE {repr(feature_range[0])} END AS {c}" + select_exprs.append(expr) + else: + rng = feature_range[1] - feature_range[0] + denom = (maxv - minv) + # ((col - min) / denom) * rng + feature_range[0] + expr = ( + f"CASE WHEN {c} IS NULL THEN NULL ELSE " + f"(({c} - {repr(minv)}) / {repr(denom)}) * {repr(rng)} + {repr(feature_range[0])} END AS {c}" + ) + select_exprs.append(expr) + else: + select_exprs.append(c) + + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"normalize_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table with normalized values (CTAS) + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata (include token so user can rollback) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table};details={"|".join(msgs)}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # Local in-place apply on data['last_select'] (existing behavior) + target_df = df if inplace else df.copy(deep=True) + operation_status = "success" + messages = [] + try: + scaler = MinMaxScaler(feature_range=feature_range) + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + msg = f"Normalized {len(target_columns)} column(s) to range {feature_range}." + messages.append(msg) + if inplace: + data["last_select"] = target_df + location_msg = "Updated data['last_select'] in-place." + else: + data["last_select_normalized"] = target_df + location_msg = "Stored in data['last_select_normalized']." + kernel._send_message("stdout", f"{msg} {location_msg}") + except Exception as e: + operation_status = "error" + err_msg = f"Error during normalization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # Display DataFrame + try: + self._send_html(kernel, target_df) + except Exception: + pass + + # Insert metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + metadata_message = f"Feature range: {feature_range}\n\nDetails:\n{message_str}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=metadata_message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py new file mode 100644 index 0000000..09207dc --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/splitdata.py @@ -0,0 +1,495 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.model_selection import train_test_split +import logging +import os +import re + +# optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class SplitData(MariaMagic): + """ + %splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False] + [random_state=42] [inplace=True|False] [train_name=last_select_train] + [test_name=last_select_test] [val_name=last_select_val] + + Split the current data["last_select"] DataFrame into train/test/(validation). + + Execution metadata is recorded into table `magic_metadata` with fields: + id, command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "splitdata" + + def help(self): + return ( + "%splitdata [test_size=0.2] [val_size=0.1] [stratify=colname] [shuffle=True|False]\n" + "[random_state=42] [inplace=True|False] [train_name=name] [test_name=name] [val_name=name]\n" + "Split last_select into train/test/(val). Execution metadata recorded in magic_metadata." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # --------------- metadata / DB helpers (best-effort) ---------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # ---------------- end metadata helpers ---------------- + + def execute(self, kernel, data): + df = data.get("last_select") + + # prepare metadata context + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or df.empty: + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Defaults + test_size_arg = args.get("test_size", 0.2) + val_size_arg = args.get("val_size", 0.0) + stratify_col = args.get("stratify", None) + shuffle = bool(args.get("shuffle", True)) + random_state = args.get("random_state", None) + inplace = bool(args.get("inplace", True)) + + train_name = args.get("train_name", "last_select_train") + test_name = args.get("test_name", "last_select_test") + val_name = args.get("val_name", "last_select_val") + + # Validate dataset + n_total = len(df) + if n_total == 0: + msg = "DataFrame has no rows to split." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Helper to interpret sizes (int count or fraction) + def interpret_size(size_arg, total): + if isinstance(size_arg, int): + if size_arg < 0: + raise ValueError("Sizes must be non-negative.") + return float(size_arg) / total + try: + size_f = float(size_arg) + except Exception: + raise ValueError("Size must be an int or float.") + if size_f < 0: + raise ValueError("Sizes must be non-negative.") + if 0 <= size_f < 1: + return size_f + # If provided >=1 and integer-like, treat as count + if size_f >= 1 and abs(size_f - int(size_f)) < 1e-9: + if int(size_f) > total: + raise ValueError("Size count larger than dataset.") + return float(int(size_f)) / total + # fractions >= 1 are invalid + raise ValueError("If numeric and >=1, size must be an integer count <= total rows.") + + try: + test_frac = interpret_size(test_size_arg, n_total) + val_frac = interpret_size(val_size_arg, n_total) + except ValueError as e: + msg = f"Error interpreting sizes: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + if test_frac + val_frac >= 1.0: + msg = "Sum of test_size and val_size must be less than 1.0." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Prepare stratify arrays if requested + stratify_arr = None + if stratify_col: + if stratify_col not in df.columns: + msg = f"Stratify column '{stratify_col}' not found in DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + stratify_arr = df[stratify_col].values + + # Run splits + try: + # First split off the test set (test_frac of original) + if test_frac > 0: + train_val_df, test_df = train_test_split( + df, + test_size=test_frac, + shuffle=shuffle, + random_state=random_state, + stratify=stratify_arr if stratify_arr is not None else None + ) + else: + train_val_df = df.copy(deep=True) + test_df = pd.DataFrame(columns=df.columns) + + # If no val requested, train = train_val_df + if val_frac <= 0: + train_df = train_val_df + val_df = pd.DataFrame(columns=df.columns) + else: + rel_val_frac = val_frac / (1.0 - test_frac) + stratify_arr_second = None + if stratify_arr is not None: + stratify_arr_second = train_val_df[stratify_col].values + train_df, val_df = train_test_split( + train_val_df, + test_size=rel_val_frac, + shuffle=shuffle, + random_state=random_state, + stratify=stratify_arr_second if stratify_arr_second is not None else None + ) + + # Store results in data dict under requested names + data[test_name] = test_df + data[val_name] = val_df + data[train_name] = train_df + + if inplace: + data["last_select"] = train_df + + # Report sizes + msg = ( + f"Split completed: total={n_total}, train={len(train_df)}, " + f"test={len(test_df)}, val={len(val_df)}." + ) + kernel._send_message("stdout", msg) + + # Display small previews + try: + if not train_df.empty: + self._send_html(kernel, train_df.head(20), title=f"Train ({len(train_df)} rows)") + if not val_df.empty: + self._send_html(kernel, val_df.head(20), title=f"Validation ({len(val_df)} rows)") + if not test_df.empty: + self._send_html(kernel, test_df.head(20), title=f"Test ({len(test_df)} rows)") + except Exception: + pass + + # Insert metadata (success) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns = stratify_col if stratify_col else "ALL_COLUMNS" + message = ( + f"train_name={train_name}, test_name={test_name}, val_name={val_name}\n" + f"train_count={len(train_df)}, test_count={len(test_df)}, val_count={len(val_df)}\n" + f"test_frac={test_frac}, val_frac={val_frac}, shuffle={shuffle}, random_state={random_state}" + ) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns, + operation_status="success", + message=message, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + except Exception as e: + msg = f"Error during splitting: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=stratify_col if stratify_col else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py new file mode 100644 index 0000000..2115b81 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/data_preprocessing/standardize.py @@ -0,0 +1,815 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +from sklearn.preprocessing import StandardScaler +import logging +import os +import re +import uuid +import time + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Standardize(MariaMagic): + """ + %standardize [columns=col1,col2,...] [inplace=True|False] + [mode=preview|apply|rollback] [table=schema.table] + [confirm=true|false] [sample_size=100] [lock_timeout=10] + + Standardizes numeric columns using sklearn's StandardScaler + (zero mean and unit variance). + + - columns: comma-separated list of columns to standardize. + If omitted, all numeric columns are used. + - inplace: if True (default), modifies data["last_select"] in-place. + if False, stores result in data["last_select_standardized"]. + - mode: preview/apply/rollback (preview default). + * preview: show local preview and optional DB stats if table=... provided. + * apply: local in-place (default) or DB versioned apply when table=... and confirm=true. + * rollback: restore a previously-created backup (needs mariadb_client). + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "standardize" + + def help(self): + return ( + "%standardize [columns=col1,col2,...] [inplace=True|False]\n" + " [mode=preview|apply|rollback] [table=schema.table] [confirm=true]\n" + " [sample_size=100] [lock_timeout=10]\n" + "Standardizes numeric columns using sklearn's StandardScaler." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df): + try: + html = df.to_html(index=False) + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + # include rollback columns so apply/rollback can record/locate backups + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name, + rollback_token=None, backup_table=None, original_table=None): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + rollback_sql = self._sql_escape(rollback_token) + backup_sql = self._sql_escape(backup_table) + original_sql = self._sql_escape(original_table) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name, rollback_token, backup_table, original_table) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql}, + {rollback_sql}, + {backup_sql}, + {original_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + def _acquire_lock(self, mariadb_client, lock_name, timeout=10): + try: + mariadb_client.run_statement(f"SELECT GET_LOCK('{lock_name}', {int(timeout)});") + if mariadb_client.iserror(): + return False + return True + except Exception: + return False + + def _release_lock(self, mariadb_client, lock_name): + try: + mariadb_client.run_statement(f"SELECT RELEASE_LOCK('{lock_name}');") + except Exception: + pass + + def _table_exists(self, mariadb_client, table_full_name): + try: + mariadb_client.run_statement(f"SELECT 1 FROM {table_full_name} LIMIT 1;") + return not mariadb_client.iserror() + except Exception: + return False + + def _parse_two_value_result(self, res): + """ + Parse results expected to have two values (e.g., AVG and STD). + Returns (val1, val2) or (None, None) if parsing fails. + """ + if not res: + return None, None + try: + dfs = pd.read_html(res) + if dfs and len(dfs) > 0: + r = dfs[0].iloc[0] + v0 = None + v1 = None + try: + v0 = float(r.iloc[0]) if pd.notna(r.iloc[0]) else None + except Exception: + v0 = None + try: + v1 = float(r.iloc[1]) if r.size > 1 and pd.notna(r.iloc[1]) else None + except Exception: + v1 = None + return v0, v1 + except Exception: + vals = re.findall(r"(.*?)", str(res), flags=re.S | re.I) + if vals: + def tofloat(txt): + txt = re.sub(r"<.*?>", "", txt).strip() + if txt.lower() == "null" or txt == "": + return None + try: + return float(txt) + except Exception: + return None + v0 = tofloat(vals[0]) + v1 = tofloat(vals[1]) if len(vals) > 1 else None + return v0, v1 + # fallback: try whitespace split + try: + parts = [p for p in str(res).split() if p.strip()] + if len(parts) >= 2: + try: + return float(parts[0]), float(parts[1]) + except Exception: + return None, None + except Exception: + pass + return None, None + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + df = data.get("last_select") + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + if df is None or (hasattr(df, "empty") and df.empty): + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + columns_arg = args.get("columns", None) + if isinstance(columns_arg, str): + columns = [c.strip() for c in columns_arg.split(",") if c.strip()] + elif isinstance(columns_arg, (list, tuple)): + columns = list(columns_arg) + else: + columns = None + + inplace = bool(args.get("inplace", True)) + mode = str(args.get("mode", "preview")).lower() + mode = mode if mode in {"preview", "apply", "rollback"} else "preview" + table_full = args.get("table", None) + confirm = bool(args.get("confirm", False)) + sample_size = int(args.get("sample_size", 100)) + lock_timeout = int(args.get("lock_timeout", 10)) + + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Determine target columns (numeric) + target_df = df if inplace else df.copy(deep=True) + if columns is None: + target_columns = [c for c in target_df.columns if pd.api.types.is_numeric_dtype(target_df[c])] + else: + missing_cols = [c for c in columns if c not in target_df.columns] + if missing_cols: + msg = f"Missing columns: {', '.join(missing_cols)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + target_columns = columns + + if not target_columns: + msg = "No numeric columns to standardize." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # ---------------- PREVIEW ---------------- + if mode == "preview": + try: + messages = [] + # local preview: means/std and sample transformed + local_stats = {} + for col in target_columns: + s = pd.to_numeric(df[col], errors="coerce").dropna() + if s.empty: + messages.append(f"Local: Column '{col}' has no numeric non-null values; skipped.") + local_stats[col] = (None, None) + continue + mean = float(s.mean()) + std = float(s.std(ddof=0)) # population std to match DB STDDEV_POP + local_stats[col] = (mean, std) + messages.append(f"Local: Column '{col}': mean={mean}, std={std}") + + kernel._send_message("stdout", "PREVIEW (local):\n" + "\n".join(messages)) + + # show sample transformed rows + try: + sample = df[target_columns].head(sample_size).copy() + for col in target_columns: + mean, std = local_stats.get(col, (None, None)) + if mean is None or std is None or std == 0: + # cannot standardize sensibly; show original + sample[col + "_std_preview"] = sample[col] + else: + sample[col + "_std_preview"] = (pd.to_numeric(sample[col], errors="coerce") - mean) / std + if not sample.empty: + self._send_html(kernel, sample.head(20)) + except Exception: + pass + + # DB preview if requested + if table_full and mariadb_client is not None: + db_msgs = [] + for col in target_columns: + try: + # use AVG and STDDEV_POP for stable population std + out = mariadb_client.run_statement(f"SELECT AVG({col}), STDDEV_POP({col}) FROM {table_full};") + if mariadb_client.iserror(): + db_msgs.append(f"DB: Column '{col}': AVG/STD query failed (permissions?).") + continue + mean_db, std_db = self._parse_two_value_result(out) + db_msgs.append(f"DB: Column '{col}': mean={mean_db}, std={std_db}") + if mean_db is None or std_db is None: + db_msgs.append(f"DB: Column '{col}': cannot compute mean/std (NULL).") + continue + if std_db == 0: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE 0 END AS {col}" + else: + expr = f"CASE WHEN {col} IS NULL THEN NULL ELSE (({col} - {repr(mean_db)}) / {repr(std_db)}) END AS {col}" + db_msgs.append(f"DB: Column '{col}' expression: {expr}") + except Exception as e: + db_msgs.append(f"DB: Column '{col}' AVG/STD query exception: {e}") + kernel._send_message("stdout", "PREVIEW (db):\n" + "\n".join(db_msgs)) + + # log preview metadata + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='preview', + message='preview_completed', + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Error during preview: {e}") + return + + # ---------------- ROLLBACK ---------------- + if mode == "rollback": + if mariadb_client is None: + kernel._send_message("stderr", "Rollback requested but no mariadb_client available.") + return + token = args.get("rollback_token", None) + try: + if not token: + mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Could not find metadata for rollback (check permissions).") + return + out = mariadb_client.run_statement(f"SELECT rollback_token FROM {db_name}.magic_metadata WHERE command_name={self._sql_escape(self.name())} AND user_name={self._sql_escape(user_name)} ORDER BY execution_timestamp DESC LIMIT 1;") + m = re.search(r"(.*?)", str(out), flags=re.S | re.I) + if m: + token = re.sub(r"<.*?>", "", m.group(1)).strip() + if not token: + kernel._send_message("stderr", "No rollback_token found; cannot rollback safely.") + return + + out = mariadb_client.run_statement(f"SELECT backup_table, original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + m = re.search(r"(.*?).*?(.*?)", str(out), flags=re.S | re.I) + backup_table = None + original_table = None + if m: + backup_table = re.sub(r"<.*?>", "", m.group(1)).strip() + original_table = re.sub(r"<.*?>", "", m.group(2)).strip() + else: + try: + out_b = mariadb_client.run_statement(f"SELECT backup_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mb = re.search(r"(.*?)", str(out_b), flags=re.S | re.I) + if mb: + backup_table = re.sub(r"<.*?>", "", mb.group(1)).strip() + except Exception: + pass + try: + out_o = mariadb_client.run_statement(f"SELECT original_table FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + mo = re.search(r"(.*?)", str(out_o), flags=re.S | re.I) + if mo: + original_table = re.sub(r"<.*?>", "", mo.group(1)).strip() + except Exception: + pass + + if not backup_table: + kernel._send_message("stderr", "No backup table found in metadata for rollback token.") + return + + lock_name = f"standardize_rb_{token}" + self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + try: + if original_table: + if self._table_exists(mariadb_client, original_table): + original_old = f"{original_table}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {original_table} TO {original_old}, {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename tables during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {original_table}; previous {original_table} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {original_table};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {original_table}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={original_table}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=original_table + ) + else: + # try to infer original table name from arguments in metadata + out_args = mariadb_client.run_statement(f"SELECT arguments FROM {db_name}.magic_metadata WHERE rollback_token={self._sql_escape(token)} LIMIT 1;") + margs = re.search(r"(.*?)", str(out_args), flags=re.S | re.I) + inferred_original = None + if margs: + args_txt = re.sub(r"<.*?>", "", margs.group(1)).strip() + mm = re.search(r"table\s*=\s*([^\s,]+)", args_txt) + if mm: + inferred_original = mm.group(1).strip() + if inferred_original: + if self._table_exists(mariadb_client, inferred_original): + original_old = f"{inferred_original}_prerollback_{token}" + mariadb_client.run_statement(f"RENAME TABLE {inferred_original} TO {original_old}, {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: restored {backup_table} -> {inferred_original}; previous {inferred_original} renamed to {original_old}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original};previous_saved_as={original_old}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + mariadb_client.run_statement(f"RENAME TABLE {backup_table} TO {inferred_original};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to rename backup to inferred original during rollback (check permissions).") + return + kernel._send_message("stdout", f"Rollback: renamed {backup_table} -> {inferred_original}.") + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='rollback', + message=f'restored_to={inferred_original}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=inferred_original + ) + else: + kernel._send_message("stderr", "Could not determine original table name for rollback. Manual restoration required.") + return + finally: + self._release_lock(mariadb_client, lock_name) + except Exception as e: + kernel._send_message("stderr", f"Rollback error: {e}") + return + + # ---------------- APPLY ---------------- + if mode == "apply": + # DB-target apply if table provided and mariadb_client present + if table_full and mariadb_client is not None: + if not confirm: + kernel._send_message("stderr", "DB apply requires confirm=true to proceed. Preview first, then re-run with confirm=true.") + return + + token = str(uuid.uuid4()).replace('-', '')[:16] + backup_table = f"{table_full}_backup_{token}" + new_table = f"{table_full}_vnew_{token}" + + # collect mean/std from DB per column + col_stats = {} + msgs = [] + for col in target_columns: + try: + out = mariadb_client.run_statement(f"SELECT AVG({col}), STDDEV_POP({col}) FROM {table_full};") + if mariadb_client.iserror(): + msgs.append(f"{col}: AVG/STD query failed.") + col_stats[col] = (None, None) + continue + mean_db, std_db = self._parse_two_value_result(out) + col_stats[col] = (mean_db, std_db) + msgs.append(f"{col}: mean={mean_db}, std={std_db}") + except Exception as e: + col_stats[col] = (None, None) + msgs.append(f"{col}: exception computing stats: {e}") + + # build select expressions (preserve non-target columns) + select_exprs = [] + for c in df.columns: + if c in target_columns: + mean_db, std_db = col_stats.get(c, (None, None)) + if mean_db is None or std_db is None: + # cannot compute, keep original as-is + select_exprs.append(c) + elif std_db == 0: + # constant zero (or map to 0) + select_exprs.append(f"CASE WHEN {c} IS NULL THEN NULL ELSE 0 END AS {c}") + else: + select_exprs.append(f"CASE WHEN {c} IS NULL THEN NULL ELSE (({c} - {repr(mean_db)}) / {repr(std_db)}) END AS {c}") + else: + select_exprs.append(c) + + select_sql = ", ".join(select_exprs) + + try: + lock_name = f"standardize_apply_{token}" + got_lock = self._acquire_lock(mariadb_client, lock_name, timeout=lock_timeout) + if not got_lock: + kernel._send_message("stderr", "Could not acquire advisory lock; aborting apply.") + return + + # create new table CTAS + mariadb_client.run_statement(f"CREATE TABLE {new_table} AS SELECT {select_sql} FROM {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "Failed to create new table for apply (CTAS failed).") + return + + # atomic rename: original -> backup, new -> original + mariadb_client.run_statement(f"RENAME TABLE {table_full} TO {backup_table}, {new_table} TO {table_full};") + if mariadb_client.iserror(): + kernel._send_message("stderr", "RENAME TABLE failed (apply may be inconsistent).") + return + + kernel._send_message("stdout", f"Apply completed: original preserved as {backup_table}.") + # log metadata with rollback token + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns='\n'.join(target_columns), + operation_status='applied', + message=f'applied_backup={backup_table};details={"|".join(msgs)}', + db_name=db_name, + user_name=user_name, + rollback_token=token, + backup_table=backup_table, + original_table=table_full + ) + + # attempt to refresh last_select with a sample + try: + mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + fresh = mariadb_client.run_statement(f"SELECT * FROM {table_full} LIMIT {sample_size};") + try: + df_list = pd.read_html(fresh) + if df_list and len(df_list) > 0: + data["last_select"] = df_list[0] + try: + self._send_html(kernel, data["last_select"]) + except Exception: + pass + except Exception: + kernel._send_message("stdout", "Applied to DB; could not refresh last_select from DB.") + except Exception: + pass + + except Exception as e: + kernel._send_message("stderr", f"Apply (DB versioned) failed: {e}") + log.exception(e) + finally: + self._release_lock(mariadb_client, lock_name) + return + + else: + # local apply (existing behavior) + operation_status = "success" + messages = [] + try: + scaler = StandardScaler() + target_df[target_columns] = scaler.fit_transform(target_df[target_columns]) + summary_msg = f"Standardized {len(target_columns)} column(s) (mean=0, std=1)." + messages.append(summary_msg) + if inplace: + data["last_select"] = target_df + location_msg = "Updated data['last_select'] in-place." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") + else: + data["last_select_standardized"] = target_df + location_msg = "Stored in data['last_select_standardized']." + kernel._send_message("stdout", f"{summary_msg} {location_msg}") + except Exception as e: + operation_status = "error" + err_msg = f"Error during standardization: {e}" + kernel._send_message("stderr", err_msg) + messages.append(err_msg) + + # show + try: + self._send_html(kernel, target_df) + except Exception: + pass + + # metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(target_columns) + message_str = "\n".join(messages) + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status=operation_status, + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return + + # fallback + kernel._send_message("stderr", "Unknown execution path reached.") + return diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py new file mode 100644 index 0000000..81913ea --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/ml_pipeline.py @@ -0,0 +1,916 @@ +# mlpipeline.py +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import json +import logging +import os +import re + +# Import the other pipeline stages (paths kept as in your original snippet) +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData # placeholder safety +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData +from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel +from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel +from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class MLPipeline(MariaMagic): + """ + %mlpipeline target=target_col problem=classification|regression [features=col1,col2,...] [model=rf|auto] + [save_path=/path/to/model.joblib] + + Automates an end-to-end ML pipeline on data['last_select'] with minimal input. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "mlpipeline" + + def help(self): + return ( + "%mlpipeline target=target_col problem=classification|regression [features=col1,col2,...] [model=rf|auto]\n" + "[save_path=/path/to/model.joblib]\n" + "Automates an ML pipeline: cleaning, encoding, feature selection, preprocessing, model selection, training, and evaluation." + ) + + def _str_to_obj(self, s): + try: + return int(s) + except ValueError: + try: + return float(s) + except ValueError: + pass + try: + return bool(util.strtobool(s)) + except Exception: + try: + return json.loads(s) + except Exception: + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _send_message(self, kernel, channel, message): + kernel._send_message(channel, f"[MLPipeline] {message}") + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + df = data.get("last_select") + if df is None or df.empty: + msg = "No last_select found or DataFrame is empty." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + try: + args = self.parse_args(self.args) + except Exception as e: + msg = f"Error parsing arguments: {e}. Use key=value syntax." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Parse arguments + target = args.get("target") + problem = args.get("problem") + features_arg = args.get("features") + model_name_arg = args.get("model", "auto") + save_path = args.get("save_path", None) + + # Validate required arguments + if not target: + msg = "target argument is required (target=target_col)." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + if not problem: + msg = "problem argument is required (problem=classification|regression)." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + if problem not in ("classification", "regression"): + msg = "problem must be 'classification' or 'regression'." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + if target not in df.columns: + msg = f"Target column '{target}' not found in DataFrame." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Parse features or set to all columns except target if not provided + if features_arg: + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + msg = "features must be comma-separated string or list." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + else: + features = [col for col in df.columns if col != target] + if not features: + msg = "No features available after excluding target column." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Validate features + missing = [c for c in features if c not in df.columns] + if missing: + msg = f"Missing feature columns in DataFrame: {', '.join(missing)}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Set defaults + inplace = True + missing_strategy = "drop" + outlier_action = "none" + encode_method = "onehot" + scale_method = "standardize" + test_size = 0.2 + val_size = 0.0 + stratify = target if problem == "classification" else None + shuffle = True + random_state = None + model_store_name = "last_model" + train_name = "last_select_train" + test_name = "last_select_test" + val_name = "last_select_val" + feature_method = "correlation" + k_features = 5 + primary_metric = "accuracy" if problem == "classification" else "r2" + cv = 0 + + # Work on a copy if not inplace + working_df = df if inplace else df.copy(deep=True) + data["last_select"] = working_df + + # Step 1: Handle missing values + try: + drop_args = f"columns={','.join(features + [target])}" + DropMissing(drop_args).execute(kernel, data) + cur_df = data.get("last_select") + if cur_df is None or cur_df.empty: + msg = "DataFrame is empty after dropping missing values." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=drop_args, + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + # Refresh working_df reference after cleaning + working_df = cur_df + except Exception as e: + msg = f"Error handling missing values: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 2: Encode categorical features + try: + # Recompute cat_columns on current working_df + cat_columns = [c for c in features if c in working_df.columns and working_df[c].dtype in ["object", "category"]] + if cat_columns: + encode_args = f"method={encode_method} columns={','.join(cat_columns)} inplace=True drop_original=True mode=apply confirm=true" + # reset any previous encoder + data["last_select_encoder"] = None + Encode(encode_args).execute(kernel, data) + + # after Encode runs, refresh working_df from shared data to see new columns + working_df = data.get("last_select", working_df) + + if encode_method == "onehot": + encoder = data.get("last_select_encoder") + if not encoder: + msg = "Encoder not found after encoding. Ensure %encode saves the encoder to data['last_select_encoder']." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns="\n".join(cat_columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + try: + # get_feature_names_out may require passing the original column names + try: + feature_names = list(encoder.get_feature_names_out(cat_columns)) + except Exception: + # fallback for older sklearn or if encoder doesn't support that call + cats = getattr(encoder, "categories_", None) + feature_names = [] + if cats is not None: + for cname, cat_list in zip(cat_columns, cats): + for cat in cat_list: + feature_names.append(f"{cname}_{str(cat)}") + else: + # As a last resort, build feature names from current working_df columns + # by selecting columns that start with the column name + "_" + feature_names = [] + for cname in cat_columns: + feature_names += [c for c in working_df.columns if c.startswith(cname + "_")] + # remove duplicates and ensure these features exist + feature_names = [str(fn) for fn in feature_names] + features = [c for c in features if c not in cat_columns] + feature_names + except Exception as e: + msg = f"Failed to retrieve encoded feature names: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns="\n".join(cat_columns), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + elif encode_method == "label": + # label encoding created columns
_lbl + features = [f"{c}_lbl" if c in cat_columns else c for c in features] + + elif encode_method == "ordinal": + # ordinal encoding created columns _ord + features = [f"{c}_ord" if c in cat_columns else c for c in features] + + # Refresh working_df again (defensive) + working_df = data.get("last_select", working_df) + + # Verify encoded features exist + missing_encoded = [f for f in features if f not in working_df.columns] + if missing_encoded: + # helpful debug output: list what columns do exist that are related + related_columns = [] + for c in cat_columns: + related_columns += [col for col in working_df.columns if col.startswith(c + "_") or col.startswith(c + "_lbl") or col.startswith(c + "_ord")] + msg = f"Encoded features not found in DataFrame: {', '.join(missing_encoded)}" + self._send_message(kernel, "stderr", msg) + if related_columns: + self._send_message(kernel, "stderr", f"Available related columns: {', '.join(related_columns)}") + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=encode_args, + affected_columns="\n".join(missing_encoded), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + except Exception as e: + msg = f"Error during encoding: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 3: Feature selection (if features not provided) + if not features_arg: + try: + select_features_args = f"target={target} method={feature_method} k={k_features} problem={problem} inplace={inplace}" + SelectFeatures(select_features_args).execute(kernel, data) + features = data.get("selected_features", []) + if not features: + msg = "Feature selection failed to return features." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=select_features_args, + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + # Verify selected features exist + working_df = data.get("last_select", working_df) + missing_features = [f for f in features if f not in working_df.columns] + if missing_features: + msg = f"Selected features not found in DataFrame: {', '.join(missing_features)}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=select_features_args, + affected_columns="\n".join(missing_features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + except Exception as e: + msg = f"Error during feature selection: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 4: Scale numeric features + try: + working_df = data.get("last_select", working_df) + num_columns = [c for c in features if c in working_df.columns and pd.api.types.is_numeric_dtype(working_df[c])] + if num_columns: + scale_args = f"columns={','.join(num_columns)} inplace=True" + Standardize(scale_args).execute(kernel, data) + except Exception as e: + msg = f"Error during scaling: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=scale_args if 'scale_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns="\n".join(num_columns) if 'num_columns' in locals() else (",".join(features) if 'features' in locals() else ""), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 5: Split data + try: + split_args = f"test_size={test_size} val_size={val_size} shuffle={shuffle} " \ + f"train_name={train_name} test_name={test_name} val_name={val_name} inplace={inplace}" + if stratify: + split_args += f" stratify={stratify}" + if random_state is not None: + split_args += f" random_state={random_state}" + + SplitData(split_args).execute(kernel, data) + + # Safely check that the split produced valid DataFrames + train_df = data.get(train_name) + test_df = data.get(test_name) + + if train_df is None or train_df.empty or test_df is None or test_df.empty: + msg = "Data splitting failed to produce non-empty train/test sets." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=split_args, + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + except Exception as e: + msg = f"Error during data splitting: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=split_args if 'split_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 6: Model selection or training + try: + # Treat 'auto' and None the same → use SelectModel + if not model_name_arg or model_name_arg == "auto": + select_model_args = ( + f"features={','.join(features)} target={target} cv=5 " + f"primary_metric={primary_metric} problem={problem} output_name={model_store_name} inplace={inplace}" + ) + SelectModel(select_model_args).execute(kernel, data) + self._send_message(kernel, "stdout", "Automatically selected best model via SelectModel.") + else: + # Train a specific model + train_args = ( + f"model={model_name_arg} features={','.join(features)} target={target} " + f"model_name={model_store_name} test_name={test_name} cv={cv} inplace={inplace} problem={problem}" + ) + TrainModel(train_args).execute(kernel, data) + self._send_message(kernel, "stdout", f"Trained specified model '{model_name_arg}'.") + + # Validate model creation + model_obj = data.get(model_store_name) + if model_obj is None: + msg = f"No model object created. Ensure SelectModel or TrainModel supports problem='{problem}'." + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + except Exception as e: + msg = f"Error during model training/selection: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 7: Evaluate model + try: + eval_args = f"model_name={model_store_name} test_name={test_name} problem={problem}" + EvaluateModel(eval_args).execute(kernel, data) + except Exception as e: + msg = f"Error during model evaluation: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=eval_args if 'eval_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Step 8: Save model if requested + if save_path: + try: + # Ensure correct key for SaveModel command + save_args = f"model_name_in_data={model_store_name} save_path={save_path}" + SaveModel(save_args).execute(kernel, data) + self._send_message(kernel, "stdout", f"Model saved to {save_path}.") + except Exception as e: + msg = f"Error saving model: {e}" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=save_args if 'save_args' in locals() else (self.args if isinstance(self.args, str) else str(self.args)), + affected_columns=model_store_name, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + else: + msg = "You must provide save_path=/path/to/file.joblib" + self._send_message(kernel, "stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return False + + # Summary and success metadata + success_msg = "ML pipeline completed successfully." + self._send_message(kernel, "stdout", success_msg) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(features) if 'features' in locals() else "" + message_str = f"{success_msg} model={model_store_name} saved_to={save_path}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + self._send_message(kernel, "stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return True diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py new file mode 100644 index 0000000..6aca48a --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_features.py @@ -0,0 +1,679 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from sklearn.feature_selection import SelectKBest, f_classif, f_regression, RFE, mutual_info_classif, mutual_info_regression, chi2, VarianceThreshold +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LogisticRegression, Lasso +from sklearn.preprocessing import StandardScaler, MinMaxScaler +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class SelectFeatures(MariaMagic): + """ + %select_features target=target_col + [method=correlation|rf_importance|rfe|mutual_info|chi2|anova|l1_selection|variance] + [k=5] [problem=classification|regression] + [output_name=selected_features] [inplace=True|False] + + Identify the best features for training a model on data['last_select']. + Uses all columns except the target column as features. + + Execution metadata is recorded in table `magic_metadata`. + """ + + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "select_features" + + def help(self): + return "Identify the best features for model training from data['last_select']." + + + # -------------------- small utilities -------------------- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + import json + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- metadata / DB helpers (best-effort) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + target = args.get("target") + method = args.get("method", "correlation").lower() + k = args.get("k", 5) + problem_override = args.get("problem", None) + output_name = args.get("output_name", "selected_features") + inplace = bool(args.get("inplace", True)) + + if not target: + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + if target not in df.columns: + msg = f"Target column '{target}' not found in DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Use all columns except the target as features + features = [col for col in df.columns if col != target] + if not features: + msg = "No features available after excluding target column." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Determine problem type + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + tgt_ser = df[target] + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + if pd.api.types.is_float_dtype(tgt_ser) or nunique > 20 or uniq_prop > 0.05: + problem = "regression" + else: + problem = "classification" + else: + problem = "classification" + + # Prepare data + X = df[features].copy() + y = df[target].copy() + + # Handle missing values (simple imputation for feature selection) + try: + if problem == "regression": + X = X.fillna(X.mean(numeric_only=True)) + else: + X = X.fillna(X.mode().iloc[0]) + except Exception: + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + if X.isna().any().any(): + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Scale data for methods that require it + if method in ("chi2", "l1_selection"): + scaler = MinMaxScaler() if method == "chi2" else StandardScaler() + try: + X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) + except Exception as e: + msg = f"Error scaling data: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Feature selection + try: + if method == "correlation": + correlations = X.corrwith(y, method="pearson").abs() + scores = correlations.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "rf_importance": + model = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() + model.fit(X, y) + importances = pd.Series(model.feature_importances_, index=features) + scores = importances.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "rfe": + estimator = RandomForestClassifier() if problem == "classification" else RandomForestRegressor() + selector = RFE(estimator, n_features_to_select=k) + selector.fit(X, y) + ranking = pd.Series(selector.ranking_, index=features) + scores = 1 / (ranking + 1) + selected_features = ranking[ranking == 1].index.tolist() + result_df = pd.DataFrame({ + "Feature": ranking.index, + "Score": scores, + "Ranking": ranking + }).sort_values("Score", ascending=False) + + elif method == "mutual_info": + score_func = mutual_info_classif if problem == "classification" else mutual_info_regression + selector = SelectKBest(score_func=score_func, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "chi2": + if problem != "classification": + msg = "chi2 method is only for classification problems." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if (X < 0).any().any(): + msg = "chi2 requires non-negative features." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + selector = SelectKBest(score_func=chi2, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "anova": + score_func = f_classif if problem == "classification" else f_regression + selector = SelectKBest(score_func=score_func, k=k) + selector.fit(X, y) + scores = pd.Series(selector.scores_, index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "l1_selection": + model = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000) if problem == "classification" else Lasso(alpha=0.01) + model.fit(X, y) + scores = pd.Series(np.abs(model.coef_.ravel() if problem == "classification" else model.coef_), index=features) + scores = scores.sort_values(ascending=False) + selected_features = scores[scores > 0].head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + elif method == "variance": + selector = VarianceThreshold(threshold=0.0) + selector.fit(X) + variances = pd.Series(selector.variances_, index=features) + scores = variances.sort_values(ascending=False) + selected_features = scores.head(k).index.tolist() + result_df = pd.DataFrame({ + "Feature": scores.index, + "Score": scores.values + }) + + else: + msg = "method must be one of 'correlation', 'rf_importance', 'rfe', 'mutual_info', 'chi2', 'anova', 'l1_selection', or 'variance'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + except Exception as e: + msg = f"Error during feature selection: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Store results in data dict + try: + data[output_name] = selected_features + data[output_name + "_meta"] = { + "method": method, + "problem": problem, + "target": target, + "k": k, + "all_scores": result_df.to_dict() + } + except Exception as e: + msg = f"Error storing results: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(selected_features) if 'selected_features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Display results + try: + self._send_html(kernel, result_df, title=f"Feature Selection Results (method={method})") + except Exception: + pass + + success_msg = f"Selected {len(selected_features)} features saved to data['{output_name}']: {', '.join(selected_features)}" + kernel._send_message("stdout", success_msg) + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(selected_features) + message_str = success_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py new file mode 100644 index 0000000..bd16bdd --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/ml_pipeline/select_model.py @@ -0,0 +1,699 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +from sklearn.model_selection import cross_val_score +from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.neural_network import MLPClassifier, MLPRegressor +import logging +import os +import re + +# Optional external libraries +_XGBOOST_AVAILABLE = False +_LIGHTGBM_AVAILABLE = False +_CATBOOST_AVAILABLE = False +try: + from xgboost import XGBClassifier, XGBRegressor + _XGBOOST_AVAILABLE = True +except Exception: + pass + +try: + from lightgbm import LGBMClassifier, LGBMRegressor + _LIGHTGBM_AVAILABLE = True +except Exception: + pass + +try: + from catboost import CatBoostClassifier, CatBoostRegressor + _CATBOOST_AVAILABLE = True +except Exception: + pass + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class SelectModel(MariaMagic): + """ + %select_model target=target_col + [features=col1,col2] [cv=5] [primary_metric=accuracy|r2|f1|precision|recall|mse|mae] + [problem=classification|regression] [output_name=best_model] + [inplace=True|False] [model_params={'rf': {'n_estimators': 100}, 'logistic': {'C': 1.0}}] + + Select the best model by comparing all available models on data['last_select'] using cross-validation. + If features are not provided, uses data['selected_features'] from %select_features. + Tests all metrics (classification: accuracy, f1, precision, recall; regression: r2, mse, mae). + Stores the best model in data[output_name] based on primary_metric and displays a table of performances. + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "select_model" + + def help(self): + return "Select the best model for training from data['last_select'] using cross-validation." + + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + import json + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # -------------------- end metadata helpers -------------------- + + def _choose_model(self, name, problem, params=None): + p = params or {} + name = name.lower() + if name in ("logistic", "logistic_regression", "lr"): + if problem != "classification": + raise ValueError("LogisticRegression is for classification problems.") + return LogisticRegression(max_iter=1000, **p) + if name in ("rf", "random_forest"): + return RandomForestClassifier(**p) if problem == "classification" else RandomForestRegressor(**p) + if name in ("svc", "svm"): + if problem != "classification": + raise ValueError("SVC is for classification problems.") + return SVC(probability=True, **p) + if name in ("linear", "linear_regression"): + if problem != "regression": + raise ValueError("LinearRegression is for regression problems.") + return LinearRegression(**p) + if name == "ridge": + if problem != "regression": + raise ValueError("Ridge is for regression problems.") + return Ridge(**p) + if name == "lasso": + if problem != "regression": + raise ValueError("Lasso is for regression problems.") + return Lasso(**p) + if name == "knn": + return KNeighborsClassifier(**p) if problem == "classification" else KNeighborsRegressor(**p) + if name == "gbm": + return GradientBoostingClassifier(**p) if problem == "classification" else GradientBoostingRegressor(**p) + if name == "ada": + return AdaBoostClassifier(**p) if problem == "classification" else AdaBoostRegressor(**p) + if name == "mlp": + return MLPClassifier(max_iter=1000, **p) if problem == "classification" else MLPRegressor(max_iter=1000, **p) + if name == "xgboost": + if not _XGBOOST_AVAILABLE: + raise ImportError("xgboost not available in this environment.") + return XGBClassifier(**p) if problem == "classification" else XGBRegressor(**p) + if name == "lightgbm": + if not _LIGHTGBM_AVAILABLE: + raise ImportError("lightgbm not available in this environment.") + return LGBMClassifier(**p) if problem == "classification" else LGBMRegressor(**p) + if name == "catboost": + if not _CATBOOST_AVAILABLE: + raise ImportError("catboost not available in this environment.") + p = dict(p) + p.setdefault("verbose", False) + return CatBoostClassifier(**p) if problem == "classification" else CatBoostRegressor(**p) + raise ValueError(f"Unknown model name '{name}'") + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + msg = "No last_select found or DataFrame is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + features_arg = args.get("features") + target = args.get("target") + cv = int(args.get("cv", 5) or 5) + primary_metric = args.get("primary_metric", None) + problem_override = args.get("problem", None) + output_name = args.get("output_name", "best_model") + inplace = bool(args.get("inplace", True)) + model_params = args.get("model_params", {}) or {} + + if not target: + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Use selected_features if features not provided + if not features_arg: + features = data.get("selected_features") + if not features: + msg = "No features provided and no selected_features found. Run %select_features first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + msg = "features must be comma-separated string or list." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + missing = [c for c in features + [target] if c not in df.columns] + if missing: + msg = f"Missing columns in DataFrame: {', '.join(missing)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Determine problem type + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + tgt_ser = df[target] + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + if pd.api.types.is_float_dtype(tgt_ser) or nunique > 20 or uniq_prop > 0.05: + problem = "regression" + else: + problem = "classification" + else: + problem = "classification" + + # Define all available models based on problem type + classification_models = ["logistic", "rf", "svm", "knn", "gbm", "ada", "mlp"] + regression_models = ["linear", "ridge", "lasso", "rf", "knn", "gbm", "ada", "mlp"] + if _XGBOOST_AVAILABLE: + classification_models.append("xgboost") + regression_models.append("xgboost") + if _LIGHTGBM_AVAILABLE: + classification_models.append("lightgbm") + regression_models.append("lightgbm") + if _CATBOOST_AVAILABLE: + classification_models.append("catboost") + regression_models.append("catboost") + models = classification_models if problem == "classification" else regression_models + + # Define all metrics + metrics = { + "classification": ["accuracy", "f1", "precision", "recall"], + "regression": ["r2", "mse", "mae"] + } + if primary_metric is None: + primary_metric = "accuracy" if problem == "classification" else "r2" + if primary_metric not in metrics[problem]: + msg = f"Invalid primary_metric '{primary_metric}' for {problem}. Choose from {', '.join(metrics[problem])}." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Prepare data + X = df[features].copy() + y = df[target].copy() + + # Handle missing values + X = X.fillna(X.mean(numeric_only=True)) if problem == "regression" else X.fillna(X.mode().iloc[0]) + if X.isna().any().any(): + msg = "Features contain non-numeric data or unhandled missing values." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Evaluate models across all metrics + results = [] + best_model = None + best_score = -float("inf") if primary_metric not in ("mse", "mae") else float("inf") + best_model_name = None + + for model_name in models: + try: + # Get model-specific parameters + params = model_params.get(model_name, {}) if isinstance(model_params, dict) else {} + model = self._choose_model(model_name, problem, params) + model_result = {"Model": model_name} + + # Evaluate all metrics + for metric in metrics[problem]: + scoring = metric if metric in ("accuracy", "f1", "precision", "recall", "r2") else ( + "neg_mean_squared_error" if metric == "mse" else "neg_mean_absolute_error" + ) + cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring) + mean_score = np.mean(cv_scores) + std_score = np.std(cv_scores) + if metric in ("mse", "mae"): + mean_score = -mean_score # Convert to positive + model_result[f"{metric}_Mean"] = mean_score + model_result[f"{metric}_Std"] = std_score + + results.append(model_result) + + # Update best model based on primary_metric + current_score = model_result[f"{primary_metric}_Mean"] + if primary_metric in ("mse", "mae"): + if current_score < best_score: + best_score = current_score + best_model = model + best_model_name = model_name + else: + if current_score > best_score: + best_score = current_score + best_model = model + best_model_name = model_name + + except Exception as e: + # Log the model-level failure but continue with other models + kernel._send_message("stderr", f"Error evaluating model '{model_name}': {e}") + continue + + if not results: + msg = "No models were successfully evaluated." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Create results DataFrame + result_df = pd.DataFrame(results) + for metric in metrics[problem]: + result_df[f"{metric}_Mean"] = result_df[f"{metric}_Mean"].round(4) + result_df[f"{metric}_Std"] = result_df[f"{metric}_Std"].round(4) + result_df = result_df.sort_values(f"{primary_metric}_Mean", ascending=primary_metric in ("mse", "mae")) + + # Fit the best model on the full training data + try: + best_model.fit(X, y) + except Exception as e: + msg = f"Error fitting best model '{best_model_name}': {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Store the best model and metadata + try: + data[output_name] = best_model + data[output_name + "_meta"] = { + "model_name": best_model_name, + "problem": problem, + "features": features, + "target": target, + "primary_metric": primary_metric, + "cv": cv, + "score": float(best_score), + "all_results": result_df.to_dict() + } + if hasattr(best_model, "classes_"): + data[output_name + "_meta"]["classes"] = list(getattr(best_model, "classes_")) + except Exception as e: + msg = f"Error storing best model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Display results + self._send_html(kernel, result_df, title=f"Model Selection Results (primary_metric={primary_metric})") + success_msg = f"Best model '{best_model_name}' (mean {primary_metric}={best_score:.4f}) saved to data['{output_name}']." + kernel._send_message("stdout", success_msg) + + # Insert metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(features) + message_str = success_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py new file mode 100644 index 0000000..b038f24 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/evaluate_model.py @@ -0,0 +1,715 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import joblib +import json +import logging +import os +import re + +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, + mean_squared_error, mean_absolute_error, r2_score, + roc_auc_score, classification_report +) +from sklearn.preprocessing import LabelEncoder + +import matplotlib +# Use non-interactive backend if needed (safe in most notebook envs) +matplotlib.use("Agg") +import matplotlib.pyplot as plt +import io +import base64 + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class EvaluateModel(MariaMagic): + """ + %evaluate_model [model_name=last_model] [test_name=last_select_test] [pred_name=last_preds] + [problem=classification|regression] + + Nice, visual evaluation of a trained model: metrics card, confusion-matrix plot, + classification report and a preview table of actual vs predicted. + + This version adds metadata logging to magic_metadata table: + - creates magic_metadata if needed + - logs error rows on failures and a success row on successful evaluation + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "evaluate_model" + + def help(self): + return "Evaluate a trained model on a test DataFrame and show metrics + predictions." + + # reuse helpers from previous version + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _send_raw_html(self, kernel, html): + """Send raw HTML to the frontend.""" + try: + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + def _plot_confusion_matrix_to_datauri(self, cm, labels): + """Draw confusion matrix (matplotlib) and return data URI PNG.""" + fig, ax = plt.subplots(figsize=(6, 5)) + im = ax.imshow(cm, interpolation='nearest') + ax.set_title("Confusion matrix") + ax.set_xlabel("Predicted") + ax.set_ylabel("Actual") + + # Set tick labels + ax.set_xticks(np.arange(len(labels))) + ax.set_yticks(np.arange(len(labels))) + ax.set_xticklabels(labels, rotation=45, ha="right") + ax.set_yticklabels(labels) + + # Annotate cells + thresh = cm.max() / 2.0 if cm.size else 0 + for i in range(cm.shape[0]): + for j in range(cm.shape[1]): + ax.text(j, i, format(int(cm[i, j]), 'd'), + ha="center", va="center", + fontsize=10) + + fig.tight_layout() + + buf = io.BytesIO() + fig.savefig(buf, format="png", bbox_inches="tight") + plt.close(fig) + buf.seek(0) + data = base64.b64encode(buf.read()).decode("ascii") + return f"data:image/png;base64,{data}" + + # -------------------- metadata helpers (copied/adapted) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + model_store_name = args.get("model_name", args.get("model", "last_model")) + test_name = args.get("test_name", "last_select_test") + pred_name = args.get("pred_name", "last_preds") + problem_override = args.get("problem", None) + + # fetch model + model = data.get(model_store_name) + if model is None: + msg = f"No model found in data['{model_store_name}']. Train and save a model first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # fetch test set + test_df = data.get(test_name) + if test_df is None or not isinstance(test_df, pd.DataFrame) or test_df.empty: + msg = f"No test DataFrame found in data['{test_name}'] or it is empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # infer problem if not provided + if problem_override: + problem = problem_override.lower() + else: + is_classifier = any(attr in dir(model) for attr in ("predict_proba", "decision_function", "classes_")) + problem = "classification" if is_classifier else "regression" + + # get meta + meta = data.get(model_store_name + "_meta", {}) or {} + features = meta.get("features") + target_col = meta.get("target") or meta.get("target_col") + + # fallback target inference + if not target_col: + if features: + possible_targets = [c for c in test_df.columns if c not in features] + if len(possible_targets) == 1: + target_col = possible_targets[0] + + if not target_col: + msg = ("Target column not found in model meta and could not be inferred from test DataFrame. " + "Set data[model_name + '_meta']['target']='' when training, or pass target info in meta.") + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if target_col not in test_df.columns: + msg = f"Target column '{target_col}' not present in test DataFrame '{test_name}'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if not features: + msg = "Model metadata does not contain 'features' list. Cannot build X_test." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + missing_features = [c for c in features if c not in test_df.columns] + if missing_features: + msg = f"Test DataFrame missing feature columns: {', '.join(missing_features)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + X_test = test_df[features].copy() + y_true_orig = test_df[target_col].copy() + + # Predict + try: + preds_raw = model.predict(X_test) + except Exception as e: + msg = f"Error during prediction: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # predict_proba if available + pred_proba = None + if problem == "classification" and hasattr(model, "predict_proba"): + try: + proba = model.predict_proba(X_test) + if proba.ndim == 2 and proba.shape[1] == 2: + pred_proba = proba[:, 1].tolist() + else: + pred_proba = proba.tolist() + except Exception: + pred_proba = None + + # human-readable preds + preds_display = preds_raw + model_classes = getattr(model, "classes_", None) + try: + if model_classes is not None and pd.api.types.is_integer_dtype(np.asarray(preds_raw).dtype): + preds_display = np.asarray(model_classes)[np.asarray(preds_raw).astype(int)] + except Exception: + preds_display = preds_raw + + # predictions DataFrame + preds_df = test_df.copy(deep=True) + preds_df["_predicted"] = preds_display + if pred_proba is not None: + preds_df["_pred_proba"] = pred_proba + data[pred_name] = preds_df + + # metrics calculation + out_lines = [] + metrics_html = "" + cm_image_uri = None + + # We'll collect a compact metrics summary to store in metadata message on success + metrics_summary = {} + + if problem == "classification": + y_true_vals = np.asarray(y_true_orig) + preds_vals = np.asarray(preds_display) + + def is_mixed(a, b): + return (pd.api.types.is_numeric_dtype(a) and not pd.api.types.is_numeric_dtype(b)) or \ + (pd.api.types.is_numeric_dtype(b) and not pd.api.types.is_numeric_dtype(a)) + + if is_mixed(y_true_vals.dtype, preds_vals.dtype): + y_metric = np.asarray(y_true_orig.astype(str)) + p_metric = np.asarray(pd.Series(preds_display).astype(str)) + else: + if pd.api.types.is_numeric_dtype(y_true_vals) and pd.api.types.is_numeric_dtype(preds_vals): + y_metric = y_true_vals.astype(float) + p_metric = preds_vals.astype(float) + else: + y_metric = np.asarray(y_true_orig.astype(str)) + p_metric = np.asarray(pd.Series(preds_display).astype(str)) + + try: + acc = accuracy_score(y_metric, p_metric) + prec = precision_score(y_metric, p_metric, average="weighted", zero_division=0) + rec = recall_score(y_metric, p_metric, average="weighted", zero_division=0) + f1 = f1_score(y_metric, p_metric, average="weighted", zero_division=0) + cm = confusion_matrix(y_metric, p_metric) + except Exception as e: + msg = f"Error computing classification metrics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + metrics_summary.update({"accuracy": float(acc), "precision": float(prec), + "recall": float(rec), "f1": float(f1)}) + + # ROC AUC if possible + roc_text = "N/A" + if pred_proba is not None and model_classes is not None: + try: + class_to_idx = {str(c): i for i, c in enumerate(model_classes)} + y_idx = np.array([class_to_idx.get(str(v), None) for v in y_true_orig]) + if None in y_idx: + roc_text = "Not computable: some test classes missing from model.classes_." + else: + proba_arr = np.asarray(pred_proba) + if proba_arr.ndim == 1: + roc_auc = roc_auc_score(y_idx.astype(int), proba_arr.astype(float)) + roc_text = f"{roc_auc:.4f}" + metrics_summary["roc_auc"] = float(roc_auc) + else: + roc_auc = roc_auc_score(y_idx.astype(int), proba_arr, multi_class="ovr", average="weighted") + roc_text = f"{roc_auc:.4f}" + metrics_summary["roc_auc"] = float(roc_auc) + except Exception: + roc_text = "Computation failed." + + # Prepare metrics HTML card + metrics_html = f""" +
+
+

Metrics

+
cells + m = re.findall(r"(.*?)
+ + + + + +
Accuracy{acc:.4f}
Precision (w){prec:.4f}
Recall (w){rec:.4f}
F1 (w){f1:.4f}
ROC AUC{roc_text}
+ + """ + + # Render confusion matrix as image and embed + try: + if model_classes is not None: + label_names = [str(c) for c in model_classes] + else: + # derive from the union of unique labels in y_metric and p_metric + uniq = sorted(set(np.unique(y_metric).tolist() + np.unique(p_metric).tolist()), key=lambda x: str(x)) + label_names = [str(x) for x in uniq] + cm_arr = np.asarray(cm, dtype=int) + cm_image_uri = self._plot_confusion_matrix_to_datauri(cm_arr, label_names) + metrics_html += f'
confusion matrix
' + except Exception: + metrics_html += '
Confusion matrix image failed to render.
' + + metrics_html += "" # close flex container + + # classification report text + try: + target_names = [str(c) for c in model_classes] if model_classes is not None else None + report = classification_report(y_metric, p_metric, zero_division=0, target_names=target_names) + except Exception: + report = "Classification report not available." + + else: + # regression branch + try: + preds_num = np.asarray(preds_raw).astype(float) + y_true_num = np.asarray(y_true_orig).astype(float) + rmse = float(np.sqrt(mean_squared_error(y_true_num, preds_num))) + mae = float(mean_absolute_error(y_true_num, preds_num)) + r2 = float(r2_score(y_true_num, preds_num)) + except Exception as e: + msg = f"Error computing regression metrics: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + metrics_summary.update({"rmse": float(rmse), "mae": float(mae), "r2": float(r2)}) + + metrics_html = f""" +
+

Regression metrics

+ + + + +
RMSE{rmse:.4f}
MAE{mae:.4f}
{r2:.4f}
+
+ """ + report = None + + # Build final HTML to display (metrics + classification report text) + html_parts = [ + f"
", + metrics_html + ] + if problem == "classification": + html_parts.append("

Classification report

") + html_parts.append(f"
{report}
") + # also add textual confusion matrix below if image not present + if cm_image_uri is None and 'cm' in locals(): + html_parts.append("

Confusion matrix

")
+                html_parts.append(str(cm.tolist()))
+                html_parts.append("
") + html_parts.append("
") + + # send HTML + try: + self._send_raw_html(kernel, "\n".join(html_parts)) + except Exception: + pass + + # then show predictions table (actual vs predicted) using your helper + try: + # show a limited set (up to 200 rows) + display_df = preds_df[[target_col, "_predicted"] + (["_pred_proba"] if "_pred_proba" in preds_df.columns else [])] + self._send_html(kernel, display_df.head(200), title="Predictions preview (actual vs predicted)") + except Exception: + pass + + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = "\n".join(features) + # craft a concise message describing main metrics + if problem == "classification": + main_metrics = ", ".join(f"{k}={v:.4f}" for k, v in metrics_summary.items()) + else: + main_metrics = ", ".join(f"{k}={v:.4f}" for k, v in metrics_summary.items()) + message_str = f"Evaluation success. Model='{model_store_name}', test='{test_name}', preds_saved='{pred_name}'. {main_metrics}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py new file mode 100644 index 0000000..1fffe14 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/loadmodel.py @@ -0,0 +1,370 @@ +import joblib +import shlex +import json +from distutils import util +import logging +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import os +import re +import pandas as pd + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +def _str_to_obj(s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + +class LoadModel(MariaMagic): + """ + %load_model load_path=/tmp/model.joblib [target_key=last_model] + + Loads a locally saved .joblib model into the `data` dictionary. + + This version: + - detects save formats: + * raw model object (backwards compatible) + * dict {"model": , "meta": } (round-trip with SaveModel) + - restores meta into data[target_key + '_meta'] when available + - attempts minimal inference of features from model if present (feature_names_in_) + - logs metadata to magic_metadata table (creates it if necessary) + """ + + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger(__name__) + + def type(self): + return "Line" + + def name(self): + return "load_model" + + def help(self): + return "Load a saved model from a local .joblib file." + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = _str_to_obj(v) + return pairs + + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + load_path = args.get("load_path") + target_key = args.get("target_key", "last_model") + + if not load_path: + msg = "You must provide load_path=/path/to/file.joblib" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=target_key or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Attempt to load + try: + loaded = joblib.load(load_path) + + # Detect saved structure: either raw model, or {"model": model, "meta": meta} + model_obj = None + restored_meta = None + + if isinstance(loaded, dict) and "model" in loaded: + model_obj = loaded.get("model") + restored_meta = loaded.get("meta") if isinstance(loaded.get("meta"), dict) else None + else: + # backwards-compatible: raw model object + model_obj = loaded + restored_meta = None + + # Minimal inference: if no meta restored, but model exposes feature_names_in_, capture it + try: + if not isinstance(restored_meta, dict): + inferred_meta = {} + if hasattr(model_obj, "feature_names_in_"): + try: + inferred_meta["features"] = list(getattr(model_obj, "feature_names_in_")) + except Exception: + pass + # If we inferred something, assign restored_meta to keep behavior consistent + if inferred_meta: + restored_meta = inferred_meta + # store model and meta into data + data[target_key] = model_obj + if isinstance(restored_meta, dict) and restored_meta: + data[target_key + "_meta"] = restored_meta + except Exception: + # if for any reason storing meta fails, just store the model + data[target_key] = model_obj + + # Prepare success message + meta_info = "" + try: + if isinstance(restored_meta, dict) and restored_meta: + feat = restored_meta.get("features") + tgt = restored_meta.get("target") or restored_meta.get("target_col") + parts = [] + if feat: + parts.append(f"features[{len(feat)}]") + if tgt: + parts.append(f"target={tgt}") + if parts: + meta_info = " (" + ", ".join(parts) + ")" + except Exception: + meta_info = "" + + success_msg = f"Loaded model from {load_path} → data['{target_key}']{meta_info}" + kernel._send_message("stdout", success_msg) + + # write success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = target_key + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=success_msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + except Exception as e: + msg = f"Failed to load model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=target_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py new file mode 100644 index 0000000..38ea857 --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/predict.py @@ -0,0 +1,553 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import pandas as pd +import numpy as np +import shlex +import json +from distutils import util +import logging +import os +import re + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class Predict(MariaMagic): + """ + %predict_model model_name=last_model data_name=last_select_test output_name=last_preds + [show_cols=10] [proba=True|False] + + You can also provide inline values: + %predict_model model_name=last_model data_name=[38, 80000.0] output_name=last_preds + + This version records metadata into magic_metadata table (creates it if needed), + logging errors and a final success entry on completion. + """ + + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger(__name__) + + def type(self): + return "Line" + + def name(self): + return "predict_model" + + def help(self): + return "Run predictions using a trained model stored in data[model_name], with optional inline feature values." + + def _str_to_obj(self, s): + # try to interpret numbers, booleans, lists, or JSON + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + # strip quotes + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- metadata helpers (copied/adapted) -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + model_name = args.get("model_name", "last_model") + data_arg = args.get("data_name", "last_select_test") + output_name = args.get("output_name", "last_preds") + show_cols = int(args.get("show_cols", 10)) + show_proba = bool(args.get("proba", False)) + + # --- 1. Retrieve model --- + model = data.get(model_name) + if model is None: + msg = f"No model found in data['{model_name}']. Train one first." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_name or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # --- 2. Load metadata --- + meta = data.get(model_name + "_meta", {}) + features = meta.get("features") + problem = meta.get("problem", "regression") + + if not features: + kernel._send_message("stderr", "Model meta missing 'features'. Using numeric columns only if applicable.") + # leave features as empty list so we can attempt to infer columns from df later + features = [] + + # --- 3. Determine input mode --- + df = None + inline_used = False + inline_vals = None + if isinstance(data_arg, list): + # Inline list of feature values + inline_used = True + inline_vals = data_arg + if not features: + msg = "Cannot use inline values: model has no stored feature names." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if len(data_arg) != len(features): + msg = f"Number of values ({len(data_arg)}) doesn't match expected features ({len(features)}): {features}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + df = pd.DataFrame([data_arg], columns=features) + kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, data_arg))}") + + elif isinstance(data_arg, str) and data_arg.startswith("[") and data_arg.endswith("]"): + # If user passed JSON array as string, parse it + try: + vals = json.loads(data_arg) + inline_used = True + inline_vals = vals + if not features: + msg = "Cannot use inline values: model has no stored feature names." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if len(vals) != len(features): + msg = f"Number of values ({len(vals)}) doesn't match expected features ({len(features)}): {features}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + df = pd.DataFrame([vals], columns=features) + kernel._send_message("stdout", f"Using inline feature values for prediction: {dict(zip(features, vals))}") + except Exception as e: + msg = f"Error parsing inline data list: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + # DataFrame-based mode + df = data.get(data_arg) + if df is None or (isinstance(df, pd.DataFrame) and df.empty): + msg = f"No DataFrame found in data['{data_arg}'] or it's empty." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features) if features else data_arg, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if not isinstance(df, pd.DataFrame): + msg = f"data['{data_arg}'] is not a pandas DataFrame." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=data_arg, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # --- 4. Align columns to features --- + df_cols = df.columns.tolist() + missing = [c for c in features if c not in df_cols] + extra = [c for c in df_cols if c not in features] + + if missing: + # we fill missing with zeros (behavior from original) + kernel._send_message("stderr", f"Missing columns not in input: {missing}. Filling with zeros.") + if extra: + kernel._send_message("stderr", f"Ignoring extra columns not seen during training: {extra}.") + + # If no features are known, attempt to use numeric columns from df + if not features: + # prefer numeric columns + numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() + if numeric_cols: + features = numeric_cols + kernel._send_message("stdout", f"Inferred features from numeric columns: {features}") + else: + msg = "No features available and could not infer numeric columns for prediction." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + X = pd.DataFrame({col: df[col] if col in df.columns else 0 for col in features}) + + # --- 5. Run predictions --- + try: + if show_proba and problem == "classification" and hasattr(model, "predict_proba"): + preds = model.predict_proba(X) + if hasattr(model, "classes_"): + class_labels = [str(c) for c in model.classes_] + pred_df = pd.DataFrame(preds, columns=[f"proba_{c}" for c in class_labels]) + else: + pred_df = pd.DataFrame(preds, columns=[f"proba_{i}" for i in range(preds.shape[1])]) + else: + y_pred = model.predict(X) + pred_df = pd.DataFrame(y_pred, columns=["prediction"]) + except Exception as e: + msg = f"Error during prediction: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=",".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # --- 6. Save & display --- + data[output_name] = pred_df + + try: + # prefer html display if kernel supports it + try: + self._send_html(kernel, pred_df.head(show_cols), title=f"Predictions ({output_name})") + except Exception: + kernel._send_message("stdout", pred_df.head(show_cols).to_string(index=False)) + except Exception: + pass + + success_msg = f"Predictions stored in data['{output_name}'] with shape={pred_df.shape}" + kernel._send_message("stdout", success_msg) + + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_cols_str = "\n".join(features) if features else "" + # include a short summary mentioning whether inline values were used + inline_part = f" inline_values={inline_vals}" if inline_used else "" + message_str = f"Prediction success. model={model_name}, data_arg={data_arg}, output={output_name}, shape={pred_df.shape}{inline_part}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_cols_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py new file mode 100644 index 0000000..460edfe --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/savemodel.py @@ -0,0 +1,385 @@ +import joblib +import shlex +import json +import time +from distutils import util +import logging +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import os +import re +import pandas as pd + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +def _str_to_obj(s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + +class SaveModel(MariaMagic): + """ + %save_model model_name_in_data=last_model save_path=/tmp/model.joblib [overwrite=True|False] + + Saves a trained model (from the `data` dict) to a local file using joblib. + + This version: + - If data contains model_key + "_meta", saves a dict {"model": model, "meta": meta} + - Otherwise saves the raw model object (backwards compatible) + - Writes metadata rows to magic_metadata (creates table if needed) + """ + + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger(__name__) + + def type(self): + return "Line" + + def name(self): + return "save_model" + + def help(self): + return "Save a trained model to a local .joblib file." + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = _str_to_obj(v) + return pairs + + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # parse args + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + model_key = args.get("model_name_in_data", "last_model") + save_path = args.get("save_path") + overwrite = bool(args.get("overwrite", False)) + + if not save_path: + msg = "You must provide save_path=/path/to/file.joblib" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key or "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + model_obj = data.get(model_key) + if model_obj is None: + msg = f"No model found in data['{model_key}']." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # If file exists and overwrite=False + if os.path.exists(save_path) and not overwrite: + msg = f"File {save_path} already exists. Use overwrite=True to replace it." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Try to include model metadata (if present) into the saved file so LoadModel can restore it + meta = data.get(model_key + "_meta") + + try: + os.makedirs(os.path.dirname(save_path), exist_ok=True) + if isinstance(meta, dict) and meta: + # Save both model and meta in a single object for round-trip + save_obj = {"model": model_obj, "meta": meta} + joblib.dump(save_obj, save_path) + else: + # No meta available, save raw model (backwards compatible) + joblib.dump(model_obj, save_path) + + success_msg = f"Model from data['{model_key}'] saved to {save_path}" + kernel._send_message("stdout", success_msg) + + # Insert success metadata + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = model_key + # If meta exists, include a short description of key meta fields (features/target) in the message + meta_info = "" + try: + if isinstance(meta, dict): + feat = meta.get("features") + tgt = meta.get("target") or meta.get("target_col") + parts = [] + if feat: + # limit length to avoid overly long field in DB + parts.append(f"features[{len(feat)}]") + if tgt: + parts.append(f"target={tgt}") + if parts: + meta_info = " (" + ", ".join(parts) + ")" + except Exception: + meta_info = "" + message_str = f"Saved model to {save_path}{meta_info}" + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + except Exception as e: + msg = f"Failed to save model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns=model_key, + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return diff --git a/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py new file mode 100644 index 0000000..905ad8a --- /dev/null +++ b/mariadb_kernel/maria_magics/ml_commands/model_training/train_model.py @@ -0,0 +1,655 @@ +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +import shlex +from distutils import util +import pandas as pd +import numpy as np +import joblib +import json +import logging +import os +import re + +from sklearn.model_selection import cross_val_score +from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor +from sklearn.neural_network import MLPClassifier, MLPRegressor + +# Optional external libraries +_XGBOOST_AVAILABLE = False +_LIGHTGBM_AVAILABLE = False +_CATBOOST_AVAILABLE = False +try: + from xgboost import XGBClassifier, XGBRegressor + _XGBOOST_AVAILABLE = True +except Exception: + pass + +try: + from lightgbm import LGBMClassifier, LGBMRegressor + _LIGHTGBM_AVAILABLE = True +except Exception: + pass + +try: + from catboost import CatBoostClassifier, CatBoostRegressor + _CATBOOST_AVAILABLE = True +except Exception: + pass + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class TrainModel(MariaMagic): + """ + %train_model model= features=col1,col2 target=target_col + [cv=0] [problem=classification|regression] + [model_name=last_model] [pred_name=last_preds] [test_name=last_select_test] + [save_path=/path/to/model.joblib] [inplace=True|False] [model_params={'n':1}] + + Train a model on data["last_select"] (TRAINING set). This magic DOES NOT perform + splitting or scaling — run your preprocessing and %splitdata beforehand. + + This version adds metadata logging to magic_metadata table similar to SelectModel: + - Ensures magic_metadata exists in current database + - Inserts error/success rows for operations + """ + def __init__(self, args=""): + self.args = args + + def type(self): + return "Line" + + def name(self): + return "train_model" + + def help(self): + return "Train a model on data['last_select'] (no split or scaling)." + + def _str_to_obj(self, s): + # try int/float/bool, then JSON, then string unquote + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + # try json + try: + return json.loads(s) + except Exception: + pass + # strip quotes + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_str): + if not input_str or input_str.strip() == "": + return {} + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _send_html(self, kernel, df, title=None): + try: + html = df.to_html(index=False) + if title: + html = f"

{title}

" + html + kernel.send_response(kernel.iopub_socket, "display_data", + {"data": {"text/html": html}, "metadata": {}}) + except Exception: + pass + + # -------------------- small utilities for metadata -------------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas + try: + dfs = pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape(arguments) + affected_sql = self._sql_escape(affected_columns) + status_sql = self._sql_escape(operation_status) + message_sql = self._sql_escape(message) + db_sql = self._sql_escape(db_name) + user_sql = self._sql_escape(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + # -------------------- end metadata helpers -------------------- + + def _choose_model(self, name, problem, params=None): + p = params or {} + name = name.lower() + # Classification vs regression models where appropriate + if name in ("logistic", "logistic_regression", "lr"): + if problem != "classification": + raise ValueError("LogisticRegression is for classification problems.") + return LogisticRegression(max_iter=1000, **p) + if name in ("rf", "random_forest"): + return RandomForestClassifier(**p) if problem == "classification" else RandomForestRegressor(**p) + if name in ("svc", "svm"): + if problem != "classification": + raise ValueError("SVC is for classification problems.") + return SVC(probability=True, **p) + if name in ("linear", "linear_regression"): + if problem != "regression": + raise ValueError("LinearRegression is for regression problems.") + return LinearRegression(**p) + if name == "ridge": + if problem != "regression": + raise ValueError("Ridge is for regression problems.") + return Ridge(**p) + if name == "lasso": + if problem != "regression": + raise ValueError("Lasso is for regression problems.") + return Lasso(**p) + if name == "knn": + return KNeighborsClassifier(**p) if problem == "classification" else KNeighborsRegressor(**p) + if name == "gbm": + return GradientBoostingClassifier(**p) if problem == "classification" else GradientBoostingRegressor(**p) + if name == "ada": + return AdaBoostClassifier(**p) if problem == "classification" else AdaBoostRegressor(**p) + if name == "mlp": + return MLPClassifier(max_iter=1000, **p) if problem == "classification" else MLPRegressor(max_iter=1000, **p) + if name == "xgboost": + if not _XGBOOST_AVAILABLE: + raise ImportError("xgboost not available in this environment.") + return XGBClassifier(**p) if problem == "classification" else XGBRegressor(**p) + if name == "lightgbm": + if not _LIGHTGBM_AVAILABLE: + raise ImportError("lightgbm not available in this environment.") + return LGBMClassifier(**p) if problem == "classification" else LGBMRegressor(**p) + if name == "catboost": + if not _CATBOOST_AVAILABLE: + raise ImportError("catboost not available in this environment.") + # CatBoost often prints to stdout; keep default verbose False + p = dict(p) + p.setdefault("verbose", False) + return CatBoostClassifier(**p) if problem == "classification" else CatBoostRegressor(**p) + raise ValueError(f"Unknown model name '{name}'") + + def execute(self, kernel, data): + # Prepare metadata context early + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + try: + self._ensure_metadata_table(kernel, db_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).") + except Exception: + pass + + # Load training DataFrame + df = data.get("last_select") + if df is None or df.empty: + msg = "No last_select found or DataFrame is empty (training set required)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + try: + args = self.parse_args(self.args) + except Exception: + msg = "Error parsing arguments. Use key=value syntax." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + features_arg = args.get("features") + target = args.get("target") + model_name_arg = args.get("model", "rf") + cv = int(args.get("cv", 0) or 0) + problem_override = args.get("problem", None) + test_name = args.get("test_name", "last_select_test") + model_store_name = args.get("model_name", "last_model") + inplace = bool(args.get("inplace", True)) + model_params = args.get("model_params", {}) or {} + + if not features_arg: + msg = "features argument is required (features=col1,col2...)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + if not target: + msg = "target argument is required (target=target_col)." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # parse features + if isinstance(features_arg, str): + features = [c.strip() for c in features_arg.split(",") if c.strip()] + elif isinstance(features_arg, (list, tuple)): + features = list(features_arg) + else: + msg = "features must be comma-separated string or list." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + missing = [c for c in features + [target] if c not in df.columns] + if missing: + msg = f"Missing columns in training DataFrame: {', '.join(missing)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Determine problem type + if problem_override: + problem = problem_override.lower() + if problem not in ("classification", "regression"): + msg = "problem must be 'classification' or 'regression'." + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + else: + # improved heuristic for problem detection + tgt_ser = df[target] + + if pd.api.types.is_numeric_dtype(tgt_ser): + nunique = int(tgt_ser.nunique(dropna=True)) + non_null_count = max(1, len(tgt_ser.dropna())) + uniq_prop = nunique / non_null_count + + # treat as regression if: + # - float dtype, or + # - many distinct values (>20), or + # - distinct proportion high (e.g. >5% of rows) + if pd.api.types.is_float_dtype(tgt_ser) or (nunique > 20) or (uniq_prop > 0.05): + problem = "regression" + else: + # few distinct integer-like values -> classification (categorical target) + problem = "classification" + else: + problem = "classification" + + # Prepare X_train, y_train + X_train = df[features].copy() + y_train = df[target].copy() + + # NOTE: test set (if present) will be ignored in this modified flow — no predictions or metrics. + # Keep reading test_df only to validate presence but do not use it. + test_df = data.get(test_name) + if isinstance(test_df, pd.DataFrame) and not test_df.empty: + missing_test = [c for c in features + [target] if c not in test_df.columns] + if missing_test: + msg = f"Test DataFrame '{test_name}' missing columns: {', '.join(missing_test)}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Instantiate model + try: + model = self._choose_model(model_name_arg, problem, params=model_params) + except Exception as e: + msg = f"Error creating model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Cross-validation on training set if requested (kept) + cv_results = None + if cv and cv > 1: + try: + scoring = "accuracy" if problem == "classification" else "r2" + cv_results = cross_val_score(model, X_train, y_train, cv=cv, scoring=scoring) + except Exception as e: + msg = f"Error during cross-validation: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Fit + try: + model.fit(X_train, y_train) + except Exception as e: + msg = f"Error fitting model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features), + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Store only the trained model and minimal meta (no preds, no metrics, no joblib saving) + try: + data[model_store_name] = model + + # Save metadata including target so evaluate_model can find it + meta = data.setdefault(model_store_name + "_meta", {}) + meta["problem"] = problem + meta["features"] = features + meta["target"] = target + + # If model exposes classes_, save them for easier decoding later + if hasattr(model, "classes_"): + try: + meta["classes"] = list(getattr(model, "classes_")) + except Exception: + pass + + except Exception as e: + msg = f"Error storing model: {e}" + kernel._send_message("stderr", msg) + try: + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=self.args if isinstance(self.args, str) else str(self.args), + affected_columns="\n".join(features) if 'features' in locals() else "", + operation_status="error", + message=msg, + db_name=db_name, + user_name=user_name + ) + except Exception: + pass + return + + # Output concise summary + out_lines = [f"Model '{model_name_arg}' trained and saved to data['{model_store_name}']. problem={problem}. train_rows={len(X_train)}"] + if cv_results is not None: + out_lines.append(f"cross-val (cv={cv}) scores: mean={float(np.mean(cv_results)):.4f}, std={float(np.std(cv_results)):.4f}") + summary_msg = "\n".join(out_lines) + kernel._send_message("stdout", summary_msg) + + # Insert success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "\n".join(features) + message_str = summary_msg + self._insert_metadata( + kernel=kernel, + command_name=self.name(), + arguments=args_for_db, + affected_columns=affected_columns_str, + operation_status="success", + message=message_str, + db_name=db_name, + user_name=user_name + ) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/rag_commands/maria_ingest.py b/mariadb_kernel/maria_magics/rag_commands/maria_ingest.py new file mode 100644 index 0000000..f94c11f --- /dev/null +++ b/mariadb_kernel/maria_magics/rag_commands/maria_ingest.py @@ -0,0 +1,1005 @@ +# mariadb_kernel/maria_magics/maria_ingest.py +import shlex +import json +import math +import logging +import os +import io +import re +import numpy as np +from distutils import util +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +from mariadb_kernel.mariadb_client import MariaDBClient + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# optional file extractors +_PYPDF2_AVAILABLE = False +_PYDOCX_AVAILABLE = False +try: + import PyPDF2 + _PYPDF2_AVAILABLE = True +except Exception: + _PYPDF2_AVAILABLE = False +try: + import docx + _PYDOCX_AVAILABLE = True +except Exception: + _PYDOCX_AVAILABLE = False + +# IPython history fallback +try: + from IPython import get_ipython +except Exception: + get_ipython = None + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class MariaIngest(MariaMagic): + """ + Ingest text documents into MariaDB, chunk them, and store embeddings. + + This variant reduces noisy logging and prints only important status/warnings. + + Added: metadata logging into magic_metadata table similar to TrainModel: + - Ensures magic_metadata exists in current database + - Inserts error/success rows for operations + """ + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaIngest") + + def type(self): + return "Cell" + + def name(self): + return "maria_ingest" + + def help(self): + return "Ingest docs -> chunk -> embeddings. Uses native VECTOR when compatible; otherwise falls back to JSON. (cleaned logs)" + + # ---- utilities ---- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + if input_obj is None: + return {} + if isinstance(input_obj, dict): + return input_obj + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + input_str = input_obj.strip() + if input_str == "": + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception: + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + # NOTE: keep existing _sql_escape for SQL literals used in queries (returns unquoted for non-strings) + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + # Metadata helpers (copied/adapted from TrainModel to provide consistent metadata logging) + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ---- end metadata helpers ---- + + def _simple_chunk(self, text: str, chunk_size: int, overlap: int): + if not text: + return [] + t = text.strip() + if len(t) <= chunk_size: + return [t] + chunks = [] + start = 0 + L = len(t) + while start < L: + end = min(L, start + chunk_size) + if end < L: + look_ahead = t[end: min(L, end + 100)] + idx_nl = look_ahead.find("\n") + idx_dot = look_ahead.find(".") + if idx_nl != -1: + end += idx_nl + 1 + elif idx_dot != -1: + end += idx_dot + 1 + chunk = t[start:end].strip() + if chunk: + chunks.append(chunk) + if end >= L: + break + start = max(0, end - overlap) + if not chunks and t: + chunks = [t] + return chunks + + def _embed_batch(self, texts, dim=384): + model_name = "all-MiniLM-L6-v2" + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + if _ST_AVAILABLE: + try: + st = SentenceTransformer(model_name) + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", + embs.shape[1], dim) + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + return embs + except Exception as e: + self.log.exception("sentence-transformers failed, falling back to deterministic embeddings: %s", e) + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs = embs / norms + return embs + + def _parse_single_result(self, html): + if html is None: + return None + try: + import pandas as _pd + df = _pd.read_html(html)[0] + if df.size == 0: + return None + return df.iloc[0, 0] + except Exception: + try: + m = re.search(r"]*>(.*?)", html, flags=re.S) + if m: + return m.group(1).strip() + except Exception: + pass + return None + + def _read_file_content(self, path: str): + warnings = [] + if not path: + return "", warnings + try: + p = os.path.expanduser(os.path.expandvars(path)) + if not os.path.isabs(p): + p = os.path.abspath(p) + if not os.path.exists(p): + warnings.append(f"file not found: {p}") + return "", warnings + _, ext = os.path.splitext(p.lower()) + if ext in ('.txt', '.md', '.text', '.json', '.ndjson'): + with io.open(p, 'r', encoding='utf-8', errors='replace') as fh: + return fh.read(), warnings + if ext == '.pdf': + if _PYPDF2_AVAILABLE: + try: + text_parts = [] + with open(p, 'rb') as fh: + reader = PyPDF2.PdfReader(fh) + for page in reader.pages: + try: + text_parts.append(page.extract_text() or '') + except Exception: + pass + return ''.join(text_parts), warnings + except Exception as e: + warnings.append(f"PyPDF2 failed to extract PDF text: {e}") + else: + warnings.append("PyPDF2 not available; cannot extract PDF text.") + return "", warnings + if ext in ('.docx',): + if _PYDOCX_AVAILABLE: + try: + doc = docx.Document(p) + paragraphs = [pr.text for pr in doc.paragraphs] + return '\n'.join(paragraphs), warnings + except Exception as e: + warnings.append(f"python-docx failed to extract docx: {e}") + else: + warnings.append("python-docx not available; cannot extract docx text.") + return "", warnings + try: + with io.open(p, 'r', encoding='utf-8', errors='replace') as fh: + return fh.read(), warnings + except Exception: + try: + with io.open(p, 'r', encoding='latin-1', errors='replace') as fh: + return fh.read(), warnings + except Exception as e: + warnings.append(f"Failed to read file: {e}") + return "", warnings + except Exception as e: + return "", [str(e)] + + def _get_existing_vector_dim(self, mariadb_client, dbname): + """ + If an embeddings table exists, parse SHOW CREATE TABLE to extract the VECTOR(...) dimension. + Returns int dimension if found, otherwise None. + """ + try: + resp = mariadb_client.run_statement("SHOW CREATE TABLE embeddings;") + if not resp: + return None + txt = str(resp) + m = re.search(r"embedding_vector\s+vector\((\d+)\)", txt, flags=re.I) + if m: + try: + return int(m.group(1)) + except Exception: + return None + m2 = re.search(r"vector\((\d+)\)", txt, flags=re.I) + if m2: + try: + return int(m2.group(1)) + except Exception: + return None + except Exception: + pass + return None + + # ---- main execution ---- + def execute(self, kernel, data): + # collect user-facing warnings/errors to print concisely at the end + user_warnings = [] + + # --- Extract cell content robustly --- + cell_text = "" + try: + if isinstance(data, dict): + if "cell" in data and isinstance(data["cell"], dict): + if "body" in data["cell"] and isinstance(data["cell"]["body"], str): + cell_text = data["cell"]["body"] + elif "code" in data["cell"] and isinstance(data["cell"]["code"], str): + cell_text = data["cell"]["code"] + elif any(k in data for k in ("code", "content", "message", "data")): + for k in ("code", "content", "message", "data"): + if k in data and isinstance(data[k], str): + cell_text = data[k] + break + elif isinstance(data, str): + cell_text = data + else: + try: + cell_text = str(data) + except Exception: + cell_text = "" + except Exception as e: + kernel._send_message("stderr", f"[debug] could not extract cell text: {e}\n") + cell_text = "" + + cell_text = cell_text.strip() if cell_text else "" + + # --- Parse arguments --- + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # attempt to write metadata about parse error if possible + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata(kernel, self.name(), self.args, "", "error", + f"Error parsing arguments: {e}", db_name, user_name) + except Exception: + pass + return + + # text arg or file arg preference + provided_text = args.get("text") if isinstance(args, dict) else None + file_arg = None + for k in ("text_file", "file", "path"): + if isinstance(args, dict) and args.get(k): + file_arg = args.get(k) + break + + if isinstance(provided_text, str) and provided_text.strip(): + cell_text = provided_text + kernel._send_message("stdout", f"Using text from args (len={len(cell_text)})\n") + elif file_arg: + file_contents, warnings = self._read_file_content(file_arg) + for w in warnings: + user_warnings.append(w) + if file_contents: + cell_text = file_contents + kernel._send_message("stdout", f"Using file content from {file_arg} (len={len(cell_text)})\n") + else: + kernel._send_message("stderr", f"Failed to read file or file contained no text: {file_arg}\n") + + # metadata and settings + doc_id = args.get("doc_id") or f"doc_{int(np.floor(np.random.random()*1e9))}" + title = args.get("title") or "" + chunk_size = int(args.get("chunk_size", 800) or 800) + overlap = int(args.get("overlap", 100) or 100) + embedding_dim = 384 # expected embedding dim from model + metadata = args.get("metadata", {}) or {} + + # build docs + docs_to_ingest = [] + maybe_json = (cell_text or "").strip() + try: + if maybe_json.startswith("[") or maybe_json.startswith("{"): + parsed = json.loads(maybe_json) + if isinstance(parsed, list): + for d in parsed: + docs_to_ingest.append({ + "doc_id": d.get("doc_id") or d.get("id") or f"doc_{int(np.floor(np.random.random()*1e9))}", + "title": d.get("title") or "", + "content": d.get("content") or "", + "metadata": d.get("metadata") or {} + }) + elif isinstance(parsed, dict) and ("content" in parsed or "doc_id" in parsed): + docs_to_ingest.append({ + "doc_id": parsed.get("doc_id") or parsed.get("id") or doc_id, + "title": parsed.get("title") or title, + "content": parsed.get("content") or "", + "metadata": parsed.get("metadata") or metadata + }) + else: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + else: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + except Exception: + docs_to_ingest.append({ + "doc_id": doc_id, + "title": title, + "content": cell_text, + "metadata": metadata + }) + + docs_to_ingest = [d for d in docs_to_ingest if (d.get("content") or "").strip()] + if not docs_to_ingest: + msg = "No non-empty documents to ingest; aborting." + kernel._send_message("stderr", msg + "\n") + # write metadata error (best-effort) + try: + db_name = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, db_name) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, db_name, user_name) + except Exception: + pass + return + + # get mariadb client + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + msg = "No mariadb_client available on kernel (can't run ingestion)." + kernel._send_message("stderr", msg + "\n") + # cannot write metadata without mariadb_client, so just return + return + + # determine db + try: + db_name_html = mariadb_client.run_statement("SELECT DATABASE();") + dbname = self._parse_single_result(db_name_html) or "" + kernel._send_message("stdout", f"Using database: {dbname}\n") + except Exception as e: + msg = f"Failed to query current database: {e}" + kernel._send_message("stderr", msg + "\n") + # try to write metadata + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass + return + + if not dbname: + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # Ensure metadata table exists for this database + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + # non-fatal, continue; metadata will be best-effort later + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + + # create tables: documents, chunks; embeddings handled carefully + try: + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`documents` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + doc_id VARCHAR(191) UNIQUE, + title TEXT, + content LONGTEXT, + metadata JSON, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`chunks` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + doc_id VARCHAR(191), + chunk_index INT, + chunk_text LONGTEXT, + chunk_meta JSON, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, + UNIQUE KEY uq_doc_chunk (doc_id, chunk_index), + FULLTEXT KEY ft_chunk_text (chunk_text) + ) ENGINE=InnoDB; + """ + ) + # create embeddings table if missing — keep existing definition if present + try: + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + chunk_id BIGINT UNIQUE, + model VARCHAR(128), + dim INT, + embedding_vector VECTOR({embedding_dim}), + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + except Exception: + # tolerate create failure and continue (we'll detect existing table schema) + pass + except Exception as e: + msg = f"DDL failed: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # detect existing VECTOR dimension (if any) + existing_vec_dim = self._get_existing_vector_dim(mariadb_client, dbname) + use_native_vector = True + if existing_vec_dim is None: + use_native_vector = True + else: + if existing_vec_dim != embedding_dim: + use_native_vector = False + user_warnings.append(f"embeddings.embedding_vector exists with dim={existing_vec_dim}; ingest dim={embedding_dim}. Will use JSON fallback.") + + # ensure embeddings_json exists (fallback) + try: + mariadb_client.run_statement( + f""" + CREATE TABLE IF NOT EXISTS `{dbname}`.`embeddings_json` ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + chunk_id BIGINT UNIQUE, + model VARCHAR(128), + dim INT, + embedding_json JSON, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) ENGINE=InnoDB; + """ + ) + except Exception: + pass + + # ingest loop with concise counters + total_chunks = 0 + total_emb_rows = 0 + native_attempts = 0 + native_successes = 0 + native_failures = 0 + fallback_successes = 0 + fallback_failures = 0 + + for doc in docs_to_ingest: + d_doc_id = doc.get("doc_id") + d_title = doc.get("title") + d_content = doc.get("content") or "" + d_meta = doc.get("metadata") or {} + + # insert document row + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`documents` (doc_id, title, content, metadata) + VALUES ({self._sql_escape(d_doc_id)}, {self._sql_escape(d_title)}, {self._sql_escape(d_content)}, {self._sql_escape(json.dumps(d_meta))}) + ON DUPLICATE KEY UPDATE title=VALUES(title), content=VALUES(content), metadata=VALUES(metadata); + """ + ) + except Exception as e: + user_warnings.append(f"Failed to insert document {d_doc_id}: {e}") + # Log per-document failure into metadata (best-effort) + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id}", "error", f"Failed to insert document {d_doc_id}: {e}", dbname, user_name) + except Exception: + pass + continue + + # chunk + chunks = self._simple_chunk(d_content, chunk_size, overlap) + if not chunks and d_content: + chunks = [d_content] + total_chunks += len(chunks) + + # insert chunks and collect ids + inserted_chunk_ids = [] + for idx, chunk_text in enumerate(chunks): + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`chunks` (doc_id, chunk_index, chunk_text, chunk_meta) + VALUES ({self._sql_escape(d_doc_id)}, {idx}, {self._sql_escape(chunk_text)}, {self._sql_escape(json.dumps({}))}); + """ + ) + # get last insert id (best-effort) + try: + last_html = mariadb_client.run_statement("SELECT LAST_INSERT_ID();") + last_val = self._parse_single_result(last_html) + last_id = int(last_val) if last_val is not None else None + except Exception: + last_id = None + if last_id is not None: + inserted_chunk_ids.append((idx, last_id)) + else: + try: + sel_html = mariadb_client.run_statement( + f"SELECT id FROM `{dbname}`.`chunks` WHERE doc_id = {self._sql_escape(d_doc_id)} AND chunk_index = {idx} LIMIT 1;" + ) + sel_val = self._parse_single_result(sel_html) + inserted_chunk_ids.append((idx, int(sel_val)) if sel_val is not None else (idx, None)) + except Exception: + inserted_chunk_ids.append((idx, None)) + except Exception as e: + user_warnings.append(f"Failed to insert chunk {idx} for {d_doc_id}: {e}") + # log per-chunk failure + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id},chunk_index={idx}", "error", + f"Failed to insert chunk {idx} for {d_doc_id}: {e}", dbname, user_name) + except Exception: + pass + inserted_chunk_ids.append((idx, None)) + continue + + # embeddings: compute and insert (native if allowed, else JSON) + if chunks: + embs = self._embed_batch(chunks, embedding_dim) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs_norm = (embs / norms).astype(np.float32) + + for (i, chunk_db_id), vec in zip(inserted_chunk_ids, embs_norm): + if chunk_db_id is None: + user_warnings.append(f"No chunk id for doc {d_doc_id} chunk {i}; embedding skipped.") + # log skipped embedding + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"doc_id={d_doc_id},chunk_index={i}", "error", + "No chunk id for embedding; skipped", dbname, user_name) + except Exception: + pass + continue + + vec_list = [float(v) for v in vec.tolist()] + vec_literal = "[" + ",".join(repr(x) for x in vec_list) + "]" + + if not use_native_vector: + # always use JSON fallback + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + # verify + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify_json) + if cnt and int(cnt) > 0: + fallback_successes += 1 + total_emb_rows += 1 + else: + fallback_failures += 1 + # log failure + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON embedding write returned zero rows", dbname, user_name) + except Exception: + fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON embedding verification failed", dbname, user_name) + except Exception: + pass + except Exception as e_json: + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback embedding storage failed: {e_json}", dbname, user_name) + except Exception: + pass + continue + + # Attempt native VECTOR insert + native_attempts += 1 + try: + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings` (chunk_id, model, dim, embedding_vector) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {vec_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_vector=VALUES(embedding_vector); + """ + ) + except Exception as e_native: + native_failures += 1 + # try fallback JSON + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify_json) + if cnt and int(cnt) > 0: + fallback_successes += 1 + total_emb_rows += 1 + else: + fallback_failures += 1 + except Exception: + fallback_failures += 1 + except Exception as e_json: + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback storage after native failure also failed: {e_json}", dbname, user_name) + except Exception: + pass + # log native failure + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Native vector insert failed: {e_native}", dbname, user_name) + except Exception: + pass + continue + + # Verify native insert succeeded by COUNT(*) + try: + verify = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings` WHERE chunk_id = {chunk_db_id};" + ) + cnt = self._parse_single_result(verify) + if cnt and int(cnt) > 0: + native_successes += 1 + total_emb_rows += 1 + else: + # fallback to JSON + native_failures += 1 + try: + emb_json_literal = self._sql_escape(json.dumps(vec_list)) + mariadb_client.run_statement( + f""" + INSERT INTO `{dbname}`.`embeddings_json` (chunk_id, model, dim, embedding_json) + VALUES ({chunk_db_id}, {self._sql_escape('all-MiniLM-L6-v2')}, {embedding_dim}, {emb_json_literal}) + ON DUPLICATE KEY UPDATE model=VALUES(model), dim=VALUES(dim), embedding_json=VALUES(embedding_json); + """ + ) + try: + verify_json = mariadb_client.run_statement( + f"SELECT COUNT(*) FROM `{dbname}`.`embeddings_json` WHERE chunk_id = {chunk_db_id};" + ) + cntj = self._parse_single_result(verify_json) + if cntj and int(cntj) > 0: + fallback_successes += 1 + total_emb_rows += 1 + else: + fallback_failures += 1 + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + "Fallback JSON write after native verify returned zero rows", dbname, user_name) + except Exception: + fallback_failures += 1 + except Exception as e_json: + user_warnings.append(f"Fallback embedding storage failed for chunk_id={chunk_db_id}: {e_json}") + fallback_failures += 1 + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Fallback JSON storage after native verify failed: {e_json}", dbname, user_name) + except Exception: + pass + except Exception as e_verify: + user_warnings.append(f"Verify select for embeddings failed: {e_verify}") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + f"chunk_id={chunk_db_id}", "error", + f"Verify select for embeddings failed: {e_verify}", dbname, user_name) + except Exception: + pass + + # Final diagnostics: counts & version + try: + cnt_emb = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings;") + cnt_emb_val = self._parse_single_result(cnt_emb) or "0" + except Exception: + cnt_emb_val = "N/A" + try: + cnt_json = mariadb_client.run_statement("SELECT COUNT(*) FROM embeddings_json;") + cnt_json_val = self._parse_single_result(cnt_json) or "0" + except Exception: + cnt_json_val = "N/A" + try: + version = mariadb_client.run_statement("SELECT VERSION();") + version_val = self._parse_single_result(version) or "" + except Exception: + version_val = "" + + # concise output + summary_msg = ( + "Ingest complete.\n" + f" documents={len(docs_to_ingest)}\n" + f" chunks_total={total_chunks}\n" + f" embeddings_written={total_emb_rows}\n" + f" Server version: {version_val}\n" + ) + kernel._send_message("stdout", summary_msg) + + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "documents,chunks,embeddings" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", summary_msg, dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + + # optionally show warnings + if user_warnings: + try: + for w in user_warnings: + try: + pass + except Exception: + pass + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py b/mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py new file mode 100644 index 0000000..1d19c20 --- /dev/null +++ b/mariadb_kernel/maria_magics/rag_commands/maria_rag_query.py @@ -0,0 +1,874 @@ +# mariadb_kernel/maria_magics/maria_rag_query.py +""" +%maria_rag_query + +Single-command RAG: retrieve relevant chunks, run fusion chain (LLM via Gemini) and return answer. + +Hardcoded settings: + - retriever = "hybrid" + - k = 6 + - llm_model = "gemini-2.5-flash" (hardcoded) + - prompt = "default" + - bm25_weight = 0.3 + +Usage: + %maria_rag_query query="How do I cancel my subscription?" + %maria_rag_query query="How do I cancel my subscription?" explain=true + +Notes: + - The code will attempt to use the Google GenAI Python client (google.genai). It checks + the environment variables GOOGLE_API_KEY or GENAI_API_KEY for the API key. + - If the GenAI client or API key is unavailable, the magic falls back to a local fusion chain. +""" + +import shlex +import json +import logging +import re +import os +import numpy as np +from distutils import util + +from mariadb_kernel.maria_magics.maria_magic import MariaMagic + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# optional Google GenAI (Gemini) client +_GENAI_AVAILABLE = False +try: + from google import genai + from google.genai import types + _GENAI_AVAILABLE = True +except Exception: + _GENAI_AVAILABLE = False + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class MariaRAGQuery(MariaMagic): + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaRAGQuery") + + # HARDCODED SETTINGS (per your request) + self.RETRIEVER = "hybrid" + self.K = 6 + self.BM25_WEIGHT = 0.3 + self.CANDIDATE_N = 500 + self.LLM_MODEL = "gemini-2.5-flash" # using Gemini model per your snippet + self.PROMPT_NAME = "default" + self.EMBED_DIM = 384 + + def type(self): + return "Line" + + def name(self): + return "maria_rag_query" + + def help(self): + return "%maria_rag_query query=\"...\" — retrieve+fusion RAG (hardcoded settings)" + + # ---------------- Parsing helpers ---------------- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + if input_obj is None: + return {} + if isinstance(input_obj, dict): + return input_obj + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + input_str = input_obj.strip() + if not input_str: + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception: + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + # ---------------- Metadata helpers (added) ---------------- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ---------------- Embedding utilities ---------------- + def _embed_texts(self, texts, dim=None): + """Return normalized numpy embeddings for texts. If sentence-transformers available use it.""" + if dim is None: + dim = self.EMBED_DIM + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + if _ST_AVAILABLE: + try: + st = SentenceTransformer("all-MiniLM-L6-v2") + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return (embs / norms).astype(np.float32) + except Exception as e: + self.log.debug("sentence-transformers failure: %s", e) + # deterministic fallback + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + return (embs / norms).astype(np.float32) + + def _parse_html_table(self, html): + """Best-effort HTML -> list-of-dicts parser used for mariadb_client outputs.""" + if html is None: + return None + s = str(html) + rows = re.findall(r"]*>(.*?)", s, flags=re.S | re.I) + parsed = [] + header = [] + for r in rows: + ths = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if ths and not header: + header = [re.sub(r"<[^>]+>", "", t).strip() for t in ths] + continue + tds = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if not tds: + continue + cells = [re.sub(r"<[^>]+>", "", t).strip() for t in tds] + if header and len(cells) == len(header): + parsed.append(dict(zip(header, cells))) + else: + parsed.append({str(i): cells[i] if i < len(cells) else "" for i in range(len(cells))}) + return parsed if parsed else None + + def _parse_vector_literal(self, val): + if val is None: + return None + if isinstance(val, (list, tuple, np.ndarray)): + return np.array(val, dtype=np.float32) + s = str(val).strip() + if s.startswith("[") and s.endswith("]"): + try: + arr = json.loads(s) + return np.array(arr, dtype=np.float32) + except Exception: + pass + nums = re.findall(r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?", s) + if not nums: + return None + try: + return np.array([float(x) for x in nums], dtype=np.float32) + except Exception: + return None + + # ---------------- Retrieval helpers ---------------- + def _bm25_prefilter(self, kernel, dbname, query_text): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return [] + q_esc = self._sql_escape(query_text) + sql = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text, " + f"MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) AS bm25_score " + f"FROM `{dbname}`.`chunks` " + f"WHERE MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) " + f"ORDER BY bm25_score DESC LIMIT {self.CANDIDATE_N};" + ) + try: + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + if not rows: + return [] + cand = [] + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("id")) + except Exception: + continue + cand.append({ + "chunk_id": cid, + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or r.get("0") or 0), + "chunk_text": r.get("chunk_text") or "", + "bm25_score": float(r.get("bm25_score") or 0.0) + }) + return cand + except Exception as e: + self.log.debug("BM25 query failed: %s", e) + return [] + + def _sample_candidates(self, kernel, dbname): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return [] + sql = f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text FROM `{dbname}`.`chunks` ORDER BY RAND() LIMIT {self.CANDIDATE_N};" + try: + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + if not rows: + return [] + cand = [] + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("id")) + except Exception: + continue + cand.append({ + "chunk_id": cid, + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0), + "chunk_text": r.get("chunk_text") or "" + }) + return cand + except Exception as e: + self.log.debug("Sampling failed: %s", e) + return [] + + def _fetch_embeddings_for_candidates(self, kernel, dbname, candidate_ids): + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + return {} + if not candidate_ids: + return {} + + ids_sql = ",".join(str(int(x)) for x in candidate_ids) + # first attempt native embeddings join + try: + sql = ( + f"SELECT e.chunk_id, e.embedding_vector, c.chunk_text, c.doc_id, c.chunk_index " + f"FROM `{dbname}`.`embeddings` e " + f"JOIN `{dbname}`.`chunks` c ON e.chunk_id = c.id " + f"WHERE e.chunk_id IN ({ids_sql});" + ) + html = mariadb_client.run_statement(sql) + rows = self._parse_html_table(html) + emb_map = {} + if rows: + for r in rows: + try: + cid = int(r.get("chunk_id") or r.get("0")) + except Exception: + continue + emb_raw = r.get("embedding_vector") or r.get("embedding") or None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) or 1.0 + emb_map[cid] = { + "vec": (vec / norm).astype(np.float32), + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0) + } + if emb_map: + return emb_map + except Exception as e: + self.log.debug("native embeddings fetch failed: %s", e) + + # fallback to embeddings_json + try: + sql_json = ( + f"SELECT ej.chunk_id, ej.embedding_json, c.chunk_text, c.doc_id, c.chunk_index " + f"FROM `{dbname}`.`embeddings_json` ej " + f"JOIN `{dbname}`.`chunks` c ON ej.chunk_id = c.id " + f"WHERE ej.chunk_id IN ({ids_sql});" + ) + html_json = mariadb_client.run_statement(sql_json) + rows_json = self._parse_html_table(html_json) + emb_map = {} + if rows_json: + for r in rows_json: + try: + cid = int(r.get("chunk_id") or r.get("0")) + except Exception: + continue + emb_raw = r.get("embedding_json") or r.get("embedding") or None + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) or 1.0 + emb_map[cid] = { + "vec": (vec / norm).astype(np.float32), + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_index": int(r.get("chunk_index") or 0) + } + return emb_map + except Exception as e: + self.log.debug("embeddings_json fetch failed: %s", e) + return {} + + # ---------------- Gemini LLM call ---------------- + def _call_gemini(self, system_prompt, user_prompt, model_name=None, max_output_tokens=400): + """ + Call Gemini using google.genai per the snippet the user provided. + Looks for API key in GOOGLE_API_KEY or GENAI_API_KEY environment variables. + Returns (text, raw_response) or (None, None) on failure. + """ + if not _GENAI_AVAILABLE: + self.log.debug("google.genai not available in environment.") + return None, None + + # NOTE: in the snippet provided earlier an API key was hardcoded; here we'll check env vars. + api_key = os.environ.get("GOOGLE_API_KEY") or os.environ.get("GENAI_API_KEY") or "" + if not api_key: + self.log.debug("No GENAI API key found in GOOGLE_API_KEY or GENAI_API_KEY.") + return None, None + + try: + client = genai.Client(api_key=api_key) + # Build combined content: put system + user into 'contents' - simple approach + contents = system_prompt + "\n\n" + user_prompt + resp = client.models.generate_content( + model=model_name or self.LLM_MODEL, + contents=contents, + config=types.GenerateContentConfig( + max_output_tokens=max_output_tokens, + thinking_config=types.ThinkingConfig(thinking_budget=0) + ) + ) + # The user's snippet used resp.text + text = getattr(resp, "text", None) + if text is None: + # some genai client versions put result in resp.output or resp.candidates + try: + # try attribute "candidates" + if hasattr(resp, "candidates") and resp.candidates: + text = getattr(resp.candidates[0], "content", None) or getattr(resp.candidates[0], "text", None) + elif hasattr(resp, "output"): + text = str(resp.output) + else: + text = str(resp) + except Exception: + text = str(resp) + return text, resp + except Exception as e: + self.log.debug("Gemini call failed: %s", e) + return None, None + + # ---------------- Local fusion fallback ---------------- + def _fusion_chain_local(self, question, context_blocks): + """ + Local, deterministic fusion map-reduce: + - Map: pick sentences from each context containing question tokens + - Reduce: join deduplicated sentences into a compact answer + """ + q = question.lower() + q_tokens = set(re.findall(r"\w+", q)) + picked = [] + evidence = [] + debug = {"map": [], "reduce": None} + + for b in context_blocks: + text = b["chunk_text"] + sentences = re.split(r'(?<=[\.\?\!])\s+', text) + picks = [] + for s in sentences: + st = s.strip() + if not st: + continue + s_tokens = set(re.findall(r"\w+", st.lower())) + if len(q_tokens & s_tokens) > 0: + picks.append(st) + if not picks and sentences: + picks = [sentences[0].strip()] + picks = picks[:3] + if picks: + picked.extend(picks) + evidence.append({ + "doc_id": b["doc_id"], + "chunk_index": b["chunk_index"], + "snippet": " ".join(picks)[:400] + }) + debug["map"].append({"chunk_id": b["chunk_id"], "picked_count": len(picks)}) + + # Reduce: deduplicate and join + uniq = [] + seen = set() + for s in picked: + key = s.strip().lower() + if key not in seen: + seen.add(key) + uniq.append(s.strip()) + + if not uniq: + answer = "I couldn't find a clear answer in the retrieved documents." + else: + answer = " ".join(uniq[:8]) + if len(answer) > 800: + answer = answer[:797] + "..." + debug["reduce"] = {"picked_sentences": len(uniq)} + return answer, evidence, debug + + # ---------------- Main entry ---------------- + def execute(self, kernel, data): + # parse args and query + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # best-effort metadata log + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", f"Error parsing arguments: {e}", dbname, user_name) + except Exception: + pass + args = {} + + query = None + if isinstance(args, dict): + query = args.get("query") or args.get("q") + if not query: + if isinstance(data, str) and data.strip(): + query = data.strip() + if not query: + msg = "No query supplied. Usage: %maria_rag_query query=\"...\"" + kernel._send_message("stderr", msg + "\n") + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + explain = False + if isinstance(args, dict): + if args.get("explain") in (True, "true", "True", 1, "1"): + explain = True + + kernel._send_message("stdout", f"[debug] RAG query received (len={len(query)}): {query}\n") + + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + msg = "No mariadb_client available on kernel (can't run retrieval)." + kernel._send_message("stderr", msg + "\n") + # metadata best-effort: cannot insert without client, but attempt helper which will no-op + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # determine DB + try: + db_html = mariadb_client.run_statement("SELECT DATABASE();") + db_parsed = self._parse_html_table(db_html) + dbname = "" + if db_parsed and isinstance(db_parsed, list) and len(db_parsed) > 0: + first = db_parsed[0] + dbname = next(iter(first.values())) + else: + m = re.search(r"]*>(.*?)", str(db_html), flags=re.S) + if m: + dbname = m.group(1).strip() + except Exception as e: + msg = f"Failed to detect current DB: {e}" + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass + return + + if not dbname: + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # Ensure metadata table exists for this database (best-effort) + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + + # RETRIEVAL: BM25 prefilter (hybrid) + candidates = [] + try: + if self.RETRIEVER == "hybrid": + candidates = self._bm25_prefilter(kernel, dbname, query) + except Exception: + candidates = [] + if not candidates: + candidates = self._sample_candidates(kernel, dbname) + if not candidates: + msg = "No candidate chunks found (chunks table empty?)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass + return + + candidate_ids = [c["chunk_id"] for c in candidates if c.get("chunk_id") is not None] + emb_map = self._fetch_embeddings_for_candidates(kernel, dbname, candidate_ids) + if not emb_map: + msg = "No embeddings found for any candidate chunks." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + # compute query embedding consistent with vector dim + try: + first_vec = next(iter(emb_map.values()))["vec"] + vec_dim = first_vec.shape[0] + except Exception: + msg = "Failed to determine embedding dimensionality." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + try: + q_emb = self._embed_texts([query], dim=vec_dim)[0] + except Exception as e: + msg = f"Failed to compute query embedding: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + # combine bm25 + vector + scored = [] + bm25_values = [float(c.get("bm25_score", 0.0) or 0.0) for c in candidates] + bm25_max = max(bm25_values) if bm25_values else 0.0 + for c in candidates: + cid = c.get("chunk_id") + if cid not in emb_map: + continue + emb_info = emb_map[cid] + sim = float(np.dot(q_emb, emb_info["vec"])) + bm25_raw = float(c.get("bm25_score", 0.0) or 0.0) + bm25_norm = (bm25_raw / bm25_max) if bm25_max > 0 else 0.0 + combined = (self.BM25_WEIGHT * bm25_norm) + ((1.0 - self.BM25_WEIGHT) * ((sim + 1.0) / 2.0)) + scored.append({ + "chunk_id": cid, + "doc_id": emb_info.get("doc_id"), + "chunk_index": emb_info.get("chunk_index"), + "chunk_text": emb_info.get("chunk_text"), + "vec_sim": sim, + "bm25": bm25_raw, + "score": combined + }) + + if not scored: + msg = "No scored candidates after combining BM25/vector." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "search", "error", msg, dbname, user_name) + except Exception: + pass + return + + # top-K + scored.sort(key=lambda r: r["score"], reverse=True) + topk = scored[: self.K] + + # assemble context blocks with citations (internal only) + context_blocks = [] + for s in topk: + context_blocks.append({ + "chunk_id": s["chunk_id"], + "doc_id": s["doc_id"], + "chunk_index": s["chunk_index"], + "chunk_text": s["chunk_text"], + "vec_sim": s["vec_sim"], + "bm25": s["bm25"], + "score": s["score"] + }) + + # Build prompt / context to send to Gemini + context_text = "" + for i, b in enumerate(context_blocks): + citation = f"[{b['doc_id']}::chunk_{b['chunk_index']}]" + context_text += f"--- SOURCE {i+1} {citation} ---\n{b['chunk_text']}\n\n" + + system_prompt = "You are a helpful assistant that answers questions based on provided documents. When you use information from a source include a citation tag like [DOCID::chunk_X]." + user_prompt = f"QUESTION:\n{query}\n\nCONTEXT:\n{context_text}\n\nINSTRUCTIONS:\nAnswer the question concisely.\n" + + # Try Gemini via google.genai + llm_answer = None + llm_raw_resp = None + gemini_text, gemini_raw = self._call_gemini(system_prompt, user_prompt, model_name=self.LLM_MODEL, max_output_tokens=512) + if gemini_text: + llm_answer = gemini_text + llm_raw_resp = gemini_raw + + chain_debug = None + used_llm = False + if not llm_answer: + ans, evidence, debug = self._fusion_chain_local(query, context_blocks) + chain_debug = debug + llm_answer = ans + else: + used_llm = True + + # Output answer only (no sources printed) + kernel._send_message("stdout", "\n=== ANSWER ===\n") + kernel._send_message("stdout", llm_answer + "\n\n") + + # NOTE: sources are intentionally NOT printed here. + + if explain: + kernel._send_message("stdout", "\n=== EXPLAIN: retrieval candidates (top 20 shown) ===\n") + for s in scored[:20]: + kernel._send_message("stdout", f"chunk_id={s['chunk_id']} doc_id={s['doc_id']} chunk_index={s['chunk_index']} score={s['score']:.6f} vec_sim={s['vec_sim']:.6f} bm25={s['bm25']:.6f}\n") + if chain_debug is not None: + kernel._send_message("stdout", "\n=== EXPLAIN: chain debug ===\n") + kernel._send_message("stdout", json.dumps(chain_debug, indent=2) + "\n") + if llm_raw_resp is not None: + kernel._send_message("stdout", "\n=== GEMINI RAW RESP (truncated) ===\n") + kernel._send_message("stdout", str(llm_raw_resp)[:2000] + "\n") + + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "chunks,embeddings,llm" if used_llm else "chunks,embeddings,local_chain" + msg = f"Returned {len(topk)} results for query. used_llm={used_llm}" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", msg, dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/rag_commands/maria_search.py b/mariadb_kernel/maria_magics/rag_commands/maria_search.py new file mode 100644 index 0000000..6d995b5 --- /dev/null +++ b/mariadb_kernel/maria_magics/rag_commands/maria_search.py @@ -0,0 +1,841 @@ +# maria_kernel/maria_magics/maria_search.py +""" +%maria_search + +Hybrid BM25 + vector search. Hardcoded settings: + - MODEL_NAME = "all-MiniLM-L6-v2" + - K = 8 + - BM25_WEIGHT = 0.3 + - CANDIDATE_N = 500 + +Usage: + %maria_search query="refund policy for returns" + %maria_search "refund policy for returns" # raw-line fallback +If no query supplied, defaults to "testquery". +""" + +import shlex +import json +import logging +import re +import numpy as np +from distutils import util + +# optional sentence-transformers +_ST_AVAILABLE = False +try: + from sentence_transformers import SentenceTransformer + _ST_AVAILABLE = True +except Exception: + _ST_AVAILABLE = False + +# optional pandas for parsing HTML tables +_PANDAS_AVAILABLE = False +try: + import pandas as _pd + _PANDAS_AVAILABLE = True +except Exception: + _PANDAS_AVAILABLE = False + +try: + from mariadb_kernel.maria_magics.maria_magic import MariaMagic +except Exception: + # lightweight fallback if run standalone for tests + class MariaMagic: + def __init__(self, *a, **k): + pass + def type(self): return "Line" + def name(self): return "maria_search" + def help(self): return "Search (hybrid)." + +# Optional helper to reliably get current DB name (if available) +try: + from mariadb_kernel.sql_fetch import SqlFetch +except Exception: + SqlFetch = None + + +class MariaSearch(MariaMagic): + def __init__(self, args=""): + self.args = args + self.log = logging.getLogger("MariaSearch") + + # Hardcoded (per your request) + self.MODEL_NAME = "all-MiniLM-L6-v2" + self.K = 8 + self.CANDIDATE_N = 500 + self.BM25_WEIGHT = 0.3 + + def type(self): + return "Line" + + def name(self): + return "maria_search" + + def help(self): + return "%maria_search query=\"text\" — hybrid BM25 + vector search (hardcoded model/weights)" + + # ----------------- utilities ----------------- + def _str_to_obj(self, s): + try: + return int(s) + except Exception: + pass + try: + return float(s) + except Exception: + pass + try: + return bool(util.strtobool(s)) + except Exception: + pass + try: + return json.loads(s) + except Exception: + pass + if isinstance(s, str) and len(s) >= 2 and s[0] == s[-1] and s[0] in ("'", '"'): + return s[1:-1] + return s + + def parse_args(self, input_obj): + if input_obj is None: + return {} + if isinstance(input_obj, dict): + return input_obj + if not isinstance(input_obj, str): + try: + return dict(input_obj) + except Exception: + return {} + input_str = input_obj.strip() + if input_str == "": + return {} + try: + pairs = dict(token.split("=", 1) for token in shlex.split(input_str)) + except Exception: + pairs = {} + for token in input_str.split(): + if "=" in token: + k, v = token.split("=", 1) + pairs[k] = v + for k, v in pairs.items(): + pairs[k] = self._str_to_obj(v) + return pairs + + def _sql_escape(self, s): + if s is None: + return "NULL" + if not isinstance(s, str): + return str(s) + return "'" + s.replace("'", "''") + "'" + + # ----- metadata helpers (same approach as TrainModel / MariaIngest) ----- + def _get_mariadb_client(self, kernel): + return getattr(kernel, "mariadb_client", None) + + def _get_logger(self, kernel): + return getattr(kernel, "log", logging.getLogger(__name__)) + + def _sql_escape_meta(self, val): + """Escape a value for SQL single-quoted literal insert. None -> NULL""" + if val is None: + return "NULL" + if not isinstance(val, str): + val = str(val) + return "'" + val.replace("'", "''") + "'" + + def _get_db_name(self, kernel): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + + # Try SqlFetch if available + if SqlFetch is not None and mariadb_client is not None: + try: + sf = SqlFetch(mariadb_client, log) + dbname = sf.get_db_name() + if isinstance(dbname, str): + return dbname + except Exception: + log.debug("SqlFetch available but .get_db_name() failed; falling back.") + + if mariadb_client is None: + return "" + + try: + result = mariadb_client.run_statement("SELECT DATABASE();") + if mariadb_client.iserror() or not result: + return "" + # Try to parse HTML table with pandas if available + try: + import pandas as _pd # local import to avoid global dependency + dfs = _pd.read_html(result) + if dfs and len(dfs) > 0: + val = dfs[0].iloc[0, 0] + if isinstance(val, float) and _pd.isna(val): + return "" + return str(val) if val is not None else "" + except Exception: + m = re.search(r"(.*?)", str(result), flags=re.S | re.I) + if m: + txt = re.sub(r"<.*?>", "", m.group(1)).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + txt = str(result).strip() + if txt.lower() == "null" or txt == "": + return "" + return txt + except Exception: + return "" + return "" + + def _get_user_name(self, kernel): + candidates = [ + getattr(kernel, "user_name", None), + getattr(kernel, "username", None), + getattr(kernel, "user", None), + getattr(kernel, "session", None), + ] + for cand in candidates: + if cand is None: + continue + if isinstance(cand, str) and cand.strip(): + return cand + try: + maybe = getattr(cand, "user", None) + if isinstance(maybe, str) and maybe.strip(): + return maybe + except Exception: + pass + try: + return os.getlogin() + except Exception: + return "" + + def _ensure_metadata_table(self, kernel, db_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + create_sql = f""" + CREATE TABLE IF NOT EXISTS {table_full_name} ( + id INT AUTO_INCREMENT PRIMARY KEY, + command_name VARCHAR(255), + arguments TEXT, + execution_timestamp DATETIME, + affected_columns TEXT, + operation_status VARCHAR(50), + message TEXT, + db_name VARCHAR(255), + user_name VARCHAR(255), + rollback_token VARCHAR(255), + backup_table VARCHAR(255), + original_table VARCHAR(255) + ); + """ + try: + mariadb_client.run_statement(create_sql) + if mariadb_client.iserror(): + log.error("Error creating magic_metadata table.") + except Exception as e: + log.error(f"Failed to ensure magic_metadata table: {e}") + + def _insert_metadata(self, kernel, command_name, arguments, affected_columns, + operation_status, message, db_name, user_name): + mariadb_client = self._get_mariadb_client(kernel) + log = self._get_logger(kernel) + if mariadb_client is None: + return + table_full_name = f"{db_name}.magic_metadata" if db_name else "magic_metadata" + + args_sql = self._sql_escape_meta(arguments) + affected_sql = self._sql_escape_meta(affected_columns) + status_sql = self._sql_escape_meta(operation_status) + message_sql = self._sql_escape_meta(message) + db_sql = self._sql_escape_meta(db_name) + user_sql = self._sql_escape_meta(user_name) + + insert_sql = f""" + INSERT INTO {table_full_name} + (command_name, arguments, execution_timestamp, affected_columns, + operation_status, message, db_name, user_name) + VALUES ( + {self._sql_escape_meta(command_name)}, + {args_sql}, + NOW(), + {affected_sql}, + {status_sql}, + {message_sql}, + {db_sql}, + {user_sql} + ); + """ + try: + mariadb_client.run_statement(insert_sql) + if mariadb_client.iserror(): + log.error("Error inserting into magic_metadata.") + except Exception as e: + log.error(f"Exception while inserting metadata: {e}") + + # ----------------- end metadata helpers ----------------- + + def _parse_html_table(self, html): + """Return a list-of-dicts or pandas.DataFrame. Best-effort fallback if pandas missing.""" + if html is None: + return None + if _PANDAS_AVAILABLE: + try: + dfs = _pd.read_html(html) + if dfs: + return dfs[0] + except Exception: + pass + # fallback simple parser -> list of dicts + try: + tbl = re.search(r"]*>(.*?)", str(html), flags=re.S | re.I) + if not tbl: + return None + rows = re.findall(r"]*>(.*?)", tbl.group(1), flags=re.S | re.I) + if not rows: + return None + headers = None + parsed = [] + for r in rows: + # find header cells + if headers is None: + ths = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + if ths: + headers = [re.sub(r"<[^>]+>", "", c).strip() for c in ths] + continue + tds = re.findall(r"]*>(.*?)", r, flags=re.S | re.I) + cells = [re.sub(r"<[^>]+>", "", c).strip() for c in tds] + if not cells: + continue + if headers and len(cells) == len(headers): + parsed.append(dict(zip(headers, cells))) + else: + parsed.append({str(i): cells[i] if i < len(cells) else "" for i in range(len(cells))}) + return parsed + except Exception: + return None + + def _is_nonempty_table(self, table): + """Return True if 'table' (DataFrame or list-of-dicts) has at least one row.""" + if table is None: + return False + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(table, _pd.DataFrame): + return not table.empty + if isinstance(table, list): + return len(table) > 0 + # other truthy checks (strings etc) considered empty for our use + return False + + def _embed_texts(self, texts, dim=384): + if len(texts) == 0: + return np.zeros((0, dim), dtype=np.float32) + if _ST_AVAILABLE: + try: + st = SentenceTransformer(self.MODEL_NAME) + embs = st.encode(texts, convert_to_numpy=True, show_progress_bar=False) + embs = np.array(embs, dtype=np.float32) + if embs.ndim == 1: + embs = np.expand_dims(embs, 0) + if embs.shape[1] != dim: + self.log.warning("Embedding dim mismatch: model returned %d, expected %d. Adjusting.", + embs.shape[1], dim) + if embs.shape[1] > dim: + embs = embs[:, :dim].astype(np.float32) + else: + pad = np.zeros((embs.shape[0], dim - embs.shape[1]), dtype=np.float32) + embs = np.concatenate([embs, pad], axis=1) + return embs + except Exception as e: + self.log.exception("sentence-transformers failed, falling back to deterministic embeddings: %s", e) + rng = np.random.RandomState(12345) + embs = rng.normal(size=(len(texts), dim)).astype(np.float32) + norms = np.linalg.norm(embs, axis=1, keepdims=True) + norms[norms == 0] = 1.0 + embs = embs / norms + return embs + + def _parse_vector_literal(self, val): + """Parse JSON or text vector into numpy array.""" + if val is None: + return None + if isinstance(val, (list, tuple, np.ndarray)): + try: + return np.array(val, dtype=np.float32) + except Exception: + pass + s = str(val).strip() + if s.startswith("[") and s.endswith("]"): + try: + parsed = json.loads(s) + return np.array(parsed, dtype=np.float32) + except Exception: + pass + nums = re.findall(r"[-+]?[0-9]*\.?[0-9]+(?:[eE][-+]?[0-9]+)?", s) + if not nums: + return None + try: + arr = np.array([float(x) for x in nums], dtype=np.float32) + return arr + except Exception: + return None + + # ----------------- main ----------------- + def execute(self, kernel, data): + # parse args & query + try: + args = self.parse_args(self.args) + except Exception as e: + kernel._send_message("stderr", f"Error parsing arguments: {e}\n") + # best-effort metadata logging + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", f"Error parsing arguments: {e}", dbname, user_name) + except Exception: + pass + args = {} + + query = None + if isinstance(args, dict): + query = args.get("query") or args.get("q") + if not query: + if isinstance(data, str) and data.strip(): + query = data.strip() + if not query: + query = "testquery" + query = str(query).strip() + if not query: + msg = "Empty query; nothing to search." + kernel._send_message("stderr", msg + "\n") + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + kernel._send_message("stdout", f"[debug] running hybrid search for query (len={len(query)}): {query}\n") + + mariadb_client = getattr(kernel, "mariadb_client", None) + if mariadb_client is None: + msg = "No mariadb_client available on kernel (can't run search)." + kernel._send_message("stderr", msg + "\n") + # Can't insert metadata without mariadb_client, but attempt (internal check will no-op) + try: + dbname = self._get_db_name(kernel) + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # determine DB + try: + db_html = mariadb_client.run_statement("SELECT DATABASE();") + dbname = None + parsed_db = self._parse_html_table(db_html) + if parsed_db is None: + m = re.search(r"]*>(.*?)", str(db_html), flags=re.S) + dbname = m.group(1).strip() if m else "" + else: + if _PANDAS_AVAILABLE and hasattr(parsed_db, "iloc") and isinstance(parsed_db, _pd.DataFrame): + if not parsed_db.empty: + dbname = str(parsed_db.iloc[0, 0]) + else: + dbname = "" + elif isinstance(parsed_db, list) and len(parsed_db) > 0: + first = parsed_db[0] + if isinstance(first, dict): + dbname = next(iter(first.values())) + else: + dbname = first.get("0") if "0" in first else "" + else: + dbname = "" + except Exception as e: + msg = f"Failed to query current database: {e}" + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, "") + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, "", user_name) + except Exception: + pass + return + + if not dbname: + msg = "No current database selected (use `USE ` before running the magic)." + kernel._send_message("stderr", msg + "\n") + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "", "error", msg, dbname, user_name) + except Exception: + pass + return + + # Ensure metadata table exists for this database + try: + user_name = self._get_user_name(kernel) + self._ensure_metadata_table(kernel, dbname) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to ensure metadata table (continuing).\n") + except Exception: + pass + + # --- BM25 prefilter if requested --- + candidates = [] + try: + if self.BM25_WEIGHT > 0: + q_esc = self._sql_escape(query) + sql = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text, " + f"MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) AS bm25_score " + f"FROM `{dbname}`.`chunks` " + f"WHERE MATCH(chunk_text) AGAINST ({q_esc} IN NATURAL LANGUAGE MODE) " + f"ORDER BY bm25_score DESC LIMIT {self.CANDIDATE_N};" + ) + html = mariadb_client.run_statement(sql) + df = self._parse_html_table(html) + if self._is_nonempty_table(df): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(df, _pd.DataFrame): + for _, row in df.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("id")) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "bm25_score": float(row.get("bm25_score") if "bm25_score" in row else 0.0) if row is not None else 0.0 + }) + else: + # parsed list-of-dicts + for r in df: + try: + cid = int(r.get("chunk_id") or r.get("id") or next(iter(r.values()))) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": r.get("chunk_text") or r.get(next(iter(r.keys()))) or "", + "doc_id": r.get("doc_id") or "", + "bm25_score": float(r.get("bm25_score") or 0.0) + }) + except Exception as e: + kernel._send_message("stderr", f"BM25 prefilter failed: {e}\n") + # Log BM25 prefilter warning as metadata (non-fatal) + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "warning", f"BM25 prefilter failed: {e}", dbname, user_name) + except Exception: + pass + + # if no candidates from BM25, fallback to sample + if not candidates: + try: + sql_sample = ( + f"SELECT id AS chunk_id, doc_id, chunk_index, chunk_text " + f"FROM `{dbname}`.`chunks` " + f"ORDER BY RAND() LIMIT {self.CANDIDATE_N};" + ) + html = mariadb_client.run_statement(sql_sample) + df = self._parse_html_table(html) + if self._is_nonempty_table(df): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(df, _pd.DataFrame): + for _, row in df.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("id")) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "" + }) + else: + for r in df: + try: + cid = int(r.get("chunk_id") or r.get("id") or next(iter(r.values()))) + except Exception: + cid = None + candidates.append({ + "chunk_id": cid, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "" + }) + except Exception as e: + kernel._send_message("stderr", f"Candidate sampling failed: {e}\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "warning", f"Candidate sampling failed: {e}", dbname, user_name) + except Exception: + pass + + if not candidates: + msg = "No candidate chunks found (empty chunks table?)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass + return + + candidate_ids = [int(c["chunk_id"]) for c in candidates if c.get("chunk_id") is not None] + if not candidate_ids: + msg = "No valid candidate chunk ids." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "chunks", "error", msg, dbname, user_name) + except Exception: + pass + return + + # --- fetch embeddings: try native embeddings table first --- + id_list_sql = ",".join(str(int(x)) for x in candidate_ids) + emb_rows = None + try: + sql_emb = ( + f"SELECT e.chunk_id, e.embedding_vector, c.chunk_text, c.doc_id, c.chunk_meta " + f"FROM `{dbname}`.`embeddings` e " + f"JOIN `{dbname}`.`chunks` c ON e.chunk_id = c.id " + f"WHERE e.chunk_id IN ({id_list_sql});" + ) + html = mariadb_client.run_statement(sql_emb) + emb_rows = self._parse_html_table(html) + except Exception: + emb_rows = None + + emb_map = {} + # parse native embeddings if returned + if self._is_nonempty_table(emb_rows): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(emb_rows, _pd.DataFrame): + for _, row in emb_rows.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get("chunk_id")) + except Exception: + continue + emb_raw = row.get("embedding_vector") if "embedding_vector" in row else row.get("embedding") if "embedding" in row else None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "chunk_meta": row.get("chunk_meta") if "chunk_meta" in row else "" + } + else: + for r in emb_rows: + try: + cid = int(r.get("chunk_id") or r.get(next(iter(r.keys())))) + except Exception: + continue + emb_raw = r.get("embedding_vector") or r.get("embedding_json") or r.get("embedding_bin") or None + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_meta": r.get("chunk_meta") or "" + } + + # If native embeddings empty for candidates, try embeddings_json fallback + if not emb_map: + try: + sql_json = ( + f"SELECT ej.chunk_id, ej.embedding_json, c.chunk_text, c.doc_id, c.chunk_meta " + f"FROM `{dbname}`.`embeddings_json` ej " + f"JOIN `{dbname}`.`chunks` c ON ej.chunk_id = c.id " + f"WHERE ej.chunk_id IN ({id_list_sql});" + ) + html_json = mariadb_client.run_statement(sql_json) + rows_json = self._parse_html_table(html_json) + if self._is_nonempty_table(rows_json): + if _PANDAS_AVAILABLE and hasattr(_pd, "DataFrame") and isinstance(rows_json, _pd.DataFrame): + for _, row in rows_json.iterrows(): + try: + cid = int(row.get("chunk_id") if "chunk_id" in row else row.get(0)) + except Exception: + continue + emb_raw = row.get("embedding_json") if "embedding_json" in row else row.get("embedding") or None + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": row.get("chunk_text") if "chunk_text" in row else "", + "doc_id": row.get("doc_id") if "doc_id" in row else "", + "chunk_meta": row.get("chunk_meta") if "chunk_meta" in row else "" + } + else: + for r in rows_json: + try: + cid = int(r.get("chunk_id") or next(iter(r.values()))) + except Exception: + continue + emb_raw = r.get("embedding_json") or r.get(next(iter([k for k in r.keys() if 'embedding' in k.lower()])), None) + vec = None + if emb_raw is not None: + try: + if isinstance(emb_raw, (list, tuple)): + vec = np.array(emb_raw, dtype=np.float32) + else: + vec = np.array(json.loads(emb_raw), dtype=np.float32) + except Exception: + vec = self._parse_vector_literal(emb_raw) + if vec is None: + continue + norm = np.linalg.norm(vec) + if norm == 0: norm = 1.0 + emb_map[cid] = { + "vec": vec.astype(np.float32) / norm, + "chunk_text": r.get("chunk_text") or "", + "doc_id": r.get("doc_id") or "", + "chunk_meta": r.get("chunk_meta") or "" + } + except Exception as e: + kernel._send_message("stderr", f"Embeddings JSON fallback failed: {e}\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "warning", f"Embeddings JSON fallback failed: {e}", dbname, user_name) + except Exception: + pass + + if not emb_map: + msg = "No embeddings found for candidate chunks (neither native nor JSON fallback)." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + # compute query embedding (dim inferred from first vector) + try: + vec_dim = next(iter(emb_map.values()))["vec"].shape[0] + except Exception: + msg = "Failed to determine embedding dimensionality." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + try: + q_emb = self._embed_texts([query], dim=vec_dim)[0] + q_norm = np.linalg.norm(q_emb) + if q_norm == 0: q_norm = 1.0 + q_emb = q_emb.astype(np.float32) / q_norm + except Exception as e: + msg = f"Failed to compute query embedding: {e}" + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "embeddings", "error", msg, dbname, user_name) + except Exception: + pass + return + + # combine scores and rank + results = [] + bm25_scores = [float(c.get("bm25_score", 0.0) or 0.0) for c in candidates] + bm25_max = max(bm25_scores) if bm25_scores else 0.0 + for c in candidates: + cid = c.get("chunk_id") + if cid not in emb_map: + continue + emb_info = emb_map[cid] + sim = float(np.dot(q_emb, emb_info["vec"])) + bm25_raw = float(c.get("bm25_score", 0.0) or 0.0) + bm25_norm = (bm25_raw / bm25_max) if bm25_max > 0 else 0.0 + combined = (self.BM25_WEIGHT * bm25_norm) + ((1.0 - self.BM25_WEIGHT) * ((sim + 1.0) / 2.0)) + results.append({ + "chunk_id": cid, + "chunk_text": emb_info.get("chunk_text") or c.get("chunk_text", ""), + "doc_id": emb_info.get("doc_id") or c.get("doc_id", ""), + "chunk_meta": emb_info.get("chunk_meta") or c.get("chunk_meta", ""), + "vec_sim": sim, + "bm25": bm25_raw, + "score": combined + }) + + if not results: + msg = "No scored results to return after filtering." + kernel._send_message("stderr", msg + "\n") + try: + self._insert_metadata(kernel, self.name(), self.args if isinstance(self.args, str) else str(self.args), + "search", "error", msg, dbname, user_name) + except Exception: + pass + return + + results.sort(key=lambda r: r["score"], reverse=True) + topk = results[: self.K] + + # output table + lines = [] + header = ["chunk_id", "chunk_text...", "score", "vec_sim", "bm25", "doc_id"] + lines.append("\t".join(header)) + for r in topk: + text_preview = (r["chunk_text"] or "").replace("\n", " ") + if len(text_preview) > 200: + text_preview = text_preview[:197] + "..." + score_s = f"{r['score']:.6f}" + vec_s = f"{r['vec_sim']:.6f}" + bm25_s = f"{r['bm25']:.6f}" + line = "\t".join([str(r["chunk_id"]), text_preview, score_s, vec_s, bm25_s, str(r.get("doc_id", ""))]) + lines.append(line) + + out = "\n".join(lines) + "\n" + kernel._send_message("stdout", out) + + # write success metadata (best-effort) + try: + args_for_db = self.args if isinstance(self.args, str) else str(self.args) + affected_columns_str = "chunks,embeddings" + self._insert_metadata(kernel, self.name(), args_for_db, affected_columns_str, + "success", f"Returned {len(topk)} results for query.", dbname, user_name) + except Exception: + try: + kernel._send_message("stdout", "Warning: failed to write metadata (continuing).\n") + except Exception: + pass + + return diff --git a/mariadb_kernel/maria_magics/supported_magics.py b/mariadb_kernel/maria_magics/supported_magics.py index cb5b8fe..69d940c 100644 --- a/mariadb_kernel/maria_magics/supported_magics.py +++ b/mariadb_kernel/maria_magics/supported_magics.py @@ -1,25 +1,68 @@ -""" Maintains a list of magic commands supported by the kernel """ - -# Copyright (c) MariaDB Foundation. -# Distributed under the terms of the Modified BSD License. - -from mariadb_kernel.maria_magics.line import Line -from mariadb_kernel.maria_magics.df import DF -from mariadb_kernel.maria_magics.lsmagic import LSMagic -from mariadb_kernel.maria_magics.maria_magic import MariaMagic -from mariadb_kernel.maria_magics.bar import Bar -from mariadb_kernel.maria_magics.pie import Pie -from mariadb_kernel.maria_magics.delimiter import Delimiter -from mariadb_kernel.maria_magics.load import Load - - -def get(): - return { - "line": Line, - "bar": Bar, - "pie": Pie, - "df": DF, - "lsmagic": LSMagic, - "delimiter": Delimiter, - "load": Load, - } +""" Maintains a list of magic commands supported by the kernel """ + +# Copyright (c) MariaDB Foundation. +# Distributed under the terms of the Modified BSD License. + +from mariadb_kernel.maria_magics.line import Line +from mariadb_kernel.maria_magics.df import DF +from mariadb_kernel.maria_magics.lsmagic import LSMagic +from mariadb_kernel.maria_magics.maria_magic import MariaMagic +from mariadb_kernel.maria_magics.bar import Bar +from mariadb_kernel.maria_magics.pie import Pie +from mariadb_kernel.maria_magics.delimiter import Delimiter +from mariadb_kernel.maria_magics.load import Load +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.missing import Missing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropmissing import DropMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.stats import Stats +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.fillmissing import FillMissing +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.outliers import Outliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.dropoutliers import DropOutliers +from mariadb_kernel.maria_magics.ml_commands.data_cleaning.clipoutliers import ClipOutliers +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.encode import Encode +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.normalize import Normalize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.standardize import Standardize +from mariadb_kernel.maria_magics.ml_commands.data_preprocessing.splitdata import SplitData +from mariadb_kernel.maria_magics.ml_commands.model_training.train_model import TrainModel +from mariadb_kernel.maria_magics.ml_commands.model_training.evaluate_model import EvaluateModel +from mariadb_kernel.maria_magics.ml_commands.model_training.savemodel import SaveModel +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_features import SelectFeatures +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.select_model import SelectModel +from mariadb_kernel.maria_magics.ml_commands.model_training.loadmodel import LoadModel +from mariadb_kernel.maria_magics.ml_commands.model_training.predict import Predict +from mariadb_kernel.maria_magics.ml_commands.ml_pipeline.ml_pipeline import MLPipeline +from mariadb_kernel.maria_magics.rag_commands.maria_ingest import MariaIngest +from mariadb_kernel.maria_magics.rag_commands.maria_search import MariaSearch +from mariadb_kernel.maria_magics.rag_commands.maria_rag_query import MariaRAGQuery + +def get(): + return { + "line": Line, + "bar": Bar, + "pie": Pie, + "df": DF, + "lsmagic": LSMagic, + "delimiter": Delimiter, + "load": Load, + "missing": Missing, + "dropmissing": DropMissing, + "stats": Stats, + "fillmissing": FillMissing, + "outliers": Outliers, + "dropoutliers": DropOutliers, + "clipoutliers": ClipOutliers, + "encode": Encode, + "normalize": Normalize, + "standardize": Standardize, + "splitdata": SplitData, + "train_model": TrainModel, + "evaluate_model": EvaluateModel, + "savemodel": SaveModel, + "loadmodel": LoadModel, + "predict": Predict, + "select_features": SelectFeatures, + "select_model": SelectModel, + "ml_pipeline": MLPipeline, + "maria_ingest": MariaIngest, + "maria_search": MariaSearch, + "maria_rag_query": MariaRAGQuery, + } diff --git a/requirements.txt b/requirements.txt index f95ca0d..1fdba48 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,12 @@ setuptools-scm ipykernel beautifulsoup4 mycli +numpy +joblib +scikit-learn +sentence-transformers>=2.2.0 +google-generativeai>=0.3.0 +xgboost>=1.5.0 +lightgbm>=3.3.0 +catboost>=1.0.0 +scipy \ No newline at end of file diff --git a/sample_sales_export.csv b/sample_sales_export.csv new file mode 100644 index 0000000..913be90 --- /dev/null +++ b/sample_sales_export.csv @@ -0,0 +1,7 @@ +month,sales +Jan,10 +Feb,20 +Mar,15 +Apr,30 +May,22 +Jun,18