mostly-ai
diff --git a/‎examples/benchmark.ipynb‎
Lines changed: 1 addition & 0 deletions b/‎examples/benchmark.ipynb‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎mostlyai/qa/_distances.py‎
Lines changed: 146 additions & 56 deletions b/‎mostlyai/qa/_distances.py‎
Lines changed: 146 additions & 56 deletions
diff --git a/‎mostlyai/qa/assets/html/report_template.html‎
Lines changed: 19 additions & 8 deletions b/‎mostlyai/qa/assets/html/report_template.html‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎mostlyai/qa/metrics.py‎
Lines changed: 13 additions & 2 deletions b/‎mostlyai/qa/metrics.py‎
Lines changed: 13 additions & 2 deletions
@@ -111,6 +111,7 @@
    "source": [
     "import matplotlib.pyplot as plt\n",
     "\n",
+    "\n",
     "def plot_dataset(df, dataset):\n",
     "    # Define the color mapping for each synthesizer\n",
     "    color_mapping = {\n",
 
@@ -30,96 +30,186 @@
 
 def calculate_distances(
     *, syn_embeds: np.ndarray, trn_embeds: np.ndarray, hol_embeds: np.ndarray | None
-) -> tuple[np.ndarray, np.ndarray | None]:
-    """
-    Calculates distances to the closest records (DCR). Once for synthetic to training, and once for synthetic to
-    holdout data.
+) -> tuple[np.ndarray, np.ndarray | None, np.ndarray | None]:
     """
+    Calculates distances to the closest records (DCR).
+
+    Args:
+        syn_embeds: Embeddings of synthetic data.
+        trn_embeds: Embeddings of training data.
+        hol_embeds: Embeddings of holdout data.
 
+    Returns:
+        Tuple containing:
+            - dcr_syn_trn: DCR for synthetic to training.
+            - dcr_syn_hol: DCR for synthetic to holdout.
+            - dcr_trn_hol: DCR for training to holdout.
+    """
     if hol_embeds is not None:
         assert trn_embeds.shape == hol_embeds.shape
-    # calculate DCR using L2 metric
-    index = NearestNeighbors(n_neighbors=1, algorithm="brute", metric="l2", n_jobs=min(cpu_count() - 1, 16))
-    index.fit(syn_embeds)
+    # calculate DCR for synthetic to training
+    index_syn = NearestNeighbors(n_neighbors=1, algorithm="brute", metric="l2", n_jobs=min(cpu_count() - 1, 16))
+    index_syn.fit(syn_embeds)
     _LOG.info(f"calculate DCRs for {len(syn_embeds):,} synthetic to {len(trn_embeds):,} training")
-    dcrs_trn, _ = index.kneighbors(trn_embeds)
-    dcr_trn = dcrs_trn[:, 0]
+    dcrs_syn_trn, _ = index_syn.kneighbors(trn_embeds)
+    dcr_syn_trn = dcrs_syn_trn[:, 0]
+
+    dcr_syn_hol = None
+    dcr_trn_hol = None
+
     if hol_embeds is not None:
+        # calculate DCR for synthetic to holdout
         _LOG.info(f"calculate DCRs for {len(syn_embeds):,} synthetic to {len(hol_embeds):,} holdout")
-        dcrs_hol, _ = index.kneighbors(hol_embeds)
-        dcr_hol = dcrs_hol[:, 0]
-    else:
-        dcr_hol = None
-    dcr_trn_deciles = np.round(np.quantile(dcr_trn, np.linspace(0, 1, 11)), 3)
-    _LOG.info(f"DCR deciles for synthetic to training: {dcr_trn_deciles}")
-    if dcr_hol is not None:
-        dcr_hol_deciles = np.round(np.quantile(dcr_hol, np.linspace(0, 1, 11)), 3)
-        _LOG.info(f"DCR deciles for synthetic to holdout:  {dcr_hol_deciles}")
-        # calculate share of dcr_trn != dcr_hol
-        _LOG.info(f"share of dcr_trn < dcr_hol: {np.mean(dcr_trn < dcr_hol):.1%}")
-        _LOG.info(f"share of dcr_trn > dcr_hol: {np.mean(dcr_trn > dcr_hol):.1%}")
-    return dcr_trn, dcr_hol
-
-
-def plot_distances(plot_title: str, dcr_trn: np.ndarray, dcr_hol: np.ndarray | None) -> go.Figure:
+        dcrs_syn_hol, _ = index_syn.kneighbors(hol_embeds)
+        dcr_syn_hol = dcrs_syn_hol[:, 0]
+
+        # calculate DCR for training to holdout
+        _LOG.info(f"calculate DCRs for {len(trn_embeds):,} training to {len(hol_embeds):,} holdout")
+        index_trn = NearestNeighbors(n_neighbors=1, algorithm="brute", metric="l2", n_jobs=min(cpu_count() - 1, 16))
+        index_trn.fit(trn_embeds)
+        dcrs_trn_hol, _ = index_trn.kneighbors(hol_embeds)
+        dcr_trn_hol = dcrs_trn_hol[:, 0]
+
+    dcr_syn_trn_deciles = np.round(np.quantile(dcr_syn_trn, np.linspace(0, 1, 11)), 3)
+    _LOG.info(f"DCR deciles for synthetic to training: {dcr_syn_trn_deciles}")
+    if dcr_syn_hol is not None:
+        dcr_syn_hol_deciles = np.round(np.quantile(dcr_syn_hol, np.linspace(0, 1, 11)), 3)
+        _LOG.info(f"DCR deciles for synthetic to holdout:  {dcr_syn_hol_deciles}")
+        # calculate share of dcr_syn_trn != dcr_syn_hol
+        _LOG.info(f"share of dcr_syn_trn < dcr_syn_hol: {np.mean(dcr_syn_trn < dcr_syn_hol):.1%}")
+        _LOG.info(f"share of dcr_syn_trn > dcr_syn_hol: {np.mean(dcr_syn_trn > dcr_syn_hol):.1%}")
+
+    if dcr_trn_hol is not None:
+        dcr_trn_hol_deciles = np.round(np.quantile(dcr_trn_hol, np.linspace(0, 1, 11)), 3)
+        _LOG.info(f"DCR deciles for training to holdout:  {dcr_trn_hol_deciles}")
+
+    return dcr_syn_trn, dcr_syn_hol, dcr_trn_hol
+
+
+def plot_distances(
+    plot_title: str, dcr_syn_trn: np.ndarray, dcr_syn_hol: np.ndarray | None, dcr_trn_hol: np.ndarray | None
+) -> go.Figure:
     # calculate quantiles
     y = np.linspace(0, 1, 101)
-    x_trn = np.quantile(dcr_trn, y)
-    if dcr_hol is not None:
-        x_hol = np.quantile(dcr_hol, y)
+    x_syn_trn = np.quantile(dcr_syn_trn, y)
+    if dcr_syn_hol is not None:
+        x_syn_hol = np.quantile(dcr_syn_hol, y)
+    else:
+        x_syn_hol = None
+
+    if dcr_trn_hol is not None:
+        x_trn_hol = np.quantile(dcr_trn_hol, y)
     else:
-        x_hol = None
+        x_trn_hol = None
+
     # prepare layout
     layout = go.Layout(
         title=dict(text=f"<b>{plot_title}</b>", x=0.5, y=0.98),
         title_font=CHARTS_FONTS["title"],
         font=CHARTS_FONTS["base"],
-        hoverlabel=CHARTS_FONTS["hover"],
+        hoverlabel=dict(
+            **CHARTS_FONTS["hover"],
+            namelength=-1,  # Show full length of hover labels
+        ),
         plot_bgcolor=CHARTS_COLORS["background"],
         autosize=True,
         height=500,
         margin=dict(l=20, r=20, b=20, t=40, pad=5),
-        showlegend=False,
-        hovermode="x unified",
+        showlegend=True,
         yaxis=dict(
             showticklabels=False,
             zeroline=True,
             zerolinewidth=1,
             zerolinecolor="#999999",
             rangemode="tozero",
+            showline=True,
+            linewidth=1,
+            linecolor="#999999",
+        ),
+        yaxis2=dict(
+            overlaying="y",
+            side="right",
+            tickformat=".0%",
+            showgrid=False,
+            range=[0, 1],
+            showline=True,
+            linewidth=1,
+            linecolor="#999999",
+        ),
+        xaxis=dict(
+            showline=True,
+            linewidth=1,
+            linecolor="#999999",
+            hoverformat=".3f",
         ),
     )
-    fig = go.Figure(layout=layout).set_subplots(
-        rows=1,
-        cols=1,
-    )
-    # plot content
-    cum_trn_scatter = go.Scatter(
-        mode="lines",
-        x=x_trn,
-        y=y,
-        name="DCR training",
-        line=dict(color=CHARTS_COLORS["synthetic"], width=5),
-        yhoverformat=".0%",
-    )
-    fig.add_trace(cum_trn_scatter, row=1, col=1)
-    if x_hol is not None:
-        cum_hol_scatter = go.Scatter(
+    fig = go.Figure(layout=layout)
+
+    traces = []
+
+    # training vs holdout (light gray)
+    if x_trn_hol is not None:
+        traces.append(
+            go.Scatter(
+                mode="lines",
+                x=x_trn_hol,
+                y=y,
+                name="Training vs. Holdout Data",
+                line=dict(color="#999999", width=5),
+                yaxis="y2",
+            )
+        )
+
+    # synthetic vs holdout (gray)
+    if x_syn_hol is not None:
+        traces.append(
+            go.Scatter(
+                mode="lines",
+                x=x_syn_hol,
+                y=y,
+                name="Synthetic vs. Holdout Data",
+                line=dict(color="#666666", width=5),
+                yaxis="y2",
+            )
+        )
+
+    # synthetic vs training (green)
+    traces.append(
+        go.Scatter(
             mode="lines",
-            x=x_hol,
+            x=x_syn_trn,
             y=y,
-            name="DCR holdout",
-            line=dict(color=CHARTS_COLORS["original"], width=5),
-            yhoverformat=".0%",
+            name="Synthetic vs. Training Data",
+            line=dict(color="#24db96", width=5),
+            yaxis="y2",
+        )
+    )
+
+    for trace in traces:
+        fig.add_trace(trace)
+
+    fig.update_layout(
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.15,
+            xanchor="center",
+            x=0.5,
+            font=dict(size=10),
+            traceorder="reversed",
         )
-        fig.add_trace(cum_hol_scatter, row=1, col=1)
+    )
+
     return fig
 
 
 def plot_store_distances(
-    dcr_trn: np.ndarray,
-    dcr_hol: np.ndarray | None,
+    dcr_syn_trn: np.ndarray,
+    dcr_syn_hol: np.ndarray | None,
+    dcr_trn_hol: np.ndarray | None,
     workspace: TemporaryWorkspace,
 ) -> None:
-    fig = plot_distances("Cumulative Distributions of Distance to Closest Records (DCR)", dcr_trn, dcr_hol)
+    fig = plot_distances(
+        "Cumulative Distributions of Distance to Closest Records (DCR)", dcr_syn_trn, dcr_syn_hol, dcr_trn_hol
+    )
     workspace.store_figure_html(fig, "distances_dcr")
@@ -393,10 +393,11 @@ <h2 id="distances" class="anchor">Distances</h2>
         <table class='table' style="text-align: left">
           <thead>
             <tr>
-              <td style="width: 33%"> </td>
-              <td style="width: 33%">Synthetic vs. Training Data</td>
+              <td style="width: 25%"> </td>
+              <td style="width: 25%">Synthetic vs. Training Data</td>
               {% if metrics.distances.ims_holdout is not none %}
-              <td style="width: 33%"><small class="muted-text">(Synthetic vs. Holdout Data)</small></td>
+              <td style="width: 25%"><small style="color: #666666;">Synthetic vs. Holdout Data</small></td>
+              <td style="width: 25%"><small style="color: #999999;">Training vs. Holdout Data</small></td>
               {% endif %}
             </tr>
           </thead>
@@ -405,16 +406,26 @@ <h2 id="distances" class="anchor">Distances</h2>
               <td>Identical Matches</td>
               <td>{{ "{:.1%}".format(metrics.distances.ims_training) }}</td>
               {% if metrics.distances.ims_holdout is not none %}
-              <td><small class="muted-text">({{ "{:.1%}".format(metrics.distances.ims_holdout) }})</small></td>
+              <td><small style="color: #666666;">{{ "{:.1%}".format(metrics.distances.ims_holdout) }}</small></td>
+              <td><small style="color: #999999;">{{ "{:.1%}".format(metrics.distances.ims_trn_hol) if metrics.distances.ims_trn_hol is not none else "N/A" }}</small></td>
               {% endif %}
             </tr>
             <tr>
               <td>Average Distances</td>
               <td>{{ "{:.3f}".format(metrics.distances.dcr_training) }}</td>
               {% if metrics.distances.dcr_holdout is not none %}
-              <td><small class="muted-text">({{ "{:.3f}".format(metrics.distances.dcr_holdout) }})</small></td>
+              <td><small style="color: #666666;">{{ "{:.3f}".format(metrics.distances.dcr_holdout) }}</small></td>
+              <td><small style="color: #999999;">{{ "{:.3f}".format(metrics.distances.dcr_trn_hol) if metrics.distances.dcr_trn_hol is not none else "N/A" }}</small></td>
               {% endif %}
             </tr>
+            {% if metrics.distances.dcr_share is not none %}
+            <tr>
+              <td>DCR Share</td>
+              <td>{{ "{:.1%}".format(metrics.distances.dcr_share) }}</td>
+              <td></td>
+              <td></td>
+            </tr>
+            {% endif %}
           </tbody>
         </table>
         <br />
@@ -432,9 +443,9 @@ <h2 id="distances" class="anchor">Distances</h2>
       <div class="explainer-body">
         Synthetic data shall be as close to the original training samples, as it is close to original holdout samples, which serve us as a reference.
         This can be asserted empirically by measuring distances between synthetic samples to their closest original samples, whereas training and holdout sets are sampled to be of equal size.
-        For the visualization above, the distances of synthetic samples to the training samples are displayed in green, and the distances of synthetic samples to the holdout samples (if available) displayed in gray.
-        A green line that is significantly left of the gray line implies that synthetic samples are closer to the training samples than to the holdout samples, indicating that the data has overfitted to the training data.
-        A green line that overlays with the gray line validates that the trained model indeed represents the general rules, that can be found in training just as well as in holdout samples.
+        A green line that is significantly left of the dark gray line implies that synthetic samples are closer to the training samples than to the holdout samples, indicating that the data has overfitted to the training data.
+        A green line that overlays with the dark gray line validates that the trained model indeed represents the general rules, that can be found in training just as well as in holdout samples.
+        The DCR share indicates the proportion of synthetic samples that are closer to a training sample than to a holdout sample, and ideally, this value should not significantly exceed 50%, as a higher value could indicate overfitting.
       </div>
     </div>
   </div>
 
@@ -192,6 +192,12 @@ class Distances(CustomBaseModel):
         "`ims_training`.",
         ge=0.0,
     )
+    ims_trn_hol: float | None = Field(
+        default=None,
+        alias="imsTrnHol",
+        description="Share of training samples that are identical to a holdout sample.",
+        ge=0.0,
+    )
     dcr_training: float | None = Field(
         default=None,
         alias="dcrTraining",
@@ -201,8 +207,13 @@ class Distances(CustomBaseModel):
     dcr_holdout: float | None = Field(
         default=None,
         alias="dcrHoldout",
-        description="Average L2 nearest-neighbor distance between synthetic and holdout samples. Serves as a "
-        "reference for `dcr_training`.",
+        description="Average L2 nearest-neighbor distance between synthetic and holdout samples. Serves as a reference for `dcr_training`.",
+        ge=0.0,
+    )
+    dcr_trn_hol: float | None = Field(
+        default=None,
+        alias="dcrTrnHol",
+        description="Average L2 nearest-neighbor distance between training and holdout samples. Serves as a reference for `dcr_training`.",
         ge=0.0,
     )
     dcr_share: float | None = Field(