diff --git a/experimental/README.md b/experimental/README.md new file mode 100644 index 0000000..de36d79 --- /dev/null +++ b/experimental/README.md @@ -0,0 +1,6 @@ +# Experimental + +This folder contains experimental work on dpmm. + +## `audit_dpmm` +Contains the course code and experiments for the paper Tight Auditing of Differential Privacy in MST and AIM. \ No newline at end of file diff --git a/experimental/audit_dpmm/.gitignore b/experimental/audit_dpmm/.gitignore new file mode 100644 index 0000000..992b3f4 --- /dev/null +++ b/experimental/audit_dpmm/.gitignore @@ -0,0 +1,212 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock +#poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +#pdm.lock +#pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +#pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Cursor +# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to +# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data +# refer to https://docs.cursor.com/context/ignore-files +.cursorignore +.cursorindexingignore + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# DS_Store +*.DS_Store + +data/features_2_rows.pkl \ No newline at end of file diff --git a/experimental/audit_dpmm/README.md b/experimental/audit_dpmm/README.md new file mode 100644 index 0000000..918f8ad --- /dev/null +++ b/experimental/audit_dpmm/README.md @@ -0,0 +1,14 @@ +# `audit-dpmm` + +This folder contains the course code and experiments for the paper Tight Auditing of Differential Privacy in MST and AIM by G. Ganev, M.S.M.S. Annamalai, B. Kulynych. + +## Installation + +The experiments require Python 3.11. +All necessary dependencies are listed in environment.yaml. + +## Source Code Structure + +To replicate the experiemnts and the plots from the paper: +1. Run `run_attack.py` +2. Run `run_audit.ipynb` \ No newline at end of file diff --git a/experimental/audit_dpmm/code/audit_utils.py b/experimental/audit_dpmm/code/audit_utils.py new file mode 100644 index 0000000..3ace325 --- /dev/null +++ b/experimental/audit_dpmm/code/audit_utils.py @@ -0,0 +1,357 @@ +import numpy as np + +from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier +from sklearn.metrics import roc_auc_score + +from scipy import integrate +from scipy.stats import chi2 +from scipy.optimize import root_scalar +from scipy.stats import norm, binomtest +from scipy.stats import beta as beta_dist + +from riskcal.analysis import get_beta_from_gdp + + +def mu_lower_from_two_groups(y_D, y_Dp, alpha=0.1, sens=1.0): + """ + y_D: array of releases on dataset D (same query, rerun mechanism many times) + y_Dp: array of releases on dataset D' (neighboring dataset) + alpha: one-sided error; returns mu_lo with confidence ~1-alpha + sens: query sensitivity Δ (1.0 in our case) + + Model: Y = q(D) + N(0, sigma^2) and Y' = q(D') + N(0, sigma^2) + mu = Δ / sigma + """ + y_D = np.asarray(y_D, dtype=float) + y_Dp = np.asarray(y_Dp, dtype=float) + + m = len(y_D) + n = len(y_Dp) + if m < 2 or n < 2: + raise ValueError("Need at least 2 samples in each group to estimate sigma.") + + s2_D = np.var(y_D, ddof=1) + s2_Dp = np.var(y_Dp, ddof=1) + + nu = (m - 1) + (n - 1) # degrees of freedom + sp2 = ((m - 1) * s2_D + (n - 1) * s2_Dp) / nu + + # One-sided upper bound on sigma^2 via chi-square (use LOWER quantile!) + chi2_lower = chi2.ppf(alpha, nu) + if chi2_lower <= 0: + return 0.0 + + sigma2_up = nu * sp2 / chi2_lower + sigma_up = np.sqrt(sigma2_up) + + mu_lo = sens / sigma_up + return float(max(mu_lo, 0.0)), float(sigma_up) + + +def _conf_upper_binom_cp(k, n, alpha_one_sided=0.05): + """ + Upper bound for a binomial proportion using Clopper–Pearson. + + alpha_one_sided is the desired one-sided error rate. + We approximate this by taking the upper endpoint of a two-sided CI + with confidence_level = 1 - alpha_one_sided (conservative but standard). + """ + if n <= 0: + return 1.0 + ci = binomtest(int(k), int(n)).proportion_ci(confidence_level=1 - alpha_one_sided, method="exact") + return ci.high + + +###### +class JointBetaMu: + """ + Inspired from Bayesian Estimation of Differential Privacy (https://arxiv.org/abs/2206.05199) + and https://github.com/microsoft/responsible-ai-toolbox-privacy/blob/66d2d45b8f57683b0390cfa63774abb70235e5da/privacy_estimates/joint_density.py#L118 + + Joint-beta (Jeffreys) model for (FPR, FNR) + μ-GDP region inversion. + + Posterior: + FPR ~ Beta(0.5+FP, 0.5+TN) + FNR ~ Beta(0.5+FN, 0.5+TP) + independent. + """ + def __init__(self, fp, tn, fn, tp): + self.fpr_post = beta_dist(0.5 + fp, 0.5 + tn) + self.fnr_post = beta_dist(0.5 + fn, 0.5 + tp) + + def prob_mu_private(self, mu, epsabs=1e-6): + """ + Probability mass of μ-GDP feasible region: + fnr >= beta_from_mu(fpr, mu) + under independent posteriors. + """ + def integrand(fpr): + b = get_beta_from_gdp(fpr, mu) + return self.fpr_post.pdf(fpr) * (1.0 - self.fnr_post.cdf(b)) + + p, _ = integrate.quad(integrand, 0.0, 1.0, epsabs=epsabs) + return float(np.clip(p, 0.0, 1.0)) + + def mu_lo(self, alpha=0.1, xtol=1e-3, max_mu=50.0): + """ + Returns μ_lo such that the μ-GDP region contains alpha posterior mass. + This mirrors the epsilon_estimation.DensityModel().eps_lo convention. + """ + assert 0 < alpha < 1 + + def objective(mu): + return self.prob_mu_private(mu, epsabs=max(xtol/5, 1e-6)) - alpha + + # If even μ=0 already contains >= alpha mass, lower bound is 0 + if objective(0.0) >= 0.0: + return 0.0 + + lo, hi = 0.0, 1.0 + while objective(hi) < 0.0: + hi *= 2 + if hi >= max_mu: + hi = max_mu + break + + # If still not enough mass even at max_mu, return max_mu (very conservative) + if objective(hi) < 0.0: + return float(max_mu) + + res = root_scalar(objective, bracket=[lo, hi], xtol=xtol, method="brentq") + return float(res.root) +###### + + +def _threshold_grid_from_scores( + valid_scores, + mode="quantiles", # ["all_unique", "quantiles"] + n_thresholds=200, +): + if mode == "all_unique": + thresholds = np.unique(valid_scores) + elif mode == "quantiles": + qs = np.linspace(0, 1, n_thresholds) + thresholds = np.unique(np.quantile(valid_scores, qs)) + else: + raise ValueError("threshold_mode must be 'all_unique' or 'quantiles'") + return thresholds + + +def _eval_thresholds(scores_out, scores_in, thresholds, ci_method="bonferroni_cp", alpha=0.1): + """Compute confusion/rates for each threshold.""" + P = len(scores_in) + N = len(scores_out) + + TP, FN, FP, TN = [], [], [], [] + FPR, FNR, ADV, MU_HAT, MU_LOWER = [], [], [], [], [] + + for t in thresholds: + tp, fn, fp, tn = _confusion_at_threshold(scores_out, scores_in, float(t)) + fpr, fnr, adv = _rates_from_confusion(tp, fn, fp, tn) + mu_hat = _mu_from_fpr_fnr(fpr, fnr) + mu_lower = _mu_lo_from_counts(tp=tp, fn=fn, fp=fp, tn=tn, ci_method=ci_method, alpha=alpha) + + TP.append(tp) + FN.append(fn) + FP.append(fp) + TN.append(tn) + FPR.append(fpr) + FNR.append(fnr) + ADV.append(adv) + MU_HAT.append(mu_hat) + MU_LOWER.append(mu_lower) + + return { + "thresholds": thresholds.astype(float), + "TP": np.array(TP, dtype=int), + "FN": np.array(FN, dtype=int), + "FP": np.array(FP, dtype=int), + "TN": np.array(TN, dtype=int), + "FPR": np.array(FPR, dtype=float), + "FNR": np.array(FNR, dtype=float), + "advantage": np.array(ADV, dtype=float), + "mu_hat": np.array(MU_HAT, dtype=float), + "mu_lower": np.array(MU_LOWER, dtype=float), + "P": np.array([P], dtype=int)[0], + "N": np.array([N], dtype=int)[0], + } + + +def _confusion_at_threshold(scores_out, scores_in, t): + """Return (TP, FN, FP, TN) when predicting 'in' if score>=t.""" + P = len(scores_in) + N = len(scores_out) + tp = int(np.sum(scores_in >= t)) + fn = int(P - tp) + fp = int(np.sum(scores_out >= t)) + tn = int(N - fp) + return tp, fn, fp, tn + + +def _rates_from_confusion(tp, fn, fp, tn): + """Return (FPR, FNR, advantage) where advantage = TPR - FPR.""" + P = tp + fn + N = tn + fp + fpr = fp / max(N, 1) + fnr = fn / max(P, 1) + tpr = tp / max(P, 1) + adv = tpr - fpr + return float(fpr), float(fnr), float(adv) + + +def _mu_from_fpr_fnr(fpr, fnr): + """Compute μ from a single (FPR,FNR) point.""" + clip_eps = 1e-6 + fpr = np.clip(fpr, clip_eps, 1 - clip_eps) + fnr = np.clip(fnr, clip_eps, 1 - clip_eps) + mu = norm.ppf(1 - fpr) - norm.ppf(fnr) + mu = np.clip(mu, 0, None) + return mu + + +def _select_optimal_threshold(curve, threshold_selection): + """Select optimal threshold from cureve (validation curve).""" + if threshold_selection == "max_advantage": + idx = int(np.argmax(curve["advantage"])) + elif threshold_selection == "max_mu_hat": + idx = int(np.argmax(curve["mu_hat"])) + elif threshold_selection == "max_mu_lower": + idx = int(np.argmax(curve["mu_lower"])) + else: + raise ValueError("threshold_selection must be 'max_advantage', 'max_mu_hat' or 'max_mu_lower'") + + t = float(curve["thresholds"][idx]) + return t + + +def _mu_lo_from_counts(tp, fn, fp, tn, + ci_method="bonferroni_cp", # "bonferroni_cp", "joint_beta" + alpha=0.1, +): + """Compute a single μ lower bound from one confusion tuple.""" + P = tp + fn + N = tn + fp + + if ci_method == "bonferroni_cp": + # Bonferroni across (FPR,FNR): alpha/2 per rate + # fix fpr for highest advantage; get ci for fnr with all alpha + alpha_each = alpha / 2 + fpr_u = _conf_upper_binom_cp(fp, N, alpha_one_sided=alpha_each) + fnr_u = _conf_upper_binom_cp(fn, P, alpha_one_sided=alpha_each) + return _mu_from_fpr_fnr(fpr_u, fnr_u) + + if ci_method == "joint_beta": + jb = JointBetaMu(fp=fp, tn=tn, fn=fn, tp=tp) + return jb.mu_lo(alpha=alpha) + + raise ValueError("ci_method must be 'bonferroni_cp' or 'joint_beta'") + + +def run_audit( + out_data, + in_data, + n_train, + n_valid, + n_test, + classifier="xgboost", # "xgboost", "random_forest" + threshold_mode="quantiles", # "all_unique", "quantiles" + n_thresholds=200, + threshold_selection="max_advantage", # "max_advantage", "max_mu_hat", "max_mu_lower" + ci_method="joint_beta", # "bonferroni_cp", "joint_beta" + alpha=0.1, + random_state=None, +): + """ + 1) Train attack model on TRAIN split. + 2) Compute scores on VALID and TEST (but DO NOT use TEST for threshold selection). + 3) Choose threshold t* using VALID only by objective: + - max_advantage: maximize TPR - FPR + - max_mu_hat: maximize μ_hat computed from (FPR,FNR) + 4) Return artifacts + full VALID diagnostics curves for plotting. + """ + + # --- train attack model --- + X_train = np.concatenate([out_data[:n_train], in_data[:n_train]]) + y_train = np.array([0] * n_train + [1] * n_train) + + if classifier == "xgboost": + clf = GradientBoostingClassifier(random_state=random_state) + elif classifier == "random_forest": + clf = RandomForestClassifier(random_state=random_state) + else: + raise ValueError("classifier must be 'xgboost' or 'random_forest'") + clf.fit(X_train, y_train) + + # --- compute scores for valid+test --- + out_scores_all = clf.predict_proba(out_data[n_train:])[:, 1] + in_scores_all = clf.predict_proba(in_data[n_train:])[:, 1] + + out_scores_valid = out_scores_all[:n_valid] + out_scores_test = out_scores_all[n_valid:n_valid + n_test] + in_scores_valid = in_scores_all[:n_valid] + in_scores_test = in_scores_all[n_valid:n_valid + n_test] + + # --- AUC diagnostics (do NOT use for selection) --- + # valid auc + y_valid = np.array([0] * n_valid + [1] * n_valid) + scores_valid = np.concatenate([out_scores_valid, in_scores_valid]) + auc_valid = roc_auc_score(y_valid, scores_valid) + auc_valid = max(auc_valid, 1 - auc_valid) + + # test auc + y_test = np.array([0] * n_test + [1] * n_test) + scores_test = np.concatenate([out_scores_test, in_scores_test]) + auc_test = roc_auc_score(y_test, scores_test) + auc_test = max(auc_test, 1 - auc_test) + + # valid/test auc + y_vt = np.array([0] * (n_valid + n_test) + [1] * (n_valid + n_test)) + scores_vt = np.concatenate([out_scores_all, in_scores_all]) + auc_vt = roc_auc_score(y_vt, scores_vt) + auc_vt = max(auc_vt, 1 - auc_vt) + + # --- extract thresholds grid from VALID only --- + valid_scores = np.concatenate([out_scores_valid, in_scores_valid]) + thresholds = _threshold_grid_from_scores(valid_scores, mode=threshold_mode, n_thresholds=n_thresholds) + + # --- evaluate curve and select threshold on VALID only --- + valid_curve = _eval_thresholds(out_scores_valid, in_scores_valid, thresholds, ci_method=ci_method, alpha=alpha) + opt_t = _select_optimal_threshold(valid_curve, threshold_selection) + valid_curve["opt_t"] = opt_t + + # --- evaluate/estimate on TEST --- + tp, fn, fp, tn = _confusion_at_threshold(out_scores_test, in_scores_test, opt_t) + fpr, fnr, adv = _rates_from_confusion(tp, fn, fp, tn) + mu_hat = _mu_from_fpr_fnr(fpr, fnr) + + mu_lower = _mu_lo_from_counts( + tp=tp, fn=fn, fp=fp, tn=tn, + ci_method=ci_method, + alpha=alpha, + ) + + return { + "valid_test": { + "auc": auc_vt, + }, + "valid": { + "auc": auc_valid, + "curve": valid_curve, + }, + "test": { + "auc": auc_test, + "point": { + "TP": tp, + "FN": fn, + "FP": fp, + "TN": tn, + "FPR": fpr, + "FNR": fnr, + "advantage": adv, + "mu_hat": mu_hat, + "mu_lower": mu_lower, + }, + }, + } + + diff --git a/experimental/audit_dpmm/code/mst/__init__.py b/experimental/audit_dpmm/code/mst/__init__.py new file mode 100644 index 0000000..600bfde --- /dev/null +++ b/experimental/audit_dpmm/code/mst/__init__.py @@ -0,0 +1,2 @@ +from mst.adp2gdp import mu_from_eps_delta +from mst.mst import MST diff --git a/experimental/audit_dpmm/code/mst/adp2gdp.py b/experimental/audit_dpmm/code/mst/adp2gdp.py new file mode 100644 index 0000000..a768839 --- /dev/null +++ b/experimental/audit_dpmm/code/mst/adp2gdp.py @@ -0,0 +1,21 @@ +import numpy as np +from scipy.stats import norm +from scipy.optimize import brentq + + +# convert mu-GDP to (eps, delta)-DP using Equation (6) from Tight Auditing DPML paper +def delta_from_eps_mu(eps, mu): + return norm.cdf(-eps / mu + mu / 2) - np.exp(eps) * norm.cdf(-eps / mu - mu / 2) + + +def mu_from_eps_delta(eps, delta): + # bracket search + lo, hi = 1e-6, 50.0 + + # expand hi if needed + while delta_from_eps_mu(eps, hi) < delta: + hi *= 2 + if hi > 1e6: + raise RuntimeError("Failed to bracket μ") + + return brentq(lambda m: delta_from_eps_mu(eps, m) - delta, lo, hi) diff --git a/experimental/audit_dpmm/code/mst/mst.py b/experimental/audit_dpmm/code/mst/mst.py new file mode 100644 index 0000000..7c05daf --- /dev/null +++ b/experimental/audit_dpmm/code/mst/mst.py @@ -0,0 +1,123 @@ +# A generative model training algorithm based on +# "Winning the NIST Contest: A scalable and general approach to differentially private synthetic data" +# by Ryan McKenna, Gerome Miklau, Daniel Sheldon +# Adapted from: https://github.com/ryan112358/private-pgm/blob/1da21c8b38149b05f1385b8e54116568b700b4fa/mechanisms/mst.py +# and +# Adapted from: https://github.com/sassoftware/dpmm/blob/752fd57480ec593a3b2b5950fd445e98cdedd7e3/src/dpmm/models/mst.py + + +import numpy as np +from logging import getLogger +from typing import Tuple, Optional +from numpy.random import RandomState + +from dpmm.models.base.mbi import Dataset, Domain +from dpmm.models.base.mechanisms import cdp_rho +from dpmm.models.base.memory import model_size +from dpmm.models.base.mechanisms import Mechanism + +from mst import mu_from_eps_delta + + +""" +This is a generalization of the winning mechanism from the +2018 NIST Differential Privacy Synthetic Data Competition. + +Unlike the original implementation, this one can work for any discrete dataset, +and does not rely on public provisional data for measurement selection. +""" + + +logger = getLogger("dpmm") + + +class MST(Mechanism): + """ + Maximum Spanning Tree (MST) mechanism is a differentially private generative model relying + on selecting an optimal set of marginals to approximate the joint distribution of the data. + It uses the exponential mechanism to select higher-order marginals based on their weights. + The marginals are measured using the Laplace mechanism. + The measured marginals are then used to estimate a maximum spanning tree which will be able to generate data. + + Ref: https://arxiv.org/pdf/2108.04978 + + :param epsilon: Privacy budget. + :type epsilon: float, optional + :param delta: Privacy parameter. + :type delta: float, optional + :param n_iters: Number of iterations for inference. + :type n_iters: int + :param compress: Whether to compress the data. + :type compress: bool + :param domain: The domain of the data. + :type domain: Domain, optional + :param prng: Random state for reproducibility. + :type prng: RandomState, optional + :param max_model_size: Maximum model size in MB. + :type max_model_size: int, optional + :param structural_zeros: Structural zeros in the data. + :type structural_zeros: dict, optional + :param n_jobs: Number of parallel jobs. + :type n_jobs: int + """ + + def __init__( + self, + domain: Domain, + epsilon: Optional[float] = None, + delta: Optional[float] = None, + n_iters: int = 5000, + compress: bool = False, + GDP: bool = False, + prng: Optional[RandomState] = None, + max_model_size: Optional[int] = None, + structural_zeros: Optional[dict] = None, + n_jobs: int = -1, + ): + super().__init__( + epsilon=epsilon, + delta=delta, + prng=prng, + max_model_size=max_model_size, + compress=compress, + domain=domain, + structural_zeros=structural_zeros, + n_jobs=n_jobs, + ) + + if GDP: + # HARDCODED -- 2 (convert ADP directly to GDP) + self.rho = None + mu = mu_from_eps_delta(self.epsilon, self.delta) + self.sigma = 1 / mu + else: + self.rho = cdp_rho(self.epsilon, self.delta) + # HARDCODED -- 1 (use all DP budget on 1-way marginals measurement) + self.sigma = np.sqrt(1 / (2 * self.rho)) + + self.n_iters = n_iters + + def _fit(self, data: Dataset, public: bool = False) -> Tuple[Dataset, list]: + """ + Fit the MST mechanism to the data. + + :param data: The dataset. + :type data: Dataset + :param public: Whether the data is public. Defaults to False. + :type public: bool, optional + :return: The dataset and measurement log. + :rtype: Tuple[Dataset, list] + """ + # select all 1-way marginals + self.cliques = cliques_1 = [(col,) for col in data.domain] + + log1 = self.measure(data, cliques=cliques_1, public=public) + # compress domain of all 1-way marginals + if self.compress: + log1 = self.compressor.fit(log1) + data = self.compressor.transform(data) + + self.model_size = model_size(data, cliques_1) + # HARDCODED - 3 (only select all 1-way marginals) + + return data, log1 diff --git a/experimental/audit_dpmm/code/run_attack.py b/experimental/audit_dpmm/code/run_attack.py new file mode 100644 index 0000000..81951bf --- /dev/null +++ b/experimental/audit_dpmm/code/run_attack.py @@ -0,0 +1,94 @@ +import string +import pickle +import numpy as np +import pandas as pd +from tqdm import tqdm +from itertools import product +from multiprocessing import Pool, cpu_count + +from mst import MST + + +N_ROWS = 10 +N_COLS = 3 +N_ALL = 5000 +LEN_SYNTH = 25 + +EPSILON = 1 +DELTA = 1e-2 + + +def featurize_df_queries(df, queries): + features = np.zeros(len(queries)) + for i, query in enumerate(queries): + features[i] = (df == query).all(axis=1).sum() + return features.astype(int) + + +def featurize_model(model, columns): + meas = model.measures + + measures = np.zeros(2 * len(columns)) + for col_idx, col in enumerate(columns): + col_proj = sorted([_meas for _meas in meas if col in _meas[3]], key=lambda x: len(x[3])) + + proj = col_proj[0][3] + _meas = col_proj[0][1] + _meas = _meas.reshape(*[_meas.size // 2**(len(proj) - 1) for _ in proj]) + + if len(col_proj[0][3]) > 1: + axis = col_proj[0][3].index(col) + _meas = np.sum(_meas, axis=tuple([i for i in range(len(_meas.shape)) if i != axis])) + + measures[2 * col_idx: (2 * col_idx) + _meas.shape[0]] = _meas + + return measures + + +def one_iteration(args): + i, df_out, df_in, columns, domain, queries, epsilon, delta, len_synth = args + + # out data + gen_out = MST(epsilon=epsilon, delta=delta, domain=domain, compress=False, n_jobs=1) + gen_out.fit(df_out) + synth_out = gen_out.generate(len_synth) + out_feats = np.concatenate([featurize_df_queries(synth_out, queries), featurize_model(gen_out, columns)]) + + # in data + gen_in = MST(epsilon=epsilon, delta=delta, domain=domain, compress=False, n_jobs=1) + gen_in.fit(df_in) + synth_in = gen_in.generate(len_synth) + in_feats = np.concatenate([featurize_df_queries(synth_in, queries), featurize_model(gen_in, columns)]) + + return i, out_feats, in_feats + + +if __name__ == "__main__": + # data + columns = list(string.ascii_uppercase[:N_COLS]) + domain = {col: 2 for col in columns} + + df_out = pd.DataFrame(np.zeros((N_ROWS, N_COLS), dtype=int), columns=columns) + df_in = pd.DataFrame(np.vstack([np.ones((1, N_COLS), dtype=int), np.zeros((N_ROWS, N_COLS), dtype=int)]), columns=columns) + + # black-box + white-box features + queries = np.array(list(product([0, 1], repeat=N_COLS))) + n_features = len(queries) + 2 * len(columns) + data = {"out": np.zeros([N_ALL, n_features]), "in": np.zeros([N_ALL, n_features])} + + # build tasks + tasks = [(i, df_out, df_in, columns, domain, queries, EPSILON, DELTA, LEN_SYNTH) for i in range(N_ALL)] + n_cpu = max(1, cpu_count() - 1) + + with Pool(processes=n_cpu, maxtasksperchild=1) as pool: + for i, out_row, in_row in tqdm( + pool.imap_unordered(one_iteration, tasks, chunksize=1), + total=N_ALL, + desc="it", + leave=False, + ): + data["out"][i, :] = out_row + data["in"][i, :] = in_row + + with open('../data/features.pkl', 'wb') as handle: + pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/experimental/audit_dpmm/code/run_audit.ipynb b/experimental/audit_dpmm/code/run_audit.ipynb new file mode 100644 index 0000000..0a5a0ae --- /dev/null +++ b/experimental/audit_dpmm/code/run_audit.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "dc57a5b6-6745-4cbe-88c6-7da26eb7ba74", + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from riskcal.analysis import get_beta_from_adp, get_beta_from_zcdp, get_beta_from_gdp, get_advantage_from_gdp\n", + "from dpmm.models.base.mechanisms import cdp_rho\n", + "from mst import mu_from_eps_delta\n", + "from audit_utils import run_audit, mu_lower_from_two_groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7626c21", + "metadata": {}, + "outputs": [], + "source": [ + "EPSILON = 1.0\n", + "DELTA = 1e-2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4f6b01", + "metadata": {}, + "outputs": [], + "source": [ + "N_TRAIN = 2000\n", + "N_VALID = 1000\n", + "N_TEST = 2000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5320fcef", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9931c23", + "metadata": {}, + "outputs": [], + "source": [ + "THEORY_RHO= cdp_rho(EPSILON, DELTA)\n", + "IMPLIED_MU = np.sqrt(2*THEORY_RHO)\n", + "print(f\"Implied mu: {IMPLIED_MU} <---\")\n", + "\n", + "\n", + "THEORY_MU = mu_from_eps_delta(EPSILON, DELTA)\n", + "print(f\"Theory mu: {THEORY_MU}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05bc4f2b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62395d6f", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab427b5d", + "metadata": {}, + "outputs": [], + "source": [ + "with open('../data/features.pkl', 'rb') as handle:\n", + " features = pickle.load(handle)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cace5da0-acde-43a3-8edd-efd41826350a", + "metadata": {}, + "outputs": [], + "source": [ + "default_results = run_audit(features[\"out\"],\n", + " features[\"in\"],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " random_state=13)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b6388eb-0304-4888-b59f-f983595e9be1", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Empirical mu: {default_results['test']['point']['mu_lower']} <--\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7aa8b52-3664-4640-9988-7069c5c8ffdd", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e124a42-0704-443d-ba5f-e043a66566c5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0347e26c-e25d-40ac-a404-ea5b3518a1b1", + "metadata": {}, + "outputs": [], + "source": [ + "def adp_frontier_from_eps_delta(epsilon, delta, n_points=500):\n", + " \"\"\"\n", + " Returns theoretical ADP frontier curve\n", + " \"\"\"\n", + " clip_eps = 1e-6\n", + " alpha = np.linspace(clip_eps, 1 - clip_eps, n_points)\n", + " beta = get_beta_from_adp(epsilon, delta, alpha)\n", + " return alpha, beta\n", + "\n", + "\n", + "def zcdp_frontier_from_rho(rho, n_points=500):\n", + " \"\"\"\n", + " Returns theoretical zCDP frontier curve\n", + " \"\"\"\n", + " clip_eps = 1e-6\n", + " alpha = np.linspace(clip_eps, 1 - clip_eps, n_points)\n", + " beta = get_beta_from_zcdp(rho, alpha)\n", + " return alpha, beta\n", + "\n", + "\n", + "def gdp_frontier_from_mu(mu, n_points=500):\n", + " \"\"\"\n", + " Returns theoretical GDP frontier curve\n", + " \"\"\"\n", + " clip_eps = 1e-6\n", + " alpha = np.linspace(clip_eps, 1 - clip_eps, n_points)\n", + " beta = get_beta_from_gdp(mu, alpha)\n", + " return alpha, beta\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f91bbc97-491d-4579-8a86-c4a8345f3288", + "metadata": {}, + "outputs": [], + "source": [ + "val_curve = default_results[\"valid\"][\"curve\"]\n", + "fpr = val_curve[\"FPR\"]\n", + "fnr = val_curve[\"FNR\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4860c03e-354b-4746-ac8d-5e1ec92820b6", + "metadata": {}, + "outputs": [], + "source": [ + "alpha_th_eps, beta_th_eps = adp_frontier_from_eps_delta(EPSILON, DELTA)\n", + "alpha_th_rho, beta_th_rho = zcdp_frontier_from_rho(THEORY_RHO)\n", + "alpha_th_mu_imp, beta_th_mu_imp = gdp_frontier_from_mu(IMPLIED_MU)\n", + "alpha_th_mu, beta_th_mu = gdp_frontier_from_mu(THEORY_MU)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6a7ac340-cebf-44b6-815c-8320ae0de57d", + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(6,6))\n", + "plt.gca().set_aspect('equal', adjustable='box')\n", + "\n", + "# Empirical audit (primary)\n", + "plt.plot(\n", + " fpr, fnr,\n", + " color=\"black\",\n", + " linewidth=3.0,\n", + " alpha=0.95,\n", + " label=\"Empirical audit\"\n", + ")\n", + "\n", + "# μ-GDP via zCDP (primary theory)\n", + "plt.plot(\n", + " alpha_th_mu_imp, beta_th_mu_imp,\n", + " color=\"red\",\n", + " linewidth=3.0,\n", + " alpha=0.95,\n", + " label=r\"$\\mu$-GDP (via $\\rho$-zCDP)\"\n", + ")\n", + "\n", + "# μ-GDP direct (secondary theory)\n", + "plt.plot(\n", + " alpha_th_mu, beta_th_mu,\n", + " color=\"red\",\n", + " linestyle=\"--\",\n", + " linewidth=2.0,\n", + " alpha=0.65,\n", + " label=r\"$\\mu$-GDP (via $(\\epsilon,\\delta)$-DP)\"\n", + ")\n", + "\n", + "# zCDP frontier (context)\n", + "plt.plot(\n", + " alpha_th_rho, beta_th_rho,\n", + " color=\"royalblue\",\n", + " linestyle=\"-.\",\n", + " linewidth=2.0,\n", + " alpha=0.65,\n", + " label=r\"$\\rho$-zCDP (context)\"\n", + ")\n", + "\n", + "# (ε,δ)-DP frontier (context)\n", + "plt.plot(\n", + " alpha_th_eps, beta_th_eps,\n", + " color=\"gray\",\n", + " linestyle=\":\",\n", + " linewidth=2.0,\n", + " alpha=0.5,\n", + " label=r\"$(\\epsilon,\\delta)$-DP (context)\"\n", + ")\n", + "\n", + "# 45-degree random-guess baseline: β = 1 − α\n", + "alpha_diag = np.linspace(0, 1, 200)\n", + "plt.plot(\n", + " alpha_diag, 1 - alpha_diag,\n", + " color=\"gray\",\n", + " linestyle=\"--\",\n", + " linewidth=2.0,\n", + " alpha=0.5,\n", + " label=\"Random guess\"\n", + ")\n", + "\n", + "plt.xlabel(\"FPR (α)\", fontsize=12)\n", + "plt.ylabel(\"FNR (β)\", fontsize=12)\n", + "plt.xlim(0, 1)\n", + "plt.ylim(0, 1)\n", + "\n", + "plt.legend(loc=\"upper right\", fontsize=12)\n", + "\n", + "plt.grid(alpha=0.12)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "# fig.savefig(\"../data/tradeoff.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2586db4f-3bb5-42cb-9314-db821e0aaf11", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a179941-78da-46f2-9488-0ae27463d978", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c9c2c8d", + "metadata": {}, + "outputs": [], + "source": [ + "t = val_curve[\"thresholds\"]\n", + "fpr = val_curve[\"FPR\"]\n", + "fnr = val_curve[\"FNR\"]\n", + "tpr = 1 - fnr\n", + "adv = val_curve[\"advantage\"]\n", + "mu_hat = val_curve[\"mu_hat\"]\n", + "\n", + "# selected threshold index\n", + "t_sel = val_curve[\"opt_t\"]\n", + "idx_sel = int(np.argmin(np.abs(t - t_sel)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee19b711", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Figure & axes ---\n", + "fig, ax = plt.subplots()\n", + "\n", + "# FORCE square plotting box\n", + "# ax.set_box_aspect(1)\n", + "\n", + "# --- Baseline ---\n", + "# ax.axhline(y=0.0, color=\"lightgray\", linestyle=\":\", linewidth=1.5, label=\"Random guess (adv.=0)\")\n", + "\n", + "# --- FPR / FNR ---\n", + "ax.plot(t, fpr, color=\"tab:blue\", linewidth=2, alpha=0.65, label=\"FPR (α)\")\n", + "\n", + "ax.plot(t, fnr, color=\"tab:orange\", linewidth=2, alpha=0.65, label=\"FNR (β)\")\n", + "\n", + "# --- Advantage (primary signal) ---\n", + "ax.plot(t, adv, color=\"darkgreen\", linewidth=3, alpha=0.95, label=\"Empirical advantage\")\n", + "\n", + "# --- Theory reference ---\n", + "adv_theory = get_advantage_from_gdp(IMPLIED_MU)\n", + "\n", + "ax.axhline(y=adv_theory, color=\"gray\", linestyle=\"-.\", linewidth=2, alpha=0.5, label=r\"Theory advantage ($\\mu$-GDP)\")\n", + "\n", + "# --- Selected threshold ---\n", + "ax.axvline(t_sel, color=\"black\", linestyle=\"--\", linewidth=2, alpha=0.95, label=r\"Selected $\\tau^*$\")\n", + "\n", + "# Star marker\n", + "# ax.scatter([t_sel], [adv[idx_sel]], s=160, marker=\"*\", color=\"black\", zorder=5)\n", + "\n", + "# Annotation\n", + "# ax.text(t_sel + 0.015, adv[idx_sel], rf\"$\\tau^\\star={t_sel:.2f}$\", fontsize=11, va=\"center\")\n", + "\n", + "# --- Axes styling ---\n", + "ax.set_xlabel(r\"Threshold $\\tau$\", fontsize=12)\n", + "ax.set_ylabel(\"Rate\", fontsize=12)\n", + "\n", + "ax.set_xlim(0,1)\n", + "ax.set_ylim(0,1)\n", + "\n", + "# --- Legend (inside, clean) ---\n", + "handles, labels = ax.get_legend_handles_labels()\n", + "order = [\n", + " labels.index(r\"Selected $\\tau^*$\"),\n", + " labels.index(\"Empirical advantage\"),\n", + " labels.index(r\"Theory advantage ($\\mu$-GDP)\"),\n", + " labels.index(\"FPR (α)\"),\n", + " labels.index(\"FNR (β)\"),\n", + "]\n", + "ax.legend([handles[i] for i in order], [labels[i] for i in order],\n", + " loc=\"upper right\", fontsize=11)\n", + "\n", + "plt.grid(alpha=0.12)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "# fig.savefig(\"../data/valid.pdf\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0094d407", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d17410be", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fd890bce-a532-4cb2-9a49-0574abd917aa", + "metadata": {}, + "outputs": [], + "source": [ + "results_all = {}\n", + "\n", + "# baseline\n", + "baseline_ww_out, baseline_ww_in = features[\"out\"][:, 9], features[\"in\"][:, 9]\n", + "results, _ = mu_lower_from_two_groups(baseline_ww_out, baseline_ww_in)\n", + "results_all[\"Baseline\"] = results\n", + "\n", + "# threshold_selection -- max_mu_hat\n", + "results = run_audit(features[\"out\"],\n", + " features[\"in\"],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " threshold_selection=\"max_mu_hat\",\n", + " random_state=13)\n", + "results_all[r\"$\\hat{\\mu}$\"] = results['test']['point']['mu_lower']\n", + "\n", + "# ci_method -- bonferroni_cp\n", + "results = run_audit(features[\"out\"],\n", + " features[\"in\"],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " ci_method=\"bonferroni_cp\",\n", + " random_state=13)\n", + "results_all[\"Clopper–Pearson\"] = results['test']['point']['mu_lower']\n", + "\n", + "# D_out size -- 2\n", + "# with open('../data/features_2.pkl', 'rb') as handle:\n", + "# features_2 = pickle.load(handle)\n", + "# results = run_audit(features_2[\"out\"],\n", + "# features_2[\"in\"],\n", + "# n_train=N_TRAIN,\n", + "# n_valid=N_VALID,\n", + "# n_test=N_TEST,\n", + "# random_state=13)\n", + "# results_all[\"$|D_{out}| = 2$\"] = results['test']['point']['mu_lower']\n", + "results_all[\"$|D_{out}| = 2$\"] = 0.29832043950664605\n", + "\n", + "# classifier -- random_forest\n", + "results = run_audit(features[\"out\"],\n", + " features[\"in\"],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " classifier=\"random_forest\",\n", + " random_state=13)\n", + "results_all[\"Random Forest\"] = results['test']['point']['mu_lower']\n", + "\n", + "# threat model -- black-box\n", + "results = run_audit(features[\"out\"][:, :8],\n", + " features[\"in\"][:, :8],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " random_state=13)\n", + "results_all[\"Black-box\"] = results['test']['point']['mu_lower']\n", + "\n", + "# threat model -- white-box\n", + "results = run_audit(features[\"out\"][:, 8:],\n", + " features[\"in\"][:, 8:],\n", + " n_train=N_TRAIN,\n", + " n_valid=N_VALID,\n", + " n_test=N_TEST,\n", + " random_state=13)\n", + "results_all[\"White-box\"] = results['test']['point']['mu_lower']\n", + "\n", + "# default\n", + "results_all[\"Default\"] = default_results['test']['point']['mu_lower']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aeac441-8509-40e0-953b-a59fbcbbfeb0", + "metadata": {}, + "outputs": [], + "source": [ + "# results = {\n", + "# \"Baseline\": 0.2601335097836669,\n", + "# r\"$\\hat{\\mu}$\": 0.0,\n", + "# \"Clopper–Pearson\": 0.2796166719825678,\n", + "# r\"$|D_{out}| = 2$\": 0.29832043950664605,\n", + "# \"Random Forest\": 0.32397972624803917,\n", + "# \"Black-box\": 0.3901927720079095,\n", + "# \"White-box\": 0.4150704207293557,\n", + "# \"Default\": 0.42617713009324837,\n", + "# }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f775da6-6557-4b71-8dd5-39046d05e422", + "metadata": {}, + "outputs": [], + "source": [ + "names = list(results_all.keys())\n", + "values = list(results_all.values())\n", + "\n", + "fig = plt.figure(figsize=(7, 5))\n", + "\n", + "bars = plt.bar(names, values, color=\"black\", label=\"Empirical audit\")\n", + "\n", + "# ---- highlight baseline ----\n", + "for bar, name in zip(bars, names):\n", + " if name == \"Default\":\n", + " bar.set_edgecolor(\"red\")\n", + " bar.set_linewidth(5)\n", + "\n", + "for bar in bars:\n", + " h = bar.get_height()\n", + " plt.text(bar.get_x() + bar.get_width()/2, h + 0.003, f\"{h:.2f}\",\n", + " ha=\"center\", va=\"bottom\", fontsize=11)\n", + "\n", + "# ---- theoretical line ----\n", + "plt.axhline(\n", + " IMPLIED_MU,\n", + " linestyle=\"-\",\n", + " linewidth=3.0,\n", + " color=\"red\",\n", + " alpha=0.95,\n", + " label=r\"Theory $\\mu$ (via $\\rho$-zCDP)\",\n", + ")\n", + "\n", + "plt.legend(\n", + " loc=\"upper left\",\n", + " bbox_to_anchor=(0.01, 0.95), # move down a bit\n", + " fontsize=12\n", + ")\n", + "\n", + "plt.ylabel(r\"$\\mu_{emp}$\", fontsize=12)\n", + "# plt.xlabel(\"Ablation setting\")\n", + "\n", + "plt.grid(axis=\"y\", linestyle=\"--\", alpha=0.12)\n", + "plt.xticks(rotation=30, fontsize=12)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "# fig.savefig(\"../data/abl.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51dc3a22-1a31-46e9-bdd7-79c99d30f5e5", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pgm-audit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experimental/audit_dpmm/data/abl.pdf b/experimental/audit_dpmm/data/abl.pdf new file mode 100644 index 0000000..96bdff2 Binary files /dev/null and b/experimental/audit_dpmm/data/abl.pdf differ diff --git a/experimental/audit_dpmm/data/features.pkl b/experimental/audit_dpmm/data/features.pkl new file mode 100644 index 0000000..ac41a5d Binary files /dev/null and b/experimental/audit_dpmm/data/features.pkl differ diff --git a/experimental/audit_dpmm/data/tradeoff.pdf b/experimental/audit_dpmm/data/tradeoff.pdf new file mode 100644 index 0000000..b4527b6 Binary files /dev/null and b/experimental/audit_dpmm/data/tradeoff.pdf differ diff --git a/experimental/audit_dpmm/data/valid.pdf b/experimental/audit_dpmm/data/valid.pdf new file mode 100644 index 0000000..ea9c12b Binary files /dev/null and b/experimental/audit_dpmm/data/valid.pdf differ diff --git a/experimental/audit_dpmm/environment.yml b/experimental/audit_dpmm/environment.yml new file mode 100644 index 0000000..ef0713f --- /dev/null +++ b/experimental/audit_dpmm/environment.yml @@ -0,0 +1,14 @@ +name: pgm-audit +channels: + - defaults +dependencies: + - python=3.11 + - tqdm + - scikit-learn + - pandas + - matplotlib + - jupyterlab + - pip + - pip: + - dpmm + - riskcal