Source code for missmecha.generate.marcat

import numpy as np
import pandas as pd

[docs] class MARCatType1: """ MAR Mechanism - Categorical Type 1 (Category-Conditioned Row-Wise Masking) Simulates Missing At Random (MAR) by introducing missingness **across rows**, conditioned on the value of a categorical feature. Each category is assigned a random masking probability such that the total missingness rate approximately matches `missing_rate`. This mechanism is particularly suitable for simulating structured row-wise missingness in tabular data with labeled groups or strata. Parameters ---------- missing_rate : float, default=0.1 Target total missing rate across the dataset. seed : int, default=1 Random seed for reproducibility. cat_column : str or None Name of the categorical column used to drive missingness. If None, a column is randomly selected from the input DataFrame during `fit()`. """ def __init__(self, missing_rate=0.1, seed=1, cat_column=None): self.missing_rate = missing_rate self.seed = seed self.cat_column = cat_column self.fitted = False def _verbose(self, msg): print(f"[{self.__class__.__name__}] {msg}")
[docs] def fit(self, X, y=None): """ Fit the masking distribution conditioned on a categorical column. Assigns each category a masking probability proportional to a random draw, normalized to ensure that the total missing rate matches `missing_rate`. Parameters ---------- X : pd.DataFrame Input DataFrame containing the categorical column. y : Ignored Included for interface compatibility. Returns ------- self : MARCatType1 Fitted object with learned class-based probabilities. """ rng = np.random.default_rng(self.seed) if not isinstance(X, pd.DataFrame): raise ValueError("MARCatType1 currently requires pandas DataFrame input.") # 如果没指定类别列,随机选一列(假设是 categorical) if self.cat_column is None: self.cat_column = rng.choice(X.columns) self._verbose(f"No categorical column specified. Randomly selected '{self.cat_column}' as cat_column.") # 将选定列转成 category 类型(确保是离散的) if not pd.api.types.is_categorical_dtype(X[self.cat_column]): X[self.cat_column] = X[self.cat_column].astype("category") self.classes = X[self.cat_column].cat.categories probs = rng.uniform(0, 1, len(self.classes)) probs = probs / probs.sum() * self.missing_rate # normalize total missing_rate self.class_probs = dict(zip(self.classes, probs)) self.fitted = True return self
[docs] def transform(self, X): """ Apply row-wise missingness based on category-conditioned probabilities. For each category in the chosen column, a subset of rows is randomly selected and all columns in those rows are masked (i.e., set to NaN). Parameters ---------- X : pd.DataFrame Input DataFrame to transform. Returns ------- X_missing : pd.DataFrame DataFrame with row-level missing values introduced. """ if not self.fitted: raise RuntimeError("Call .fit() before transform().") X_missing = X.copy() rng = np.random.default_rng(self.seed) for cat, prob in self.class_probs.items(): rows = X[self.cat_column] == cat mask = rng.random(size=rows.sum()) < prob X_missing.loc[rows, :] = X_missing.loc[rows, :].mask(mask[:, None]) return X_missing
MARCAT_TYPES = { 1: MARCatType1, # 2: MARType2, # 3: MARType3, # 4: MARType4, # 5: MARType5, # 6: MARType6, # 7: MARType7, # 8: MARType8 }