Source code for missmecha.generate.mcar

import numpy as np


[docs] class MCARType1: """ MCAR Mechanism - Type 1 (Uniform Independent Masking) Randomly masks entries with a uniform probability across the entire dataset. This mechanism applies a global missing rate independently at each cell. Parameters ---------- missing_rate : float, default=0.1 The proportion of values to randomly set as missing (0 ≤ missing_rate ≤ 1). seed : int, default=1 Random seed for reproducibility. """ def __init__(self, missing_rate=0.1, seed=42): self.missing_rate = missing_rate self.seed = seed self.fitted = False # 一般MCAR不需要 fit, 但我们保持统一接口
[docs] def fit(self, X, y=None): """ Placeholder fit method for interface compatibility. MCARType1 does not require fitting, but this method sets a flag for internal consistency. """ # MCAR 不依赖 X 或 y,fit 只是设置标志位 self.fitted = True return self
[docs] def transform(self, X): """ Apply MCARType1 transformation to introduce missingness. Each entry in the dataset has an independent probability of being set to NaN. Parameters ---------- X : np.ndarray Input array to apply missingness (converted to float). Returns ------- X_missing : np.ndarray The same array with missing values inserted. """ if not self.fitted: raise RuntimeError("MCARType1 must be fit before calling transform.") rng = np.random.default_rng(self.seed) X = X.astype(float) mask = rng.uniform(0, 1, size=X.shape) < self.missing_rate X_missing = X.copy() X_missing[mask] = np.nan return X_missing
[docs] class MCARType2: """ MCAR Mechanism - Type 2 (Random Cell Selection) Randomly selects a fixed number of entries based on the overall missing rate, and masks exactly that number of cells across the dataset. Parameters ---------- missing_rate : float, default=0.1 The proportion of values to randomly set as missing (0 ≤ missing_rate ≤ 1). seed : int, default=1 Random seed for reproducibility. """ def __init__(self, missing_rate=0.1, seed=1): self.missing_rate = missing_rate self.seed = seed self.fitted = False
[docs] def fit(self, X, y=None): """ Placeholder fit method for interface compatibility. MCARType2 does not require fitting, but this method sets a flag for internal consistency. """ # MCAR 不依赖 X/y,fit 仅作为流程接口 self.fitted = True return self
[docs] def transform(self, X): """ Apply MCARType2 transformation to introduce missingness. Randomly masks a fixed number of values across the entire array, based on the global missing rate. Parameters ---------- X : np.ndarray Input array to apply missingness (converted to float). Returns ------- X_missing : np.ndarray Array with missing entries randomly inserted. """ if not self.fitted: raise RuntimeError("MCARType2 must be fit before calling transform.") if not isinstance(X, np.ndarray): raise TypeError("Input must be a NumPy array.") if not (0 <= self.missing_rate <= 1): raise ValueError("missing_rate must be between 0 and 1.") rng = np.random.default_rng(self.seed) X = X.astype(float) total_elements = X.size num_missing = int(round(total_elements * self.missing_rate)) X_missing = X.copy() flat_indices = rng.choice(total_elements, size=num_missing, replace=False) multi_indices = np.unravel_index(flat_indices, X.shape) X_missing[multi_indices] = np.nan return X_missing
[docs] class MCARType3: """ MCAR Mechanism - Type 3 (Column-wise Balanced Missingness) Applies missingness to each column independently, with approximately equal number of missing entries per column. Parameters ---------- missing_rate : float, default=0.1 The total proportion of missing values in the dataset. seed : int, default=1 Random seed for reproducibility. """ def __init__(self, missing_rate=0.1, seed=1): self.missing_rate = missing_rate self.seed = seed self.fitted = False
[docs] def fit(self, X, y=None): """ Placeholder fit method for interface compatibility. MCARType2 does not require fitting, but this method sets a flag for internal consistency. """ self.fitted = True return self
[docs] def transform(self, X): """ Apply MCARType3 transformation to introduce missingness. Ensures that missing values are approximately evenly distributed across columns. Parameters ---------- X : np.ndarray Input array to apply missingness (converted to float). Returns ------- X_missing : np.ndarray Array with missing values inserted in a column-balanced way. """ if not self.fitted: raise RuntimeError("MCARType3 must be fit before calling transform.") if not isinstance(X, np.ndarray): raise TypeError("Input must be a NumPy array.") if not (0 <= self.missing_rate <= 1): raise ValueError("missing_rate must be between 0 and 1.") rng = np.random.default_rng(self.seed) X = X.astype(float) n, p = X.shape total_cells = n * p total_missing = int(round(total_cells * self.missing_rate)) missing_per_col = total_missing // p X_missing = X.copy() for j in range(p): if missing_per_col > 0: rows = rng.choice(n, size=missing_per_col, replace=False) X_missing[rows, j] = np.nan return X_missing
MCAR_TYPES = { 1: MCARType1, 2: MCARType2, 3: MCARType3, }