Source code for missmecha.generate.mnarcat
import numpy as np
import random
[docs]
class MNARCatType1:
"""
MNAR Mechanism for Categorical and Ordinal Features (Column-wise Variant)
Introduces missingness into categorical or ordinal columns based on feature-specific
criteria:
- For numerical columns, values below a quantile threshold are masked.
- For ordinal columns, top-ranked values are targeted.
- For nominal columns, randomly chosen categories are partially masked.
Parameters
----------
q : float, default=0.2
Quantile or proportion threshold used for masking.
seed : int, default=1
Random seed for reproducibility.
"""
def __init__(self, q=0.2, seed=1):
self.q = q
self.seed = seed
self.fitted = False
[docs]
def fit(self, X, col_info):
"""
Fit method (placeholder for compatibility).
Parameters
----------
X : np.ndarray
Input data array.
col_info : dict
Dictionary mapping column indices to their type.
Returns
-------
self : MNARCategorical
Returns self.
"""
self.col_info = col_info
self.fitted = True
return self
[docs]
def transform(self, X):
"""
Apply MNAR masking to categorical/ordinal/numerical columns.
Parameters
----------
X : np.ndarray
Input data to transform.
Returns
-------
X_missing : np.ndarray
Transformed array with missing values injected.
"""
if not self.fitted:
raise RuntimeError("Call .fit() before .transform().")
np.random.seed(self.seed)
random.seed(self.seed)
X = X.astype(float)
X_missing = X.copy()
n = X.shape[0]
q = self.q * 100
for col, col_type in self.col_info.items():
col_idx = int(col)
num_to_remove = int(n * self.q)
if "numerical" in col_type:
threshold = np.percentile(X[:, col_idx], q)
X_missing[:, col_idx] = np.where(X[:, col_idx] < threshold, np.nan, X[:, col_idx])
elif "ordinal" in col_type:
ordinal_map = col_type.get('ordinal', {})
max_val = max(ordinal_map.values())
max_indices = np.where(X[:, col_idx] >= (max_val - 2))[0].tolist()
remove_indices = self._sample_indices(max_indices, n, num_to_remove)
X_missing[remove_indices, col_idx] = np.nan
elif "nominal" in col_type:
unique_vals = list(set(X[:, col_idx]))
chosen_val = random.choice(unique_vals)
chosen_indices = np.where(X[:, col_idx] == chosen_val)[0].tolist()
remove_indices = self._sample_indices(chosen_indices, n, num_to_remove)
X_missing[remove_indices, col_idx] = np.nan
return X_missing
def _sample_indices(self, primary_indices, total, k):
"""
Helper function to sample indices up to size k, combining primary and fallback.
"""
all_indices = set(range(total))
secondary = list(all_indices - set(primary_indices))
if len(primary_indices) >= k:
return random.sample(primary_indices, k)
else:
fill = random.sample(secondary, k - len(primary_indices))
return primary_indices + fill
MNARCAT_TYPES = {
1: MNARCatType1,
# 2: MARType2,
# 3: MARType3,
# 4: MARType4,
# 5: MARType5,
# 6: MARType6,
# 7: MARType7,
# 8: MARType8
}