Source code for missmecha.generate.mnarcat

import numpy as np
import random


[docs]
class MNARCatType1:
    """
    MNAR Mechanism for Categorical and Ordinal Features (Column-wise Variant)

    Introduces missingness into categorical or ordinal columns based on feature-specific
    criteria:
    - For numerical columns, values below a quantile threshold are masked.
    - For ordinal columns, top-ranked values are targeted.
    - For nominal columns, randomly chosen categories are partially masked.

    Parameters
    ----------
    q : float, default=0.2
        Quantile or proportion threshold used for masking.
    seed : int, default=1
        Random seed for reproducibility.
    """

    def __init__(self, q=0.2, seed=1):
        self.q = q
        self.seed = seed
        self.fitted = False


[docs]
    def fit(self, X, col_info):
        """
        Fit method (placeholder for compatibility).

        Parameters
        ----------
        X : np.ndarray
            Input data array.
        col_info : dict
            Dictionary mapping column indices to their type.

        Returns
        -------
        self : MNARCategorical
            Returns self.
        """
        self.col_info = col_info
        self.fitted = True
        return self



[docs]
    def transform(self, X):
        """
        Apply MNAR masking to categorical/ordinal/numerical columns.

        Parameters
        ----------
        X : np.ndarray
            Input data to transform.

        Returns
        -------
        X_missing : np.ndarray
            Transformed array with missing values injected.
        """
        if not self.fitted:
            raise RuntimeError("Call .fit() before .transform().")

        np.random.seed(self.seed)
        random.seed(self.seed)
        X = X.astype(float)
        X_missing = X.copy()
        n = X.shape[0]
        q = self.q * 100

        for col, col_type in self.col_info.items():
            col_idx = int(col)
            num_to_remove = int(n * self.q)

            if "numerical" in col_type:
                threshold = np.percentile(X[:, col_idx], q)
                X_missing[:, col_idx] = np.where(X[:, col_idx] < threshold, np.nan, X[:, col_idx])

            elif "ordinal" in col_type:
                ordinal_map = col_type.get('ordinal', {})
                max_val = max(ordinal_map.values())
                max_indices = np.where(X[:, col_idx] >= (max_val - 2))[0].tolist()
                remove_indices = self._sample_indices(max_indices, n, num_to_remove)
                X_missing[remove_indices, col_idx] = np.nan

            elif "nominal" in col_type:
                unique_vals = list(set(X[:, col_idx]))
                chosen_val = random.choice(unique_vals)
                chosen_indices = np.where(X[:, col_idx] == chosen_val)[0].tolist()
                remove_indices = self._sample_indices(chosen_indices, n, num_to_remove)
                X_missing[remove_indices, col_idx] = np.nan

        return X_missing


    def _sample_indices(self, primary_indices, total, k):
        """
        Helper function to sample indices up to size k, combining primary and fallback.
        """
        all_indices = set(range(total))
        secondary = list(all_indices - set(primary_indices))
        if len(primary_indices) >= k:
            return random.sample(primary_indices, k)
        else:
            fill = random.sample(secondary, k - len(primary_indices))
            return primary_indices + fill



MNARCAT_TYPES = {
    1: MNARCatType1,
    # 2: MARType2,
    # 3: MARType3,
    # 4: MARType4,
    # 5: MARType5,
    # 6: MARType6,
    # 7: MARType7,
    # 8: MARType8
}