Source code for copul.data_uniformer

import numpy as np
from scipy import stats


[docs] class DataUniformer: """Class to transform data to uniform margins using empirical CDF. Transforms multivariate data to have uniform margins on [0,1] by converting values to ranks and then scaling appropriately. """ def __init__(self): pass
[docs] def uniform(self, data, touch_boundaries=False): """Transform data to uniform margins (ranks scaled to [0,1]). Parameters ---------- data : numpy.ndarray or list Array of shape (n_samples, n_features) to be transformed touch_boundaries : bool, optional If False (default), the transformed values lie strictly in (0,1). If True, the transformed values will exactly include 0.0 and 1.0 for the min and max of each column. Returns ------- numpy.ndarray Transformed data with values in [0,1]. """ # Ensure data is a numpy array data = np.asarray(data, dtype=np.float64) # Fast path for 1D arrays if data.ndim == 1: return self._transform_column(data, touch_boundaries) # Multi-dimensional case n_samples, n_features = data.shape # Preallocate output array transformed_data = np.empty_like(data) # Serial transformation for j in range(n_features): transformed_data[:, j] = self._transform_column( data[:, j], touch_boundaries ) return transformed_data
def _transform_column(self, column, touch_boundaries=False): """Transform a single column to uniform margins. Parameters ---------- column : numpy.ndarray (1D) Column to transform touch_boundaries : bool See `uniform` docstring Returns ------- numpy.ndarray Transformed column with values in [0,1]. """ n_samples = len(column) # If there's only one sample, choose a sensible default if n_samples == 1: if touch_boundaries: # Could choose 0.0 or 1.0, but 0.0 is typical to "touch" the boundary return np.array([0.0], dtype=np.float64) else: # Typically we stay in (0,1), 0.5 is a neutral midpoint return np.array([0.5], dtype=np.float64) # Compute ranks using 'average' to handle ties gracefully ranks = stats.rankdata(column, method="average") if touch_boundaries: # Map ranks to [0,1]: # smallest rank (1) -> 0.0, largest rank (n_samples) -> 1.0 return (ranks - 1.0) / (n_samples - 1.0) else: # Original behavior: map ranks to (0,1) => (1/(n+1), ..., n/(n+1)) return ranks / (n_samples + 1.0)