Source code for copul.data_uniformer
import numpy as np
from scipy import stats
[docs]
class DataUniformer:
"""Class to transform data to uniform margins using empirical CDF.
Transforms multivariate data to have uniform margins on [0,1] by
converting values to ranks and then scaling appropriately.
"""
def __init__(self):
pass
[docs]
def uniform(self, data, touch_boundaries=False):
"""Transform data to uniform margins (ranks scaled to [0,1]).
Parameters
----------
data : numpy.ndarray or list
Array of shape (n_samples, n_features) to be transformed
touch_boundaries : bool, optional
If False (default), the transformed values lie strictly in (0,1).
If True, the transformed values will exactly include 0.0 and 1.0
for the min and max of each column.
Returns
-------
numpy.ndarray
Transformed data with values in [0,1].
"""
# Ensure data is a numpy array
data = np.asarray(data, dtype=np.float64)
# Fast path for 1D arrays
if data.ndim == 1:
return self._transform_column(data, touch_boundaries)
# Multi-dimensional case
n_samples, n_features = data.shape
# Preallocate output array
transformed_data = np.empty_like(data)
# Serial transformation
for j in range(n_features):
transformed_data[:, j] = self._transform_column(
data[:, j], touch_boundaries
)
return transformed_data
def _transform_column(self, column, touch_boundaries=False):
"""Transform a single column to uniform margins.
Parameters
----------
column : numpy.ndarray (1D)
Column to transform
touch_boundaries : bool
See `uniform` docstring
Returns
-------
numpy.ndarray
Transformed column with values in [0,1].
"""
n_samples = len(column)
# If there's only one sample, choose a sensible default
if n_samples == 1:
if touch_boundaries:
# Could choose 0.0 or 1.0, but 0.0 is typical to "touch" the boundary
return np.array([0.0], dtype=np.float64)
else:
# Typically we stay in (0,1), 0.5 is a neutral midpoint
return np.array([0.5], dtype=np.float64)
# Compute ranks using 'average' to handle ties gracefully
ranks = stats.rankdata(column, method="average")
if touch_boundaries:
# Map ranks to [0,1]:
# smallest rank (1) -> 0.0, largest rank (n_samples) -> 1.0
return (ranks - 1.0) / (n_samples - 1.0)
else:
# Original behavior: map ranks to (0,1) => (1/(n+1), ..., n/(n+1))
return ranks / (n_samples + 1.0)