Source code for EventStream.data.preprocessing.standard_scaler
"""Pre-processor that normalizes data to have zero mean and unit variance."""
import polars as pl
from .preprocessor import Preprocessor
[docs]
class StandardScaler(Preprocessor):
"""Normalizes data to have zero mean and unit variance.
This is a concrete implementation of the Preprocessor abstract class. It is a pre-processor that
normalizes data to have zero mean and unit variance. It is implemented as a Polars friendly pre-processor,
meaning that it is implemented as a Polars expression that can be used in both a select and a groupby
aggregation context.
Examples:
>>> import polars as pl
>>> S = StandardScaler()
>>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5]})
>>> params = S.fit_from_polars(pl.col("a")).alias("params")
>>> df.select(params)["params"].to_list()
[{'mean_': 3.0, 'std_': 1.5811388300841898}]
>>> norm = S.predict_from_polars(pl.col("a"), params).alias("a_norm")
>>> df.select(norm)["a_norm"].to_list()
[-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
"""
[docs]
@classmethod
def params_schema(cls) -> dict[str, pl.DataType]:
r"""Returns {"mean\_": pl.Float64, "std\_": pl.Float64}."""
return {"mean_": pl.Float64, "std_": pl.Float64}
[docs]
def fit_from_polars(self, column: pl.Expr) -> pl.Expr:
r"""Fit the mean and standard deviation of the data in `column`.
Arguments:
column: The Polars expression for the column containing the raw data to be pre-processed.
Returns:
pl.Expr: A polars expression for a struct column containing the mean and standard deviation of
the data in `column` in fields named "mean\_" and "std\_" respectively.
"""
return pl.struct([column.mean().alias("mean_"), column.std().alias("std_")])
[docs]
@classmethod
def predict_from_polars(cls, column: pl.Expr, model_column: pl.Expr) -> pl.Expr:
r"""Returns `(column - model_column.struct.field("mean_")) / model_column.struct.field("std_")`.
Arguments:
column: The Polars expression for the column containing the raw data to be centered and scaled.
model_column: The Polars expression for a struct column containing "mean\_" and "std\_" fields.
Returns:
pl.Expr: `(column - model_column.struct.field("mean_")) / model_column.struct.field("std_")`
"""
return (column - model_column.struct.field("mean_")) / model_column.struct.field("std_")