Эффективный метод добавления производных данных в Polars multiIndex LazyFrame

Эффективный метод добавления производных данных в Polars multiIndex LazyFrame ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Эффективный метод добавления производных данных в Polars multiIndex LazyFrame

Цитата

Сообщение Anonymous » 02 ноя 2025, 12:45

Я работаю с Polars, и мне нужно добавить производные данные в мультииндексный LazyFrame.
Чтобы изменить данные, я поворачиваю DataFrame без выполнения каких-либо агрегаций, выполняю некоторые вычисления, а затем возвращаю DataFrame обратно в исходный формат. Эту операцию необходимо выполнить на разных уровнях индекса. В документации Polars упоминается, что операции поворота недоступны в ленивом режиме. Из-за поворота/отмены поворота необходимо переключаться между нетерпеливым и ленивым режимами. Есть ли более эффективный способ добиться этого без переключения между нетерпеливым и ленивым режимами?
Вот пример:
import numpy as np
import polars as pl

def get_raw_data() -> pl.LazyFrame:
"""Generate random a multiindex LazyFrame with different size of indexes as example."""
names = np.array(['A', 'B', 'C'])
measures = np.array(['height', 'width'])
repeats: np.ndarray = np.array([3, 3, 2])
return pl.DataFrame({
'id': np.repeat(names, repeats*measures.size),
'measure': np.concatenate([np.repeat(measures, rep) for rep in repeats]),
'date': np.concatenate([np.arange(size) for size in np.repeat(repeats, measures.size)]),
'value': np.round(np.random.default_rng(111).random(measures.size*repeats.sum()), 2),
}).lazy()

print(get_raw_data().collect())
# shape: (16, 4)
# ┌─────┬─────────┬──────┬───────┐
# │ id ┆ measure ┆ date ┆ value │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 ┆ f64 │
# ╞═════╪═════════╪══════╪═══════╡
# │ A ┆ height ┆ 0 ┆ 0.15 │
# │ A ┆ height ┆ 1 ┆ 0.17 │
# │ A ┆ height ┆ 2 ┆ 0.51 │
# │ A ┆ width ┆ 0 ┆ 0.66 │
# │ A ┆ width ┆ 1 ┆ 0.77 │
# │ … ┆ … ┆ … ┆ … │
# │ B ┆ width ┆ 2 ┆ 0.72 │
# │ C ┆ height ┆ 0 ┆ 0.08 │
# │ C ┆ height ┆ 1 ┆ 0.42 │
# │ C ┆ width ┆ 0 ┆ 0.4 │
# │ C ┆ width ┆ 1 ┆ 0.94 │
# └─────┴─────────┴──────┴───────┘

def expr_add_categories() -> pl.Expr:
"""Generate a sample list of expressions to add some derived categories."""
return [(pl.col('height')/pl.col('width')).alias('ratio')]

def expr_add_ids() -> pl.Expr:
"""Generate a sample list of expressions to add some derived ids."""
return [
(pl.col('A') / pl.col('B')).alias('AB'),
(pl.col('A') / pl.col('C')).alias('AC')
]

def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
"""Add various derived categories to LazyFrame."""
return (
df
.collect() # pivot requires eager mode
.pivot(on='measure', index=['id', 'date'], values='value')
.lazy() # back to lazy mode
.with_columns(expr_add_categories())
.unpivot(index=['id', 'date'], variable_name='measure')
.drop_nulls()
.select('id', 'measure', 'date', 'value')
.sort('id', 'measure', 'date')
.set_sorted('id')
.set_sorted('measure')
.set_sorted('date')
)

def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
"""Add various derived IDs to LazyFrame."""
return (
df
.collect() # pivot requires eager mode
.pivot(on='id', index=['measure', 'date'], values='value')
.lazy() # back to lazy mode
.with_columns(expr_add_ids())
.unpivot(index=['measure', 'date'], variable_name='id')
.drop_nulls()
.select('id', 'measure', 'date', 'value')
.sort('id', 'measure', 'date')
.set_sorted('id')
.set_sorted('measure')
.set_sorted('date')
)

def get_modified_data() -> pl.LazyFrame:
"""Get raw data and add derived categories and names to LazyFrame."""
return (
get_raw_data()
.pipe(add_categories)
.pipe(add_ids)
)

print(get_modified_data().collect())
# shape: (39, 4)
# ┌─────┬─────────┬──────┬──────────┐
# │ id ┆ measure ┆ date ┆ value │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 ┆ f64 │
# ╞═════╪═════════╪══════╪══════════╡
# │ A ┆ height ┆ 0 ┆ 0.15 │
# │ A ┆ height ┆ 1 ┆ 0.17 │
# │ A ┆ height ┆ 2 ┆ 0.51 │
# │ A ┆ ratio ┆ 0 ┆ 0.227273 │
# │ A ┆ ratio ┆ 1 ┆ 0.220779 │
# │ … ┆ … ┆ … ┆ … │
# │ C ┆ height ┆ 1 ┆ 0.42 │
# │ C ┆ ratio ┆ 0 ┆ 0.2 │
# │ C ┆ ratio ┆ 1 ┆ 0.446809 │
# │ C ┆ width ┆ 0 ┆ 0.4 │
# │ C ┆ width ┆ 1 ┆ 0.94 │
# └─────┴─────────┴──────┴──────────┘

# *************************************************************
# Python: 3.12.0
# Numpy: 1.26.4
# Polars: 0.20.31
# *************************************************************

Изменить:
Предположим, что содержимое DataFrame является динамическим.
Уникальные элементы любого из уровней индекса заранее неизвестны.
Изменить:
Вот пример «lazypivot» в виде UDF. Я не знаю, как применить функцию агрегирования без использования лямбда-функции.
Лучшие предложения приветствуются.
import warnings
from typing import Callable
import polars as pl

def lazypivot(df: pl.LazyFrame,
on: str | list[str] | None,
index: str | list[str] | None,
values: str | list[str] | None,
column_values: str | list[str] | None = None,
aggregate_function: Callable | None = None,
maintain_order: bool = True,
sort_columns: bool = True,
) -> pl.LazyFrame:
"""Pivot a LazyFrame with or without aggregation."""

# Collect unique column values if not provided
if column_values is None:
warnings.warn(
'No column_values provided. Switching between eager and lazy mode necessary to collect unique column values.',
UserWarning
)
collected_df = df.collect()
column_values = collected_df[on].unique().sort() if sort_columns else collected_df[on].unique()
df = collected_df.lazy()

# Define the aggregation function
if aggregate_function is None:
agg_expr = [pl.col(values).filter(pl.col(on) == value).first().alias(value) for value in column_values]
else:
agg_expr = [aggregate_function(pl.col(values).filter(pl.col(on) == value)).alias(value) for value in column_values]

# Perform the pivot
return df.group_by(index, maintain_order=maintain_order).agg(agg_expr)

df = pl.DataFrame(
{
"idx": ["A", "A", "A", "B", "B"],
"cat": ["x", "y", "z", "x", "y"],
"val": [1, 2, 3, 4, 5],
}
)
print(df)

print('pivot with column_values:')
df_new = df.lazy().pipe(lazypivot, on="cat", index="idx", values="val", column_values=['x', 'y', 'z']).collect()
print(df_new)

print('pivot without column_values:')
df_new = df.lazy().pipe(lazypivot, on="cat", index="idx", values="val").collect()
print(df_new)

# shape: (5, 3)
# ┌─────┬─────┬─────┐
# │ idx ┆ cat ┆ val │
# │ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 │
# ╞═════╪═════╪═════╡
# │ A ┆ x ┆ 1 │
# │ A ┆ y ┆ 2 │
# │ A ┆ z ┆ 3 │
# │ B ┆ x ┆ 4 │
# │ B ┆ y ┆ 5 │
# └─────┴─────┴─────┘

# pivot with column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x ┆ y ┆ z │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪══════╡
# │ A ┆ 1 ┆ 2 ┆ 3 │
# │ B ┆ 4 ┆ 5 ┆ null │
# └─────┴─────┴─────┴──────┘

# pivot without column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x ┆ y ┆ z │
# │ --- ┆ --- ┆ --- ┆ --- │
# │ str ┆ i64 ┆ i64 ┆ i64 │
# ╞═════╪═════╪═════╪══════╡
# │ A ┆ 1 ┆ 2 ┆ 3 │
# │ B ┆ 4 ┆ 5 ┆ null │
# └─────┴─────┴─────┴──────┘
# UserWarning: No column_values provided. Switching between eager and # lazy mode necessary to collect unique column values.

Подробнее здесь: https://stackoverflow.com/questions/786 ... -lazyframe

1762076756

Anonymous

Я работаю с Polars, и мне нужно добавить производные данные в мультииндексный LazyFrame.
Чтобы изменить данные, я поворачиваю DataFrame без выполнения каких-либо агрегаций, выполняю некоторые вычисления, а затем возвращаю DataFrame обратно в исходный формат. Эту операцию необходимо выполнить на разных уровнях индекса. В документации Polars упоминается, что операции поворота недоступны в ленивом режиме. Из-за поворота/отмены поворота необходимо переключаться между нетерпеливым и ленивым режимами.  Есть ли более эффективный способ добиться этого без переключения между нетерпеливым и ленивым режимами?
Вот пример:
import numpy as np
import polars as pl

def get_raw_data() -> pl.LazyFrame:
"""Generate random a multiindex LazyFrame with different size of indexes as example."""
names = np.array(['A', 'B', 'C'])
measures = np.array(['height', 'width'])
repeats: np.ndarray = np.array([3, 3, 2])
return pl.DataFrame({
'id': np.repeat(names, repeats*measures.size),
'measure': np.concatenate([np.repeat(measures, rep) for rep in repeats]),
'date': np.concatenate([np.arange(size) for size in np.repeat(repeats, measures.size)]),
'value': np.round(np.random.default_rng(111).random(measures.size*repeats.sum()), 2),
}).lazy()

print(get_raw_data().collect())
# shape: (16, 4)
# ┌─────┬─────────┬──────┬───────┐
# │ id  ┆ measure ┆ date ┆ value │
# │ --- ┆ ---     ┆ ---  ┆ ---   │
# │ str ┆ str     ┆ i64  ┆ f64   │
# ╞═════╪═════════╪══════╪═══════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15  │
# │ A   ┆ height  ┆ 1    ┆ 0.17  │
# │ A   ┆ height  ┆ 2    ┆ 0.51  │
# │ A   ┆ width   ┆ 0    ┆ 0.66  │
# │ A   ┆ width   ┆ 1    ┆ 0.77  │
# │ …   ┆ …       ┆ …    ┆ …     │
# │ B   ┆ width   ┆ 2    ┆ 0.72  │
# │ C   ┆ height  ┆ 0    ┆ 0.08  │
# │ C   ┆ height  ┆ 1    ┆ 0.42  │
# │ C   ┆ width   ┆ 0    ┆ 0.4   │
# │ C   ┆ width   ┆ 1    ┆ 0.94  │
# └─────┴─────────┴──────┴───────┘

def expr_add_categories() -> pl.Expr:
"""Generate a sample list of expressions to add some derived categories."""
return [(pl.col('height')/pl.col('width')).alias('ratio')]

def expr_add_ids() -> pl.Expr:
"""Generate a sample list of expressions to add some derived ids."""
return [
(pl.col('A') / pl.col('B')).alias('AB'),
(pl.col('A') / pl.col('C')).alias('AC')
]

def add_categories(df: pl.LazyFrame) -> pl.LazyFrame:
"""Add various derived categories to LazyFrame."""
return (
df
.collect()  # pivot requires eager mode
.pivot(on='measure', index=['id', 'date'], values='value')
.lazy()     # back to lazy mode
.with_columns(expr_add_categories())
.unpivot(index=['id', 'date'], variable_name='measure')
.drop_nulls()
.select('id', 'measure', 'date', 'value')
.sort('id', 'measure', 'date')
.set_sorted('id')
.set_sorted('measure')
.set_sorted('date')
)

def add_ids(df: pl.LazyFrame) -> pl.LazyFrame:
"""Add various derived IDs to LazyFrame."""
return (
df
.collect()  # pivot requires eager mode
.pivot(on='id', index=['measure', 'date'], values='value')
.lazy()     # back to lazy mode
.with_columns(expr_add_ids())
.unpivot(index=['measure', 'date'], variable_name='id')
.drop_nulls()
.select('id', 'measure', 'date', 'value')
.sort('id', 'measure', 'date')
.set_sorted('id')
.set_sorted('measure')
.set_sorted('date')
)

def get_modified_data() ->  pl.LazyFrame:
"""Get raw data and add derived categories and names to LazyFrame."""
return (
get_raw_data()
.pipe(add_categories)
.pipe(add_ids)
)

print(get_modified_data().collect())
# shape: (39, 4)
# ┌─────┬─────────┬──────┬──────────┐
# │ id  ┆ measure ┆ date ┆ value    │
# │ --- ┆ ---     ┆ ---  ┆ ---      │
# │ str ┆ str     ┆ i64  ┆ f64      │
# ╞═════╪═════════╪══════╪══════════╡
# │ A   ┆ height  ┆ 0    ┆ 0.15     │
# │ A   ┆ height  ┆ 1    ┆ 0.17     │
# │ A   ┆ height  ┆ 2    ┆ 0.51     │
# │ A   ┆ ratio   ┆ 0    ┆ 0.227273 │
# │ A   ┆ ratio   ┆ 1    ┆ 0.220779 │
# │ …   ┆ …       ┆ …    ┆ …        │
# │ C   ┆ height  ┆ 1    ┆ 0.42     │
# │ C   ┆ ratio   ┆ 0    ┆ 0.2      │
# │ C   ┆ ratio   ┆ 1    ┆ 0.446809 │
# │ C   ┆ width   ┆ 0    ┆ 0.4      │
# │ C   ┆ width   ┆ 1    ┆ 0.94     │
# └─────┴─────────┴──────┴──────────┘

# *************************************************************
# Python: 3.12.0
# Numpy: 1.26.4
# Polars: 0.20.31
# *************************************************************

Изменить:
Предположим, что содержимое DataFrame является динамическим.
Уникальные элементы любого из уровней индекса заранее неизвестны.
Изменить:
Вот пример «lazypivot» в виде UDF. Я не знаю, как применить функцию агрегирования без использования лямбда-функции.
Лучшие предложения приветствуются.
import warnings
from typing import Callable
import polars as pl

def lazypivot(df: pl.LazyFrame,
on: str | list[str] | None,
index: str | list[str] | None,
values: str | list[str] | None,
column_values: str | list[str] | None = None,
aggregate_function: Callable | None = None,
maintain_order: bool = True,
sort_columns: bool = True,
) -> pl.LazyFrame:
"""Pivot a LazyFrame with or without aggregation."""

# Collect unique column values if not provided
if column_values is None:
warnings.warn(
'No column_values provided.  Switching between eager and lazy mode necessary to collect unique column values.',
UserWarning
)
collected_df = df.collect()
column_values = collected_df[on].unique().sort() if sort_columns else collected_df[on].unique()
df = collected_df.lazy()

# Define the aggregation function
if aggregate_function is None:
agg_expr = [pl.col(values).filter(pl.col(on) == value).first().alias(value) for value in column_values]
else:
agg_expr = [aggregate_function(pl.col(values).filter(pl.col(on) == value)).alias(value) for value in column_values]

# Perform the pivot
return df.group_by(index, maintain_order=maintain_order).agg(agg_expr)

df = pl.DataFrame(
{
"idx": ["A", "A", "A", "B", "B"],
"cat": ["x", "y", "z", "x", "y"],
"val": [1, 2, 3, 4, 5],
}
)
print(df)

print('pivot with column_values:')
df_new = df.lazy().pipe(lazypivot, on="cat", index="idx", values="val", column_values=['x', 'y', 'z']).collect()
print(df_new)

print('pivot without column_values:')
df_new = df.lazy().pipe(lazypivot, on="cat", index="idx", values="val").collect()
print(df_new)

# shape: (5, 3)
# ┌─────┬─────┬─────┐
# │ idx ┆ cat ┆ val │
# │ --- ┆ --- ┆ --- │
# │ str ┆ str ┆ i64 │
# ╞═════╪═════╪═════╡
# │ A   ┆ x   ┆ 1   │
# │ A   ┆ y   ┆ 2   │
# │ A   ┆ z   ┆ 3   │
# │ B   ┆ x   ┆ 4   │
# │ B   ┆ y   ┆ 5   │
# └─────┴─────┴─────┘

# pivot with column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x   ┆ y   ┆ z    │
# │ --- ┆ --- ┆ --- ┆ ---  │
# │ str ┆ i64 ┆ i64 ┆ i64  │
# ╞═════╪═════╪═════╪══════╡
# │ A   ┆ 1   ┆ 2   ┆ 3    │
# │ B   ┆ 4   ┆ 5   ┆ null │
# └─────┴─────┴─────┴──────┘

# pivot without column_values:
# shape: (2, 4)
# ┌─────┬─────┬─────┬──────┐
# │ idx ┆ x   ┆ y   ┆ z    │
# │ --- ┆ --- ┆ --- ┆ ---  │
# │ str ┆ i64 ┆ i64 ┆ i64  │
# ╞═════╪═════╪═════╪══════╡
# │ A   ┆ 1   ┆ 2   ┆ 3    │
# │ B   ┆ 4   ┆ 5   ┆ null │
# └─────┴─────┴─────┴──────┘
# UserWarning: No column_values provided. Switching between eager and # lazy mode necessary to collect unique column values.
 

Подробнее здесь: [url]https://stackoverflow.com/questions/78688633/efficient-method-for-adding-derived-data-to-a-polars-multiindex-lazyframe[/url]