Код: Выделить всё
import time
import polars as pl
import numpy as np
l = 600_000_000
data = pl.DataFrame({"a": np.random.randint(0, 100, l),
"b": np.random.randint(0, 100, l)})
t0 = time.time()
data.write_parquet("/tmp/in-memory.parquet",
compression="zstd", compression_level=6)
t1 = time.time()
print(f"in-memory, local write: {t1-t0:0.2f}s")
t0 = time.time()
data.write_parquet("s3://bucket/in-memory.parquet",
compression="zstd", compression_level=6)
t1 = time.time()
print(f"in-memory, s3 write: {t1-t0:0.2f}s")
lazy = data.lazy()
t0 = time.time()
lazy.sink_parquet(
pl.PartitionByKey(
base_path="/tmp/lazy",
file_path=lambda ctx: f"{ctx.keys[0]}.parquet",
by="a",
include_key=True),
mkdir=True,
compression="zstd",
compression_level=6,
maintain_order=False,
)
t1 = time.time()
print(f"lazy, local write: {t1-t0:0.2f}s")
t0 = time.time()
lazy.sink_parquet(
pl.PartitionByKey(
base_path="s3://bucket/lazy/",
file_path=lambda ctx: f"{ctx.keys[0]}.parquet",
by="a",
include_key=True),
mkdir=True,
compression="zstd",
compression_level=6,
maintain_order=False,
)
t1 = time.time()
print(f"lazy, s3 write: {t1-t0:0.2f}s")
Код: Выделить всё
in-memory, local write: 11.13s
in-memory, s3 write: 5.34s
lazy, local write: 3.73s
lazy, s3 write: 18.62s
Подробнее здесь: https://stackoverflow.com/questions/797 ... local-disk