Код: Выделить всё
import pytest
import pandas as pd
import polars as pl
@pytest.fixture
def path() -> str:
return "files/rows.csv"
@pytest.fixture
def path_xlsx() -> str:
return "files/bankdataset.xlsx"
def test_load_pandas(path: str):
pd.read_csv(path)
def test_load_polars(path: str):
pl.scan_csv(path, low_memory=True).collect()
def test_load_pandas_xlsx(path_xlsx: str):
pd.read_excel(path_xlsx, sheet_name=None)
def test_load_polars_xlsx(path_xlsx: str):
pl.read_excel(path_xlsx, sheet_name=None)
def test_load_pandas_partial(path: str):
df = pd.read_csv(path, nrows=20)
assert len(df) == 20
def test_load_polars_partial(path: str):
df = pl.scan_csv(path).head(n=20).collect()
assert len(df) == 20
def test_load_pandas_partial_xlsx(path_xlsx: str):
df = pd.read_excel(path_xlsx, nrows=20)
assert len(df) == 20
def test_load_polars_partial_xlsx(path_xlsx: str):
df = pl.read_excel(path_xlsx).head(n=20)
assert len(df) == 20
def test_load_polars_partial_buffer(path: str):
with io.BytesIO() as buffer:
pl.scan_csv(path).limit(20).sink_csv(buffer)
df = pl.read_csv(buffer.getvalue())
assert len(df) == 20
Это выходы:
================================================================================================================ MEMRAY REPORT ================================================================================================================
Allocation results for tests/test_load.py::test_load_polars_xlsx at the high watermark
- load_sheet_eager:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/fastexcel/__init__.py:424 -> 473.4MiB
- _call_with_frames_removed::488 -> 20.4KiB
- read_excel:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/fastexcel/__init__.py:514 -> 13.9KiB
- _compile_bytecode::784 -> 9.6KiB
- inner:/home/linuxbrew/.linuxbrew/opt/[email protected]/lib/python3.13/typing.py:429 -> 9.0KiB
Allocation results for tests/test_load.py::test_load_polars_partial_xlsx at the high watermark
- load_sheet_eager:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/fastexcel/__init__.py:424 -> 473.4MiB
- read_excel:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/fastexcel/__init__.py:514 -> 13.9KiB
- _read_spreadsheet:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/polars/io/spreadsheet/functions.py:684 -> 536.0B
Allocation results for tests/test_load.py::test_load_pandas_xlsx at the high watermark
- feed:/home/linuxbrew/.linuxbrew/opt/[email protected]/lib/python3.13/xml/etree/ElementTree.py:1291 -> 245.1MiB
- parse_cell:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/openpyxl/worksheet/_reader.py:244 -> 63.0MiB
- maybe_infer_to_datetimelike:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/core/dtypes/cast.py:1198 -> 39.3MiB
- _rows_to_cols:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/python_parser.py:1066 -> 38.3MiB
- _infer_types:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/base_parser.py:720 -> 15.3MiB
Allocation results for tests/test_load.py::test_load_polars at the high watermark
- collect:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/polars/lazyframe/frame.py:2332 -> 96.0B
Allocation results for tests/test_load.py::test_load_pandas at the high watermark
- read:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/c_parser_wrapper.py:234 -> 18.6MiB
- __init__:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/c_parser_wrapper.py:93 -> 6.0MiB
- get_handle:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/common.py:873 -> 4.0KiB
- _clean_options:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1688 -> 1.5KiB
- read_csv:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/pandas/io/parsers/readers.py:1009 -> 1.5KiB
Allocation results for tests/test_load.py::test_load_polars_partial_buffer at the high watermark
- _check_empty:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/polars/io/_utils.py:282 -> 1.3KiB
- read_csv:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/polars/io/csv/functions.py:572 -> 768.0B
- read_csv:/home/monopoly/workspace/toys/sheetz/.venv/lib/python3.13/site-packages/polars/io/csv/functions.py:549 -> 728.0B
< /code>
Похоже, что Pandas бьет поляры во всех аспектах в отношении эффективности памяти. Есть что -то, что я делаю не так? Я пытаюсь воспользоваться LazyFrames с pl.scan_csv , но даже это не помогает.
Подробнее здесь: https://stackoverflow.com/questions/796 ... han-pandas