import fastavro
import pyarrow as pa
import pyarrow.parquet as pq
def convert_large_avro_to_parquet(avro_file_path, parquet_file_path, batch_size=10000):
# Open the Avro file for reading
with open(avro_file_path, 'rb') as avro_file:
avro_reader = fastavro.reader(avro_file)
schema = avro_reader.writer_schema # Get the Avro schema
# Prepare for Parquet writing
parquet_writer = None
try:
batch = [] # Temporary storage for batch processing
for record in avro_reader:
batch.append(record)
# Process in batches to avoid memory overflow
if len(batch) >= batch_size:
table = pa.Table.from_pandas(pd.DataFrame(batch))
if parquet_writer is None:
# Initialize Parquet writer on the first batch
parquet_writer = pq.ParquetWriter(parquet_file_path, table.schema)
parquet_writer.write_table(table)
batch = [] # Clear the batch
# Write any remaining records
if batch:
table = pa.Table.from_pandas(pd.DataFrame(batch))
if parquet_writer is None:
parquet_writer = pq.ParquetWriter(parquet_file_path, table.schema)
parquet_writer.write_table(table)
finally:
# Close the Parquet writer if it was initialized
if parquet_writer:
parquet_writer.close()
print(f"Successfully converted {avro_file_path} to {parquet_file_path}")
Пример использования
convert_large_avro_to_parquet("large_file.avro", "large_file.parquet", batch_size=50000)
Подробнее здесь: https://stackoverflow.com/questions/792 ... -in-python
Как конвертировать все в Python [закрыто] ⇐ Python
Программы на Python
1733216968
Anonymous
import fastavro
import pyarrow as pa
import pyarrow.parquet as pq
def convert_large_avro_to_parquet(avro_file_path, parquet_file_path, batch_size=10000):
# Open the Avro file for reading
with open(avro_file_path, 'rb') as avro_file:
avro_reader = fastavro.reader(avro_file)
schema = avro_reader.writer_schema # Get the Avro schema
# Prepare for Parquet writing
parquet_writer = None
try:
batch = [] # Temporary storage for batch processing
for record in avro_reader:
batch.append(record)
# Process in batches to avoid memory overflow
if len(batch) >= batch_size:
table = pa.Table.from_pandas(pd.DataFrame(batch))
if parquet_writer is None:
# Initialize Parquet writer on the first batch
parquet_writer = pq.ParquetWriter(parquet_file_path, table.schema)
parquet_writer.write_table(table)
batch = [] # Clear the batch
# Write any remaining records
if batch:
table = pa.Table.from_pandas(pd.DataFrame(batch))
if parquet_writer is None:
parquet_writer = pq.ParquetWriter(parquet_file_path, table.schema)
parquet_writer.write_table(table)
finally:
# Close the Parquet writer if it was initialized
if parquet_writer:
parquet_writer.close()
print(f"Successfully converted {avro_file_path} to {parquet_file_path}")
Пример использования
convert_large_avro_to_parquet("large_file.avro", "large_file.parquet", batch_size=50000)
Подробнее здесь: [url]https://stackoverflow.com/questions/79246908/how-to-convert-everything-in-python[/url]
Ответить
1 сообщение
• Страница 1 из 1
Перейти
- Кемерово-IT
- ↳ Javascript
- ↳ C#
- ↳ JAVA
- ↳ Elasticsearch aggregation
- ↳ Python
- ↳ Php
- ↳ Android
- ↳ Html
- ↳ Jquery
- ↳ C++
- ↳ IOS
- ↳ CSS
- ↳ Excel
- ↳ Linux
- ↳ Apache
- ↳ MySql
- Детский мир
- Для души
- ↳ Музыкальные инструменты даром
- ↳ Печатная продукция даром
- Внешняя красота и здоровье
- ↳ Одежда и обувь для взрослых даром
- ↳ Товары для здоровья
- ↳ Физкультура и спорт
- Техника - даром!
- ↳ Автомобилистам
- ↳ Компьютерная техника
- ↳ Плиты: газовые и электрические
- ↳ Холодильники
- ↳ Стиральные машины
- ↳ Телевизоры
- ↳ Телефоны, смартфоны, плашеты
- ↳ Швейные машинки
- ↳ Прочая электроника и техника
- ↳ Фототехника
- Ремонт и интерьер
- ↳ Стройматериалы, инструмент
- ↳ Мебель и предметы интерьера даром
- ↳ Cантехника
- Другие темы
- ↳ Разное даром
- ↳ Давай меняться!
- ↳ Отдам\возьму за копеечку
- ↳ Работа и подработка в Кемерове
- ↳ Давай с тобой поговорим...
Мобильная версия