Оптимизация вставки нескольких таблиц SQLAlchemy в MySQL с отношениями

Оптимизация вставки нескольких таблиц SQLAlchemy в MySQL с отношениями ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Оптимизация вставки нескольких таблиц SQLAlchemy в MySQL с отношениями

Цитата

Сообщение Anonymous » 24 дек 2024, 01:16

У меня есть функция, которая вставляет большие объемы данных (модели, информационные панели, формулы, графики) в базу данных, сохраняя при этом связи (например, родитель-потомок, внешние ключи). Текущая реализация работает, но работает медленно из-за нескольких вызовов db.flush() после каждой пакетной вставки. Я хочу оптимизировать его, чтобы уменьшить количество сбросов и повысить производительность, сохраняя при этом связи.
Mysql, в базе данных aurora.

Имя: SQLAlchemy

Версия: 2.0.36
Проблема:

Оптимизированный код не может правильно заполнить сопоставления старых и новых идентификаторов после очистки. Связи (например, родительские идентификаторы, внешние ключи) остаются неустановленными или недействительными, что приводит к ошибкам проверки и неработающим ссылкам.
Цель:

Как я могу оптимизировать эти вставки, пока обеспечение сохранения связей и правильного заполнения сопоставлений идентификаторов?
Что я пробовал:

Уменьшил количество сбросов, сгруппировав вставки по этапам (например, модели → информационные панели). → формулы).
Использовал db.bulk_insert_mappings для более быстрой вставки, где это возможно.
Пытался обновить родительские идентификаторы и ссылки после минимальных сбросов.
Что я ожидал:

Ускорение выполнения за счет минимизации очистки и повторного обращения к базе данных.
Исправление сопоставления старых и новых идентификаторов после очистки для правильного обновления связей.
код:
@time_function(log_to_db_enabled=True)
def apply_template(self, db: Session, data: TemplateJsonSchema, company_id: int, payload: ApplyTemplateSchema):
"""

@param payload: ApplyTemplateSchema
@param db:
@param data: ApplyTemplateSchema: Contains information about the models, dashboards, formulas, company ID, template ID, and start date.
@param company_id:
@return:int: Returns the unique identifier of the applied template.
"""
"""
Description:
Applies a template to the database, creating associated model calculations, dashboards, and formulas.
Details:
- This method applies a template to the database, creating necessary entries for model calculations, dashboards, and formulas.
- The 'data' parameter contains comprehensive information about models, dashboards, formulas, company ID, template ID, and start date.
- A new entry for the template is created in the database with the provided company ID, template ID, and start date.
- Associated model calculations are added to the database, linked to the template.
- Dashboards and their related graphs are added, ensuring correct associations with model calculations.
- Formulas are added, with proper references to corresponding model calculations.
- The unique identifier of the applied template is returned.
"""
try:
graph_domain = GraphDomain(
GraphRepository(),
GraphOptionsRepository(),
GraphLayoutDataRepository(),
GraphDataPointRepository(),
DataGroupRepository(),
)

template_id = payload.template_id
branch_id = payload.target_branch_id

active_template = ActiveTemplate(company_id=company_id, template_id=template_id,
start_date=payload.start_date, branch_id=branch_id)
db.add(active_template)
db.flush()

old_id_to_model = {}
for model_attributes in data.models:
old_id = model_attributes.id
model_attributes.active_template_id = active_template.id
model_attributes.company_id = company_id
model_attributes.branch_id = branch_id
model = ModelCalculation(**model_attributes.model_dump())
model.id = None
old_id_to_model[old_id] = model

db.add_all(old_id_to_model.values())
db.flush()

# update model_id in dashboard to new ids
old_id_to_dashboard = {}
for dashboard_attributes in data.dashboards:
old_id = dashboard_attributes.id
dashboard_attributes.model_id = old_id_to_model.get(dashboard_attributes.model_id).id
dashboard_attributes.branch_id = branch_id
dashboard_attributes.company_id = company_id
dashboard = Dashboard(**dashboard_attributes.model_dump())
dashboard.id = None
old_id_to_dashboard[old_id] = dashboard

db.add_all(old_id_to_dashboard.values())
db.flush()

old_id_to_formula = {}
old_id_to_parent_id = {} # Store the original parent_id

for formula_attributes in data.formulas:
old_id = formula_attributes.id
# safe extract model_id with get
new_model_id = old_id_to_model.get(formula_attributes.model_id)
if not new_model_id:
logging.error(f"Model with id {formula_attributes.model_id} not found on formula {old_id}", exc_info=True)
continue
formula_attributes.model_id = old_id_to_model.get(formula_attributes.model_id).id
formula_attributes.branch_id = branch_id

# Store original parent_id before setting it to None
old_id_to_parent_id[old_id] = formula_attributes.parent_id
formula_attributes.parent_id = None # Temporarily set parent_id to None

formula = Formula(**formula_attributes.model_dump())
formula.id = None
old_id_to_formula[old_id] = formula

db.add_all(old_id_to_formula.values())
db.flush()

formula_id_original_to_clone = {old_id: formula.id for old_id, formula in old_id_to_formula.items()}

# Now update the parent_ids using the original stored parent_id
for old_id, formula in old_id_to_formula.items():
original_parent_id = old_id_to_parent_id[old_id]

if original_parent_id: # Only set if there was a parent_id originally
formula.parent_id = formula_id_original_to_clone.get(original_parent_id)

# id_mapping type Dict[Type[Base], Dict[int, int]]
id_mapping = {Formula: formula_id_original_to_clone}

update_cloned_formulas_expression(cloned_objects=list(old_id_to_formula.values()), id_mapping=id_mapping,
allow_original_parent_id=True, update_parent_id=True)
# flush updated cloned formulas
db.flush()

for graph_attributes in data.graphs:
# update old dashboard id to clone dashboard id
old_dashboard_id = graph_attributes.graph_data.dashboard_id
graph_attributes.graph_data.dashboard_id = old_id_to_dashboard.get(old_dashboard_id).id
graph_attributes.graph_data.branch_id = branch_id

for data_group in graph_attributes.data_groups:
for data_point in data_group.data_points:
data_point.formula_id = formula_id_original_to_clone.get(data_point.formula_id, data_point.formula_id)
graph_domain.create(db=db, data=GraphCreateSchema(**graph_attributes.model_dump()))

db.commit()
return active_template

except Exception as e:
logging.error(f"Error applying template: {e}", exc_info=True)
db.rollback()
raise e

Подробнее здесь: https://stackoverflow.com/questions/792 ... ationships

1734992184

Anonymous

У меня есть функция, которая вставляет большие объемы данных (модели, информационные панели, формулы, графики) в базу данных, сохраняя при этом связи (например, родитель-потомок, внешние ключи). Текущая реализация работает, но работает медленно из-за нескольких вызовов db.flush() после каждой пакетной вставки. Я хочу оптимизировать его, чтобы уменьшить количество сбросов и повысить производительность, сохраняя при этом связи.
Mysql, в базе данных aurora.

Имя: SQLAlchemy

Версия: 2.0.36
Проблема:

Оптимизированный код не может правильно заполнить сопоставления старых и новых идентификаторов после очистки. Связи (например, родительские идентификаторы, внешние ключи) остаются неустановленными или недействительными, что приводит к ошибкам проверки и неработающим ссылкам.
Цель:

Как я могу оптимизировать эти вставки, пока обеспечение сохранения связей и правильного заполнения сопоставлений идентификаторов?
Что я пробовал:

Уменьшил количество сбросов, сгруппировав вставки по этапам (например, модели → информационные панели).  → формулы).
Использовал db.bulk_insert_mappings для более быстрой вставки, где это возможно.
Пытался обновить родительские идентификаторы и ссылки после минимальных сбросов.
Что я ожидал:

Ускорение выполнения за счет минимизации очистки и повторного обращения к базе данных.
Исправление сопоставления старых и новых идентификаторов после очистки для правильного обновления связей.
код:
@time_function(log_to_db_enabled=True)
def apply_template(self, db: Session, data: TemplateJsonSchema, company_id: int, payload: ApplyTemplateSchema):
"""

@param payload: ApplyTemplateSchema
@param db:
@param data: ApplyTemplateSchema: Contains information about the models, dashboards, formulas, company ID, template ID, and start date.
@param company_id:
@return:int:  Returns the unique identifier of the applied template.
"""
"""
Description:
Applies a template to the database, creating associated model calculations, dashboards, and formulas.
Details:
- This method applies a template to the database, creating necessary entries for model calculations, dashboards, and formulas.
- The 'data' parameter contains comprehensive information about models, dashboards, formulas, company ID, template ID, and start date.
- A new entry for the template is created in the database with the provided company ID, template ID, and start date.
- Associated model calculations are added to the database, linked to the template.
- Dashboards and their related graphs are added, ensuring correct associations with model calculations.
- Formulas are added, with proper references to corresponding model calculations.
- The unique identifier of the applied template is returned.
"""
try:
graph_domain = GraphDomain(
GraphRepository(),
GraphOptionsRepository(),
GraphLayoutDataRepository(),
GraphDataPointRepository(),
DataGroupRepository(),
)

template_id = payload.template_id
branch_id = payload.target_branch_id

active_template = ActiveTemplate(company_id=company_id, template_id=template_id,
start_date=payload.start_date, branch_id=branch_id)
db.add(active_template)
db.flush()

old_id_to_model = {}
for model_attributes in data.models:
old_id = model_attributes.id
model_attributes.active_template_id = active_template.id
model_attributes.company_id = company_id
model_attributes.branch_id = branch_id
model = ModelCalculation(**model_attributes.model_dump())
model.id = None
old_id_to_model[old_id] = model

db.add_all(old_id_to_model.values())
db.flush()

# update model_id in dashboard to new ids
old_id_to_dashboard = {}
for dashboard_attributes in data.dashboards:
old_id = dashboard_attributes.id
dashboard_attributes.model_id = old_id_to_model.get(dashboard_attributes.model_id).id
dashboard_attributes.branch_id = branch_id
dashboard_attributes.company_id = company_id
dashboard = Dashboard(**dashboard_attributes.model_dump())
dashboard.id = None
old_id_to_dashboard[old_id] = dashboard

db.add_all(old_id_to_dashboard.values())
db.flush()

old_id_to_formula = {}
old_id_to_parent_id = {}  # Store the original parent_id

for formula_attributes in data.formulas:
old_id = formula_attributes.id
# safe extract model_id with get
new_model_id = old_id_to_model.get(formula_attributes.model_id)
if not new_model_id:
logging.error(f"Model with id {formula_attributes.model_id} not found on formula {old_id}", exc_info=True)
continue
formula_attributes.model_id = old_id_to_model.get(formula_attributes.model_id).id
formula_attributes.branch_id = branch_id

# Store original parent_id before setting it to None
old_id_to_parent_id[old_id] = formula_attributes.parent_id
formula_attributes.parent_id = None  # Temporarily set parent_id to None

formula = Formula(**formula_attributes.model_dump())
formula.id = None
old_id_to_formula[old_id] = formula

db.add_all(old_id_to_formula.values())
db.flush()

formula_id_original_to_clone = {old_id: formula.id for old_id, formula in old_id_to_formula.items()}

# Now update the parent_ids using the original stored parent_id
for old_id, formula in old_id_to_formula.items():
original_parent_id = old_id_to_parent_id[old_id]

if original_parent_id:  # Only set if there was a parent_id originally
formula.parent_id = formula_id_original_to_clone.get(original_parent_id)

# id_mapping type Dict[Type[Base], Dict[int, int]]
id_mapping = {Formula:  formula_id_original_to_clone}

update_cloned_formulas_expression(cloned_objects=list(old_id_to_formula.values()), id_mapping=id_mapping,
allow_original_parent_id=True, update_parent_id=True)
# flush updated cloned formulas
db.flush()

for graph_attributes in data.graphs:
# update old dashboard id to clone dashboard id
old_dashboard_id = graph_attributes.graph_data.dashboard_id
graph_attributes.graph_data.dashboard_id = old_id_to_dashboard.get(old_dashboard_id).id
graph_attributes.graph_data.branch_id = branch_id

for data_group in graph_attributes.data_groups:
for data_point in data_group.data_points:
data_point.formula_id = formula_id_original_to_clone.get(data_point.formula_id, data_point.formula_id)
graph_domain.create(db=db, data=GraphCreateSchema(**graph_attributes.model_dump()))

db.commit()
return active_template

except Exception as e:
logging.error(f"Error applying template: {e}", exc_info=True)
db.rollback()
raise e
 

Подробнее здесь: [url]https://stackoverflow.com/questions/79291014/optimizing-sqlalchemy-multi-table-insert-to-mysql-with-relationships[/url]