Anonymous
Как я могу объединить вложения таким образом, чтобы это повысило эффективность и оценку?
Сообщение
Anonymous » 29 ноя 2024, 09:03
Я работаю над проблемой, целью которой является дополнение традиционных вложений встраиваниями, сгенерированными LLM (для этой цели я использую Last_hidden_state). До сих пор я пытался просто объединить их и использовать механизм перекрестного внимания. Хотя объединение вложений дает результаты, аналогичные использованию только традиционных вложений (и результат определенно не лучше), механизм перекрестного внимания неожиданно ухудшил производительность. Существуют ли другие методы, которые потенциально могут улучшить оценку? Код приведен ниже:
Код для простой конкатенации:
Код: Выделить всё
def forward(self, depot_xy, node_xy_demand_tw, llm_embeddings):
moe_loss = 0
# Get traditional embeddings
if isinstance(self.embedding_depot, MoE) or isinstance(self.embedding_node, MoE):
embedded_depot, loss_depot = self.embedding_depot(depot_xy)
embedded_node, loss_node = self.embedding_node(node_xy_demand_tw)
moe_loss = moe_loss + loss_depot + loss_node
else:
embedded_depot = self.embedding_depot(depot_xy)
embedded_node = self.embedding_node(node_xy_demand_tw)
# Project LLM embeddings and normalize
# print(320, self.llm_projection[0].weight.dtype, llm_embeddings.dtype)
projected_llm = self.llm_projection(llm_embeddings)
projected_llm = self.layer_norm(projected_llm)
# Combine traditional embeddings with LLM embeddings
depot_combined = embedded_depot + projected_llm[:, :1, :] # For depot
node_combined = embedded_node + projected_llm[:, 1:, :] # For nodes
out = torch.cat((depot_combined, node_combined), dim=1)
for layer in self.layers:
out, loss = layer(out)
moe_loss = moe_loss + loss
return out, moe_loss
Код для объединения на основе перекрестного внимания с классом перекрестного внимания:
Код: Выделить всё
########################################
# CROSS ATTENTION
########################################
class CrossAttentionFusion(nn.Module):
def __init__(self, embedding_dim, head_num, qkv_dim):
super().__init__()
self.head_num = head_num
# Cross attention layers for traditional -> LLM
self.Wq_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
self.Wk_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
self.Wv_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
# Cross attention layers for LLM -> traditional
self.Wq_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
self.Wk_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
self.Wv_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16)
# Output projections
self.W_out_trad = nn.Linear(head_num * qkv_dim, embedding_dim).to(dtype=torch.bfloat16)
self.W_out_llm = nn.Linear(head_num * qkv_dim, embedding_dim).to(dtype=torch.bfloat16)
# Layer norms
self.norm_trad = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16)
self.norm_llm = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16)
def forward(self, trad_emb, llm_emb):
# Cross attention: traditional -> LLM
# print(f"trad_emb dtype: {trad_emb.dtype}, shape: {trad_emb.shape}")
# print(f"llm_emb dtype: {llm_emb.dtype}, shape: {llm_emb.shape}")
# print(f"Wq_trad type: {self.Wq_trad.weight.dtype}, shape: {self.Wq_trad.weight.shape}")
q_trad = reshape_by_heads(self.Wq_trad(trad_emb), self.head_num)
k_llm = reshape_by_heads(self.Wk_llm(llm_emb), self.head_num)
v_llm = reshape_by_heads(self.Wv_llm(llm_emb), self.head_num)
trad_attends_llm = multi_head_attention(q_trad, k_llm, v_llm)
trad_fused = self.W_out_trad(trad_attends_llm)
trad_out = self.norm_trad(trad_emb + trad_fused)
# Cross attention: LLM -> traditional
q_llm = reshape_by_heads(self.Wq_llm(llm_emb), self.head_num)
k_trad = reshape_by_heads(self.Wk_trad(trad_emb), self.head_num)
v_trad = reshape_by_heads(self.Wv_trad(trad_emb), self.head_num)
llm_attends_trad = multi_head_attention(q_llm, k_trad, v_trad)
llm_fused = self.W_out_llm(llm_attends_trad)
llm_out = self.norm_llm(llm_emb + llm_fused)
# Combine the cross-attended features
fused_embeddings = trad_out + llm_out
return fused_embeddings
########################################
# ENCODER
########################################
class MTL_Encoder(nn.Module):
def __init__(self, **model_params):
super().__init__()
self.model_params = model_params
embedding_dim = self.model_params['embedding_dim']
hidden_dim = self.model_params['ff_hidden_dim']
encoder_layer_num = self.model_params['encoder_layer_num']
head_num = self.model_params['head_num']
qkv_dim = self.model_params['qkv_dim']
llama_hidden_size = 4096 # Llama-2 7B hidden size
# Project Llama embeddings to the model's embedding dimension with dtype torch.bfloat16
self.llm_projection = nn.Sequential(
nn.Linear(llama_hidden_size, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, embedding_dim)
).to(dtype=torch.bfloat16)
# Add layer normalization for better embedding fusion
self.layer_norm = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16)
self.layer_norm_trad = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16)
if self.model_params['num_experts'] > 1 and "Raw" in self.model_params['expert_loc']:
self.embedding_depot = MoE(input_size=2, output_size=embedding_dim,
num_experts=self.model_params['num_experts'],
k=self.model_params['topk'], T=1.0,
noisy_gating=True,
routing_level=self.model_params['routing_level'],
routing_method=self.model_params['routing_method'],
moe_model="Linear")
self.embedding_node = MoE(input_size=5, output_size=embedding_dim,
num_experts=self.model_params['num_experts'],
k=self.model_params['topk'], T=1.0,
noisy_gating=True,
routing_level=self.model_params['routing_level'],
routing_method=self.model_params['routing_method'],
moe_model="Linear")
else:
self.embedding_depot = nn.Linear(2, embedding_dim)
self.embedding_node = nn.Linear(5, embedding_dim)
# Cross-attention fusion module
self.cross_attention_fusion = CrossAttentionFusion(
embedding_dim=embedding_dim,
head_num=head_num,
qkv_dim=qkv_dim
)
self.layers = nn.ModuleList([EncoderLayer(i, **model_params)
for i in range(encoder_layer_num)])
def forward(self, depot_xy, node_xy_demand_tw, llm_embeddings):
moe_loss = 0
# Get traditional embeddings
if isinstance(self.embedding_depot, MoE) or isinstance(self.embedding_node, MoE):
embedded_depot, loss_depot = self.embedding_depot(depot_xy)
embedded_node, loss_node = self.embedding_node(node_xy_demand_tw)
moe_loss = moe_loss + loss_depot + loss_node
else:
embedded_depot = self.embedding_depot(depot_xy)
embedded_node = self.embedding_node(node_xy_demand_tw)
# Combine depot and node embeddings
traditional_embeddings = torch.cat((embedded_depot, embedded_node), dim=1).to(dtype=torch.bfloat16)
# Project and normalize LLM embeddings
projected_llm = self.llm_projection(llm_embeddings)
projected_llm = self.layer_norm(projected_llm)
# Normalize traditional embeddings
traditional_embeddings = self.layer_norm_trad(traditional_embeddings)
# Apply cross-attention fusion
fused_embeddings = self.cross_attention_fusion(
traditional_embeddings,
projected_llm
)
# Pass through encoder layers
out = fused_embeddings
for layer in self.layers:
out, loss = layer(out)
moe_loss = moe_loss + loss
return out, moe_loss
и ниже показано, как я получаю встраивания LLM:
Код: Выделить всё
with torch.no_grad():
outputs = self.llama(**inputs)
# Use the last hidden state's [CLS] token
new_embeddings = outputs.hidden_states[-1][:, 0, :]
Я делаю что-то не так? Или вообще нужны ли традиционные встраивания, созданные LLM?
Подробнее здесь:
https://stackoverflow.com/questions/792 ... -and-score
1732860204
Anonymous
Я работаю над проблемой, целью которой является дополнение традиционных вложений встраиваниями, сгенерированными LLM (для этой цели я использую Last_hidden_state). До сих пор я пытался просто объединить их и использовать механизм перекрестного внимания. Хотя объединение вложений дает результаты, аналогичные использованию только традиционных вложений (и результат определенно не лучше), механизм перекрестного внимания неожиданно ухудшил производительность. Существуют ли другие методы, которые потенциально могут улучшить оценку? Код приведен ниже: Код для простой конкатенации: [code] def forward(self, depot_xy, node_xy_demand_tw, llm_embeddings): moe_loss = 0 # Get traditional embeddings if isinstance(self.embedding_depot, MoE) or isinstance(self.embedding_node, MoE): embedded_depot, loss_depot = self.embedding_depot(depot_xy) embedded_node, loss_node = self.embedding_node(node_xy_demand_tw) moe_loss = moe_loss + loss_depot + loss_node else: embedded_depot = self.embedding_depot(depot_xy) embedded_node = self.embedding_node(node_xy_demand_tw) # Project LLM embeddings and normalize # print(320, self.llm_projection[0].weight.dtype, llm_embeddings.dtype) projected_llm = self.llm_projection(llm_embeddings) projected_llm = self.layer_norm(projected_llm) # Combine traditional embeddings with LLM embeddings depot_combined = embedded_depot + projected_llm[:, :1, :] # For depot node_combined = embedded_node + projected_llm[:, 1:, :] # For nodes out = torch.cat((depot_combined, node_combined), dim=1) for layer in self.layers: out, loss = layer(out) moe_loss = moe_loss + loss return out, moe_loss [/code] Код для объединения на основе перекрестного внимания с классом перекрестного внимания: [code]######################################## # CROSS ATTENTION ######################################## class CrossAttentionFusion(nn.Module): def __init__(self, embedding_dim, head_num, qkv_dim): super().__init__() self.head_num = head_num # Cross attention layers for traditional -> LLM self.Wq_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) self.Wk_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) self.Wv_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) # Cross attention layers for LLM -> traditional self.Wq_llm = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) self.Wk_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) self.Wv_trad = nn.Linear(embedding_dim, head_num * qkv_dim, bias=False).to(dtype=torch.bfloat16) # Output projections self.W_out_trad = nn.Linear(head_num * qkv_dim, embedding_dim).to(dtype=torch.bfloat16) self.W_out_llm = nn.Linear(head_num * qkv_dim, embedding_dim).to(dtype=torch.bfloat16) # Layer norms self.norm_trad = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16) self.norm_llm = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16) def forward(self, trad_emb, llm_emb): # Cross attention: traditional -> LLM # print(f"trad_emb dtype: {trad_emb.dtype}, shape: {trad_emb.shape}") # print(f"llm_emb dtype: {llm_emb.dtype}, shape: {llm_emb.shape}") # print(f"Wq_trad type: {self.Wq_trad.weight.dtype}, shape: {self.Wq_trad.weight.shape}") q_trad = reshape_by_heads(self.Wq_trad(trad_emb), self.head_num) k_llm = reshape_by_heads(self.Wk_llm(llm_emb), self.head_num) v_llm = reshape_by_heads(self.Wv_llm(llm_emb), self.head_num) trad_attends_llm = multi_head_attention(q_trad, k_llm, v_llm) trad_fused = self.W_out_trad(trad_attends_llm) trad_out = self.norm_trad(trad_emb + trad_fused) # Cross attention: LLM -> traditional q_llm = reshape_by_heads(self.Wq_llm(llm_emb), self.head_num) k_trad = reshape_by_heads(self.Wk_trad(trad_emb), self.head_num) v_trad = reshape_by_heads(self.Wv_trad(trad_emb), self.head_num) llm_attends_trad = multi_head_attention(q_llm, k_trad, v_trad) llm_fused = self.W_out_llm(llm_attends_trad) llm_out = self.norm_llm(llm_emb + llm_fused) # Combine the cross-attended features fused_embeddings = trad_out + llm_out return fused_embeddings ######################################## # ENCODER ######################################## class MTL_Encoder(nn.Module): def __init__(self, **model_params): super().__init__() self.model_params = model_params embedding_dim = self.model_params['embedding_dim'] hidden_dim = self.model_params['ff_hidden_dim'] encoder_layer_num = self.model_params['encoder_layer_num'] head_num = self.model_params['head_num'] qkv_dim = self.model_params['qkv_dim'] llama_hidden_size = 4096 # Llama-2 7B hidden size # Project Llama embeddings to the model's embedding dimension with dtype torch.bfloat16 self.llm_projection = nn.Sequential( nn.Linear(llama_hidden_size, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, embedding_dim) ).to(dtype=torch.bfloat16) # Add layer normalization for better embedding fusion self.layer_norm = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16) self.layer_norm_trad = nn.LayerNorm(embedding_dim).to(dtype=torch.bfloat16) if self.model_params['num_experts'] > 1 and "Raw" in self.model_params['expert_loc']: self.embedding_depot = MoE(input_size=2, output_size=embedding_dim, num_experts=self.model_params['num_experts'], k=self.model_params['topk'], T=1.0, noisy_gating=True, routing_level=self.model_params['routing_level'], routing_method=self.model_params['routing_method'], moe_model="Linear") self.embedding_node = MoE(input_size=5, output_size=embedding_dim, num_experts=self.model_params['num_experts'], k=self.model_params['topk'], T=1.0, noisy_gating=True, routing_level=self.model_params['routing_level'], routing_method=self.model_params['routing_method'], moe_model="Linear") else: self.embedding_depot = nn.Linear(2, embedding_dim) self.embedding_node = nn.Linear(5, embedding_dim) # Cross-attention fusion module self.cross_attention_fusion = CrossAttentionFusion( embedding_dim=embedding_dim, head_num=head_num, qkv_dim=qkv_dim ) self.layers = nn.ModuleList([EncoderLayer(i, **model_params) for i in range(encoder_layer_num)]) def forward(self, depot_xy, node_xy_demand_tw, llm_embeddings): moe_loss = 0 # Get traditional embeddings if isinstance(self.embedding_depot, MoE) or isinstance(self.embedding_node, MoE): embedded_depot, loss_depot = self.embedding_depot(depot_xy) embedded_node, loss_node = self.embedding_node(node_xy_demand_tw) moe_loss = moe_loss + loss_depot + loss_node else: embedded_depot = self.embedding_depot(depot_xy) embedded_node = self.embedding_node(node_xy_demand_tw) # Combine depot and node embeddings traditional_embeddings = torch.cat((embedded_depot, embedded_node), dim=1).to(dtype=torch.bfloat16) # Project and normalize LLM embeddings projected_llm = self.llm_projection(llm_embeddings) projected_llm = self.layer_norm(projected_llm) # Normalize traditional embeddings traditional_embeddings = self.layer_norm_trad(traditional_embeddings) # Apply cross-attention fusion fused_embeddings = self.cross_attention_fusion( traditional_embeddings, projected_llm ) # Pass through encoder layers out = fused_embeddings for layer in self.layers: out, loss = layer(out) moe_loss = moe_loss + loss return out, moe_loss [/code] и ниже показано, как я получаю встраивания LLM: [code] with torch.no_grad(): outputs = self.llama(**inputs) # Use the last hidden state's [CLS] token new_embeddings = outputs.hidden_states[-1][:, 0, :] [/code] Я делаю что-то не так? Или вообще нужны ли традиционные встраивания, созданные LLM? Подробнее здесь: [url]https://stackoverflow.com/questions/79233998/how-can-i-fuse-embeddings-in-a-manner-such-that-it-increase-efficiency-and-score[/url]