Обучение модели искусственного интеллекта для переводчика кода от Java на Python Vise-versa [закрыто]Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Обучение модели искусственного интеллекта для переводчика кода от Java на Python Vise-versa [закрыто]

Сообщение Anonymous »

Я не имею особого опыта в обучении моделям искусственного интеллекта. Но я получил помощь от чата GPT
. Я хочу обучить свою модель, чтобы перевести код с пользователя на другой язык
. В настоящее время он подчиняется 3 языкам (CPP, Java, Python), но я попытался обучить модель Java и Python, я также добавляю набор данных с набором кодов
, но она все еще не дает ответа на запись
. Существует набор данных
в Kaggle < /p>
** < /p>
!pip install --upgrade pip
!pip install torch transformers tqdm pandas sentencepiece
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import re
# Cell 3: Data Cleaning Functions
def clean_code(code, language):
"""Remove comments from the code based on the programming language."""
if language == 'Java':
# Remove single-line comments
code = re.sub(r'//.*', '', code)
# Remove multi-line comments
code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
elif language == 'Python':
# Remove single-line comments
code = re.sub(r'#.*', '', code)

return code.strip()

def load_dataset(file_path):
"""Load and clean the dataset from a CSV file."""
dataset = pd.read_csv(file_path)

# Clean the Java and Python code columns
dataset['Java'] = dataset['Java'].apply(lambda x: clean_code(x, 'Java'))
dataset['Python'] = dataset['Python'].apply(lambda x: clean_code(x, 'Python'))

return dataset['Java'].tolist(), dataset['Python'].tolist()
# Cell 4: Dataset Class Implementation
class CodeTranslationDataset(Dataset):
def __init__(self, source_codes, target_codes, tokenizer, direction='java_to_python', max_length=512):
self.source_codes = source_codes
self.target_codes = target_codes
self.tokenizer = tokenizer
self.max_length = max_length
self.direction = direction
self.task_prefix = "Translate Java to Python: " if direction == 'java_to_python' else "Translate Python to Java: "

def __len__(self):
return len(self.source_codes)

def __getitem__(self, idx):
input_text = f"{self.task_prefix}{self.source_codes[idx]}"
target_text = self.target_codes[idx]

input_tokens = self.tokenizer(
input_text,
padding="max_length",
truncation=True,
max_length=self.max_length,
return_tensors="pt",
)

target_tokens = self.tokenizer(
target_text,
padding="max_length",
truncation=True,
max_length=self.max_length,
return_tensors="pt",
)

return {
"input_ids": input_tokens["input_ids"].squeeze(),
"attention_mask": input_tokens["attention_mask"].squeeze(),
"labels": target_tokens["input_ids"].squeeze(),
}
# Cell 5: Code Translator Class
class CodeTranslator:
def __init__(self, direction='java_to_python', model_name="t5-small", device="cuda"):
self.direction = direction
self.device = torch.device(device if torch.cuda.is_available() else "cpu")
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(self.device)
self.task_prefix = "Translate Java to Python: " if direction == 'java_to_python' else "Translate Python to Java: "
print(f"Initialized {direction} model on {self.device}")

def save_model(self, save_path="/kaggle/working/code_translator.pth"):
torch.save({
'model_state_dict': self.model.state_dict(),
'tokenizer_config': self.tokenizer.init_kwargs,
'direction': self.direction
}, save_path)
print(f"Model saved to {save_path}")

def train(self, train_source, train_target, val_source=None, val_target=None, batch_size=8, epochs=10, learning_rate=2e-5):
train_dataset = CodeTranslationDataset(train_source, train_target, self.tokenizer, self.direction)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_loader = None
if val_source and val_target:
val_dataset = CodeTranslationDataset(val_source, val_target, self.tokenizer, self.direction)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

optimizer = torch.optim.AdamW(self.model.parameters(), lr=learning_rate)

for epoch in range(epochs):
self.model.train()
total_loss = 0
progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

for batch in progress_bar:
input_ids = batch["input_ids"].to(self.device)
attention_mask = batch["attention_mask"].to(self.device)
labels = batch["labels"].to(self.device)

outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)

loss = outputs.loss
total_loss += loss.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
progress_bar.set_postfix({"batch_loss": loss.item()})

avg_loss = total_loss / len(train_loader)
print(f"Train loss: {avg_loss:.4f}")

# Evaluate on validation set if available
if val_loader:
val_loss = self._evaluate(val_loader)
print(f"Validation loss: {val_loss:.4f}")

def _evaluate(self, val_loader):
self.model.eval()
total_loss = 0

with torch.no_grad():
for batch in val_loader:
input_ids = batch["input_ids"].to(self.device)
attention_mask = batch["attention_mask"].to(self.device)
labels = batch["labels"].to(self.device)

outputs = self.model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
total_loss += outputs.loss.item()

avg_loss = total_loss / len(val_loader)
return avg_loss

def translate(self, code):
self.model.eval()
input_text = f"{self.task_prefix}{code.strip()}" # Ensure input code is stripped of leading/trailing whitespace

inputs = self.tokenizer(
input_text,
padding="max_length",
truncation=True,
max_length=512,
return_tensors="pt"
).to(self.device)

with torch.no_grad():
outputs = self.model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=512,
num_beams=5,
early_stopping=True
)

return self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# Cell 6: Load and Prepare Dataset
file_path = "/kaggle/input/dataset/data.csv" # Update this path to your CSV file
train_source, train_target = load_dataset(file_path)

# Split the dataset into training and validation sets
split = int(len(train_source) * 0.75)
val_source, val_target = train_source[split:], train_target[split:]

# Initialize the translator for Java to Python
java_to_python_translator = CodeTranslator(direction='java_to_python', device='cuda')

# Train the Java to Python model
java_to_python_translator.train(
train_source=train_source[:split],
train_target=train_target[:split],
val_source=val_source,
val_target=val_target,
batch_size=4, # Adjust batch size
epochs=10, # Increase epochs for better training
learning_rate=5e-5 # Adjust learning rate
)

# Initialize the translator for Python to Java
python_to_java_translator = CodeTranslator(direction='python_to_java', device='cuda')

# Train the Python to Java model
python_to_java_translator.train(
train_source=train_target[:split], # Use Python as source
train_target=train_source[:split], # Use Java as target
val_source=val_target, # Validation set
val_target=val_source, # Validation set
batch_size=4, # Adjust batch size
epochs=10, # Increase epochs for better training
learning_rate=5e-5 # Adjust learning rate
)
# Cell 7: Testing Phase
def run_tests(translator):
print(f"\n{'='*30} Testing {translator.direction.upper()} {'='*30}")

if translator.direction == 'java_to_python':
test_inputs = test_java_cases
source_lang = 'Java'
target_lang = 'Python'
else:
test_inputs = test_python_cases
source_lang = 'Python'
target_lang = 'Java'

for code in test_inputs:
translated = translator.translate(code)
print(f"{source_lang}:\n{code}\n{target_lang}:\n{translated}\n{'='*70}")

# Example test cases
test_java_cases = [
"public class Main { public static void main(String[] args) { System.out.println(\"Hello World\"); } }",
"int sum = 0; for(int i=0; i

Подробнее здесь: https://stackoverflow.com/questions/794 ... vise-versa
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»