Как я могу гарантировать, что преобразование текста в речь Azure переключается между языками?Python

Программы на Python
Ответить
Anonymous
 Как я могу гарантировать, что преобразование текста в речь Azure переключается между языками?

Сообщение Anonymous »

Я работаю над приложением, использующим Azure, Gemini, Python и Dart, и хочу обеспечить правильное произношение между языками. Например, я хочу перевести между немецким и испанским языками: цель состоит в том, чтобы слова «привет» -> «хола» произносились правильно на обоих языках. Azure отлично справляется с предложениями, но с дословным переводом у него проблемы.
Вот мой код. Не могли бы вы помочь мне решить проблему?
  • translation_service.py

Код: Выделить всё

class TranslationService:
def __init__(self):
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY not found in environment variables")

genai.configure(api_key=api_key)

self.generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
}

self.model = GenerativeModel(
model_name="gemini-2.0-flash-exp",
generation_config=self.generation_config
)

self.tts_service = EnhancedTTSService()

# Initialize chat session with translation instructions
self.chat_session = self.model.start_chat(
history=[
{
"role": "user",
"parts": [
"""
Translate the following text providing word-by-word translations with arrows:

Text: "{input_text}"

Required format for each language:

1. GERMAN TRANSLATION:
a) Complete sentence:
"Full sentence in German"

b) Word-by-word:
Ich -> Yo
suche -> busco
einen -> un
Job -> trabajo
damit -> para que
ich -> yo
finanziell -> económicamente
unabhängig -> independiente
sein -> ser
kann -> pueda

c) Formal complete sentence:
"Full formal sentence in German"

d) Formal word-by-word:
Ich -> Yo
suche -> busco
nach -> por
einem -> un
Job -> trabajo
um -> para
finanziell -> económicamente
unabhängig -> independiente
zu -> ser
sein -> ser

2. ENGLISH TRANSLATION:
[Same format as German]

3. FRENCH TRANSLATION:
[Same format as German]

4.  ITALIAN TRANSLATION:
[Same format as German]

IMPORTANT FORMATTING RULES:
- Use -> for word translations (NOT = or parentheses)
- Put each word-translation pair on a new line
- Keep one space before and after the arrow
- Include articles, prepositions, and auxiliary verbs as separate entries
- Maintain consistent spacing throughout
"""
]
}
]
)

def _restore_accents(self, text: str) -> str:
"""Restore proper accents and special characters."""
accent_map = {
"a": "á", "e": "é", "i": "í", "o": "ó", "u": "ú", "n": "ñ",
"A": "Á", "E": "É", "I": "Í", "O": "Ó", "U": "Ú", "N": "Ñ",
}

patterns = {
r"([aeiou])´": lambda m: accent_map[m.group(1)],
r"([AEIOU])´": lambda m: accent_map[m.group(1)],
r"n~": "ñ",
r"N~": "Ñ",
}

for pattern, replacement in patterns.items():
if callable(replacement):
text = re.sub(pattern, replacement, text)
else:
text = re.sub(pattern, replacement, text)

return text

async def process_prompt(self, text: str, source_lang: str, target_lang: str) -> Translation:
try:
response = self.chat_session.send_message(text)
generated_text = response.text

print(f"Generated text from Gemini: {generated_text[:100]}...")

audio_filename = await self.tts_service.text_to_speech(
text=generated_text
)

if audio_filename:
print(f"Successfully generated audio: {audio_filename}")
else:
print("Audio generation failed")

return Translation(
original_text=text,
translated_text=generated_text,
source_language=source_lang,
target_language=target_lang,
audio_path=audio_filename,
translations={"main": generated_text},
word_by_word=self._generate_word_by_word(text, generated_text),
grammar_explanations=self._generate_grammar_explanations(generated_text)
)

except Exception as e:
print(f"Error in process_prompt: {str(e)}")
raise Exception(f"Translation processing failed: {str(e)}")

def _generate_word_by_word(self, original: str, translated: str) -> dict[str, dict[str, str]]:
"""Generate word-by-word translation mapping."""
result = {}
original_words = original.split()
translated_words = translated.split()

for i, word in enumerate(original_words):
if i < len(translated_words):
result[word] = {
"translation": translated_words[i],
"pos": "unknown",
}
return result

def _auto_fix_spelling(self, text: str) -> str:
"""Fix spelling in the given text."""
words = re.findall(r"\b\w+\b|[^\w\s]", text)
corrected_words = []

for word in words:
if not re.match(r"\w+", word):
corrected_words.append(word)
continue

if self.spell.unknown([word]):
correction = self.spell.correction(word)
if correction:
if word.isupper():
correction = correction.upper()
elif word[0].isupper():
correction = correction.capitalize()
word = correction

corrected_words.append(word)

return "  ".join(corrected_words)
  • speech_service.py

Код: Выделить всё

import speech_recognition as sr
from ...domain.entities.translation import Translation
from ..services.translation_service import TranslationService

class SpeechService:
def __init__(self):
self.recognizer = sr.Recognizer()
self.translation_service = TranslationService()

async def process_audio(self, audio_file_path: str) -> Translation:
try:
with sr.AudioFile(audio_file_path) as source:
audio = self.recognizer.record(source)
text = self.recognizer.recognize_google(audio, language='es')

# Process the recognized text through translation service
return await self.translation_service.process_prompt(
text=text,
source_lang='en',
target_lang='de'
)
except Exception as e:
raise Exception(f"Speech processing failed: {str(e)}")

  • tts_service.py

Код: Выделить всё

from azure.cognitiveservices.speech.audio import AudioOutputConfig
import os
from typing import Optional
from datetime import datetime
import asyncio
import re

class EnhancedTTSService:
def __init__(self):
# Initialize Speech Config
self.subscription_key = os.getenv("AZURE_SPEECH_KEY")
self.region = os.getenv("AZURE_SPEECH_REGION")

if not self.subscription_key or not self.region:
raise ValueError("Azure Speech credentials not found in environment variables")

# Create speech config
self.speech_config = SpeechConfig(
subscription=self.subscription_key,
region=self.region
)
self.speech_config.set_speech_synthesis_output_format(
SpeechSynthesisOutputFormat.Audio16Khz32KBitRateMonoMp3
)

# Voice mapping with specific styles and roles
self.voice_mapping = {
'en': 'en-US-JennyMultilingualNeural',
'es': 'es-ES-ArabellaMultilingualNeural',
'de': 'de-DE-SeraphinaMultilingualNeural'
}

def _get_temp_directory(self) -> str:
"""Create and return the temporary directory path"""
if os.name == 'nt':  # Windows
temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
else:  # Unix/Linux
temp_dir = '/tmp/tts_audio'
os.makedirs(temp_dir, exist_ok=True)
return temp_dir

def _detect_language(self, text: str) -> str:
"""Detect the primary language of the text"""
# Simple language detection based on character patterns
if re.search(r'[äöüßÄÖÜ]', text):
return 'de'
elif re.search(r'[áéíóúñ¿¡]', text):
return 'es'
return 'en'

def _generate_ssml(self, text: str) -> str:
"""Generate valid SSML with proper escaping and language tags"""
# Clean the text
text = text.replace('&', '&').replace('', '>')

# Detect primary language
primary_lang = self._detect_language(text)
voice_name = self.voice_mapping.get(primary_lang, self.voice_mapping['en'])

ssml = f"""



{text}


"""
return ssml

async def text_to_speech(self, text: str, output_path: Optional[str] = None) ->  Optional[str]:
"""Convert text to speech with robust error handling"""
synthesizer = None
try:
print(f"\nStarting TTS process for text: {text[:100]}...")  # First 100 chars

# Generate output path if not provided
if not output_path:
temp_dir = self._get_temp_directory()
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = os.path.join(temp_dir, f"speech_{timestamp}.mp3")

# Configure audio output
audio_config = AudioOutputConfig(filename=output_path)

# Create synthesizer for this request
synthesizer = SpeechSynthesizer(
speech_config=self.speech_config,
audio_config=audio_config
)

# Generate and validate SSML
ssml = self._generate_ssml(text)
print(f"Generated SSML length: {len(ssml)} characters")

# Perform synthesis
print("Starting speech synthesis...")
result = await asyncio.get_event_loop().run_in_executor(
None,
lambda: synthesizer.speak_ssml_async(ssml).get()
)

# Handle result
if result.reason == ResultReason.SynthesizingAudioCompleted:
print("Speech synthesis completed successfully")
return os.path.basename(output_path)

elif result.reason == ResultReason.Canceled:
print(f"Speech synthesis canceled: {result.cancellation_details.reason}")
print(f"Error details: {result.cancellation_details.error_details}")
return None

return None

except Exception as e:
print(f"Exception in text_to_speech: {str(e)}")
return None

finally:
# Proper cleanup
if synthesizer:
try:
synthesizer.stop_speaking_async()
except:
pass

  • routes.py

Код: Выделить всё

# server/app/infrastructure/api/routes.py
from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional
from ...application.services.translation_service import TranslationService
from ...domain.entities.translation import Translation
import os

app = FastAPI()

# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],  # In production, replace with your frontend domain
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

# Initialize translation service
translation_service = TranslationService()

class PromptRequest(BaseModel):
text: str
source_lang: Optional[str] = "en"
target_lang: Optional[str] = "en"

@app.post("/api/conversation", response_model=Translation)
async def start_conversation(prompt: PromptRequest):
try:
response = await translation_service.process_prompt(
prompt.text,
prompt.source_lang,
prompt.target_lang
)
print(f"Generated audio path: {response.audio_path}")  # Debug log
return response
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

@app.get("/api/audio/{filename}")
async def get_audio(filename: str):
try:
# Get the audio directory
if os.name == 'nt':  # Windows
temp_dir = os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
else:   # Unix/Linux
temp_dir = '/tmp/tts_audio'

# Clean the filename and build the full path
filename = os.path.basename(filename)  # Prevent path traversal
file_path = os.path.join(temp_dir, filename)

if not os.path.exists(file_path):
raise HTTPException(status_code=404, detail="Audio file not found")

return FileResponse(
path=file_path,
media_type='audio/mp3',
filename=filename,
headers={
"Accept-Ranges": "bytes",
"Content-Type": "audio/mp3",
"Cache-Control": "no-cache",
"Access-Control-Allow-Origin": "*",
}
)
except Exception as e:
print(f"Error serving audio file: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))

  • main.py

Код: Выделить всё

import uvicorn
from app.infrastructure.api.routes import app
import os

if __name__ == "__main__":
# Create audio directory with proper permissions
audio_dir = '/tmp/tts_audio' if os.name != 'nt' else os.path.join(os.environ.get('TEMP', ''), 'tts_audio')
os.makedirs(audio_dir, exist_ok=True)
os.chmod(audio_dir, 0o755)  # Read/write for owner, read for others

uvicorn.run(app, host="127.0.0.1", port=8000, reload=True)
Это служба, которую я использую в Azure:
[img]https: //i.sstatic.net/XI2UDAIc.png[/img]

Я пробовал библиотеку «langid», но, похоже, в моем случае она не работает. Моя цель — услышать правильное произношение англо-испанских и немецко-испанских пар слов во время дословного перевода.

Подробнее здесь: https://stackoverflow.com/questions/793 ... -languages
Ответить

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

Вернуться в «Python»