Интеграция живого преобразования речи в текст Deepgram с Twilio на Python?Python

Программы на Python
Ответить Пред. темаСлед. тема
Anonymous
 Интеграция живого преобразования речи в текст Deepgram с Twilio на Python?

Сообщение Anonymous »

Меня заинтриговали Блэнд и Вапи, и я захотел создать своего собственного помощника по вызову с помощью искусственного интеллекта. Для вызова я решил использовать Twilio. Я попробовал использовать живую модель преобразования речи в текст от AssemblyAI, но для этого мне необходимо предоставить данные моей кредитной карты. Поэтому я попробовал использовать Deepgram. Проблема в том, что ни одно из моих решений до сих пор не работало. Хороших руководств по этой проблеме почти нет, учитывая, что каждая другая модель преобразования речи в текст имеет свое собственное руководство, даже несколько раз.
В последнее время я застрял в очень запутанной ситуации. часть программы. Взгляните на код вместе с выводом терминала.
Вот код из main.py:

Код: Выделить всё

import os
import asyncio

from flask import Flask, request, Response
from flask_sock import Sock
import ngrok
from twilio.rest import Client
from twilio.twiml.voice_response import Connect, VoiceResponse, Say, Stream
from dotenv import load_dotenv
load_dotenv()

from twilio_transcriber import TwilioTranscriber

# Flask settings
PORT = 5000
DEBUG = False
INCOMING_CALL_ROUTE = '/'
WEBSOCKET_ROUTE = '/realtime'

# Twilio authentication
account_sid = os.environ['TWILIO_ACCOUNT_SID']
api_key = os.environ['TWILIO_API_KEY_SID']
api_secret = os.environ['TWILIO_API_SECRET']
client = Client(api_key, api_secret, account_sid)

# Twilio phone number to call
TWILIO_NUMBER = os.environ['TWILIO_NUMBER']

# ngrok authentication
ngrok.set_auth_token(os.getenv("NGROK_AUTHTOKEN"))
app = Flask(__name__)
sock = Sock(app)

@app.route(INCOMING_CALL_ROUTE, methods=['GET', 'POST'])
def receive_call():
if request.method == 'POST':
xml = f"""


You will currently be talking to an AI assistant. By continuing this call, you are consenting for the transcript of this call to be anonymously recorded and used for improving our AI calling functions.





""".strip()
return Response(xml, mimetype='text/xml')
else:
return f"This is the calling URL for any incoming number."

@app.route(INCOMING_CALL_ROUTE + "num/", methods=['GET', 'POST'])
def receive_outgoing_call(Number):
if request.method == 'POST':
xml = f"""


You will currently be talking to an AI assistant.  By continuing this call, you are consenting for the transcript of this call to be anonymously recorded and used for improving our AI calling functions.





""".strip()
return Response(xml, mimetype='text/xml')
else:
return f"This is the calling URL for {Number}."

@sock.route(WEBSOCKET_ROUTE + "/")
def transcription_websocket(ws, Number):
asyncio.run(handle_websocket(ws, Number))

async def handle_websocket(ws, Number):
transcriber = TwilioTranscriber(ws, Number)
await transcriber.connect()

if __name__ == "__main__":
try:
# Open Ngrok tunnel
listener = ngrok.forward(f"http://localhost:{PORT}")
print(f"Ngrok tunnel opened at {listener.url()} for port {PORT}")
NGROK_URL = listener.url()

# Set ngrok URL to be the webhook for the appropriate Twilio number
twilio_numbers = client.incoming_phone_numbers.list()
twilio_number_sid = [num.sid for num in twilio_numbers if num.phone_number == TWILIO_NUMBER][0]
client.incoming_phone_numbers(twilio_number_sid).update(account_sid, voice_url=f"{NGROK_URL}{INCOMING_CALL_ROUTE}")

call = client.calls.create(
url=NGROK_URL + "/num/+91xxxxxxxxxx",
to='+91xxxxxxxxxx',
from_=TWILIO_NUMBER,
)
print(call.sid)

# run the app
app.run(port=PORT, debug=DEBUG)
finally:
# Always disconnect the ngrok tunnel
ngrok.disconnect()
Вот код из twilio_transcriber.py:

Код: Выделить всё

import os
import json
import base64
import asyncio
import websockets
from dotenv import load_dotenv
from pydub import AudioSegment

load_dotenv()

DEEPGRAM_API_KEY = os.getenv('DEEPGRAM_API_KEY')
TWILIO_SAMPLE_RATE = 8000  # Hz
DEEPGRAM_WS_URL = 'wss://api.deepgram.com/v1/listen?encoding=mulaw&sample_rate=8000&channels=2&multichannel=true'

class TwilioTranscriber:
def __init__(self, twilio_ws, number):
self.twilio_ws = twilio_ws
self.deepgram_ws = None
self.ph_no = number
self.audio_queue = asyncio.Queue()
self.callsid_queue = asyncio.Queue()

async def connect(self):
headers = {'Authorization': f'Token {DEEPGRAM_API_KEY}'}
print('Connecting to Deepgram WebSocket...')
try:
self.deepgram_ws = await websockets.connect(DEEPGRAM_WS_URL, extra_headers=headers)
print('Connected to Deepgram WebSocket.')
except Exception as e:
print(f'Failed to connect to Deepgram WebSocket: {e}')
return

await asyncio.wait([
asyncio.ensure_future(self.deepgram_sender()),
asyncio.ensure_future(self.deepgram_receiver()),
asyncio.ensure_future(self.twilio_receiver())
])

async def deepgram_sender(self):
print('deepgram_sender started')
while True:
chunk = await self.audio_queue.get()
if chunk is None:
break
try:
await self.deepgram_ws.send(chunk)
print('Sent audio chunk to Deepgram.')
except Exception as e:
print(f'Failed to send audio chunk to Deepgram: {e}')
break

async def deepgram_receiver(self):
print('deepgram_receiver started')
callsid = await self.callsid_queue.get()
async for message in self.deepgram_ws:
try:
print(f'Received message from Deepgram: {message}')
transcription = json.loads(message)
print(f'Transcription from {self.ph_no}: {transcription}')
except Exception as e:
print(f'Failed to process Deepgram message:  {e}')
break

async def twilio_receiver(self):
print('twilio_receiver started')
BUFFER_SIZE = 20 * 160
inbuffer = bytearray()
outbuffer = bytearray()
inbound_chunks_started = False
outbound_chunks_started = False
latest_inbound_timestamp = 0
latest_outbound_timestamp = 0

async for message in self.twilio_ws:
print(f'Received message from Twilio: {message}')
try:
data = json.loads(message)
print(f'Parsed JSON data: {data}')

if data['event'] == 'start':
start = data['start']
callsid = start['callSid']
self.callsid_queue.put_nowait(callsid)
print(f'Received start event for callSid: {callsid}')
elif data['event'] == 'connected':
print('Received connected event')
continue
elif data['event'] == 'media':
media = data['media']
chunk = base64.b64decode(media['payload'])
print(f'Received media chunk: {len(chunk)} bytes')

if media['track'] == 'inbound':
if inbound_chunks_started:
if latest_inbound_timestamp + 20 < int(media['timestamp']):
bytes_to_fill = 8 * (int(media['timestamp']) - (latest_inbound_timestamp + 20))
inbuffer.extend(b'\xff' * bytes_to_fill)
else:
inbound_chunks_started = True
latest_inbound_timestamp = int(media['timestamp'])
latest_outbound_timestamp = int(media['timestamp']) - 20
latest_inbound_timestamp = int(media['timestamp'])
inbuffer.extend(chunk)
elif media['track'] == 'outbound':
if latest_outbound_timestamp + 20 < int(media['timestamp']):
bytes_to_fill = 8 * (int(media['timestamp']) - (latest_outbound_timestamp + 20))
outbuffer.extend(b'\xff' * bytes_to_fill)
latest_outbound_timestamp = int(media['timestamp'])
outbuffer.extend(chunk)
elif data['event'] == 'stop':
print('Received stop event')
break

while len(inbuffer) >= BUFFER_SIZE and len(outbuffer) >= BUFFER_SIZE:
asinbound = AudioSegment(inbuffer[:BUFFER_SIZE], sample_width=1, frame_rate=TWILIO_SAMPLE_RATE, channels=1)
asoutbound = AudioSegment(outbuffer[:BUFFER_SIZE], sample_width=1, frame_rate=TWILIO_SAMPLE_RATE, channels=1)
mixed = AudioSegment.from_mono_audiosegments(asinbound, asoutbound)
self.audio_queue.put_nowait(mixed.raw_data)
inbuffer = inbuffer[BUFFER_SIZE:]
outbuffer = outbuffer[BUFFER_SIZE:]
except Exception as e:
print(f'Error processing Twilio message: {e}')
break

self.audio_queue.put_nowait(b'')

# The code below is just for demonstration purposes and assumes you have an established websocket connection `twilio_ws` and a phone number `number`.
# You would need to replace these with your actual websocket connection and phone number.
# Example usage:
# transcriber = TwilioTranscriber(twilio_ws, number)
# asyncio.run(transcriber.connect())
Вот вывод терминала:

Код: Выделить всё

C:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\pydub\utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work
warn("Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work", RuntimeWarning)
Ngrok tunnel opened at https://7018-2405-201-8026-300f-3828-7548-4a2-67a.ngrok-free.app for port 5000
CAca39dfd003f34e29a1d0fef5f89aeb59
* Serving Flask app 'main'
* Debug mode: off
WARNING: This is a development server. Do not use it in a production deployment.  Use a production WSGI server instead.
* Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [31/May/2024 10:45:02] "POST /num/+919382280814 HTTP/1.1" 200 -
Connecting to Deepgram WebSocket...
Connected to Deepgram WebSocket.
deepgram_sender started
deepgram_receiver started
twilio_receiver started
Если вы думаете, что я вставил неполный вывод терминала, то нет. Он фактически застревает здесь, и я не получаю ответа ни от моего телефонного звонка, ни от терминала.

Подробнее здесь: https://stackoverflow.com/questions/785 ... -in-python
Реклама
Ответить Пред. темаСлед. тема

Быстрый ответ

Изменение регистра текста: 
Смайлики
:) :( :oops: :roll: :wink: :muza: :clever: :sorry: :angel: :read: *x)
Ещё смайлики…
   
К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми.

Максимально разрешённый размер вложения: 15 МБ.

  • Похожие темы
    Ответы
    Просмотры
    Последнее сообщение

Вернуться в «Python»