Google Speech V2: потоковая передача с микрофона в реальном времени

Google Speech V2: потоковая передача с микрофона в реальном времени ⇐ Python

1 сообщение • Страница 1 из 1

Anonymous

Google Speech V2: потоковая передача с микрофона в реальном времени

Цитата

Сообщение Anonymous » 11 ноя 2024, 12:59

Кажется, я не могу найти нигде в документации, как использовать API речи Google V2. По какой-то причине V2 кажется дешевле, чем V1 (согласно таблице цен на речь Google, хотя я понятия не имею, почему устаревшая версия кажется более дорогой), а также V2 поддерживает автоматическое определение языка.
Вот рабочий пример V1 для «прослушивания» с микрофона и расшифровки в реальном времени:
import queue
import re
import sys
import os

from google.cloud import speech

import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="key_google.json"

class MicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""

def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
"""The audio -- and generator -- is guaranteed to be on the main thread."""
self._rate = rate
self._chunk = chunk

# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True

def __enter__(self: object) -> object:
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)

self.closed = False

return self

def __exit__(
self: object,
type: object,
value: object,
traceback: object,
) -> None:
"""Closes the stream, regardless of whether the connection was lost or not."""
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()

def _fill_buffer(
self: object,
in_data: object,
frame_count: int,
time_info: object,
status_flags: object,
) -> object:
"""Continuously collect data from the audio stream, into the buffer.

Args:
in_data: The audio data as a bytes object
frame_count: The number of frames captured
time_info: The time information
status_flags: The status flags

Returns:
The audio data as a bytes object
"""
self._buff.put(in_data)
return None, pyaudio.paContinue

def generator(self: object) -> object:
"""Generates audio chunks from the stream of audio data in chunks.

Args:
self: The MicrophoneStream object

Returns:
A generator that outputs audio chunks.
"""
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]

# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break

yield b"".join(data)

def listen_print_loop(responses: object) -> None: # Changed the return type to None
num_chars_printed = 0
all_transcripts = [] # To store all transcripts

for response in responses:
if not response.results:
continue

result = response.results[0]
if not result.alternatives:
continue

transcript = result.alternatives[0].transcript
overwrite_chars = " " * (num_chars_printed - len(transcript))

if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print(transcript + overwrite_chars)
all_transcripts.append(transcript) # Storing the transcript

if re.search(r"\b(exit|quit)\b", transcript, re.I):
print("Exiting..")
print("All Transcripts: ", all_transcripts) # Print all transcripts if needed
break

num_chars_printed = 0

def main() -> None:
"""Transcribe speech from audio file."""
# See http://g.co/cloud/speech/docs/languages
# for a list of supported languages.
language_code = "es" # a BCP-47 language tag

client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=language_code,
)

streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)

with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)

responses = client.streaming_recognize(streaming_config, requests)

# Now, put the transcription responses to use.
listen_print_loop(responses)

if __name__ == "__main__":
main()

Однако я понятия не имею, как преобразовать тот же код для использования версии 2. Вот единственный пример, который есть у Google, но этот код в основном предназначен для чтения файла .wav или любого другого записанного файла, но мне нужно делать это в реальном времени. Кто-нибудь знает, как использовать V2 для транскрипции потока в реальном времени?
Это моя попытка (но она вообще не работает):
import os
import queue
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10) # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key_google.json"

class MicrophoneStream:
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
self._buff = queue.Queue()
self.closed = True

def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
stream_callback=self._fill_buffer,
)
self.closed = False
return self

def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
self._buff.put(None)
self._audio_interface.terminate()

def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
self._buff.put(in_data)
return None, pyaudio.paContinue

def generator(self):
while not self.closed:
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)

def main():
project_id = "stellar-cumulus-379717"

client = SpeechClient()

recognition_config = cloud_speech.RecognitionConfig(
auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
language_codes=["en-US"],
model="long",
)

streaming_config = cloud_speech.StreamingRecognitionConfig(config=recognition_config)
config_request = cloud_speech.StreamingRecognizeRequest(
recognizer=f"projects/{project_id}/locations/global/recognizers/_",
streaming_config=streaming_config,
)

with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
audio_requests = (
cloud_speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)

def requests():
yield config_request
yield from audio_requests

responses = client.streaming_recognize(requests=requests())

for response in responses:
for result in response.results:
print(f"Transcript: {result.alternatives[0].transcript}")

if __name__ == "__main__":
main()

Я получаю кучу ошибок:
Traceback (most recent call last):
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 162, in error_remapped_callable
return _StreamingResponseIterator(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 88, in __init__
self._stored_first_result = next(self._wrapped)
^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 541, in __next__
return self._next()
^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 967, in _next
raise self
grpc._channel._MultiThreadedRendezvous:

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "D:\AI_workers\pocketchat\stt2.py", line 98, in
main()
File "D:\AI_workers\pocketchat\stt2.py", line 90, in main
responses = client.streaming_recognize(requests=requests())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\cloud\speech_v2\services\speech\client.py", line 1639, in streaming_recognize
response = rpc(
^^^^
File "C:\Python3\Lib\site-packages\google\api_core\gapic_v1\method.py", line 113, in __call__
return wrapped_func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 166, in error_remapped_callable
raise exceptions.from_grpc_error(exc) from exc
google.api_core.exceptions.Unknown: None Exception iterating requests!

Подробнее здесь: https://stackoverflow.com/questions/770 ... g-from-mic

1731319155

Anonymous

Кажется, я не могу найти нигде в документации, как использовать API речи Google V2.  По какой-то причине V2 кажется дешевле, чем V1 (согласно таблице цен на речь Google, хотя я понятия не имею, почему устаревшая версия кажется более дорогой), а также V2 поддерживает автоматическое определение языка.
Вот рабочий пример V1 для «прослушивания» с микрофона и расшифровки в реальном времени:
import queue
import re
import sys
import os

from google.cloud import speech

import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="key_google.json"

class MicrophoneStream:
"""Opens a recording stream as a generator yielding the audio chunks."""

def __init__(self: object, rate: int = RATE, chunk: int = CHUNK) -> None:
"""The audio -- and generator -- is guaranteed to be on the main thread."""
self._rate = rate
self._chunk = chunk

# Create a thread-safe buffer of audio data
self._buff = queue.Queue()
self.closed = True

def __enter__(self: object) -> object:
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
# The API currently only supports 1-channel (mono) audio
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
# Run the audio stream asynchronously to fill the buffer object.
# This is necessary so that the input device's buffer doesn't
# overflow while the calling thread makes network requests, etc.
stream_callback=self._fill_buffer,
)

self.closed = False

return self

def __exit__(
self: object,
type: object,
value: object,
traceback: object,
) -> None:
"""Closes the stream, regardless of whether the connection was lost or not."""
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
# Signal the generator to terminate so that the client's
# streaming_recognize method will not block the process termination.
self._buff.put(None)
self._audio_interface.terminate()

def _fill_buffer(
self: object,
in_data: object,
frame_count: int,
time_info: object,
status_flags: object,
) -> object:
"""Continuously collect data from the audio stream, into the buffer.

Args:
in_data: The audio data as a bytes object
frame_count: The number of frames captured
time_info: The time information
status_flags: The status flags

Returns:
The audio data as a bytes object
"""
self._buff.put(in_data)
return None, pyaudio.paContinue

def generator(self: object) -> object:
"""Generates audio chunks from the stream of audio data in chunks.

Args:
self: The MicrophoneStream object

Returns:
A generator that outputs audio chunks.
"""
while not self.closed:
# Use a blocking get() to ensure there's at least one chunk of
# data, and stop iteration if the chunk is None, indicating the
# end of the audio stream.
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]

# Now consume whatever other data's still buffered.
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break

yield b"".join(data)

def listen_print_loop(responses: object) ->  None:  # Changed the return type to None
num_chars_printed = 0
all_transcripts = []  # To store all transcripts

for response in responses:
if not response.results:
continue

result = response.results[0]
if not result.alternatives:
continue

transcript = result.alternatives[0].transcript
overwrite_chars = " " * (num_chars_printed - len(transcript))

if not result.is_final:
sys.stdout.write(transcript + overwrite_chars + "\r")
sys.stdout.flush()
num_chars_printed = len(transcript)
else:
print(transcript + overwrite_chars)
all_transcripts.append(transcript)  # Storing the transcript

if re.search(r"\b(exit|quit)\b", transcript, re.I):
print("Exiting..")
print("All Transcripts: ", all_transcripts)  # Print all transcripts if needed
break

num_chars_printed = 0

def main() -> None:
"""Transcribe speech from audio file."""
# See http://g.co/cloud/speech/docs/languages
# for a list of supported languages.
language_code = "es"  # a BCP-47 language tag

client = speech.SpeechClient()
config = speech.RecognitionConfig(
encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=RATE,
language_code=language_code,
)

streaming_config = speech.StreamingRecognitionConfig(
config=config, interim_results=True
)

with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
requests = (
speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)

responses = client.streaming_recognize(streaming_config, requests)

# Now, put the transcription responses to use.
listen_print_loop(responses)

if __name__ == "__main__":
main()

Однако я понятия не имею, как преобразовать тот же код для использования версии 2. Вот единственный пример, который есть у Google, но этот код в основном предназначен для чтения файла .wav или любого другого записанного файла, но мне нужно делать это в реальном времени.  Кто-нибудь знает, как использовать V2 для транскрипции потока в реальном времени?
Это моя попытка (но она вообще не работает):
import os
import queue
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech
import pyaudio

# Audio recording parameters
RATE = 16000
CHUNK = int(RATE / 10)  # 100ms

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key_google.json"

class MicrophoneStream:
def __init__(self, rate, chunk):
self._rate = rate
self._chunk = chunk
self._buff = queue.Queue()
self.closed = True

def __enter__(self):
self._audio_interface = pyaudio.PyAudio()
self._audio_stream = self._audio_interface.open(
format=pyaudio.paInt16,
channels=1,
rate=self._rate,
input=True,
frames_per_buffer=self._chunk,
stream_callback=self._fill_buffer,
)
self.closed = False
return self

def __exit__(self, type, value, traceback):
self._audio_stream.stop_stream()
self._audio_stream.close()
self.closed = True
self._buff.put(None)
self._audio_interface.terminate()

def _fill_buffer(self, in_data, frame_count, time_info, status_flags):
self._buff.put(in_data)
return None, pyaudio.paContinue

def generator(self):
while not self.closed:
chunk = self._buff.get()
if chunk is None:
return
data = [chunk]
while True:
try:
chunk = self._buff.get(block=False)
if chunk is None:
return
data.append(chunk)
except queue.Empty:
break
yield b"".join(data)

def main():
project_id = "stellar-cumulus-379717"

client = SpeechClient()

recognition_config = cloud_speech.RecognitionConfig(
auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),
language_codes=["en-US"],
model="long",
)

streaming_config = cloud_speech.StreamingRecognitionConfig(config=recognition_config)
config_request = cloud_speech.StreamingRecognizeRequest(
recognizer=f"projects/{project_id}/locations/global/recognizers/_",
streaming_config=streaming_config,
)

with MicrophoneStream(RATE, CHUNK) as stream:
audio_generator = stream.generator()
audio_requests = (
cloud_speech.StreamingRecognizeRequest(audio_content=content)
for content in audio_generator
)

def requests():
yield config_request
yield from audio_requests

responses = client.streaming_recognize(requests=requests())

for response in responses:
for result in response.results:
print(f"Transcript: {result.alternatives[0].transcript}")

if __name__ == "__main__":
main()

Я получаю кучу ошибок:
Traceback (most recent call last):
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 162, in error_remapped_callable
return _StreamingResponseIterator(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 88, in __init__
self._stored_first_result = next(self._wrapped)
^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 541, in __next__
return self._next()
^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\grpc\_channel.py", line 967, in _next
raise self
grpc._channel._MultiThreadedRendezvous:  

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "D:\AI_workers\pocketchat\stt2.py", line 98, in 
main()
File "D:\AI_workers\pocketchat\stt2.py", line 90, in main
responses = client.streaming_recognize(requests=requests())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\cloud\speech_v2\services\speech\client.py", line 1639, in streaming_recognize
response = rpc(
^^^^
File "C:\Python3\Lib\site-packages\google\api_core\gapic_v1\method.py", line 113, in __call__
return wrapped_func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Python3\Lib\site-packages\google\api_core\grpc_helpers.py", line 166, in error_remapped_callable
raise exceptions.from_grpc_error(exc) from exc
google.api_core.exceptions.Unknown: None Exception iterating requests!
 

Подробнее здесь: [url]https://stackoverflow.com/questions/77047383/google-speech-v2-real-time-streaming-from-mic[/url]

Ответить Пред. тема След. тема

1 сообщение • Страница 1 из 1

Быстрый ответ

Заголовок:

Имя пользователя:

Изменение регистра текста:

Смайлики

Ещё смайлики…

К этому ответу прикреплено по крайней мере одно вложение.

Если вы не хотите добавлять вложения, оставьте поля пустыми. Можно прикреплять файлы, перетаскивая их в окно сообщения.

Максимально разрешённый размер вложения: 15 МБ.

Имя файла:

Комментарий к файлу:

Имя файла	Комментарий к файлу	Размер	Статус

Похожие темы

Ответы

Просмотры

Последнее сообщение

Google Speech v2 в реальном времени потоковая передача от микрофона

Последнее сообщение Anonymous « 04 июн 2025, 23:22
Добавлено в форуме Python

Anonymous » 04 июн 2025, 23:22 » в форуме Python

Я не могу найти нигде в документации, как использовать API Google речи V2. По какой -то причине V2 кажется дешевле, чем V1 (согласно таблице ценообразования Google - хотя я понятия не имею, почему устаревшая версия кажется более дорогой), а также V2...

0 Ответы

2 Просмотры

Последнее сообщение Anonymous
04 июн 2025, 23:22
Android — потоковая передача звука с микрофона с одного устройства на несколько устройств через Wi-Fi [закрыто]

Последнее сообщение Anonymous « 06 окт 2024, 15:21
Добавлено в форуме Android

Anonymous » 06 окт 2024, 15:21 » в форуме Android

В настоящее время я изучаю возможность создания модуля прямой трансляции как части моего приложения, и вот пример использования: один пользователь А запишет свой голос с помощью микрофона, и эти данные должны быть переданы на сервер, в то время как...

0 Ответы

23 Просмотры

Последнее сообщение Anonymous
06 окт 2024, 15:21
Потоковая потоковая передача видео с камеры FLIR с помощью Python

Последнее сообщение Anonymous « 15 мар 2025, 23:05
Добавлено в форуме Python

Anonymous » 15 мар 2025, 23:05 » в форуме Python

У меня проблемы с потоковой передачей от Flir Grasshopper3 до OpenCV. Есть ли способ транслировать непосредственно от камеры FLIR в мой код с помощью Python? Когда я использую приведенный ниже код, CV.VideoCapture не может распознать камеру FLIR в...

0 Ответы

35 Просмотры

Последнее сообщение Anonymous
15 мар 2025, 23:05
Потоковая потоковая передача Polars: Parquet Parquet на основе Shift (-1)

Последнее сообщение Anonymous « 20 авг 2025, 09:32
Добавлено в форуме Python

Anonymous » 20 авг 2025, 09:32 » в форуме Python

Я пытаюсь погрузиться в большой паркетный файл с полярными. Это должно быть легко достичь в (1) памяти:
import os ; os.environ = '4'
import polars as pl
import time, random
import numpy as np
random.seed(42)

N_TICKS = 100_000
N_TICKERS = 10_000
T0...

0 Ответы

6 Просмотры

Последнее сообщение Anonymous
20 авг 2025, 09:32
Непонятно, как генерировать текст в речь с помощью C# в Windows - System.Speech против Microsoft Speech Platform

Последнее сообщение Anonymous « 24 июл 2024, 21:45
Добавлено в форуме C#

Anonymous » 24 июл 2024, 21:45 » в форуме C#

Я хотел бы преобразовать текст в речь в Windows с помощью C#. Я разрабатываю приложение, которое будет развернуто на сервере. Таким образом, я не разрабатываю настольное приложение. Я не хочу использовать Azure. Кажется, у Microsoft есть две...

0 Ответы

23 Просмотры

Последнее сообщение Anonymous
24 июл 2024, 21:45

Вернуться в «Python»