Код: Выделить всё
import speech_recognition as sr
from pyannote.audio import Model, Inference
from scipy.spatial.distance import cdist
import torch
import numpy
recognizer = sr.Recognizer()
recognizer.dynamic_energy_threshold = False
recognizer.energy_threshold=1100
recognizer.pause_threshold=3
embedding_model = Model.from_pretrained("pyannote/embedding", use_auth_token="(my token)", device="cpu")
inference = Inference(embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), window="whole")
Speaker_embedding = inference("voice_sample.wav")
with sr.Microphone() as source:
audio = recognizer.listen(source, timeout=10)
recognized_text = recognizer.recognize_google(audio, language="en")
audio_data = np.frombuffer(audio.get_raw_data(), dtype=np.int16).astype(np.float32) / 32768.0
live_audio = {"waveform": torch.tensor(audio_data).unsqueeze(0), "sample_rate": 16000}
live_embedding = inference(live_audio)
distance = cdist(live_embedding, Speaker_embedding, metric="cosine")[0, 0]
print(distance)
print(recognized_text)
А еще, как ни странно, пример из https://huggingface.co/pyannote/embedding:
Код: Выделить всё
from pyannote.audio import Model
model = Model.from_pretrained("pyannote/embedding", use_auth_token="ACCESS_TOKEN_GOES_HERE")
from pyannote.audio import Inference
inference = Inference(model, window="whole")
embedding1 = inference("speaker1.wav")
embedding2 = inference("speaker2.wav")
# `embeddingX` is (1 x D) numpy array extracted from the file as a whole.
from scipy.spatial.distance import cdist
distance = cdist(embedding1, embedding2, metric="cosine")[0,0]
Код: Выделить всё
ValueError: XA must be a 2-dimensional arrayПодробнее здесь: https://stackoverflow.com/questions/793 ... ltaneously
Мобильная версия