import torch from transformers import AutoModelForQuestionAnswering from datasets import load_dataset from transformers import AutoTokenizer, pipeline from tqdm.auto import tqdm import numpy as np import collections import evaluate import torch from datasets import load_dataset from transformers import AutoTokenizer, pipeline from evaluate import load from scipy.io.wavfile import write from transformers.models.whisper.english_normalizer import BasicTextNormalizer from tqdm import tqdm from transformers.pipelines.pt_utils import KeyDataset from PIL import Image import soundfile as sf from utils import * import wavmark from playsound import playsound class TextandImageQuery(): QuestionText = None RawImage = None ## AnswerText = None def funcTextandImageQuery(self, raw_image_path,question): ''' Apply an NN to answer the question ''' raw_image=Image.open(raw_image_path).convert("RGB") pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base") output = pipe(raw_image, question, top_k=1)[0] return output['answer'] def run(self): self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText) class SpeechRecognition(): QuestionAudio = None ## QuestionText = None def funcSpeechRecognition(self, input): ''' Verify the inference ''' if self.QuestionText == None: playsound(input) speech_reco = pipeline( "automatic-speech-recognition", model="openai/whisper-base", device=device ) res = speech_reco(input) return res["text"] def run(self): self.QuestionText = self.funcSpeechRecognition(self.QuestionAudio) class SpeechSynthesis(): AnswerText = None AnswerAudio= None def funcSpeechSynthesis(self,input): synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # You can replace this embedding with your own as well. speech = synthesiser("The answer to your question is:"+input, forward_params={"speaker_embeddings": speaker_embedding}) payload = [0,1,1,1,1,0,0,0,0,1,1,0,1,0,1,1] model = wavmark.load_model().to(device) signal, sample_rate = speech["audio"],speech["sampling_rate"] watermarked_signal, _ = wavmark.encode_watermark(model, signal, payload, show_progress=True) # you can save it as a new wav: path_output = "AudioAnswer.wav" sf.write(path_output, watermarked_signal, samplerate=16000) playsound(path_output) return path_output def run(self): self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText)