AIMs_files.py

import torch
from transformers import AutoModelForQuestionAnswering
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from tqdm.auto import tqdm
import numpy as np
import collections
import evaluate
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from evaluate import load
from scipy.io.wavfile import write
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
from PIL import Image
import soundfile as sf
from utils import *
import wavmark
from playsound import playsound


class TextandImageQuery():
    QuestionText = None
    RawImage = None

    ##
    AnswerText = None

    def funcTextandImageQuery(self, raw_image_path,question):
        '''
        Apply an NN to answer the question
        '''
        raw_image=Image.open(raw_image_path).convert("RGB")
        pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base")

        output = pipe(raw_image, question, top_k=1)[0]
        return output['answer']

    def run(self):
        self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)

class SpeechRecognition():
    QuestionAudio = None
    ##
    QuestionText = None

    def funcSpeechRecognition(self, input):
        '''
        Verify the inference
        '''
        if self.QuestionText == None:
            playsound(input)
        speech_reco = pipeline(
            "automatic-speech-recognition", model="openai/whisper-base", device=device
        )
        res = speech_reco(input)
        return res["text"]

    def run(self):
        self.QuestionText = self.funcSpeechRecognition(self.QuestionAudio)


class SpeechSynthesis():
    AnswerText = None
    AnswerAudio= None

    def funcSpeechSynthesis(self,input):
        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")

        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        # You can replace this embedding with your own as well.

        speech = synthesiser("The answer to your question is:"+input,
                             forward_params={"speaker_embeddings": speaker_embedding})


        payload = [0,1,1,1,1,0,0,0,0,1,1,0,1,0,1,1]
        model = wavmark.load_model().to(device)
        signal, sample_rate = speech["audio"],speech["sampling_rate"]

        watermarked_signal, _ = wavmark.encode_watermark(model, signal, payload, show_progress=True)
        # you can save it as a new wav:
        path_output = "AudioAnswer.wav"
        sf.write(path_output, watermarked_signal, samplerate=16000)
        playsound(path_output)

        return path_output

    def run(self):
        self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText)