SpeechanswertoMultimodalQuestion.py 2.47 KB
Newer Older
Carl De Sousa Trias's avatar
Carl De Sousa Trias committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import torch
from transformers import pipeline
from PIL import Image
import soundfile as sf
from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AutomaticSpeechRecognition():
    QuestionAudio = None
    ##
    QuestionText = None

    def funcAutomaticSpeechRecognition(self, input):
        '''
        Verify the inference
        '''
        speech_reco = pipeline(
            "automatic-speech-recognition", model="openai/whisper-base", device=device
        )
        res = speech_reco(input)
        return res["text"]

    def run(self):
        self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)

class TextandImageQuery():
    QuestionText = None
    RawImage = None

    ##
    AnswerText = None

    def funcTextandImageQuery(self, raw_image_path,question):
        '''
        Apply an NN to answer the question
        '''
        raw_image=Image.open(raw_image_path).convert("RGB")
        pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device)

        output = pipe(raw_image, question, top_k=1)[0]
        return output['answer']

    def run(self):
        self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)

class TextToSpeech():
    AnswerText = None
    AnswerAudio = None

    def funcTextToSpeech(self, input):
        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)

        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
        # You can replace this embedding with your own as well.

        speech = synthesiser(input,
                                forward_params={"speaker_embeddings": speaker_embedding})


        path_output = "AudioAnswer.wav"
        sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"])
        return path_output

    def run(self):
        self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)

if __name__ == '__main__':
    AIM_ASR = AutomaticSpeechRecognition()
    AIM_ASR.QuestionAudio = "path/to/audio/question"
    AIM_ASR.run()
    #print(AIM_ASR.QuestionText)
    AIM_TIQ = TextandImageQuery()
    AIM_TIQ.QuestionText=AIM_ASR.QuestionText
    AIM_TIQ.RawImage="path/to/context/image"
    AIM_TIQ.run()
    #print(AIM_TIQ.AnswerText)
    AIM_TTS = TextToSpeech()
    AIM_TTS.AnswerText=AIM_TIQ.AnswerText
    AIM_TTS.run()