Update AMQ.py, requirements.txt

aabd38aa · Carl De Sousa Trias · ec3184e2 · aabd38aa · aabd38aa
Commit aabd38aa authored Sep 27, 2024 by Carl De Sousa Trias
--- a/AMQ.py
+++ b/AMQ.py
+import torch
+from transformers import pipeline
+from PIL import Image
+import soundfile as sf
+from datasets import load_dataset
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class AutomaticSpeechRecognition():
+    QuestionAudio = None
+    ##
+    QuestionText = None
+
+    def funcAutomaticSpeechRecognition(self, input):
+        '''
+        Verify the inference
+        '''
+        speech_reco = pipeline(
+            "automatic-speech-recognition", model="openai/whisper-base", device=device
+        )
+        res = speech_reco(input)
+        return res["text"]
+
+    def run(self):
+        self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)
+
+class TextandImageQuery():
+    QuestionText = None
+    RawImage = None
+
+    ##
+    AnswerText = None
+
+    def funcTextandImageQuery(self, raw_image_path,question):
+        '''
+        Apply an NN to answer the question
+        '''
+        raw_image=Image.open(raw_image_path).convert("RGB")
+        pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device)
+
+        output = pipe(raw_image, question, top_k=1)[0]
+        return output['answer']
+
+    def run(self):
+        self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)
+
+class TextToSpeech():
+    AnswerText = None
+    AnswerAudio = None
+
+    def funcTextToSpeech(self, input):
+        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
+
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        # You can replace this embedding with your own as well.
+
+        speech = synthesiser(input,
+                                forward_params={"speaker_embeddings": speaker_embedding})
+
+
+        path_output = "AudioAnswer.wav"
+        sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"])
+        return path_output
+
+    def run(self):
+        self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)
+
+if __name__ == '__main__':
+    AIM_ASR = AutomaticSpeechRecognition()
+    AIM_ASR.QuestionAudio = "path/to/audio/question"
+    AIM_ASR.run()
+    #print(AIM_ASR.QuestionText)
+    AIM_TIQ = TextandImageQuery()
+    AIM_TIQ.QuestionText=AIM_ASR.QuestionText
+    ##AIM_TIQ.QuestionText="question as a string"
+    AIM_TIQ.RawImage="path/to/context/image"
+    AIM_TIQ.run()
+    print(AIM_TIQ.AnswerText)
+    AIM_TTS = TextToSpeech()
+    AIM_TTS.AnswerText=AIM_TIQ.AnswerText
+    AIM_TTS.run()
--- a/requirements.txt
+++ b/requirements.txt
+torch
+transformers
+datasets
+pillow
+soundfile