Update AIMs and AIW

14c5c660 · Carl De Sousa Trias · d771523b · 14c5c660 · 14c5c660 · 14c5c660
Commit 14c5c660 authored Aug 28, 2024 by Carl De Sousa Trias
--- a/Case3/all_AIW/AIM_folder/SpeechRecognition.py
+++ b/Case3/all_AIW/AIM_folder/SpeechRecognition.py
@@ -2,17 +2,15 @@ import torch
 from transformers import pipeline
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class SpeechRecognition():
+class AutomaticSpeechRecognition():
    QuestionAudio = None
    ##
    QuestionText = None
-    def funcSpeechRecognition(self, input):
+    def funcAutomaticSpeechRecognition(self, input):
        '''
        Verify the inference
        '''
-        if self.QuestionText == None:
-            playsound(input)
        speech_reco = pipeline(
            "automatic-speech-recognition", model="openai/whisper-base", device=device
        )
@@ -20,10 +18,10 @@ class SpeechRecognition():
        return res["text"]
    def run(self):
-        self.QuestionText = self.funcSpeechRecognition(self.QuestionAudio)
+        self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)
 if __name__ == '__main__':
-    module = SpeechRecognition()
+    module = AutomaticSpeechRecognition()
    module.QuestionAudio = "path/to/audiofile"
    module.run()
    print(module.QuestionText)
--- a/Case3/all_AIW/AIM_folder/SpeechanswertoMultimodalQuestion.py
+++ b/Case3/all_AIW/AIM_folder/SpeechanswertoMultimodalQuestion.py
+import torch
+from transformers import pipeline
+from PIL import Image
+import soundfile as sf
+from datasets import load_dataset
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class AutomaticSpeechRecognition():
+    QuestionAudio = None
+    ##
+    QuestionText = None
+    def funcAutomaticSpeechRecognition(self, input):
+        '''
+        Verify the inference
+        '''
+        speech_reco = pipeline(
+            "automatic-speech-recognition", model="openai/whisper-base", device=device
+        )
+        res = speech_reco(input)
+        return res["text"]
+    def run(self):
+        self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)
+class TextandImageQuery():
+    QuestionText = None
+    RawImage = None
+    ##
+    AnswerText = None
+    def funcTextandImageQuery(self, raw_image_path,question):
+        '''
+        Apply an NN to answer the question
+        '''
+        raw_image=Image.open(raw_image_path).convert("RGB")
+        pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device)
+        output = pipe(raw_image, question, top_k=1)[0]
+        return output['answer']
+    def run(self):
+        self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)
+class TextToSpeech():
+    AnswerText = None
+    AnswerAudio = None
+    def funcTextToSpeech(self, input):
+        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        # You can replace this embedding with your own as well.
+        speech = synthesiser(input,
+                                forward_params={"speaker_embeddings": speaker_embedding})
+        path_output = "AudioAnswer.wav"
+        sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"])
+        return path_output
+    def run(self):
+        self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)
+if __name__ == '__main__':
+    AIM_ASR = AutomaticSpeechRecognition()
+    AIM_ASR.QuestionAudio = "path/to/audio/question"
+    AIM_ASR.run()
+    #print(AIM_ASR.QuestionText)
+    AIM_TIQ = TextandImageQuery()
+    AIM_TIQ.QuestionText=AIM_ASR.QuestionText
+    AIM_TIQ.RawImage="path/to/context/image"
+    AIM_TIQ.run()
+    #print(AIM_TIQ.AnswerText)
+    AIM_TTS = TextToSpeech()
+    AIM_TTS.AnswerText=AIM_TIQ.AnswerText
+    AIM_TTS.run()
--- a/Case3/all_AIW/AIM_folder/SpeechSynthesis.py
+++ b/Case3/all_AIW/AIM_folder/SpeechSynthesis.py
@@ -4,11 +4,11 @@ import soundfile as sf
 from datasets import load_dataset
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-class SpeechSynthesis():
+class TextToSpeech():
    AnswerText = None
    AnswerAudio = None
-    def funcSpeechSynthesis(self, input):
+    def funcTextToSpeech(self, input):
        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
@@ -24,9 +24,9 @@ class SpeechSynthesis():
        return path_output
    def run(self):
-        self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText)
+        self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)
 if __name__ == '__main__':
-    module = SpeechSynthesis()
+    module = TextToSpeech()
-    module.AnswerText="is it a boy?"
+    module.AnswerText="Text as a string"
    module.run()
--- a/Case3/all_AIW/AIM_folder/TextandImageQuery.py
+++ b/Case3/all_AIW/AIM_folder/TextandImageQuery.py
+import torch
 from transformers import pipeline
 from PIL import Image
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")