Add AIMS implementation as standalone file

d771523b · Carl De Sousa Trias · 6b23cdc1 · d771523b · d771523b · d771523b
Commit d771523b authored Aug 27, 2024 by Carl De Sousa Trias
--- a/Case3/all_AIW/AIM_folder/SpeechRecognition.py
+++ b/Case3/all_AIW/AIM_folder/SpeechRecognition.py
+import torch
+from transformers import pipeline
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class SpeechRecognition():
+    QuestionAudio = None
+    ##
+    QuestionText = None
+
+    def funcSpeechRecognition(self, input):
+        '''
+        Verify the inference
+        '''
+        if self.QuestionText == None:
+            playsound(input)
+        speech_reco = pipeline(
+            "automatic-speech-recognition", model="openai/whisper-base", device=device
+        )
+        res = speech_reco(input)
+        return res["text"]
+
+    def run(self):
+        self.QuestionText = self.funcSpeechRecognition(self.QuestionAudio)
+
+if __name__ == '__main__':
+    module = SpeechRecognition()
+    module.QuestionAudio = "path/to/audiofile"
+    module.run()
+    print(module.QuestionText)
--- a/Case3/all_AIW/AIM_folder/SpeechSynthesis.py
+++ b/Case3/all_AIW/AIM_folder/SpeechSynthesis.py
+import torch
+from transformers import pipeline
+import soundfile as sf
+from datasets import load_dataset
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class SpeechSynthesis():
+    AnswerText = None
+    AnswerAudio = None
+
+    def funcSpeechSynthesis(self, input):
+        synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
+
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+        # You can replace this embedding with your own as well.
+
+        speech = synthesiser(input,
+                                forward_params={"speaker_embeddings": speaker_embedding})
+
+
+        path_output = "AudioAnswer.wav"
+        sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"])
+        return path_output
+
+    def run(self):
+        self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText)
+
+if __name__ == '__main__':
+    module = SpeechSynthesis()
+    module.AnswerText="is it a boy?"
+    module.run()
--- a/Case3/all_AIW/AIM_folder/TextandImageQuery.py
+++ b/Case3/all_AIW/AIM_folder/TextandImageQuery.py
+from transformers import pipeline
+from PIL import Image
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+class TextandImageQuery():
+    QuestionText = None
+    RawImage = None
+
+    ##
+    AnswerText = None
+
+    def funcTextandImageQuery(self, raw_image_path,question):
+        '''
+        Apply an NN to answer the question
+        '''
+        raw_image=Image.open(raw_image_path).convert("RGB")
+        pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device)
+
+        output = pipe(raw_image, question, top_k=1)[0]
+        return output['answer']
+
+    def run(self):
+        self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)
+
+if __name__ == '__main__':
+    module = TextandImageQuery()
+    module.QuestionText="Question as a string"
+    module.RawImage="path/to/the/contextImage"
+    module.run()
+    print(module.AnswerText)