Commit 14c5c660 authored by Carl De Sousa Trias's avatar Carl De Sousa Trias
Browse files

Update AIMs and AIW

parent d771523b
...@@ -2,17 +2,15 @@ import torch ...@@ -2,17 +2,15 @@ import torch
from transformers import pipeline from transformers import pipeline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SpeechRecognition(): class AutomaticSpeechRecognition():
QuestionAudio = None QuestionAudio = None
## ##
QuestionText = None QuestionText = None
def funcSpeechRecognition(self, input): def funcAutomaticSpeechRecognition(self, input):
''' '''
Verify the inference Verify the inference
''' '''
if self.QuestionText == None:
playsound(input)
speech_reco = pipeline( speech_reco = pipeline(
"automatic-speech-recognition", model="openai/whisper-base", device=device "automatic-speech-recognition", model="openai/whisper-base", device=device
) )
...@@ -20,10 +18,10 @@ class SpeechRecognition(): ...@@ -20,10 +18,10 @@ class SpeechRecognition():
return res["text"] return res["text"]
def run(self): def run(self):
self.QuestionText = self.funcSpeechRecognition(self.QuestionAudio) self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)
if __name__ == '__main__': if __name__ == '__main__':
module = SpeechRecognition() module = AutomaticSpeechRecognition()
module.QuestionAudio = "path/to/audiofile" module.QuestionAudio = "path/to/audiofile"
module.run() module.run()
print(module.QuestionText) print(module.QuestionText)
import torch
from transformers import pipeline
from PIL import Image
import soundfile as sf
from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class AutomaticSpeechRecognition():
QuestionAudio = None
##
QuestionText = None
def funcAutomaticSpeechRecognition(self, input):
'''
Verify the inference
'''
speech_reco = pipeline(
"automatic-speech-recognition", model="openai/whisper-base", device=device
)
res = speech_reco(input)
return res["text"]
def run(self):
self.QuestionText = self.funcAutomaticSpeechRecognition(self.QuestionAudio)
class TextandImageQuery():
QuestionText = None
RawImage = None
##
AnswerText = None
def funcTextandImageQuery(self, raw_image_path,question):
'''
Apply an NN to answer the question
'''
raw_image=Image.open(raw_image_path).convert("RGB")
pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device)
output = pipe(raw_image, question, top_k=1)[0]
return output['answer']
def run(self):
self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText)
class TextToSpeech():
AnswerText = None
AnswerAudio = None
def funcTextToSpeech(self, input):
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
speech = synthesiser(input,
forward_params={"speaker_embeddings": speaker_embedding})
path_output = "AudioAnswer.wav"
sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"])
return path_output
def run(self):
self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)
if __name__ == '__main__':
AIM_ASR = AutomaticSpeechRecognition()
AIM_ASR.QuestionAudio = "path/to/audio/question"
AIM_ASR.run()
#print(AIM_ASR.QuestionText)
AIM_TIQ = TextandImageQuery()
AIM_TIQ.QuestionText=AIM_ASR.QuestionText
AIM_TIQ.RawImage="path/to/context/image"
AIM_TIQ.run()
#print(AIM_TIQ.AnswerText)
AIM_TTS = TextToSpeech()
AIM_TTS.AnswerText=AIM_TIQ.AnswerText
AIM_TTS.run()
...@@ -4,11 +4,11 @@ import soundfile as sf ...@@ -4,11 +4,11 @@ import soundfile as sf
from datasets import load_dataset from datasets import load_dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SpeechSynthesis(): class TextToSpeech():
AnswerText = None AnswerText = None
AnswerAudio = None AnswerAudio = None
def funcSpeechSynthesis(self, input): def funcTextToSpeech(self, input):
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device) synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
...@@ -24,9 +24,9 @@ class SpeechSynthesis(): ...@@ -24,9 +24,9 @@ class SpeechSynthesis():
return path_output return path_output
def run(self): def run(self):
self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText) self.AnswerAudio = self.funcTextToSpeech(self.AnswerText)
if __name__ == '__main__': if __name__ == '__main__':
module = SpeechSynthesis() module = TextToSpeech()
module.AnswerText="is it a boy?" module.AnswerText="Text as a string"
module.run() module.run()
import torch
from transformers import pipeline from transformers import pipeline
from PIL import Image from PIL import Image
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment