Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
MPAI-NNW
ReferenceSoftwareV12
Commits
14c5c660
Commit
14c5c660
authored
Aug 28, 2024
by
Carl De Sousa Trias
Browse files
Update AIMs and AIW
parent
d771523b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Case3/all_AIW/AIM_folder/SpeechRecognition.py
→
Case3/all_AIW/AIM_folder/
Automatic
SpeechRecognition.py
View file @
14c5c660
...
@@ -2,17 +2,15 @@ import torch
...
@@ -2,17 +2,15 @@ import torch
from
transformers
import
pipeline
from
transformers
import
pipeline
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
SpeechRecognition
():
class
Automatic
SpeechRecognition
():
QuestionAudio
=
None
QuestionAudio
=
None
##
##
QuestionText
=
None
QuestionText
=
None
def
funcSpeechRecognition
(
self
,
input
):
def
func
Automatic
SpeechRecognition
(
self
,
input
):
'''
'''
Verify the inference
Verify the inference
'''
'''
if
self
.
QuestionText
==
None
:
playsound
(
input
)
speech_reco
=
pipeline
(
speech_reco
=
pipeline
(
"automatic-speech-recognition"
,
model
=
"openai/whisper-base"
,
device
=
device
"automatic-speech-recognition"
,
model
=
"openai/whisper-base"
,
device
=
device
)
)
...
@@ -20,10 +18,10 @@ class SpeechRecognition():
...
@@ -20,10 +18,10 @@ class SpeechRecognition():
return
res
[
"text"
]
return
res
[
"text"
]
def
run
(
self
):
def
run
(
self
):
self
.
QuestionText
=
self
.
funcSpeechRecognition
(
self
.
QuestionAudio
)
self
.
QuestionText
=
self
.
func
Automatic
SpeechRecognition
(
self
.
QuestionAudio
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
module
=
SpeechRecognition
()
module
=
Automatic
SpeechRecognition
()
module
.
QuestionAudio
=
"path/to/audiofile"
module
.
QuestionAudio
=
"path/to/audiofile"
module
.
run
()
module
.
run
()
print
(
module
.
QuestionText
)
print
(
module
.
QuestionText
)
Case3/all_AIW/AIM_folder/SpeechanswertoMultimodalQuestion.py
0 → 100644
View file @
14c5c660
import
torch
from
transformers
import
pipeline
from
PIL
import
Image
import
soundfile
as
sf
from
datasets
import
load_dataset
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
AutomaticSpeechRecognition
():
QuestionAudio
=
None
##
QuestionText
=
None
def
funcAutomaticSpeechRecognition
(
self
,
input
):
'''
Verify the inference
'''
speech_reco
=
pipeline
(
"automatic-speech-recognition"
,
model
=
"openai/whisper-base"
,
device
=
device
)
res
=
speech_reco
(
input
)
return
res
[
"text"
]
def
run
(
self
):
self
.
QuestionText
=
self
.
funcAutomaticSpeechRecognition
(
self
.
QuestionAudio
)
class
TextandImageQuery
():
QuestionText
=
None
RawImage
=
None
##
AnswerText
=
None
def
funcTextandImageQuery
(
self
,
raw_image_path
,
question
):
'''
Apply an NN to answer the question
'''
raw_image
=
Image
.
open
(
raw_image_path
).
convert
(
"RGB"
)
pipe
=
pipeline
(
"visual-question-answering"
,
model
=
"Salesforce/blip-vqa-base"
,
device
=
device
)
output
=
pipe
(
raw_image
,
question
,
top_k
=
1
)[
0
]
return
output
[
'answer'
]
def
run
(
self
):
self
.
AnswerText
=
self
.
funcTextandImageQuery
(
self
.
RawImage
,
self
.
QuestionText
)
class
TextToSpeech
():
AnswerText
=
None
AnswerAudio
=
None
def
funcTextToSpeech
(
self
,
input
):
synthesiser
=
pipeline
(
"text-to-speech"
,
"microsoft/speecht5_tts"
,
device
=
device
)
embeddings_dataset
=
load_dataset
(
"Matthijs/cmu-arctic-xvectors"
,
split
=
"validation"
)
speaker_embedding
=
torch
.
tensor
(
embeddings_dataset
[
7306
][
"xvector"
]).
unsqueeze
(
0
)
# You can replace this embedding with your own as well.
speech
=
synthesiser
(
input
,
forward_params
=
{
"speaker_embeddings"
:
speaker_embedding
})
path_output
=
"AudioAnswer.wav"
sf
.
write
(
path_output
,
speech
[
"audio"
],
samplerate
=
speech
[
"sampling_rate"
])
return
path_output
def
run
(
self
):
self
.
AnswerAudio
=
self
.
funcTextToSpeech
(
self
.
AnswerText
)
if
__name__
==
'__main__'
:
AIM_ASR
=
AutomaticSpeechRecognition
()
AIM_ASR
.
QuestionAudio
=
"path/to/audio/question"
AIM_ASR
.
run
()
#print(AIM_ASR.QuestionText)
AIM_TIQ
=
TextandImageQuery
()
AIM_TIQ
.
QuestionText
=
AIM_ASR
.
QuestionText
AIM_TIQ
.
RawImage
=
"path/to/context/image"
AIM_TIQ
.
run
()
#print(AIM_TIQ.AnswerText)
AIM_TTS
=
TextToSpeech
()
AIM_TTS
.
AnswerText
=
AIM_TIQ
.
AnswerText
AIM_TTS
.
run
()
Case3/all_AIW/AIM_folder/Speech
Synthesis
.py
→
Case3/all_AIW/AIM_folder/
TextTo
Speech.py
View file @
14c5c660
...
@@ -4,11 +4,11 @@ import soundfile as sf
...
@@ -4,11 +4,11 @@ import soundfile as sf
from
datasets
import
load_dataset
from
datasets
import
load_dataset
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
Speech
Synthesis
():
class
TextTo
Speech
():
AnswerText
=
None
AnswerText
=
None
AnswerAudio
=
None
AnswerAudio
=
None
def
funcSpeech
Synthesis
(
self
,
input
):
def
func
TextTo
Speech
(
self
,
input
):
synthesiser
=
pipeline
(
"text-to-speech"
,
"microsoft/speecht5_tts"
,
device
=
device
)
synthesiser
=
pipeline
(
"text-to-speech"
,
"microsoft/speecht5_tts"
,
device
=
device
)
embeddings_dataset
=
load_dataset
(
"Matthijs/cmu-arctic-xvectors"
,
split
=
"validation"
)
embeddings_dataset
=
load_dataset
(
"Matthijs/cmu-arctic-xvectors"
,
split
=
"validation"
)
...
@@ -24,9 +24,9 @@ class SpeechSynthesis():
...
@@ -24,9 +24,9 @@ class SpeechSynthesis():
return
path_output
return
path_output
def
run
(
self
):
def
run
(
self
):
self
.
AnswerAudio
=
self
.
funcSpeech
Synthesis
(
self
.
AnswerText
)
self
.
AnswerAudio
=
self
.
func
TextTo
Speech
(
self
.
AnswerText
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
module
=
Speech
Synthesis
()
module
=
TextTo
Speech
()
module
.
AnswerText
=
"
is it a boy?
"
module
.
AnswerText
=
"
Text as a string
"
module
.
run
()
module
.
run
()
Case3/all_AIW/AIM_folder/TextandImageQuery.py
View file @
14c5c660
import
torch
from
transformers
import
pipeline
from
transformers
import
pipeline
from
PIL
import
Image
from
PIL
import
Image
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment