Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
MPAI-MMC
Implementation of MMC-AMQ
Commits
aabd38aa
Commit
aabd38aa
authored
Sep 27, 2024
by
Carl De Sousa Trias
Browse files
Update AMQ.py, requirements.txt
parent
ec3184e2
Pipeline
#53
failed with stages
in 0 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
AMQ.py
0 → 100644
View file @
aabd38aa
import
torch
from
transformers
import
pipeline
from
PIL
import
Image
import
soundfile
as
sf
from
datasets
import
load_dataset
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
AutomaticSpeechRecognition
():
QuestionAudio
=
None
##
QuestionText
=
None
def
funcAutomaticSpeechRecognition
(
self
,
input
):
'''
Verify the inference
'''
speech_reco
=
pipeline
(
"automatic-speech-recognition"
,
model
=
"openai/whisper-base"
,
device
=
device
)
res
=
speech_reco
(
input
)
return
res
[
"text"
]
def
run
(
self
):
self
.
QuestionText
=
self
.
funcAutomaticSpeechRecognition
(
self
.
QuestionAudio
)
class
TextandImageQuery
():
QuestionText
=
None
RawImage
=
None
##
AnswerText
=
None
def
funcTextandImageQuery
(
self
,
raw_image_path
,
question
):
'''
Apply an NN to answer the question
'''
raw_image
=
Image
.
open
(
raw_image_path
).
convert
(
"RGB"
)
pipe
=
pipeline
(
"visual-question-answering"
,
model
=
"Salesforce/blip-vqa-base"
,
device
=
device
)
output
=
pipe
(
raw_image
,
question
,
top_k
=
1
)[
0
]
return
output
[
'answer'
]
def
run
(
self
):
self
.
AnswerText
=
self
.
funcTextandImageQuery
(
self
.
RawImage
,
self
.
QuestionText
)
class
TextToSpeech
():
AnswerText
=
None
AnswerAudio
=
None
def
funcTextToSpeech
(
self
,
input
):
synthesiser
=
pipeline
(
"text-to-speech"
,
"microsoft/speecht5_tts"
,
device
=
device
)
embeddings_dataset
=
load_dataset
(
"Matthijs/cmu-arctic-xvectors"
,
split
=
"validation"
)
speaker_embedding
=
torch
.
tensor
(
embeddings_dataset
[
7306
][
"xvector"
]).
unsqueeze
(
0
)
# You can replace this embedding with your own as well.
speech
=
synthesiser
(
input
,
forward_params
=
{
"speaker_embeddings"
:
speaker_embedding
})
path_output
=
"AudioAnswer.wav"
sf
.
write
(
path_output
,
speech
[
"audio"
],
samplerate
=
speech
[
"sampling_rate"
])
return
path_output
def
run
(
self
):
self
.
AnswerAudio
=
self
.
funcTextToSpeech
(
self
.
AnswerText
)
if
__name__
==
'__main__'
:
AIM_ASR
=
AutomaticSpeechRecognition
()
AIM_ASR
.
QuestionAudio
=
"path/to/audio/question"
AIM_ASR
.
run
()
#print(AIM_ASR.QuestionText)
AIM_TIQ
=
TextandImageQuery
()
AIM_TIQ
.
QuestionText
=
AIM_ASR
.
QuestionText
##AIM_TIQ.QuestionText="question as a string"
AIM_TIQ
.
RawImage
=
"path/to/context/image"
AIM_TIQ
.
run
()
print
(
AIM_TIQ
.
AnswerText
)
AIM_TTS
=
TextToSpeech
()
AIM_TTS
.
AnswerText
=
AIM_TIQ
.
AnswerText
AIM_TTS
.
run
()
requirements.txt
0 → 100644
View file @
aabd38aa
torch
transformers
datasets
pillow
soundfile
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment