Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in / Register
Toggle navigation
Menu
Open sidebar
MPAI-NNW
ReferenceSoftwareV12
Commits
d771523b
Commit
d771523b
authored
Aug 27, 2024
by
Carl De Sousa Trias
Browse files
Add AIMS implementation as standalone file
parent
6b23cdc1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Case3/all_AIW/AIM_folder/SpeechRecognition.py
0 → 100644
View file @
d771523b
import
torch
from
transformers
import
pipeline
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
SpeechRecognition
():
QuestionAudio
=
None
##
QuestionText
=
None
def
funcSpeechRecognition
(
self
,
input
):
'''
Verify the inference
'''
if
self
.
QuestionText
==
None
:
playsound
(
input
)
speech_reco
=
pipeline
(
"automatic-speech-recognition"
,
model
=
"openai/whisper-base"
,
device
=
device
)
res
=
speech_reco
(
input
)
return
res
[
"text"
]
def
run
(
self
):
self
.
QuestionText
=
self
.
funcSpeechRecognition
(
self
.
QuestionAudio
)
if
__name__
==
'__main__'
:
module
=
SpeechRecognition
()
module
.
QuestionAudio
=
"path/to/audiofile"
module
.
run
()
print
(
module
.
QuestionText
)
Case3/all_AIW/AIM_folder/SpeechSynthesis.py
0 → 100644
View file @
d771523b
import
torch
from
transformers
import
pipeline
import
soundfile
as
sf
from
datasets
import
load_dataset
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
SpeechSynthesis
():
AnswerText
=
None
AnswerAudio
=
None
def
funcSpeechSynthesis
(
self
,
input
):
synthesiser
=
pipeline
(
"text-to-speech"
,
"microsoft/speecht5_tts"
,
device
=
device
)
embeddings_dataset
=
load_dataset
(
"Matthijs/cmu-arctic-xvectors"
,
split
=
"validation"
)
speaker_embedding
=
torch
.
tensor
(
embeddings_dataset
[
7306
][
"xvector"
]).
unsqueeze
(
0
)
# You can replace this embedding with your own as well.
speech
=
synthesiser
(
input
,
forward_params
=
{
"speaker_embeddings"
:
speaker_embedding
})
path_output
=
"AudioAnswer.wav"
sf
.
write
(
path_output
,
speech
[
"audio"
],
samplerate
=
speech
[
"sampling_rate"
])
return
path_output
def
run
(
self
):
self
.
AnswerAudio
=
self
.
funcSpeechSynthesis
(
self
.
AnswerText
)
if
__name__
==
'__main__'
:
module
=
SpeechSynthesis
()
module
.
AnswerText
=
"is it a boy?"
module
.
run
()
Case3/all_AIW/AIM_folder/TextandImageQuery.py
0 → 100644
View file @
d771523b
from
transformers
import
pipeline
from
PIL
import
Image
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
class
TextandImageQuery
():
QuestionText
=
None
RawImage
=
None
##
AnswerText
=
None
def
funcTextandImageQuery
(
self
,
raw_image_path
,
question
):
'''
Apply an NN to answer the question
'''
raw_image
=
Image
.
open
(
raw_image_path
).
convert
(
"RGB"
)
pipe
=
pipeline
(
"visual-question-answering"
,
model
=
"Salesforce/blip-vqa-base"
,
device
=
device
)
output
=
pipe
(
raw_image
,
question
,
top_k
=
1
)[
0
]
return
output
[
'answer'
]
def
run
(
self
):
self
.
AnswerText
=
self
.
funcTextandImageQuery
(
self
.
RawImage
,
self
.
QuestionText
)
if
__name__
==
'__main__'
:
module
=
TextandImageQuery
()
module
.
QuestionText
=
"Question as a string"
module
.
RawImage
=
"path/to/the/contextImage"
module
.
run
()
print
(
module
.
AnswerText
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment