import torch from transformers import pipeline import soundfile as sf from datasets import load_dataset device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class SpeechSynthesis(): AnswerText = None AnswerAudio = None def funcSpeechSynthesis(self, input): synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts",device=device) embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # You can replace this embedding with your own as well. speech = synthesiser(input, forward_params={"speaker_embeddings": speaker_embedding}) path_output = "AudioAnswer.wav" sf.write(path_output, speech["audio"], samplerate=speech["sampling_rate"]) return path_output def run(self): self.AnswerAudio = self.funcSpeechSynthesis(self.AnswerText) if __name__ == '__main__': module = SpeechSynthesis() module.AnswerText="is it a boy?" module.run()