from transformers import pipeline from PIL import Image device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class TextandImageQuery(): QuestionText = None RawImage = None ## AnswerText = None def funcTextandImageQuery(self, raw_image_path,question): ''' Apply an NN to answer the question ''' raw_image=Image.open(raw_image_path).convert("RGB") pipe = pipeline("visual-question-answering", model="Salesforce/blip-vqa-base",device=device) output = pipe(raw_image, question, top_k=1)[0] return output['answer'] def run(self): self.AnswerText = self.funcTextandImageQuery(self.RawImage, self.QuestionText) if __name__ == '__main__': module = TextandImageQuery() module.QuestionText="Question as a string" module.RawImage="path/to/the/contextImage" module.run() print(module.AnswerText)