Multi-Modal Models with Hugging Face
James Chapman
Curriculum Manager, DataCamp
import requests from PIL import Image url = "https://www.worldanimalprotection .org/cdn-cgi/image/width=1920,format= auto/globalassets/images/elephants/1 033551-elephant.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "What animal is in this photo?"
from transformers import ViltProcessor, ViltForQuestionAnswering
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
encoding = processor(image, text, return_tensors="pt")
outputs = model(**encoding)
idx = outputs.logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])
Predicted answer: elephant
from datasets import load_dataset from transformers import pipeline dataset = load_dataset("lmms-lab/DocVQA")
import matplotlib.pyplot as plt plt.imshow(dataset["test"][2]["image"]) plt.show()
pytesseract
installed via pip
apt-get
, exe
or homebrew
/macports
)from transformers import pipeline pipe = pipeline("document-question-answering", "impira/layoutlm-document-qa")
result = pipe( dataset["test"][2]["image"], "What was the gross income in 2011-2012?" )
print(result)
[{'score': 0.05149758607149124,
'answer': '3 36073 Crores', ...}]
Multi-Modal Models with Hugging Face