Multi-Modal Models with Hugging Face
James Chapman
Curriculum Manager, DataCamp
from datasets import load_dataset
dataset = load_dataset("nlphuji/flickr30k")
image = dataset['test'][134]["image"]
from transformert import pipeline pipe = pipeline("image-classification", "google/mobilenet_v2_1.0_224") # 224x224 input
pred = pipe(image) print("Predicted class:", pred[0]['label'])
Predicted class: ballplayer, baseball player
dataset['test'][52]["image"]
pipe = pipeline("object-detection", "facebook/detr-resnet-50", revision="no_timm")
outputs = pipe(image, threshold=0.95)
for obj in outputs: box = obj['box']
print(f"Detected {obj['label']} with confidence {obj['score']:.2f} at ({box['xmin']}, {box['ymin']}) to ({box['xmax']}, {box['ymax']})")
Detected person with confidence 0.97 at (381, 131) to (499, 330)
Detected person with confidence 0.96 at (381, 36) to (427, 103)
Detected person with confidence 0.98 at (253, 39) to (294, 125)
Detected person with confidence 1.00 at (144, 36) to (296, 170)
Detected person with confidence 0.95 at (280, 60) to (399, 294)
import matplotlib.pyplot as plt import matplotlib.patches as patches ax = plt.gca() colors = ['r', 'g', 'b', 'y', 'm', 'c', 'k']
plt.imshow(image)
for n, obj in enumerate(outputs): box = obj['box']
rect = patches.Rectangle( (box['xmin'], box['ymin']), box['xmax']-box['xmin'], box['ymax']-box['ymin'],
linewidth=1, edgecolor=colors[n], facecolor='none')
ax.add_patch(rect)
plt.show()
1
(foreground) or 0
(background)pipe = pipeline("image-segmentation", model="briaai/RMBG-1.4", trust_remote_code=True) outputs = pipe(image)
plt.imshow(outputs) plt.show()
Multi-Modal Models with Hugging Face