Efficient AI Model Training with PyTorch
Dennis Lee
Data Engineer
Image application
Audio application
print(dataset)
Dataset({
features: ['img', 'label'],
num_rows: 1000
})
print(dataset[0]["img"])
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=720x480>
AutoImageProcessor
loads all preprocessing stepsfrom transformers import AutoImageProcessor model = "microsoft/swin-tiny-patch4-window7-224"
image_processor = AutoImageProcessor.from_pretrained(model)
dataset = dataset.map( lambda examples: {
"pixel_values": [
image_processor(image, return_tensors="pt").pixel_values for image in examples["img"] ]}, batched=True)
print(dataset)
Dataset({
features: ['img', 'label', 'pixel_values'],
num_rows: 1000
})
print(dataset)
DatasetDict({ train: Dataset({
features: ['file', 'audio',
'label'], num_rows: 1000 }), ... })
sampling_rate = 16000 # 16 kHz
max_duration = 1 # 1 second
max_length = sampling_rate * max_duration
print(f"max_length = {max_length:,} samples")
max_length = 16,000 samples
from transformers import AutoFeatureExtractor model = "facebook/wav2vec2-base" feature_extractor = AutoFeatureExtractor.from_pretrained(model)
def preprocess_function(split_data):
audio_arrays = [x["array"] for x in split_data["audio"]]
inputs = feature_extractor(audio_arrays,
sampling_rate=feature_extractor.sampling_rate, max_length=int(feature_extractor.sampling_rate * max_duration),
truncation=True) return inputs
preprocess_function
to the dataset
remove_columns
: remove audio
and file
columnsbatched
: process dataset
examples in batchesdataset = dataset["train"].map(preprocess_function,
remove_columns=["audio", "file"],
batched=True)
print(dataset)
DatasetDict({
train: Dataset({
features: ['label', 'input_values'],
num_rows: 1000
})
DataLoader
: prepare the data for loading and iterating during trainingaccelerator.prepare()
: place the data on CPUs or GPUs based on availabilityaccelerator.prepare()
works with PyTorch DataLoaders (torch.utils.data.DataLoader
)from accelerate import Accelerator from torch.utils.data import DataLoader dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
accelerator = Accelerator() dataloader = accelerator.prepare(dataloader)
Efficient AI Model Training with PyTorch