Introduction to LLMs in Python
Jasmin Ludolf
Senior Data Science Content Developer, DataCamp
Pipelines: pipeline()

Auto classes (AutoModel class)


Pre-training

Pre-training Fine-tuning
from datasets import load_datasettrain_data = load_dataset("imdb", split="train")train_data = data.shard(num_shards=4, index=0)test_data = load_dataset("imdb", split="test")test_data = data.shard(num_shards=4, index=0)
load_dataset(): loads a dataset from Hugging Face hubfrom transformers import AutoModel, AutoTokenizerfrom transformers import AutoModelForSequenceClassificationmodel = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
from transformers import AutoTokenizer, AutoModelForSequenceClassification from datasets import load_dataset train_data = load_dataset("imdb", split="train") train_data = data.shard(num_shards=4, index=0) test_data = load_dataset("imdb", split="test") test_data = data.shard(num_shards=4, index=0) model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased") tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")# Tokenize the data tokenized_training_data = tokenizer(train_data["text"], return_tensors="pt", padding=True, truncation=True, max_length=64) tokenized_test_data = tokenizer(test_data["text"], return_tensors="pt", padding=True, truncation=True, max_length=64)
print(tokenized_training_data)
{'input_ids': tensor([[ 101, 1045, 12524, 1045, 2572, 8025, 1011, 3756,
2013, 2026, 2678, 3573, 2138, 1997, 2035, 1996, 6704, 2008, 5129, 2009,
2043, 2009, 2001, 2034, 2207, 1999, 3476, 1012, 1045, 2036, ...
def tokenize_function(text_data): return tokenizer(text_data["text"], return_tensors="pt", padding=True, truncation=True, max_length=64) # Tokenize in batches tokenized_in_batches = train_data.map(tokenize_function, batched=True)# Tokenize row by row tokenized_by_row = train_data.map(tokenize_function, batched=False)
Dataset({
features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
num_rows: 1563
})


Introduction to LLMs in Python