Deep Learning for Text with PyTorch
Shubham Jain
Instructor
Applications:
Types: binary, multi-class, multi-label
torch.nn.Embedding
:
Embedding for 'the': tensor([-0.4689, 0.3164, -0.2971, -0.1291, 0.4064])
Embedding for 'cat': tensor([-0.0978, -0.4764, 0.0476, 0.1044, -0.3976])
Embedding for 'sat': tensor([ 0.2731, 0.4431, 0.1275, 0.1434, -0.4721])
import torch from torch import nn
words = ["The", "cat", "sat", "on", "the", "mat"] word_to_idx = {word: i for i, word in enumerate(words)}
inputs = torch.LongTensor([word_to_idx[w] for w in words])
embedding = nn.Embedding(num_embeddings=len(words), embedding_dim=10)
output = embedding(inputs)
print(output)
tensor([[ 1.0624, 0.6792, 0.0459, ... -1.0828, -0.4475, 0.4868],
...
[1.5766, 0.0106, 0.1161, ...,, -0.0859, 1.3160, 1.3621])
def preprocess_sentences(text): # Tokenization # Stemming ...
# Word to index mapping
class TextDataset(Dataset): def __init__(self, encoded_sentences): self.data = encoded_sentences def __len__(self): return len(self.data) def __getitem__(self, index): return self.data[index]
def text_processing_pipeline(text): tokens = preprocess_sentences(text) dataset = TextDataset(tokens) dataloader = DataLoader(dataset, batch_size=2, shuffle=True) return dataloader, vectorizer
text = "Your sample text here." dataloader, vectorizer = text_processing_pipeline(text)
embedding = nn.Embedding(num_embeddings=10, embedding_dim=50) for batch in dataloader: output = embedding(batch) print(output)
Deep Learning for Text with PyTorch