Deep Learning for Text with PyTorch
Shubham Jain
Data Scientist
Use cases:
Prerequisite course: Intermediate Deep Learning with PyTorch
torchtext
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("I am reading a book now. I love to read books!") print(tokens)
["I", "am", "reading", "a", "book", "now", ".", "I", "love", "to", "read",
"books", "!"]
import nltk nltk.download('stopwords') from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
tokens = ["I", "am", "reading", "a", "book", "now", ".", "I", "love", "to", "read", "books", "!"] filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)
["reading", "book", ".", "love", "read", "books", "!"]
import nltk from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
filtered_tokens = ["reading", "book", ".", "love", "read", "books", "!"]
stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
print(stemmed_tokens)
["read", "book", ".", "love", "read", "book", "!"]
from nltk.probability import FreqDist stemmed_tokens= ["read", "book", ".", "love", "read", "book", "!"] freq_dist = FreqDist(stemmed_tokens)
threshold = 2
common_tokens = [token for token in stemmed_tokens if freq_dist[token] > threshold] print(common_tokens)
["read", "book", "read", "book"]
Tokenization, stopword removal, stemming, and rare word removal
Deep Learning for Text with PyTorch