Feature Engineering for NLP in Python
Rounak Banik
Data Scientist
"I don't know." # 13 characters
# Compute the number of characters
text = "I don't know."
num_char = len(text)
# Print the number of characters
print(num_char)
13
# Create a 'num_chars' feature
df['num_chars'] = df['review'].apply(len)
# Split the string into words
text = "Mary had a little lamb."
words = text.split()
# Print the list containing words
print(words)
['Mary', 'had', 'a', 'little', 'lamb.']
# Print number of words
print(len(words))
5
# Function that returns number of words in string
def word_count(string):
# Split the string into words
words = string.split()
# Return length of words list
return len(words)
# Create num_words feature in df
df['num_words'] = df['review'].apply(word_count)
#Function that returns average word length def avg_word_length(x):
# Split the string into words words = x.split()
# Compute length of each word and store in a separate list word_lengths = [len(word) for word in words]
# Compute average word length avg_word_length = sum(word_lengths)/len(words)
# Return average word length return(avg_word_length)
# Create a new feature avg_word_length
df['avg_word_length'] = df['review'].apply(doc_density)
# Function that returns number of hashtags def hashtag_count(string):
# Split the string into words words = string.split()
# Create a list of hashtags hashtags = [word for word in words if word.startswith('#')]
# Return number of hashtags return len(hashtags)
hashtag_count("@janedoe This is my first tweet! #FirstTweet #Happy")
2
Feature Engineering for NLP in Python