Machine Learning with PySpark
Andrew Collier
Data Scientist, Fathom Data
books.show(truncate=False)
+---+--------------------------------------+
|id |text |
+---+--------------------------------------+
|0 |Forever, or a Long, Long Time | ---> 'Long' is only present in this title
|1 |Winnie-the-Pooh |
|2 |Ten Little Fingers and Ten Little Toes|
|3 |Five Get into Trouble | -+-> 'Five' is present in all of these titles
|4 |Five Have a Wonderful Time | |
|5 |Five Get into a Fix | |
|6 |Five Have Plenty of Fun | -+
+---+--------------------------------------+
from pyspark.sql.functions import regexp_replace
# Regular expression (REGEX) to match commas and hyphens
REGEX = '[,\\-]'
books = books.withColumn('text', regexp_replace(books.text, REGEX, ' '))
Before -> After
+---+-----------------------------+ +---+-----------------------------+
|id |text | |id |text |
+---+-----------------------------+ +---+-----------------------------+
|0 |Forever, or a Long, Long Time| |0 |Forever or a Long Long Time|
|1 |Winnie-the-Pooh | |1 |Winnie the Pooh |
+---+-----------------------------+ +---+-----------------------------+
from pyspark.ml.feature import Tokenizer
books = Tokenizer(inputCol="text", outputCol="tokens").transform(books)
+--------------------------------------+----------------------------------------------+
|text |tokens |
+--------------------------------------+----------------------------------------------+
|Forever or a Long Long Time |[forever, or, a, long, long, time] |
|Winnie the Pooh |[winnie, the, pooh] |
|Ten Little Fingers and Ten Little Toes|[ten, little, fingers, and, ten, little, toes]|
|Five Get into Trouble |[five, get, into, trouble] |
|Five Have a Wonderful Time |[five, have, a, wonderful, time] |
+--------------------------------------+----------------------------------------------+
from pyspark.ml.feature import StopWordsRemover
stopwords = StopWordsRemover()
# Take a look at the list of stop words
stopwords.getStopWords()
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', ...]
# Specify the input and output column names
stopwords = stopwords.setInputCol('tokens').setOutputCol('words')
books = stopwords.transform(books)
+----------------------------------------------+-----------------------------------------+
|tokens |words |
+----------------------------------------------+-----------------------------------------+
|[forever, or, a, long, long, time] |[forever, long, long, time] |
|[winnie, the, pooh] |[winnie, pooh] |
|[ten, little, fingers, and, ten, little, toes]|[ten, little, fingers, ten, little, toes]|
|[five, get, into, trouble] |[five, get, trouble] |
|[five, have, a, wonderful, time] |[five, wonderful, time] |
+----------------------------------------------+-----------------------------------------+
from pyspark.ml.feature import HashingTF
hasher = HashingTF(inputCol="words", outputCol="hash", numFeatures=32)
books = hasher.transform(books)
+---+-----------------------------------------+-----------------------------------+
|id |words |hash |
+---+-----------------------------------------+-----------------------------------+
|0 |[forever, long, long, time] |(32,[8,13,14],[2.0,1.0,1.0]) |
|1 |[winnie, pooh] |(32,[1,31],[1.0,1.0]) |
|2 |[ten, little, fingers, ten, little, toes]|(32,[1,15,25,30],[2.0,2.0,1.0,1.0])|
|3 |[five, get, trouble] |(32,[6,7,23],[1.0,1.0,1.0]) |
|4 |[five, wonderful, time] |(32,[6,13,25],[1.0,1.0,1.0]) |
+---+-----------------------------------------+-----------------------------------+
from pyspark.ml.feature import IDF
books = IDF(inputCol="hash", outputCol="features").fit(books).transform(books)
+-----------------------------------------+-------------------------------------------+
|words |features |
+-----------------------------------------+-------------------------------------------+
|[forever, long, long, time] |(32,[8,13,14],[2.598,1.299,1.704]) |
|[winnie, pooh] |(32,[1,31],[1.299,1.704]) |
|[ten, little, fingers, ten, little, toes]|(32,[1,15,25,30],[2.598,3.409,1.011,1.704])|
|[five, get, trouble] |(32,[6,7,23],[0.788,1.704,1.299]) |
|[five, wonderful, time] |(32,[6,13,25],[0.788,1.299,1.011]) |
+-----------------------------------------+-------------------------------------------+
Machine Learning with PySpark