Building Recommendation Engines with PySpark
Jamen Long
Data Scientist at Nike
+------+--------------+-------------+-----------+--------------------+----+
|userId|Good Will H...|Batman For...|Incredibles|Shawshank Redemption|Coco|
+------+--------------+-------------+-----------+--------------------+----+
|z097s3| 2| 3| null| 4| 4|
|z176c4| 1| null| 4| 3| 4|
|m821i6| 3| 4| null| 3| 5|
|t872c7| 1| 2| 4| 5|null|
|b728q0| 2| null| 5| 2|null|
|f540n1| 2| 1| null| 3| 1|
|w066f1| 5| null| 5| 2| 5|
|v081u6| 1| null| 5| 1| 1|
|j197o6| 3| 2| 2| 4|null|
|n202j1| 2| null| 2| null| 2|
|p755a0| 2| 3| 4| 5| 5|
|t791a0| 5| 5| null| 1| 4|
|c460j6| 4| 1| null| 4| 4|
|z595b3| 1| 2| 4| null| 1|
|h296x8| 4| 3| 5| 2| 4|
|a610z0| 2| 1| null| 4| 4|
|g025o2| 5| 4| 2| 2|null|
|u902e2| null| 3| 4| 1| 5|
|t893x2| 1| 4| null| null| 5|
|x668y8| 2| 3| 5| 2|null|
+------+--------------+-------------+-----------+--------------------+----+
+------+--------------------+------+
|userId| variable|rating|
+------+--------------------+------+
|z097s3| Good Will Hunting| 2|
|z097s3| Batman Forever| 3|
|z097s3|The Shawshank Red...| 4|
|z097s3| Coco| 4|
|z176c4| Good Will Hunting| 1|
|z176c4| The Incredibles| 4|
|z176c4|The Shawshank Red...| 3|
|z176c4| Coco| 4|
|m821i6| Good Will Hunting| 3|
|m821i6| Batman Forever| 4|
|m821i6|The Shawshank Red...| 3|
|m821i6| Coco| 5|
|t872c7| Good Will Hunting| 1|
|t872c7| Batman Forever| 2|
|t872c7| The Incredibles| 4|
|t872c7|The Shawshank Red...| 5|
|b728q0| Good Will Hunting| 2|
|b728q0| The Incredibles| 5|
|b728q0|The Shawshank Red...| 2|
|f540n1| Good Will Hunting| 2|
+------+--------------------+------+
+------+--------------------+------+
|userId| variable|rating|
+------+--------------------+------+
z097s3 |z097s3| Good Will Hunting| 2|
|-----> |z097s3| Batman Forever| 3|
|-----> |z097s3|The Shawshank Red...| 4|
|-----> |z097s3| Coco| 4|
z176c4 |z176c4| Good Will Hunting| 1|
|-----> |z176c4| The Incredibles| 4|
|-----> |z176c4|The Shawshank Red...| 3|
|-----> |z176c4| Coco| 4|
m821i6 |m821i6| Good Will Hunting| 3|
|-----> |m821i6| Batman Forever| 4|
|-----> |m821i6|The Shawshank Red...| 3|
|-----> |m821i6| Coco| 5|
t872c7 |t872c7| Good Will Hunting| 1|
|-----> |t872c7| Batman Forever| 2|
|-----> |t872c7| The Incredibles| 4|
|-----> |t872c7|The Shawshank Red...| 5|
b728q0 |b728q0| Good Will Hunting| 2|
|-----> |b728q0| The Incredibles| 5|
|-----> |b728q0|The Shawshank Red...| 2|
+------+--------------------+------+
df.printSchema()
root
|-- userId: string (nullable = true)
|-- variable: string (nullable = false)
|-- rating: long (nullable = true)
df.printSchema()
root
|-- userId: string (nullable = true)
|-- variable: string (nullable = false)
|-- rating: long (nullable = true)
ratings.show()
+------+--------------+-------------+-----------+--------------------+----+
|userId|Good Will H...|Batman For...|Incredibles|Shawshank Redemption|Coco|
+------+--------------+-------------+-----------+--------------------+----+
|z097s3| 2| 3| null| 4| 4|
|z176c4| 1| null| 4| 3| 4|
|m821i6| 3| 4| null| 3| 5|
|t872c7| 1| 2| 4| 5|null|
|b728q0| 2| null| 5| 2|null|
|f540n1| 2| 1| null| 3| 1|
|w066f1| 5| null| 5| 2| 5|
|v081u6| 1| null| 5| 1| 1|
|j197o6| 3| 2| 2| 4|null|
|n202j1| 2| null| 2| null| 2|
|p755a0| 2| 3| 4| 5| 5|
|t791a0| 5| 5| null| 1| 4|
|c460j6| 4| 1| null| 4| 4|
|z595b3| 1| 2| 4| null| 1|
|h296x8| 4| 3| 5| 2| 4|
|a610z0| 2| 1| null| 4| 4|
|g025o2| 5| 4| 2| 2|null|
|u902e2| null| 3| 4| 1| 5|
|t893x2| 1| 4| null| null| 5|
|x668y8| 2| 3| 5| 2|null|
+------+--------------+-------------+-----------+--------------------+----+
# Function to convert conventional datafame into row-based ("long") dataframe
wide_to_long
<function __main__.to_long>
# Function to convert conventional datafame into row-based ("long") dataframe
long_ratings = wide_to_long(ratings)
long_ratings.show()
+------+--------------------+------+
|userId| variable|rating|
+------+--------------------+------+
|z097s3| Good Will Hunting| 2|
|z097s3| Batman Forever| 3|
|z097s3|The Shawshank Red...| 4|
|z097s3| Coco| 4|
|z176c4| Good Will Hunting| 1|
|z176c4| The Incredibles| 4|
|z176c4|The Shawshank Red...| 3|
|z176c4| Coco| 4|
|m821i6| Good Will Hunting| 3|
|m821i6| Batman Forever| 4|
|m821i6|The Shawshank Red...| 3|
|m821i6| Coco| 5|
|t872c7| Good Will Hunting| 1|
|t872c7| Batman Forever| 2|
|t872c7| The Incredibles| 4|
|t872c7|The Shawshank Red...| 5|
|b728q0| Good Will Hunting| 2|
|b728q0| The Incredibles| 5|
|b728q0|The Shawshank Red...| 2|
|f540n1| Good Will Hunting| 2|
+------+--------------------+------+
userIds
and movieIds
users = long_ratings.select('userId').distinct()
users.show()
+------+
|userId|
+------+
|j197o6|
|m821i6|
|g025o2|
|z176c4|
|a610z0|
|c460j6|
|w066f1|
|v081u6|
|t791a0|
|f540n1|
|n202j1|
|t872c7|
|h296x8|
|p755a0|
|t893x2|
|u902e2|
|z097s3|
|z595b3|
+------+
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import monotonically_increasing_id
users = users.coalesce(1)
from pyspark.sql.functions import monotonically_increasing_id
users = users.coalesce(1)
users = users.withColumn(
"userIntId", monotonically_increasing_id()).persist()
users.show()
+------+---------+
|userId|userIntId|
+------+---------+
|j197o6| 0|
|m821i6| 1|
|g025o2| 2|
|z176c4| 3|
|a610z0| 4|
|c460j6| 5|
|w066f1| 6|
|v081u6| 7|
|t791a0| 8|
|f540n1| 9|
|n202j1| 10|
|t872c7| 11|
|h296x8| 12|
|p755a0| 13|
|t893x2| 14|
+------+---------+
movies = long_ratings.select("variable").distinct()
movies = movies.coalesce(1)
movies = movies.withColumn(
"movieId", monotonically_increasing_id()).persist()
movies.show()
+--------------------+-------+
| variable|movieId|
+--------------------+-------+
| The Incredibles| 0|
| Coco| 1|
|The Shawshank Red...| 2|
| Good Will Hunting| 3|
| Batman Forever| 4|
+--------------------+-------+
ratings_w_int_ids = long_ratings.join(
users, "userId", "left").join(movies, "variable", "left")
ratings_w_int_ids.show()
+--------------------+------+------+---------+-------+
| variable|userId|rating|userIntId|movieId|
+--------------------+------+------+---------+-------+
| Good Will Hunting|z097s3| 2| 16| 3|
| Batman Forever|z097s3| 3| 16| 4|
|The Shawshank Red...|z097s3| 4| 16| 2|
| Coco|z097s3| 4| 16| 1|
| Good Will Hunting|z176c4| 1| 3| 3|
| The Incredibles|z176c4| 4| 3| 0|
|The Shawshank Red...|z176c4| 3| 3| 2|
| Coco|z176c4| 4| 3| 1|
| Good Will Hunting|m821i6| 3| 1| 3|
| Batman Forever|m821i6| 4| 1| 4|
|The Shawshank Red...|m821i6| 3| 1| 2|
| Coco|m821i6| 5| 1| 1|
| Good Will Hunting|t872c7| 1| 11| 3|
| Batman Forever|t872c7| 2| 11| 4|
| The Incredibles|t872c7| 4| 11| 0|
|The Shawshank Red...|t872c7| 5| 11| 2|
+--------------------+------+------+---------+-------+
from pyspark.ml.functions import col
ratings_data = ratings_w_int_ids.select(
col("userIntId").alias("userid"),
col("variable").alias("movieId"),
col("rating"))
ratings_data.show()
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
| 16| 3| 2|
| 16| 4| 3|
| 16| 2| 4|
| 16| 1| 4|
| 3| 3| 1|
| 3| 0| 4|
| 3| 2| 3|
| 3| 1| 4|
| 1| 3| 3|
| 1| 4| 4|
| 1| 2| 3|
| 1| 1| 5|
| 11| 3| 1|
| 11| 4| 2|
| 11| 0| 4|
| 11| 2| 5|
+------+-------+------+
Building Recommendation Engines with PySpark