Introduction to Spark SQL in Python
Mark Plutowski
Data Scientist
query3 = """
SELECT
id,
word AS w1,
LEAD(word,1) OVER(PARTITION BY part ORDER BY id ) AS w2,
LEAD(word,2) OVER(PARTITION BY part ORDER BY id ) AS w3
FROM df
"""
query3agg = """
SELECT w1, w2, w3, COUNT(*) as count FROM (
SELECT
word AS w1,
LEAD(word,1) OVER(PARTITION BY part ORDER BY id ) AS w2,
LEAD(word,2) OVER(PARTITION BY part ORDER BY id ) AS w3
FROM df
)
GROUP BY w1, w2, w3
ORDER BY count DESC
"""
spark.sql(query3agg).show()
+-----+-----+-----+-----+
| w1| w2| w3|count|
+-----+-----+-----+-----+
| one| of| the| 49|
| i|think| that| 46|
| it| is| a| 46|
| it| was| a| 45|
| that| it| was| 38|
| out| of| the| 35|
|.....|.....|.....|.....|
+-----+-----+-----+-----+
| w1| w2| w3|count|
+-----+-----+-----+-----+
| one| of| the| 49|
| i|think| that| 46|
| it| is| a| 46|
| it| was| a| 45|
| that| it| was| 38|
| out| of| the| 35|
| that| i| have| 35|
|there| was| a| 34|
| i| do| not| 34|
| that| it| is| 33|
| that| he| was| 30|
| that| he| had| 30|
| that| i| was| 28|
+-----+-----+-----+-----+
query3agg = """
SELECT w1, w2, w3, length(w1)+length(w2)+length(w3) as length FROM (
SELECT
word AS w1,
LEAD(word,1) OVER(PARTITION BY part ORDER BY id ) AS w2,
LEAD(word,2) OVER(PARTITION BY part ORDER BY id ) AS w3
FROM df
WHERE part <> 0 and part <> 13
)
GROUP BY w1, w2, w3
ORDER BY length DESC
"""
spark.sql(query3agg).show(truncate=False)
+-------------------+-------------------+---------------+------+
| w1| w2| w3|length|
+-------------------+-------------------+---------------+------+
|comfortable-looking| building| two-storied| 38|
| widespread|comfortable-looking| building| 37|
| extraordinary| circumstances| connected| 35|
| simple-minded| nonconformist| clergyman| 35|
| particularly| malignant| boot-slitting| 34|
| unsystematic| sensational| literature| 33|
| oppressively| respectable| frock-coat| 33|
| relentless| keen-witted| ready-handed| 33|
| travelling-cloak| and| close-fitting| 32|
| ruddy-faced| white-aproned| landlord| 32|
| fellow-countryman| colonel| lysander| 32|
+-------------------+-------------------+---------------+------+
Introduction to Spark SQL in Python