Introduction to Spark SQL in Python
Mark Plutowski
Data Scientist
from pyspark.sql.functions import split, explode
from pyspark.sql.functions import length
df.where(length('sentence') == 0)
from pyspark.sql.functions import udf
print(df)
DataFrame[textdata: string]
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType
short_udf = udf(lambda x:
True if not x or len(x) < 10 else False,
BooleanType())
df.select(short_udf('textdata')\
.alias("is short"))\
.show(3)
+--------+
|is short|
+--------+
| false|
| true|
| false|
+--------+
from pyspark.sql.types import StringType, IntegerType, FloatType, ArrayType
df3.select('word array', in_udf('word array').alias('without endword'))\
.show(5, truncate=30)
+-----------------------------+----------------------+
| word array| without endword|
+-----------------------------+----------------------+
|[then, how, many, are, there]|[then, how, many, are]|
| [how, many]| [how]|
| [i, donot, know]| [i, donot]|
| [quite, so]| [quite]|
| [you, have, not, observed]| [you, have, not]|
+-----------------------------+----------------------+
from pyspark.sql.types import StringType, ArrayType
# Removes last item in array
in_udf = udf(lambda x:
x[0:len(x)-1] if x and len(x) > 1
else [],
ArrayType(StringType()))
Example:
[1.0, 0.0, 0.0, 3.0]
(4, [0, 3], [1.0, 3.0])
hasattr(x, "toArray")
x.numNonzeros())
Introduction to Spark SQL in Python