Databricks Concepts
Kevin Barlow
Data Practitioner
SQL
-- Creating a new table in SQL
CREATE TABLE table_name
USING delta
AS (
SELECT *
FROM source_table
WHERE date >= '2023-01-01'
)
Python, R, Scala
#Creating a new table in Pyspark
spark
.read
.table('source_table')
.filter(col('date') >= '2023-01-01')
.write
.saveAsTable('table_name')
Schema manipulation
#Pyspark
df
.withColumn(col('newCol'), ...)
.drop(col('oldCol'))
Filtering
#Pyspark
df
.filter(col('date') >= target_date)
.filter(col('id') IS NOT NULL)
Nested data
df
.explode(col('arrayCol')) #wide to long
.flatten(col('items')) #long to wide
Aggregation
df
.groupBy(col('region'))
.agg(sum(col('sales')))
Auto Loader processes new data files as they land in a data lake.
spark.readStream
.format("cloudFiles")
.option("cloudFiles.format", "json")
.load(file_path)
spark.readStream
.format("kafka")
.option("subscribe", "<topic>")
.load()
.join(table_df,
on="<id>", how="left")
.writeStream
.format("kafka")
.option("topic", "<topic>")
.start()
Databricks Concepts