Introduction to BigQuery
Matt Forrest
Field CTO
There are three main optimization rules:
SELECT *
, and only select the columns of data we needWHERE
clause early and oftenINT64
data type.WITH filter_my_data AS (SELECT
-- Filter data with
-- WHERE in the CTE first
)
SELECT
-- This query will run
-- faster with less data
JOIN a USING (user_id)
In BigQuery, use
BOOL
INT
FLOAT
DATE
Data types with WHERE
, STRING
, or BYTE
are not optimal.
Not optimal
SELECT user_id, date_ordered
FROM dataset.table
WHERE product = 'shoes'
Optimal
SELECT user_id, date_ordered
FROM dataset.table
WHERE product_id = 1234
ORDER BY
should always be at the outermost (end) of our queryORDER BY
within a window clauseNot optimal
WITH order_total AS (SELECT
user_id,
sum(product_price) as order_sum
FROM orders
GROUP BY user_id
-- Order by is not at the end of the query
ORDER BY last_purchase_date
)
SELECT order_total.order_sum,
users.user_name
FROM dataset.users users
JOIN order_total USING (user_id);
Optimal
WITH order_total AS (SELECT
user_id,
last_purchase_date
sum(product_price) as order_sum
GROUP BY user_id
)
SELECT order_total.order_sum,
users.user_name
FROM dataset.users users
JOIN a USING (user_id)
-- Order by should always be at the end
ORDER BY orders_total.last_purchase_date;
EXISTS
COUNT
to solve this use caseSELECT EXISTS (
-- Write the main query as a subquery within the exists call
SELECT
user_id
FROM
dataset.table
WHERE
product_category = 'home_goods'
AND status = 'Closed Account'
);
APPROX_TOP_SUM
or APPROX_COUNT_DISTINCT
.WHERE
clause.Introduction to BigQuery