Data Manipulation with data.table in R
Matt Dowle, Arun Srinivasan
Instructors, DataCamp
# Same example as seen before
## LHS := RHS Form
batrips[, c("is_dur_gt_1hour", "week_day") :=
.(duration > 3600, wday(start_date)]
# Same as above, but in `:=`() functional form
batrips[, `:=`(is_dur_gt_1hour = duration > 3600,
week_day = wday(start_date))]
# Update by reference with by
batrips[, n_zip_code := .N, by = zip_code]
# Functional form batrips[, `:=`(end_dur_first = duration[1], end_dur_last = duration[.N]), by = end_station]
# LHS := RHS form batrips[, c("end_dur_first", "end_dur_last") := list(duration[1], duration[.N]), by = end_station]
batrips[1:5]
trip_id duration ... end_station ... end_dur_first end_dur_last
139545 435 ... Townsend at 7th ... 435 660
139546 432 ... Townsend at 7th ... 435 660
139547 1523 ... Beale at Market ... 1523 229
139549 1620 ... Powell Street BART ... 1620 540
139550 1617 ... Powell Street BART ... 1620 540
For each unique combination of start_station
and end_station
, if median duration:
batrips[, trip_category := {
med_dur = median(duration, na.rm = TRUE)
if (med_dur < 600) "short"
else if (med_dur >= 600 & med_dur <= 1800) "medium"
else "long"
},
by = .(start_station, end_station)]
batrips[1:3]
trip_id duration ... zip_code trip_category
139545 435 ... 94612 short
139546 432 ... 94107 short
139547 1523 ... 94112 short
bin_median_duration <- function(dur) {
med_dur <- median(dur, na.rm = TRUE)
if (med_dur < 600) "short"
else if (med_dur >= 600 & med_dur <= 1800) "medium"
else "long"
}
batrips[, trip_category := bin_median_duration(duration),
by = .(start_station, end_station)]
batrips[duration > 500, min_dur_gt_500 := min(duration),
by = .(start_station, end_station)]
batrips[1:3]
trip_id duration ... zip_code min_dur_gt_500
139545 435 ... 94612 NA
139546 432 ... 94107 NA
139547 1523 ... 94112 502
Data Manipulation with data.table in R