Machine Learning with Tree-Based Models in R
Sandro Raabe
Data Scientist
head(diabetes)
# A tibble: 6 x 9
outcome pregnancies glucose blood_pressure skin_thickness insulin bmi age
<fct> <int> <int> <int> <int> <int> <dbl> <int>
1 yes 6 148 72 35 0 33.6 50
2 no 1 85 66 29 0 26.6 31
3 yes 8 183 64 0 0 23.3 32
# Split data proportionally (default: 0.75)
diabetes_split <- initial_split(diabetes, prop = 0.9)
diabetes_split
<Analysis/Assess/Total>
<692/76/768>
diabetes_train <- training(diabetes_split)
diabetes_test <- testing(diabetes_split)
nrow(diabetes_train)/nrow(diabetes)
[1] 0.9007812
# Training count of 'yes' and 'no' outcomes
counts_train <- table(diabetes_train$outcome)
counts_train
no yes
490 86
# Training proportion of 'yes' outcome
prop_yes_train <- counts_train["yes"]/
sum(counts_train)
prop_yes_train
0.15
# Test data count of 'yes' and 'no' outcomes
counts_test <- table(diabetes_test$outcome)
counts_test
no yes
28 48
# Test data proportion of 'yes' outcome
prop_yes_test <- counts_test["yes"]/
sum(counts_test)
prop_yes_test
0.63
initial_split(diabetes,
prop = 0.9,
strata = outcome)
outcome
variableMachine Learning with Tree-Based Models in R