How to Cross‑Validate and Change the Model

Code Along

R Code

library(tidyverse)
library(tidymodels)

pokemon <- read_csv("data/pokemon-data.csv")
pokemon %>% glimpse()
set.seed(123)
pokemon_folds <- vfold_cv(pokemon, v = 5, strata = early_gen)
pokemon_folds
pokemon_rec <- recipe(early_gen ~ height_m + weight_kg + hp, data = pokemon) %>% 
  step_mutate(early_gen = as.factor(early_gen))
log_mod <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

boost_mod <- boost_tree(
    trees = 1000, 
    mtry = 3, 
    learn_rate = 0.05
  ) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")
log_wf <- workflow() %>% 
  add_recipe(pokemon_rec) %>% 
  add_model(log_mod)

boost_wf <- workflow() %>% 
  add_recipe(pokemon_rec) %>% 
  add_model(boost_mod)
log_res <- fit_resamples(
  log_wf,
  resamples = pokemon_folds,
  metrics   = metric_set(accuracy)
)

boost_res <- fit_resamples(
  boost_wf,
  resamples = pokemon_folds,
  metrics   = metric_set(accuracy)
)

bind_rows(
  collect_metrics(log_res)   %>% mutate(model = "Logistic Regression"),
  collect_metrics(boost_res) %>% mutate(model = "XGBoost")
) %>% 
  select(model, mean, std_err)
library(vip)

boost_fit <- fit(boost_wf, data = pokemon)

boost_fit %>% 
  pull_workflow_fit() %>% 
  vip(num_features = 10)

Code‑along: Python

import pandas as pd

pokemon_df = pd.read_csv("data/pokemon-data.csv")
pokemon_df.head()
independent_columns = ['height_m', 'weight_kg', 'hp']
dependent_column = 'early_gen'

X = pokemon_df[independent_columns]
y = pokemon_df[dependent_column]

We will make a pipeline for a logistic regression and a gradient boosting random forest model.

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

lr_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', LogisticRegression())
    ])

boost_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('model', GradientBoostingClassifier(n_estimators=1000, learning_rate=0.05))
    ])
from sklearn.model_selection import StratifiedKFold, cross_val_score

random_seed = 22790

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)
lr_scores = cross_val_score(lr_pipeline, X, y, cv=skf, scoring='accuracy')  # can also score with 'f1' which is the harmonic mean or sensitivity (recall) and precision
boost_scores = cross_val_score(boost_pipeline, X, y, cv=skf, scoring='accuracy')
import numpy as np 

print('logistic regression mean accuracy:', np.mean(lr_scores))
print('boosted random forest mean accuracy:', np.mean(boost_scores))
boost_pipeline.fit(X, y)

model = boost_pipeline.named_steps['model']
importances = model.feature_importances_
feature_importances = pd.Series(importances, index=independent_columns)

print(feature_importances.sort_values(ascending=False))