How to Cross-Validate and Change the Model

Code Along

Getting started

Process

  • Again, create a .R file in /module-4
  • Then, run copy and paste the code in this presentation as we talk through each step

Quick discussion

  • Which feature engineering steps do you think are most important for the OULAD data we have been using?

Code-along: R

library(ggrepel)
library(tidyverse)

ggplot(starwars, aes(x = height, label = name)) +
    geom_histogram()
library(ggrepel)

ggplot(starwars, aes(x = height, y = mass, label = name)) +
    geom_point() +
    geom_label_repel()

Loading, setting up: create a .R file in /module-4 and run this code

library(tidyverse)
library(tidymodels)

starwars_recoded <- starwars %>% # built-in data available just by typing
    mutate(species_human = ifelse(species == "Human", "human", "not human"))

starwars_recoded %>% 
    count(species_human) # how many humans are there?
train_test_split <- initial_split(starwars_recoded, prop = .70)

data_train <- training(train_test_split)
# during Step 1
kfcv <- vfold_cv(data_train, v = 10) # this differentiates this from what we did before
# before, we simple used data_train to fit our model
kfcv
# predicting humans based on the variables we used in LL1 + birth_year and homeworld
my_rec <- recipe(species_human ~ height + mass + birth_year + eye_color, data = data_train) %>% 
    step_novel(eye_color) # need to dummy code
#| eval: false
#| echo: true
# during step 3
my_mod <- rand_forest() %>% # different
    set_engine("ranger", importance = "impurity") %>% # different and with importance
    set_mode("classification")
class_metrics <- metric_set(accuracy, sensitivity, specificity, ppv, npv, kap) # this is new

fitted_model_resamples <- fit_resamples(my_wf, 
                                        resamples = kfcv, # different
                                        metrics = class_metrics)
collect_metrics(fitted_model_resamples)
final_fit %>% 
    pluck(".workflow", 1) %>%   
    pull_workflow_fit() %>% 
    vip(num_features = 10)
final_fit <- last_fit(my_wf, train_test_split, metrics = class_metrics)
collect_metrics(final_fit)

Code-along: python

::: panel-tabset

Loading, setting up

#
# Imports for visualization and modeling
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
import shap

# Star Wars data
import seaborn as sns
starwars = sns.load_dataset('starwars')

Plotting with matplotlib

# Star Wars data
import seaborn as sns
starwars = sns.load_dataset('starwars')

# Plot 1: Histogram
plt.figure(figsize=(10, 6))
sns.histplot(starwars['height'], kde=False)
plt.xlabel('Height')
plt.ylabel('Count')
plt.title('Distribution of Heights')
plt.show()

# Plot 2: Scatter plot with labels
plt.figure(figsize=(10, 6))
sns.scatterplot(data=starwars, x='height', y='mass')
for i, txt in enumerate(starwars['name']):
    plt.annotate(txt, (starwars['height'][i], starwars['mass'][i]), fontsize=9)
plt.xlabel('Height')
plt.ylabel('Mass')
plt.title('Height vs Mass with Labels')
plt.show()

Cross-validation and RF modeling

# Cross-validation setup (v-fold)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
kf_splits = list(kf.split(X))

# Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-validation and metric collection
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print('Cross-validation accuracy scores:', cv_scores)
print('Mean cross-validation accuracy:', cv_scores.mean())

# Fit the model on the entire dataset for feature importance
model.fit(X, y)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# Plot feature importance
shap.summary_plot(shap_values, X, plot_type="bar", max_display=10)