Code Along
/module-4
Loading, setting up: create a .R file in /module-4 and run this code
::: panel-tabset
# Star Wars data
import seaborn as sns
starwars = sns.load_dataset('starwars')
# Plot 1: Histogram
plt.figure(figsize=(10, 6))
sns.histplot(starwars['height'], kde=False)
plt.xlabel('Height')
plt.ylabel('Count')
plt.title('Distribution of Heights')
plt.show()
# Plot 2: Scatter plot with labels
plt.figure(figsize=(10, 6))
sns.scatterplot(data=starwars, x='height', y='mass')
for i, txt in enumerate(starwars['name']):
plt.annotate(txt, (starwars['height'][i], starwars['mass'][i]), fontsize=9)
plt.xlabel('Height')
plt.ylabel('Mass')
plt.title('Height vs Mass with Labels')
plt.show()
# Cross-validation setup (v-fold)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
kf = KFold(n_splits=20, shuffle=True, random_state=42)
kf_splits = list(kf.split(X))
# Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Cross-validation and metric collection
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print('Cross-validation accuracy scores:', cv_scores)
print('Mean cross-validation accuracy:', cv_scores.mean())
# Fit the model on the entire dataset for feature importance
model.fit(X, y)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
# Plot feature importance
shap.summary_plot(shap_values, X, plot_type="bar", max_display=10)