Code Along
/module-3
Loading, setting up: create a .R file in /lab-1 and run this code
Model building with training data
Model evaluating with testing data
Only run this once you’re done training/messing with your model!; this way, these estimates will be unbiased
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Assuming starwars data is loaded into a pandas DataFrame called 'starwars'
starwars_recoded = starwars.copy()
starwars_recoded['species_human'] = starwars_recoded['species'].map(lambda x: 'human' if x == 'Human' else 'not human')
# Split data
X = starwars_recoded[['height', 'mass', 'birth_year', 'eye_color']]
y = starwars_recoded['species_human']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Engineer features and specify model
preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', ['height', 'mass', 'birth_year']),
('cat', OneHotEncoder(drop='first'), ['eye_color'])
])
model = Pipeline([
('preprocessor', preprocessor),
('classifier', LogisticRegression())
])
# Fit model
model.fit(X_train, y_train)
# Evaluate accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='human')
recall = recall_score(y_test, y_pred, pos_label='human')
f1 = f1_score(y_test, y_pred, pos_label='human')
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
# Aside: Joins (using pandas)
merged_inner = pd.merge(band_members, band_instruments, how='inner')
merged_left = pd.merge(band_members, band_instruments, how='left')