12.1 Linear Regression and Classification

import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.io as pio
pio.templates.default = "plotly_white"
pd.options.plotting.backend = "plotly"

Objectives

  • explain how scaling features affects a linear regression coefficients
  • preprocess data to prepare it for modeling
  • explain how to use a linear model to make classifications

Example: Ames Housing again

Data link: ames_home_sales.csv

data = pd.read_csv("data/ames_home_sales.csv")
data['sale_price'] = data.eval("Sale_Price / 1000")
ames = data.query('Gr_Liv_Area < 4000 and Sale_Condition == "Normal"')
ames_train, ames_test = train_test_split(ames, random_state=42, test_size=0.2)
print(f"Training set size: {len(ames_train)} homes")
print(f"Test set size: {len(ames_test)} homes")
Training set size: 1929 homes
Test set size: 483 homes
def evaluate(y_true, y_pred):
    return pd.Series({
        'MAE': mean_absolute_error(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred),
    })

Fit linreg with all numeric features

target_column = 'sale_price'
feature_columns = [column for column in ames.columns if column != target_column]
print(f"Number of features: {len(feature_columns)}")
Number of features: 74
transformer = make_column_transformer(
    ('passthrough', make_column_selector(dtype_include=np.number)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.head()
Lot_Frontage Lot_Area Year_Built Year_Remod_Add Mas_Vnr_Area BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF ... Enclosed_Porch Three_season_porch Screen_Porch Pool_Area Misc_Val Mo_Sold Year_Sold Sale_Price Longitude Latitude
2025 60 8730 1915 2003 0 7 0 698 698 698 ... 0 0 259 0 0 7 2007 153575 -93.625284 42.027088
2039 40 4800 1916 1990 0 4 0 999 1196 1196 ... 0 0 0 0 0 7 2007 109900 -93.625297 42.023518
1143 65 8450 2001 2001 0 3 0 355 827 827 ... 0 0 0 0 0 9 2008 190500 -93.688803 42.036089
1537 0 14100 1935 1997 632 6 0 536 728 1968 ... 0 0 0 0 0 5 2008 381000 -93.642597 42.016010
2589 85 15428 1951 1991 0 6 0 143 884 884 ... 0 0 195 0 0 6 2006 142000 -93.615474 42.038836

5 rows × 34 columns

linreg = LinearRegression()
pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))
MAE     5.577521e-14
MAPE    3.379632e-16
dtype: float64
evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))
MAE     6.127144e-14
MAPE    3.855149e-16
dtype: float64

This model has a big problem—can you figure out what it is?

Coefficients

pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')
coefficient feature
-0.000000 Bedroom_AbvGr
-0.000000 Half_Bath
-0.000000 BsmtFin_SF_1
-0.000000 Latitude
-0.000000 Full_Bath
-0.000000 Bsmt_Full_Bath
-0.000000 Bsmt_Half_Bath
-0.000000 Lot_Frontage
-0.000000 Year_Built
-0.000000 Year_Sold
-0.000000 First_Flr_SF
-0.000000 Second_Flr_SF
-0.000000 Mo_Sold
-0.000000 Three_season_porch
0.000000 Misc_Val
0.000000 Lot_Area
0.000000 Pool_Area
0.000000 BsmtFin_SF_2
0.000000 Garage_Area
0.000000 Bsmt_Unf_SF
0.000000 Wood_Deck_SF
0.000000 Screen_Porch
0.000000 Total_Bsmt_SF
0.000000 Mas_Vnr_Area
0.000000 Gr_Liv_Area
0.000000 Open_Porch_SF
0.000000 Enclosed_Porch
0.000000 Year_Remod_Add
0.000000 TotRms_AbvGrd
0.000000 Garage_Cars
0.000000 Fireplaces
0.000000 Kitchen_AbvGr
0.000000 Longitude
0.001000 Sale_Price

Fixed Model

data['sale_price'] = data.eval("Sale_Price / 1000")
del data['Sale_Price']
ames = data.query('Gr_Liv_Area < 4000 and Sale_Condition == "Normal"')
ames_train, ames_test = train_test_split(ames, random_state=42, test_size=0.2)
print(f"Training set size: {len(ames_train)} homes")
print(f"Test set size: {len(ames_test)} homes")
Training set size: 1929 homes
Test set size: 483 homes
target_column = 'sale_price'
feature_columns = [column for column in ames.columns if column != target_column]
print(f"Number of features: {len(feature_columns)}")
Number of features: 73
example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.head()
Lot_Frontage Lot_Area Year_Built Year_Remod_Add Mas_Vnr_Area BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF ... Open_Porch_SF Enclosed_Porch Three_season_porch Screen_Porch Pool_Area Misc_Val Mo_Sold Year_Sold Longitude Latitude
2025 60 8730 1915 2003 0 7 0 698 698 698 ... 0 0 0 259 0 0 7 2007 -93.625284 42.027088
2039 40 4800 1916 1990 0 4 0 999 1196 1196 ... 0 0 0 0 0 0 7 2007 -93.625297 42.023518
1143 65 8450 2001 2001 0 3 0 355 827 827 ... 68 0 0 0 0 0 9 2008 -93.688803 42.036089
1537 0 14100 1935 1997 632 6 0 536 728 1968 ... 12 0 0 0 0 0 5 2008 -93.642597 42.016010
2589 85 15428 1951 1991 0 6 0 143 884 884 ... 0 0 0 195 0 0 6 2006 -93.615474 42.038836

5 rows × 33 columns

pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))
MAE     18.850437
MAPE     0.114801
dtype: float64
evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))
MAE     19.213870
MAPE     0.115735
dtype: float64
pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')
coefficient feature
-71.665185 Longitude
-28.509150 Kitchen_AbvGr
-12.480190 Bedroom_AbvGr
-3.278390 Half_Bath
-1.537291 Bsmt_Half_Bath
-0.356834 Full_Bath
-0.262162 Year_Sold
-0.130585 Mo_Sold
-0.049549 Pool_Area
-0.022937 Bsmt_Unf_SF
-0.019980 Three_season_porch
-0.013728 BsmtFin_SF_2
0.000162 Misc_Val
0.000468 Lot_Area
0.015786 Wood_Deck_SF
0.015850 Open_Porch_SF
0.018071 Garage_Area
0.020334 Enclosed_Porch
0.033388 Gr_Liv_Area
0.033920 Mas_Vnr_Area
0.040027 First_Flr_SF
0.040589 Second_Flr_SF
0.044038 Screen_Porch
0.049294 Total_Bsmt_SF
0.106604 Lot_Frontage
0.316885 Year_Built
0.475925 Year_Remod_Add
0.759080 BsmtFin_SF_1
2.456884 TotRms_AbvGrd
2.721155 Bsmt_Full_Bath
4.355288 Garage_Cars
6.240939 Fireplaces
173.744691 Latitude

Aside: some EDA

How does location affect price?

px.scatter(ames_train, x="Longitude", y="Latitude", color="sale_price", opacity=0.5)

We see some expensive homes in the northwest.

(
    px.scatter(ames_train, x="Gr_Liv_Area", y="sale_price", trendline="ols")
    .update_traces(marker_opacity=0.5)
)
(
    px.scatter(ames_train, x="Lot_Area", y="sale_price", trendline="ols")
    .update_traces(marker_size=2)
)

Scaling

  • Each coefficient: how much the model’s prediction changes when adding 1 to that feature.
  • But the features are on different scales
    • e.g. Lot_Area is in square feet: 1 square foot shouldn’t make much difference.
    • but 1 degree of Latitude is a huge difference
    • So the coefficient for Lot_Area would have to be much larger to have the same effect
  • Scaling the features makes the coefficient value indicate its importance

Standard Scaler

  • StandardScalar subtracts the mean and divides by the standard deviation
  • This makes the mean 0 and the standard deviation 1
  • So the coefficients reflect how much impact a one-standard-deviation change in the feature has on the model

Before scaling:

px.histogram(ames_train, x="Lot_Area")

After scaling:

px.histogram(StandardScaler().fit_transform(ames_train[["Lot_Area"]]))

Fit linreg with scaled features

transformer = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.describe()
Lot_Frontage Lot_Area Year_Built Year_Remod_Add Mas_Vnr_Area BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF ... Open_Porch_SF Enclosed_Porch Three_season_porch Screen_Porch Pool_Area Misc_Val Mo_Sold Year_Sold Longitude Latitude
count 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 ... 1.929000e+03 1929.000000 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03 1.929000e+03
mean 2.946782e-17 -3.499303e-17 -2.431095e-16 2.210086e-15 -6.975585e-17 4.788520e-17 -1.105043e-17 -1.086626e-16 -1.381304e-16 -2.983616e-16 ... -1.841739e-17 0.000000 -4.143912e-18 2.394260e-17 -1.289217e-17 -2.578434e-17 -7.919476e-17 8.463364e-14 8.121146e-14 1.870801e-13
std 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 ... 1.000259e+00 1.000259 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00 1.000259e+00
min -1.628377e+00 -1.277895e+00 -3.316716e+00 -1.633718e+00 -5.524025e-01 -1.381565e+00 -3.149501e-01 -1.257265e+00 -2.515019e+00 -2.210675e+00 ... -7.186146e-01 -0.366553 -9.841904e-02 -2.873348e-01 -5.022095e-02 -9.468768e-02 -1.958047e+00 -1.407398e+00 -1.953488e+00 -2.639919e+00
25% -5.956124e-01 -3.851042e-01 -5.597878e-01 -8.946482e-01 -5.524025e-01 -9.332828e-01 -3.149501e-01 -7.617441e-01 -5.891262e-01 -7.264986e-01 ... -7.186146e-01 -0.366553 -9.841904e-02 -2.873348e-01 -5.022095e-02 -9.468768e-02 -8.162477e-01 -6.467171e-01 -7.099499e-01 -6.837438e-01
50% 1.420769e-01 -9.435963e-02 5.286277e-02 3.864055e-01 -5.524025e-01 -4.850003e-01 -3.149501e-01 -2.209596e-01 -1.322178e-01 -1.958435e-01 ... -3.306563e-01 -0.366553 -9.841904e-02 -2.873348e-01 -5.022095e-02 -9.468768e-02 -5.504789e-02 1.139642e-01 7.902661e-02 4.449880e-02
75% 6.141980e-01 2.034333e-01 9.718387e-01 9.283898e-01 2.980240e-01 1.308130e+00 -3.149501e-01 5.771144e-01 5.457752e-01 6.139582e-01 ... 3.806008e-01 -0.366553 -9.841904e-02 -2.873348e-01 -5.022095e-02 -9.468768e-02 7.061519e-01 8.746454e-01 7.950497e-01 7.821180e-01
max 7.607493e+00 2.271000e+01 1.380272e+00 1.322560e+00 8.831614e+00 1.308130e+00 8.189344e+00 4.307813e+00 5.360508e+00 5.787845e+00 ... 8.495397e+00 15.003108 1.675049e+01 8.625200e+00 2.738375e+01 2.750843e+01 2.228552e+00 1.635327e+00 2.476003e+00 1.651120e+00

8 rows × 33 columns

Notice that the mean is 0 and the standard deviation is 1.

pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))
MAE     18.850437
MAPE     0.114801
dtype: float64

Notice: error didn’t change.

evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))
MAE     19.213870
MAPE     0.115735
dtype: float64

Coefficients

pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')
coefficient feature
-10.052806 Bedroom_AbvGr
-9.627956 Bsmt_Unf_SF
-5.627494 Kitchen_AbvGr
-2.463411 BsmtFin_SF_2
-1.872343 Longitude
-1.639373 Half_Bath
-1.444905 Pool_Area
-0.482634 Three_season_porch
-0.370264 Bsmt_Half_Bath
-0.344641 Year_Sold
-0.343103 Mo_Sold
-0.194243 Full_Bath
0.090967 Misc_Val
0.980514 Open_Porch_SF
1.338853 Enclosed_Porch
1.403742 Bsmt_Full_Bath
1.693307 BsmtFin_SF_1
2.089566 Wood_Deck_SF
2.421134 Screen_Porch
3.113002 Latitude
3.163880 Garage_Cars
3.188761 Lot_Area
3.612784 Lot_Frontage
3.640026 Garage_Area
3.741238 TotRms_AbvGrd
4.069916 Fireplaces
5.783524 Mas_Vnr_Area
9.310250 Year_Built
9.659265 Year_Remod_Add
14.482290 First_Flr_SF
15.916276 Gr_Liv_Area
17.262078 Second_Flr_SF
20.066838 Total_Bsmt_SF

Each coef means: how much the model’s prediction changes when adding 1 standard deviation to that feature.

Hyperparameters

  • Parameters are the parts of the model that are learned from data
    • coefficients in a linear regression, splits in a decision tree, …
  • Hyperparameters are the parts of the model that are set before training
    • depth of a decision tree, which variables are used, how they are transformed, …
    • not learned directly from data
    • but modeler can choose them based on data

Linear Classifiers

“Logistic Regression”

Load up a familiar example: blood test for autism

autism = pd.read_csv("data/autism.csv", skiprows=[1])
  • Last time we fit a decision tree. But that assumed hard cut-offs for each feature.
  • Linear models could say: each additional percentage oxidized (or whatever) leads to a 0.1% increase in risk of autism.

The real world isn’t black and white; linear models can help model the gray areas.

Set up as before:

autism_train, autism_test = train_test_split(autism, random_state=42, test_size=0.25)

feature_columns = list(autism.columns[1:-1])
target_column = "Group"

positive_outcome = "ASD"
negative_outcome = "NEU"

What it looks like

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=4e-2, penalty='l1', solver='liblinear')
).fit(
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome
)
pd.DataFrame({
    'coeficients': model.named_steps["logisticregression"].coef_[0],
    'feature': feature_columns}
    ).sort_values("coeficients").query("coeficients != 0").plot.scatter(x='coeficients', y='feature')

How this classifier makes predictions

First, it predicts a risk score (negative = low risk, positive = high risk).

print(f"y = {model.named_steps['logisticregression'].intercept_[0]:.2f}", end="")
for coef, feature in zip(model.named_steps["logisticregression"].coef_[0], feature_columns):
    if coef != 0:
        print(f"+ {coef:.2f} * {feature}", end="")
y = 0.00+ 0.21 * 8-OHG+ -0.03 * tGSH/GSSG+ 0.16 * Chlorotyrosine+ 0.06 * Nitrotyrosine+ 0.57 * % oxidized

Note: these are the coefficients of the scaled features.

Example for the first row of the test set:

example_risk_score = model.decision_function(autism_test[feature_columns].iloc[[0]])
example_risk_score
array([1.05828559])

Then it uses the logistic function to turn risk score into probability:

def logistic(x):
    return 1 / (1 + np.exp(-x))
# show the logistic function. draw a line at 0.5
x = np.linspace(-10, 10, 100)
(
    px.line(x=x, y=logistic(x))
    .add_hline(y=0.5)
    .update_layout(xaxis_title="risk score", yaxis_title="probability")
)

For the example above:

logistic(example_risk_score)
array([0.74236278])
model.predict_proba(autism_test[feature_columns].iloc[[0]])
array([[0.25763722, 0.74236278]])

Intuition: Risk

  • First, linear model predicts a risk score (negative = low risk, positive = high risk).
    • Technically: log of the odds: log(prob of yes / prob of no)
    • Odds of 1: equal chance of yes and no; odds of 2: twice as likely to be yes as no
  • Then, logistic function turns risk score into probability

Other examples:

  • Risk of a customer churning
  • Risk of a transaction being fraudulent
  • Chance of one team winning: difference in skill scores (Elo)

Intuition: Logistic function

  • Monotonic: higher risk score = higher probability
  • but non-linear: as risk score increases, probability increases more slowly
    • twice the risk score does not mean twice the probability
    • when probability is near 0.5, a small change in risk score can make a big change in probability (tipping between a yes or no decision, so it’s important to get it right)
    • near 1, a small change in risk score doesn’t change probability much (already pretty sure of the answer)

Hyperparameters: regularization

  • Notice that we had many possible features, but only a few coefficients are non-zero
  • We used a penalty to encourage the model to use fewer features
  • Without the penalty, we would have a coefficient for each feature
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty=None)
).fit(
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome
)
pd.DataFrame({
    'coeficients': model.named_steps["logisticregression"].coef_[0],
    'feature': feature_columns}
    ).sort_values("coeficients").plot.scatter(x='coeficients', y='feature')

Aside: Perhaps penalty=None should have been the default; see Scikit-learn’s Defaults are Wrong – r y x, r. Also, I don’t know why penalty=None gives a qualitatively different set of coefs from C=1e-16 or another tiny penalty.

Types of Regularization

  • L1: penalty is sum of absolute values of coefficients
    • encourages some coefficients to be exactly 0: only include features that are worth it.
  • L2: penalty is sum of squares of coefficients
    • discourages large coefficients (no one feature can “drive” the decision)
  • Elastic Net: combination of L1 and L2

Apply to both linear regression and linear classification (“logistic regression”).

A nice visual description of the difference between L1 and L2: Some things you (maybe) didn’t know about Linear Regression – r y x, r

But how do we pick good values for the hyperparameters?

  • Penalties will make the model work worse on the training set, but (hopefully) better on unseen data.
  • So we’ll use a validation set to tune the hyperparameters.
  • …but how will we know how well our tuned model will work on actually unseen data?

Cross-Validation

What problem cross-validation solves?

  • We want to use a validation set to tune hyperparameters
  • But we also want to use a validation set to evaluate the model
  • If we use the same validation set for both, we’ll overfit to the validation set
  • So we need to use different validation sets for tuning and evaluation
  • But we don’t have that many data points
  • So we need to use some of the data for both tuning and evaluation

Leave-One-Out CV

Train on all but one data point, then evaluate on the one left out. Repeat for each data point.

from sklearn.model_selection import LeaveOneOut, cross_val_score
scores = cross_val_score(
    model, 
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome,
    cv=LeaveOneOut())
scores
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])
scores.mean()
0.9090909090909091

K-Fold CV

Training a different model for each left-out datapoint can take a long time. So: divide the data into k groups, and leave one group out.