12.1 Linear Regression and Classification

import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.io as pio
pio.templates.default = "plotly_white"
pd.options.plotting.backend = "plotly"

Objectives

explain how scaling features affects a linear regression coefficients
preprocess data to prepare it for modeling
explain how to use a linear model to make classifications

Example: Ames Housing again

Data link: ames_home_sales.csv

data = pd.read_csv("data/ames_home_sales.csv")

data['sale_price'] = data.eval("Sale_Price / 1000")
ames = data.query('Gr_Liv_Area < 4000 and Sale_Condition == "Normal"')

ames_train, ames_test = train_test_split(ames, random_state=42, test_size=0.2)
print(f"Training set size: {len(ames_train)} homes")
print(f"Test set size: {len(ames_test)} homes")

Training set size: 1929 homes
Test set size: 483 homes

def evaluate(y_true, y_pred):
    return pd.Series({
        'MAE': mean_absolute_error(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred),
    })

Fit linreg with all numeric features

target_column = 'sale_price'
feature_columns = [column for column in ames.columns if column != target_column]
print(f"Number of features: {len(feature_columns)}")

Number of features: 74

transformer = make_column_transformer(
    ('passthrough', make_column_selector(dtype_include=np.number)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.head()

	Lot_Frontage	Lot_Area	Year_Built	Year_Remod_Add	Mas_Vnr_Area	BsmtFin_SF_1	Bsmt_Unf_SF	Total_Bsmt_SF	First_Flr_SF	...	Screen_Porch	Mo_Sold	Year_Sold	Sale_Price	Longitude	Latitude
2025	60	8730	1915	2003	0	7	698	698	698	...	259	7	2007	153575	-93.625284	42.027088
2039	40	4800	1916	1990	0	4	999	1196	1196	...	0	7	2007	109900	-93.625297	42.023518
1143	65	8450	2001	2001	0	3	355	827	827	...	0	9	2008	190500	-93.688803	42.036089
1537	0	14100	1935	1997	632	6	536	728	1968	...	0	5	2008	381000	-93.642597	42.016010
2589	85	15428	1951	1991	0	6	143	884	884	...	195	6	2006	142000	-93.615474	42.038836

5 rows × 34 columns

linreg = LinearRegression()
pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))

MAE     5.577521e-14
MAPE    3.379632e-16
dtype: float64

evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))

MAE     6.127144e-14
MAPE    3.855149e-16
dtype: float64

This model has a big problem—can you figure out what it is?

Coefficients

pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')

coefficient	feature
-0.000000	Bedroom_AbvGr
-0.000000	Half_Bath
-0.000000	BsmtFin_SF_1
-0.000000	Latitude
-0.000000	Full_Bath
-0.000000	Bsmt_Full_Bath
-0.000000	Bsmt_Half_Bath
-0.000000	Lot_Frontage
-0.000000	Year_Built
-0.000000	Year_Sold
-0.000000	First_Flr_SF
-0.000000	Second_Flr_SF
-0.000000	Mo_Sold
-0.000000	Three_season_porch
0.000000	Misc_Val
0.000000	Lot_Area
0.000000	Pool_Area
0.000000	BsmtFin_SF_2
0.000000	Garage_Area
0.000000	Bsmt_Unf_SF
0.000000	Wood_Deck_SF
0.000000	Screen_Porch
0.000000	Total_Bsmt_SF
0.000000	Mas_Vnr_Area
0.000000	Gr_Liv_Area
0.000000	Open_Porch_SF
0.000000	Enclosed_Porch
0.000000	Year_Remod_Add
0.000000	TotRms_AbvGrd
0.000000	Garage_Cars
0.000000	Fireplaces
0.000000	Kitchen_AbvGr
0.000000	Longitude
0.001000	Sale_Price

Fixed Model

data['sale_price'] = data.eval("Sale_Price / 1000")
del data['Sale_Price']
ames = data.query('Gr_Liv_Area < 4000 and Sale_Condition == "Normal"')

ames_train, ames_test = train_test_split(ames, random_state=42, test_size=0.2)
print(f"Training set size: {len(ames_train)} homes")
print(f"Test set size: {len(ames_test)} homes")

Training set size: 1929 homes
Test set size: 483 homes

target_column = 'sale_price'
feature_columns = [column for column in ames.columns if column != target_column]
print(f"Number of features: {len(feature_columns)}")

Number of features: 73

example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.head()

	Lot_Frontage	Lot_Area	Year_Built	Year_Remod_Add	Mas_Vnr_Area	BsmtFin_SF_1	Bsmt_Unf_SF	Total_Bsmt_SF	First_Flr_SF	...	Open_Porch_SF	Screen_Porch	Mo_Sold	Year_Sold	Longitude	Latitude
2025	60	8730	1915	2003	0	7	698	698	698	...	0	259	7	2007	-93.625284	42.027088
2039	40	4800	1916	1990	0	4	999	1196	1196	...	0	0	7	2007	-93.625297	42.023518
1143	65	8450	2001	2001	0	3	355	827	827	...	68	0	9	2008	-93.688803	42.036089
1537	0	14100	1935	1997	632	6	536	728	1968	...	12	0	5	2008	-93.642597	42.016010
2589	85	15428	1951	1991	0	6	143	884	884	...	0	195	6	2006	-93.615474	42.038836

5 rows × 33 columns

pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))

MAE     18.850437
MAPE     0.114801
dtype: float64

evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))

MAE     19.213870
MAPE     0.115735
dtype: float64

pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')

coefficient	feature
-71.665185	Longitude
-28.509150	Kitchen_AbvGr
-12.480190	Bedroom_AbvGr
-3.278390	Half_Bath
-1.537291	Bsmt_Half_Bath
-0.356834	Full_Bath
-0.262162	Year_Sold
-0.130585	Mo_Sold
-0.049549	Pool_Area
-0.022937	Bsmt_Unf_SF
-0.019980	Three_season_porch
-0.013728	BsmtFin_SF_2
0.000162	Misc_Val
0.000468	Lot_Area
0.015786	Wood_Deck_SF
0.015850	Open_Porch_SF
0.018071	Garage_Area
0.020334	Enclosed_Porch
0.033388	Gr_Liv_Area
0.033920	Mas_Vnr_Area
0.040027	First_Flr_SF
0.040589	Second_Flr_SF
0.044038	Screen_Porch
0.049294	Total_Bsmt_SF
0.106604	Lot_Frontage
0.316885	Year_Built
0.475925	Year_Remod_Add
0.759080	BsmtFin_SF_1
2.456884	TotRms_AbvGrd
2.721155	Bsmt_Full_Bath
4.355288	Garage_Cars
6.240939	Fireplaces
173.744691	Latitude

Aside: some EDA

How does location affect price?

px.scatter(ames_train, x="Longitude", y="Latitude", color="sale_price", opacity=0.5)

We see some expensive homes in the northwest.

(
    px.scatter(ames_train, x="Gr_Liv_Area", y="sale_price", trendline="ols")
    .update_traces(marker_opacity=0.5)
)

(
    px.scatter(ames_train, x="Lot_Area", y="sale_price", trendline="ols")
    .update_traces(marker_size=2)
)

Scaling

Each coefficient: how much the model’s prediction changes when adding 1 to that feature.
But the features are on different scales
- e.g. Lot_Area is in square feet: 1 square foot shouldn’t make much difference.
- but 1 degree of Latitude is a huge difference
- So the coefficient for Lot_Area would have to be much larger to have the same effect
Scaling the features makes the coefficient value indicate its importance

Standard Scaler

StandardScalar subtracts the mean and divides by the standard deviation
This makes the mean 0 and the standard deviation 1
So the coefficients reflect how much impact a one-standard-deviation change in the feature has on the model

Before scaling:

px.histogram(ames_train, x="Lot_Area")

After scaling:

px.histogram(StandardScaler().fit_transform(ames_train[["Lot_Area"]]))

Fit linreg with scaled features

transformer = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    verbose_feature_names_out=False
).set_output(transform='pandas')
example_transformed_output = transformer.fit_transform(ames_train[feature_columns])
example_transformed_output.describe()

	Lot_Frontage	Lot_Area	Year_Built	Year_Remod_Add	Mas_Vnr_Area	BsmtFin_SF_1	BsmtFin_SF_2	Bsmt_Unf_SF	Total_Bsmt_SF	First_Flr_SF	...	Open_Porch_SF	Enclosed_Porch	Three_season_porch	Screen_Porch	Pool_Area	Misc_Val	Mo_Sold	Year_Sold	Longitude	Latitude
count	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	...	1.929000e+03	1929.000000	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03	1.929000e+03
mean	2.946782e-17	-3.499303e-17	-2.431095e-16	2.210086e-15	-6.975585e-17	4.788520e-17	-1.105043e-17	-1.086626e-16	-1.381304e-16	-2.983616e-16	...	-1.841739e-17	0.000000	-4.143912e-18	2.394260e-17	-1.289217e-17	-2.578434e-17	-7.919476e-17	8.463364e-14	8.121146e-14	1.870801e-13
std	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	...	1.000259e+00	1.000259	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00	1.000259e+00
min	-1.628377e+00	-1.277895e+00	-3.316716e+00	-1.633718e+00	-5.524025e-01	-1.381565e+00	-3.149501e-01	-1.257265e+00	-2.515019e+00	-2.210675e+00	...	-7.186146e-01	-0.366553	-9.841904e-02	-2.873348e-01	-5.022095e-02	-9.468768e-02	-1.958047e+00	-1.407398e+00	-1.953488e+00	-2.639919e+00
25%	-5.956124e-01	-3.851042e-01	-5.597878e-01	-8.946482e-01	-5.524025e-01	-9.332828e-01	-3.149501e-01	-7.617441e-01	-5.891262e-01	-7.264986e-01	...	-7.186146e-01	-0.366553	-9.841904e-02	-2.873348e-01	-5.022095e-02	-9.468768e-02	-8.162477e-01	-6.467171e-01	-7.099499e-01	-6.837438e-01
50%	1.420769e-01	-9.435963e-02	5.286277e-02	3.864055e-01	-5.524025e-01	-4.850003e-01	-3.149501e-01	-2.209596e-01	-1.322178e-01	-1.958435e-01	...	-3.306563e-01	-0.366553	-9.841904e-02	-2.873348e-01	-5.022095e-02	-9.468768e-02	-5.504789e-02	1.139642e-01	7.902661e-02	4.449880e-02
75%	6.141980e-01	2.034333e-01	9.718387e-01	9.283898e-01	2.980240e-01	1.308130e+00	-3.149501e-01	5.771144e-01	5.457752e-01	6.139582e-01	...	3.806008e-01	-0.366553	-9.841904e-02	-2.873348e-01	-5.022095e-02	-9.468768e-02	7.061519e-01	8.746454e-01	7.950497e-01	7.821180e-01
max	7.607493e+00	2.271000e+01	1.380272e+00	1.322560e+00	8.831614e+00	1.308130e+00	8.189344e+00	4.307813e+00	5.360508e+00	5.787845e+00	...	8.495397e+00	15.003108	1.675049e+01	8.625200e+00	2.738375e+01	2.750843e+01	2.228552e+00	1.635327e+00	2.476003e+00	1.651120e+00

8 rows × 33 columns

Notice that the mean is 0 and the standard deviation is 1.

pipeline = (
    make_pipeline(transformer, linreg)
    .fit(
        X=ames_train[feature_columns],
        y=ames_train[target_column]
    )
)
evaluate(ames_train[target_column], pipeline.predict(ames_train[feature_columns]))

MAE     18.850437
MAPE     0.114801
dtype: float64

Notice: error didn’t change.

evaluate(ames_test[target_column], pipeline.predict(ames_test[feature_columns]))

MAE     19.213870
MAPE     0.115735
dtype: float64

Coefficients

pd.DataFrame({
    'coefficient': linreg.coef_,
    'feature': example_transformed_output.columns
}).sort_values('coefficient').style.hide(axis='index')

coefficient	feature
-10.052806	Bedroom_AbvGr
-9.627956	Bsmt_Unf_SF
-5.627494	Kitchen_AbvGr
-2.463411	BsmtFin_SF_2
-1.872343	Longitude
-1.639373	Half_Bath
-1.444905	Pool_Area
-0.482634	Three_season_porch
-0.370264	Bsmt_Half_Bath
-0.344641	Year_Sold
-0.343103	Mo_Sold
-0.194243	Full_Bath
0.090967	Misc_Val
0.980514	Open_Porch_SF
1.338853	Enclosed_Porch
1.403742	Bsmt_Full_Bath
1.693307	BsmtFin_SF_1
2.089566	Wood_Deck_SF
2.421134	Screen_Porch
3.113002	Latitude
3.163880	Garage_Cars
3.188761	Lot_Area
3.612784	Lot_Frontage
3.640026	Garage_Area
3.741238	TotRms_AbvGrd
4.069916	Fireplaces
5.783524	Mas_Vnr_Area
9.310250	Year_Built
9.659265	Year_Remod_Add
14.482290	First_Flr_SF
15.916276	Gr_Liv_Area
17.262078	Second_Flr_SF
20.066838	Total_Bsmt_SF

Each coef means: how much the model’s prediction changes when adding 1 standard deviation to that feature.

Hyperparameters

Parameters are the parts of the model that are learned from data
- coefficients in a linear regression, splits in a decision tree, …
Hyperparameters are the parts of the model that are set before training
- depth of a decision tree, which variables are used, how they are transformed, …
- not learned directly from data
- but modeler can choose them based on data

Linear Classifiers

“Logistic Regression”

Load up a familiar example: blood test for autism

autism = pd.read_csv("data/autism.csv", skiprows=[1])

Last time we fit a decision tree. But that assumed hard cut-offs for each feature.
Linear models could say: each additional percentage oxidized (or whatever) leads to a 0.1% increase in risk of autism.

The real world isn’t black and white; linear models can help model the gray areas.

Set up as before:

autism_train, autism_test = train_test_split(autism, random_state=42, test_size=0.25)

feature_columns = list(autism.columns[1:-1])
target_column = "Group"

positive_outcome = "ASD"
negative_outcome = "NEU"

What it looks like

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=4e-2, penalty='l1', solver='liblinear')
).fit(
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome
)
pd.DataFrame({
    'coeficients': model.named_steps["logisticregression"].coef_[0],
    'feature': feature_columns}
    ).sort_values("coeficients").query("coeficients != 0").plot.scatter(x='coeficients', y='feature')

How this classifier makes predictions

First, it predicts a risk score (negative = low risk, positive = high risk).

print(f"y = {model.named_steps['logisticregression'].intercept_[0]:.2f}", end="")
for coef, feature in zip(model.named_steps["logisticregression"].coef_[0], feature_columns):
    if coef != 0:
        print(f"+ {coef:.2f} * {feature}", end="")

y = 0.00+ 0.21 * 8-OHG+ -0.03 * tGSH/GSSG+ 0.16 * Chlorotyrosine+ 0.06 * Nitrotyrosine+ 0.57 * % oxidized

Note: these are the coefficients of the scaled features.

Example for the first row of the test set:

example_risk_score = model.decision_function(autism_test[feature_columns].iloc[[0]])
example_risk_score

array([1.05828559])

Then it uses the logistic function to turn risk score into probability:

def logistic(x):
    return 1 / (1 + np.exp(-x))
# show the logistic function. draw a line at 0.5
x = np.linspace(-10, 10, 100)
(
    px.line(x=x, y=logistic(x))
    .add_hline(y=0.5)
    .update_layout(xaxis_title="risk score", yaxis_title="probability")
)

For the example above:

logistic(example_risk_score)

array([0.74236278])

model.predict_proba(autism_test[feature_columns].iloc[[0]])

array([[0.25763722, 0.74236278]])

Intuition: Risk

First, linear model predicts a risk score (negative = low risk, positive = high risk).
- Technically: log of the odds: log(prob of yes / prob of no)
- Odds of 1: equal chance of yes and no; odds of 2: twice as likely to be yes as no
Then, logistic function turns risk score into probability

Other examples:

Risk of a customer churning
Risk of a transaction being fraudulent
Chance of one team winning: difference in skill scores (Elo)

Intuition: Logistic function

Monotonic: higher risk score = higher probability
but non-linear: as risk score increases, probability increases more slowly
- twice the risk score does not mean twice the probability
- when probability is near 0.5, a small change in risk score can make a big change in probability (tipping between a yes or no decision, so it’s important to get it right)
- near 1, a small change in risk score doesn’t change probability much (already pretty sure of the answer)

Hyperparameters: regularization

Notice that we had many possible features, but only a few coefficients are non-zero
We used a penalty to encourage the model to use fewer features
Without the penalty, we would have a coefficient for each feature

model = make_pipeline(
    StandardScaler(),
    LogisticRegression(penalty=None)
).fit(
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome
)
pd.DataFrame({
    'coeficients': model.named_steps["logisticregression"].coef_[0],
    'feature': feature_columns}
    ).sort_values("coeficients").plot.scatter(x='coeficients', y='feature')

Aside: Perhaps penalty=None should have been the default; see Scikit-learn’s Defaults are Wrong – r y x, r. Also, I don’t know why penalty=None gives a qualitatively different set of coefs from C=1e-16 or another tiny penalty.

Types of Regularization

L1: penalty is sum of absolute values of coefficients
- encourages some coefficients to be exactly 0: only include features that are worth it.
L2: penalty is sum of squares of coefficients
- discourages large coefficients (no one feature can “drive” the decision)
Elastic Net: combination of L1 and L2

Apply to both linear regression and linear classification (“logistic regression”).

A nice visual description of the difference between L1 and L2: Some things you (maybe) didn’t know about Linear Regression – r y x, r

But how do we pick good values for the hyperparameters?

Penalties will make the model work worse on the training set, but (hopefully) better on unseen data.
So we’ll use a validation set to tune the hyperparameters.
…but how will we know how well our tuned model will work on actually unseen data?

Cross-Validation

What problem cross-validation solves?

We want to use a validation set to tune hyperparameters
But we also want to use a validation set to evaluate the model
If we use the same validation set for both, we’ll overfit to the validation set
So we need to use different validation sets for tuning and evaluation
But we don’t have that many data points
So we need to use some of the data for both tuning and evaluation

Leave-One-Out CV

Train on all but one data point, then evaluate on the one left out. Repeat for each data point.

from sklearn.model_selection import LeaveOneOut, cross_val_score
scores = cross_val_score(
    model, 
    X=autism_train[feature_columns],
    y=autism_train[target_column] == positive_outcome,
    cv=LeaveOneOut())
scores

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.])

scores.mean()

0.9090909090909091

K-Fold CV

Training a different model for each left-out datapoint can take a long time. So: divide the data into k groups, and leave one group out.