11.1 Model Types

Objectives

What type of model for what type of data?

Describe how a Random Forest makes predictions
Describe how a linear model makes predictions
Compare and contrast linear and tree models

Setting Up

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_text
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_absolute_percentage_error
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

data = pd.read_csv("../../static/data/ames/ames_home_sales.csv")

data['sale_price'] = data.eval("Sale_Price / 1000")
ames = data.query('Gr_Liv_Area < 4000 and Sale_Condition == "Normal"')

ames_train, ames_test = train_test_split(ames, random_state=42, test_size=0.2)
print(f"Training set size: {len(ames_train)} homes")
print(f"Test set size: {len(ames_test)} homes")

Training set size: 1929 homes
Test set size: 483 homes

def evaluate(y_true, y_pred):
    return pd.Series({
        'MAE': mean_absolute_error(y_true, y_pred),
        'MAPE': mean_absolute_percentage_error(y_true, y_pred),
    })

Increasing Depth -> Overfitting

feature_columns = ['Latitude', 'Longitude']
error_data = []
for max_depth in range(2, 30, 2):
    model = DecisionTreeRegressor(max_depth=max_depth, random_state=42).fit(
        X=ames_train[feature_columns],
        y=ames_train['sale_price'])
    
    train_preds = model.predict(ames_train[feature_columns])
    test_preds = model.predict(ames_test[feature_columns])
    error_data.append(dict(
        evaluate(ames_train['sale_price'], train_preds).to_dict(), max_depth=max_depth, dataset='train'))
    error_data.append(dict(
        evaluate(ames_test['sale_price'], test_preds).to_dict(), max_depth=max_depth, dataset='test'))

error_df = pd.DataFrame(error_data)
px.line(error_df, x='max_depth', y='MAE', color='dataset', range_y=[0, 50])

Random Forests Reduce Overfitting

from sklearn.ensemble import RandomForestRegressor
error_data = []
for max_depth in range(2, 30, 2):
    model = RandomForestRegressor(max_depth=max_depth, random_state=42).fit(
        X=ames_train[feature_columns],
        y=ames_train['sale_price'])

    train_preds = model.predict(ames_train[feature_columns])
    test_preds = model.predict(ames_test[feature_columns])
    error_data.append(dict(
        evaluate(ames_train['sale_price'], train_preds).to_dict(), max_depth=max_depth, dataset='train'))
    error_data.append(dict(
        evaluate(ames_test['sale_price'], test_preds).to_dict(), max_depth=max_depth, dataset='test'))

error_df = pd.DataFrame(error_data)
px.line(error_df, x='max_depth', y='MAE', color='dataset', range_y=[0, 50])

A Random Forest has many trees.

model = RandomForestRegressor(max_depth=3, random_state=42, n_estimators=5).fit(
    X=ames_train[feature_columns],
    y=ames_train['sale_price'])
len(model.estimators_)

plot_tree(model.estimators_[0], feature_names=feature_columns, filled=True, impurity=False);

Show all 5 trees:

for tree in model.estimators_:
    display(plot_tree(tree, feature_names=feature_columns, filled=True, impurity=False))

[Text(0.5, 0.875, 'Longitude <= -93.63\nsamples = 1222\nvalue = 175.971'),
 Text(0.25, 0.625, 'Latitude <= 42.044\nsamples = 711\nvalue = 201.716'),
 Text(0.125, 0.375, 'Latitude <= 42.019\nsamples = 446\nvalue = 180.387'),
 Text(0.0625, 0.125, 'samples = 149\nvalue = 208.366'),
 Text(0.1875, 0.125, 'samples = 297\nvalue = 165.884'),
 Text(0.375, 0.375, 'Longitude <= -93.651\nsamples = 265\nvalue = 237.08'),
 Text(0.3125, 0.125, 'samples = 54\nvalue = 339.693'),
 Text(0.4375, 0.125, 'samples = 211\nvalue = 207.047'),
 Text(0.75, 0.625, 'Latitude <= 42.059\nsamples = 511\nvalue = 139.793'),
 Text(0.625, 0.375, 'Latitude <= 42.038\nsamples = 507\nvalue = 138.273'),
 Text(0.5625, 0.125, 'samples = 320\nvalue = 131.718'),
 Text(0.6875, 0.125, 'samples = 187\nvalue = 150.008'),
 Text(0.875, 0.375, 'Longitude <= -93.627\nsamples = 4\nvalue = 443.106'),
 Text(0.8125, 0.125, 'samples = 3\nvalue = 411.475'),
 Text(0.9375, 0.125, 'samples = 1\nvalue = 538.0')]

[Text(0.5, 0.875, 'Latitude <= 42.046\nsamples = 1220\nvalue = 175.782'),
 Text(0.25, 0.625, 'Longitude <= -93.679\nsamples = 870\nvalue = 157.211'),
 Text(0.125, 0.375, 'Latitude <= 42.035\nsamples = 166\nvalue = 202.216'),
 Text(0.0625, 0.125, 'samples = 139\nvalue = 208.156'),
 Text(0.1875, 0.125, 'samples = 27\nvalue = 175.668'),
 Text(0.375, 0.375, 'Latitude <= 42.018\nsamples = 704\nvalue = 146.404'),
 Text(0.3125, 0.125, 'samples = 127\nvalue = 177.624'),
 Text(0.4375, 0.125, 'samples = 577\nvalue = 139.294'),
 Text(0.75, 0.625, 'Longitude <= -93.651\nsamples = 350\nvalue = 222.943'),
 Text(0.625, 0.375, 'Longitude <= -93.656\nsamples = 57\nvalue = 356.969'),
 Text(0.5625, 0.125, 'samples = 11\nvalue = 456.043'),
 Text(0.6875, 0.125, 'samples = 46\nvalue = 324.915'),
 Text(0.875, 0.375, 'Longitude <= -93.628\nsamples = 293\nvalue = 196.432'),
 Text(0.8125, 0.125, 'samples = 223\nvalue = 208.485'),
 Text(0.9375, 0.125, 'samples = 70\nvalue = 157.707')]

[Text(0.5, 0.875, 'Longitude <= -93.63\nsamples = 1220\nvalue = 175.311'),
 Text(0.25, 0.625, 'Latitude <= 42.049\nsamples = 719\nvalue = 199.083'),
 Text(0.125, 0.375, 'Latitude <= 42.019\nsamples = 488\nvalue = 180.977'),
 Text(0.0625, 0.125, 'samples = 145\nvalue = 204.302'),
 Text(0.1875, 0.125, 'samples = 343\nvalue = 170.709'),
 Text(0.375, 0.375, 'Longitude <= -93.652\nsamples = 231\nvalue = 239.068'),
 Text(0.3125, 0.125, 'samples = 48\nvalue = 343.917'),
 Text(0.4375, 0.125, 'samples = 183\nvalue = 213.309'),
 Text(0.75, 0.625, 'Latitude <= 42.058\nsamples = 501\nvalue = 139.836'),
 Text(0.625, 0.375, 'Latitude <= 42.038\nsamples = 492\nvalue = 137.218'),
 Text(0.5625, 0.125, 'samples = 306\nvalue = 130.876'),
 Text(0.6875, 0.125, 'samples = 186\nvalue = 147.326'),
 Text(0.875, 0.375, 'Longitude <= -93.626\nsamples = 9\nvalue = 281.956'),
 Text(0.8125, 0.125, 'samples = 6\nvalue = 353.931'),
 Text(0.9375, 0.125, 'samples = 3\nvalue = 152.4')]

[Text(0.5, 0.875, 'Longitude <= -93.63\nsamples = 1201\nvalue = 175.178'),
 Text(0.25, 0.625, 'Latitude <= 42.044\nsamples = 694\nvalue = 198.241'),
 Text(0.125, 0.375, 'Latitude <= 42.019\nsamples = 419\nvalue = 176.474'),
 Text(0.0625, 0.125, 'samples = 143\nvalue = 209.576'),
 Text(0.1875, 0.125, 'samples = 276\nvalue = 160.354'),
 Text(0.375, 0.375, 'Longitude <= -93.652\nsamples = 275\nvalue = 231.549'),
 Text(0.3125, 0.125, 'samples = 54\nvalue = 344.494'),
 Text(0.4375, 0.125, 'samples = 221\nvalue = 206.175'),
 Text(0.75, 0.625, 'Latitude <= 42.057\nsamples = 507\nvalue = 142.491'),
 Text(0.625, 0.375, 'Longitude <= -93.619\nsamples = 499\nvalue = 139.288'),
 Text(0.5625, 0.125, 'samples = 230\nvalue = 132.315'),
 Text(0.6875, 0.125, 'samples = 269\nvalue = 145.077'),
 Text(0.875, 0.375, 'Longitude <= -93.624\nsamples = 8\nvalue = 371.656'),
 Text(0.8125, 0.125, 'samples = 7\nvalue = 397.322'),
 Text(0.9375, 0.125, 'samples = 1\nvalue = 115.0')]

[Text(0.5, 0.875, 'Longitude <= -93.63\nsamples = 1206\nvalue = 175.041'),
 Text(0.25, 0.625, 'Latitude <= 42.049\nsamples = 711\nvalue = 200.636'),
 Text(0.125, 0.375, 'Latitude <= 42.019\nsamples = 485\nvalue = 179.909'),
 Text(0.0625, 0.125, 'samples = 143\nvalue = 208.732'),
 Text(0.1875, 0.125, 'samples = 342\nvalue = 167.262'),
 Text(0.375, 0.375, 'Longitude <= -93.651\nsamples = 226\nvalue = 247.511'),
 Text(0.3125, 0.125, 'samples = 62\nvalue = 341.148'),
 Text(0.4375, 0.125, 'samples = 164\nvalue = 208.146'),
 Text(0.75, 0.625, 'Latitude <= 42.058\nsamples = 495\nvalue = 138.455'),
 Text(0.625, 0.375, 'Latitude <= 42.038\nsamples = 486\nvalue = 135.584'),
 Text(0.5625, 0.125, 'samples = 307\nvalue = 129.09'),
 Text(0.6875, 0.125, 'samples = 179\nvalue = 146.075'),
 Text(0.875, 0.375, 'Longitude <= -93.625\nsamples = 9\nvalue = 325.493'),
 Text(0.8125, 0.125, 'samples = 7\nvalue = 362.692'),
 Text(0.9375, 0.125, 'samples = 2\nvalue = 139.5')]

Each tree makes different predictions

It was trained on a different subset of the data
It was trained on a different subset of the features at each split

predictions = [tree.predict(ames_train[feature_columns].values) for tree in model.estimators_]
predictions = pd.DataFrame(predictions).T
predictions.columns = [f"Tree {i}" for i in range(len(predictions.columns))]
predictions.head()

	Tree 0	Tree 1	Tree 2	Tree 3	Tree 4
0	131.717607	139.294034	130.875931	132.314868	129.090174
1	131.717607	139.294034	130.875931	132.314868	129.090174
2	165.883551	175.668367	170.709185	160.354198	167.262236
3	208.365563	177.624237	204.302481	209.576263	208.732229
4	150.007986	139.294034	147.325621	145.077258	146.075060

To make a prediction

Each tree makes a prediction
The predictions are averaged

predictions.mean(axis=1).head()

0    132.658523
1    132.658523
2    167.975507
3    201.720155
4    145.555992
dtype: float64

Value of Diversity

I looked and there before me
was a great multitude that no one could count,
from every nation, tribe, people and language,
standing before the throne.

Revelation 7:9, as quoted in Calvin’s “From Every Nation”

Random Forests work because they combine diverse perspectives (from different training data, different choices)
Reflects value of diversity in God’s Kingdom (see also Rev 5:9, 1 Cor 12, etc.)

Linear Models

Fit a Linear Model

from sklearn.linear_model import LinearRegression

feature_columns = ['Gr_Liv_Area'] # we'll add more later
linreg = LinearRegression().fit(
    X=ames_train[feature_columns],
    y=ames_train['sale_price'])
print(f"Intercept: {linreg.intercept_:.2f}")
print(f"Coef: {linreg.coef_[0]:.2f}")

Intercept: 15.98
Coef: 0.11

Prediction equation: \[\text{Sale Price} = 15.98 + 0.11 \times \text{Gr Liv Area}\]

Aside: you may have seen this in stats class

Stats often asks: “What is the relationship between living area and sale price?”
Machine learning asks: “How can I predict sale price?”

What computations can a linear model do?

Trees: only simple conditional logic
- example: if Gr_Liv_Area <= 2000 then go right else go left
  - (may be <= or >, depending on implementation)
- predict the average of the training data at the leaf
Linear models
- add up terms
- each term: multiply some feature by a constant (“coefficient”)

Example

First test set house:

test_liv_area = ames_test['Gr_Liv_Area'].iloc[0]
test_liv_area

So we predict:

linreg.intercept_ + linreg.coef_[0] * test_liv_area

124.78893753303895

Actual was:

ames_test['sale_price'].iloc[0]

131.0

Do remodeled homes sell for more?

Year Remod/Add: Remodel date (same as construction date if no remodeling or additions) (from dataset documentation)

ames_train['remodeled'] = ames_train['Year_Remod_Add'] != ames_train['Year_Built']

px.scatter(ames_train, x="Gr_Liv_Area", y="sale_price", color="remodeled", trendline="ols")

Conditional Logic: Simple Conditions

How could a linear model treat remodeled homes differently from non-remodeled?

if remodeled:
  Sale_Price = intercept_remodeled + coef_sqft * Gr_Liv_Area
else:
  Sale_Price = intercept_other + coef_sqft * Gr_Liv_Area

Solution: indicator variables

Sale_Price = 
   intercept_other 
   + coef_sqft      * Gr_Liv_Area
   + coef_remodeled * (1 if remodeled)

Indicator Variables

Sale_Price = 
   intercept_other 
   + coef_sqft      * Gr_Liv_Area
   + coef_remodeled * (1 if remodeled)

feature_columns = ['Gr_Liv_Area', 'remodeled']
linreg = LinearRegression().fit(
    X=ames_train[feature_columns],
    y=ames_train['sale_price'])
print(f"Intercept: {linreg.intercept_:.2f}")
print(f"coef_sqft: {linreg.coef_[0]:.2f}")
print(f"coef_remodeled: {linreg.coef_[1]:.2f}")

Intercept: 21.46
coef_sqft: 0.11
coef_remodeled: -16.11

More than two categories

Bldg Type (Nominal): Type of dwelling
       1Fam Single-family Detached  
       2FmCon   Two-family Conversion; originally built as one-family dwelling
       Duplx    Duplex
       TwnhsE   Townhouse End Unit
       TwnhsI   Townhouse Inside Unit

px.scatter(ames_train, x="Gr_Liv_Area", y="sale_price", color="Bldg_Type", trendline="ols")

One-Hot Encoder

Input: a column with N categories
Output: N columns, one per category, with 1 if that category is present, 0 otherwise

from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder(sparse_output=False)
one_hot.fit(ames_train[['Bldg_Type']])
one_hot.categories_

[array(['Duplex', 'OneFam', 'Twnhs', 'TwnhsE', 'TwoFmCon'], dtype=object)]

pd.DataFrame(
    one_hot.transform(ames_train[['Bldg_Type']]),
    columns=one_hot.categories_).head()

	Duplex	OneFam	Twnhs	TwnhsE	TwoFmCon
0	0.0	1.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	0.0
3	0.0	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0

Preprocessing multiple columns

from sklearn.compose import make_column_transformer
transformer = make_column_transformer(
    ('passthrough', ['Gr_Liv_Area']),
    (OneHotEncoder(sparse_output=False), ['Bldg_Type']),
    remainder='drop')
transformer.set_output(transform='pandas')
transformer.fit_transform(ames_train)

	passthrough__Gr_Liv_Area	onehotencoder__Bldg_Type_Duplex	onehotencoder__Bldg_Type_OneFam	onehotencoder__Bldg_Type_Twnhs	onehotencoder__Bldg_Type_TwnhsE	onehotencoder__Bldg_Type_TwoFmCon
2025	1396	0.0	1.0	0.0	0.0	0.0
2039	1196	0.0	1.0	0.0	0.0	0.0
1143	1677	0.0	1.0	0.0	0.0	0.0
1537	3447	0.0	1.0	0.0	0.0	0.0
2589	884	0.0	1.0	0.0	0.0	0.0
...	...	...	...	...	...	...
1934	1008	0.0	1.0	0.0	0.0	0.0
1263	1253	0.0	1.0	0.0	0.0	0.0
1303	928	0.0	1.0	0.0	0.0	0.0
1490	1127	0.0	1.0	0.0	0.0	0.0
987	1567	0.0	1.0	0.0	0.0	0.0

1929 rows × 6 columns

Putting it all together

from sklearn.pipeline import make_pipeline
model = make_pipeline(
    make_column_transformer(
        ('passthrough', ['Gr_Liv_Area']),
        (OneHotEncoder(sparse_output=False), ['Bldg_Type']),
        remainder='drop'),
    LinearRegression())
model.fit(ames_train, ames_train['sale_price'])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('passthrough', 'passthrough',
                                                  ['Gr_Liv_Area']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(sparse_output=False),
                                                  ['Bldg_Type'])])),
                ('linearregression', LinearRegression())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

What does the model look like?

Prediction equation: \[\text{Sale Price} = -5.61 + 0.11 \times \text{Gr Liv Area} + -34.00 \times \text{Bldg Type 1Fam} + 20.31 \times \text{Bldg Type 2FmCon} + 6.51 \times \text{Bldg Type Duplx} + 42.10 \times \text{Bldg Type TwnhsE}\]

What predictions does it make?

train_predictions = model.predict(ames_train)
evaluate(ames_train['sale_price'], train_predictions)

MAE     31.667456
MAPE     0.192769
dtype: float64

test_predictions = model.predict(ames_test)
evaluate(ames_test['sale_price'], test_predictions)

MAE     32.479024
MAPE     0.195997
dtype: float64

	Duplex	OneFam	Twnhs	TwnhsE	TwoFmCon
0	0.0	1.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	0.0
3	0.0	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0

	Duplex	OneFam	Twnhs	TwnhsE	TwoFmCon
0	0.0	1.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	0.0
3	0.0	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0

	Duplex	OneFam	Twnhs	TwnhsE	TwoFmCon
0	0.0	1.0	0.0	0.0	0.0
1	0.0	1.0	0.0	0.0	0.0
2	0.0	1.0	0.0	0.0	0.0
3	0.0	1.0	0.0	0.0	0.0
4	0.0	1.0	0.0	0.0	0.0