Show the R package imports
library(tidyverse)
library(splines)
library(mgcv)
library(tidyverse)
library(splines)
library(mgcv)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrix
import statsmodels.api as sm
import statsmodels.formula.api as smf
Some of the figures in this presentation are taken from “An Introduction to Statistical Learning, with applications in R” (Springer, 2021) with permission from the authors: G. James, D. Witten, T. Hastie and R. Tibshirani.
The legend of the Laffer curve goes like this: Arthur Laffer, then an economics professor at the University of Chicago, had dinner one night in 1974 with Dick Cheney, Donald Rumsfeld, and Wall Street Journal editor Jude Wanniski at an upscale hotel restaurant in Washington DC. They were tussling over President Ford’s tax plan, and eventually, as intellectuals do when the tussling gets heavy, Laffer commandeered a napkin and drew a picture. The picture looked like this:
\texttt{sales} \approx \beta_0 + \beta_1 \times \texttt{TV}
\texttt{sales} \approx \beta_0 + \beta_1 \times \texttt{TV}+ \beta_2 \times \texttt{radio}
We’ll focus on one predictor and one response variable for most of this lecture.
Using a term like nonlinear science is like referring to the bulk of zoology as the study of non-elephant animals. (Stanisław Ulam)
Download a file called Mx_1x1.txt
from the Human Mortality Database.
No-one is allowed to distribute the data, but you can download it for free. Here are the first few rows to get a sense of what it looks like.
with open('Mx_1x1.txt', 'r') as file:
for i in range(10):
print(file.readline(), end='')
Luxembourg, Death rates (period 1x1), Last modified: 09 Aug 2023; Methods Protocol: v6 (2017)
Year Age Female Male Total
1960 0 0.023863 0.039607 0.031891
1960 1 0.001690 0.003528 0.002644
1960 2 0.001706 0.002354 0.002044
1960 3 0.001257 0.002029 0.001649
1960 4 0.000844 0.001255 0.001051
1960 5 0.000873 0.001701 0.001293
1960 6 0.000443 0.000430 0.000437
<- read_table("Mx_1x1.txt", skip = 2, show_col_types = FALSE) %>%
lux rename(age=Age, year=Year, mx=Female) %>%
select(age, year, mx) %>%
filter(age != '110+') %>%
mutate(year = as.integer(year), age = as.integer(age), mx = as.numeric(mx))
lux
# A tibble: 6,930 × 3
age year mx
<int> <int> <dbl>
1 0 1960 0.0239
2 1 1960 0.00169
3 2 1960 0.00171
4 3 1960 0.00126
5 4 1960 0.000844
6 5 1960 0.000873
7 6 1960 0.000443
8 7 1960 0
9 8 1960 0.000951
10 9 1960 0
# ℹ 6,920 more rows
summary(lux)
age year mx
Min. : 0.0 Min. :1960 Min. :0.0000
1st Qu.: 27.0 1st Qu.:1975 1st Qu.:0.0004
Median : 54.5 Median :1991 Median :0.0034
Mean : 54.5 Mean :1991 Mean :0.0920
3rd Qu.: 82.0 3rd Qu.:2007 3rd Qu.:0.0418
Max. :109.0 Max. :2022 Max. :6.0000
NA's :358
= pd.read_csv('Mx_1x1.txt', sep='\s+', skiprows=2)
lux
= lux.rename(columns={'Age': 'age', 'Year': 'year', 'Female': 'mx'})
lux = lux[['age', 'year', 'mx']]
lux = lux[lux['age'] != '110+'].copy()
lux 'mx'] == '.', 'mx'] = np.nan
lux.loc[lux[
'year'] = lux['year'].astype(int)
lux['age'] = lux['age'].astype(int)
lux['mx'] = lux['mx'].astype(float)
lux[
lux
age year mx
0 0 1960 0.023863
1 1 1960 0.001690
2 2 1960 0.001706
3 3 1960 0.001257
4 4 1960 0.000844
... ... ... ...
6987 105 2022 0.661481
6988 106 2022 5.419704
6989 107 2022 NaN
6990 108 2022 NaN
6991 109 2022 NaN
[6930 rows x 3 columns]
<- lux %>% group_by(age) %>% summarise(mx = mean(mx, na.rm = TRUE))
average_df ggplot(lux, aes(x = age, y = mx)) +
geom_line(aes(group = year, color = year), alpha = 0.3) +
geom_line(data = average_df, color = "black", linewidth = 1) +
labs(x = "Age", y = "Mortality Rate", color = "Year") +
theme_minimal()
= lux.groupby('age')[['mx']].mean().reset_index()
average_df =lux, x='age', y='mx', hue='year', palette='viridis', alpha=0.3)
sns.lineplot(data'age'], average_df['mx'], color='black', linewidth=2)
plt.plot(average_df['Age'); plt.ylabel('Mortality Rate'); plt.xlabel(
<- lux %>% filter(age <= 90) lux
<- average_df %>% filter(age <= 90)
average_df
ggplot(lux, aes(x = age, y = mx)) +
geom_line(aes(group = year, color = year), alpha = 0.3) +
geom_line(data = average_df, color = "black", linewidth = 1) +
labs(x = "Age", y = "Mortality Rate", color = "Year") +
theme_minimal()
= lux[lux['age'] <= 90].copy() lux
= average_df[average_df['age'] <= 90]
average_df
=lux, x='age', y='mx', hue='year', palette='viridis', alpha=0.3)
sns.lineplot(data'age'], average_df['mx'], color='black', linewidth=2)
plt.plot(average_df['Age'); plt.ylabel('Mortality Rate'); plt.xlabel(
$log_mx <- log(lux$mx)
lux<- lux[lux$log_mx != -Inf, ] lux
<- average_df %>%
average_df left_join(lux %>%
group_by(age) %>%
summarise(log_mx = mean(log_mx, na.rm = TRUE)),
by = "age")
ggplot(lux, aes(x = age, y = log_mx)) +
geom_line(aes(group = year, color = year), alpha = 0.3) +
geom_line(data = average_df, color = "black", linewidth = 1) +
labs(x = "Age", y = "Log-Mortality Rate", color = "Year") +
theme_minimal()
'log_mx'] = np.log(lux['mx']) lux[
/Users/plaub/miniconda3/envs/ai2024/lib/python3.11/site-packages/pandas/core/arraylike.py:399: RuntimeWarning: divide by zero encountered in log
result = getattr(ufunc, method)(*inputs, **kwargs)
= lux[lux['log_mx'] != -np.inf] lux
'log_mx'] = lux.groupby('age')[['log_mx']].mean()
average_df[
=lux, x='age', y='log_mx', hue='year', palette='viridis', alpha=0.3)
sns.lineplot(data'age'], average_df['log_mx'], color='black', linewidth=2)
plt.plot(average_df['Age'); plt.ylabel('Log-Mortality Rate'); plt.xlabel(
<- lux %>% filter(year == 2020)
lux_2020 <- lm(log_mx ~ age, data = lux_2020) model_lr
<- ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
p geom_point(alpha = 0.75) + labs(x = "Age", y = "Log-Mortality")
<- data.frame(age = seq(min(lux$age), max(lux$age), by=0.1))
age_grid <- predict(model_lr, newdata = age_grid)
predictions
+ geom_line(data = age_grid, aes(x = age, y = predictions),
p color = "red", linewidth=2)
= lux[lux['year'] == 2020].copy()
lux_2020 = smf.ols('log_mx ~ age', data=lux_2020).fit() model_lr
'age'], lux_2020['log_mx'], alpha=0.75)
plt.scatter(lux_2020['age'], model_lr.predict(lux_2020[['age']]), color='red')
plt.plot(lux_2020['Age'); plt.ylabel('Log-Mortality'); plt.xlabel(
<- lm(log_mx ~ poly(age, 2), data = lux_2020) model_quad
<- predict(model_quad, newdata = age_grid)
predictions + geom_line(data = age_grid, aes(x = age, y = predictions),
p color = "red", linewidth=2)
= smf.ols('log_mx ~ np.power(age, 2) + age', data=lux_2020).fit() model_quad
'age'], lux_2020['log_mx'], alpha=0.75)
plt.scatter(lux_2020['age'], model_quad.predict(lux_2020), color='red', linewidth=2)
plt.plot(lux_2020['Age'); plt.ylabel('Log Mortality'); plt.xlabel(
<- lm(log_mx ~ cut(age, seq(0, 90, 10), right=F), data = lux_2020) model_step
<- ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
p geom_point(alpha = 0.75) + labs(x = "Age", y = "Log-Mortality")
<- data.frame(age = seq(min(lux$age), max(lux$age), by=0.1))
age_grid <- predict(model_step, newdata = age_grid)
predictions
+ geom_point(data = age_grid, aes(x = age, y = predictions),
p color = "red", size=2)
'age_bin'] = pd.cut(lux_2020['age'], bins=np.arange(0, 101, 10), right=False)
lux_2020['age_bin_str'] = lux_2020['age_bin'].astype(str)
lux_2020[= smf.ols('log_mx ~ age_bin_str', data=lux_2020).fit() model_step
# Generate a DataFrame for predictions
# This involves creating a DataFrame where 'age' spans the unique bins used in the model
= np.linspace(0, 101, 1000)
age_grid
= pd.DataFrame({
df_plot 'age': age_grid,
'age_bin_str': pd.cut(age_grid, bins=np.arange(0, 101, 10), right=False).astype(str)}
)
# Predict using the model
'predictions'] = model_step.predict(df_plot)
df_plot[
# Plot
'age'], lux_2020['log_mx'], alpha=0.75)
plt.scatter(lux_2020['age'], df_plot['predictions'], color='red', linewidth=2)
plt.scatter(df_plot['Age')
plt.xlabel('Log-Mortality') plt.ylabel(
<- lm(log_mx ~ bs(age, degree=10), data=lux_2020) # Requires splines package model_spline
<- predict(model_spline, newdata = age_grid)
predictions + geom_line(data = age_grid, aes(x = age, y = predictions),
p color = "red", linewidth=2, alpha=0.75)
= dmatrix("bs(lux_2020['age'], degree=3, knots=[15,30,45])",
spline_x "lux_2020['age']": lux_2020['age']}, return_type='dataframe')
{= sm.GLM(lux_2020['log_mx'], spline_x).fit() model_spline
# Prepare data for plotting
= pd.DataFrame({'age': np.linspace(lux_2020['age'].min(), lux_2020['age'].max(), 100)})
x_range = dmatrix("bs(x_range, degree=3, knots=[15,30,45])", {"x_range": x_range['age']}, return_type='dataframe')
x_range_transformed
= model_spline.predict(x_range_transformed)
predictions_spline
'age'], lux_2020['log_mx'], alpha=0.75)
plt.scatter(lux_2020['age'], predictions_spline, color='red')
plt.plot(x_range['Age'); plt.ylabel('Log Mortality'); plt.xlabel(
Methods from this class (p. 8–9):
Take ACTL3141/ACTL5104 for mortality modelling, ACTL3143/ACTL5111 for actuarial AI
ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
geom_point(aes(y = predict(model_lr)), color = "red", size = 2) +
geom_point(alpha = 0.75, size = 2) + labs(x = "Age", y = "Log-Mortality")
= smf.ols('log_mx ~ age', data=lux_2020).fit()
model 'age'], model.predict(lux_2020[['age']]), color='red')
plt.scatter(lux_2020['age'], lux_2020['log_mx'], alpha=0.75)
plt.scatter(lux_2020['Age'); plt.ylabel('Log-Mortality'); plt.xlabel(
ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
geom_smooth(method = "lm", formula = y ~ x, color = "red", linewidth=2) +
geom_point(alpha = 0.75) + labs(x = "Age", y = "Log Mortality")
='age', y='log_mx', data=lux_2020, ci=95)
sns.regplot(x'Age'); plt.ylabel('Log-Mortality'); plt.xlabel(
<- data.frame(age = seq(25, 35, by = 0.5))
df_grid $log_mx <- predict(model_lr, newdata = df_grid) df_grid
<- lux_2020 %>% filter(age >= 25, age <= 35)
lux_young
ggplot(lux_young, aes(x = age, y = log_mx)) +
theme_minimal() +
geom_point(data = df_grid, aes(y = log_mx), color = "red", size = 2) +
geom_point(alpha = 0.75, size = 2) +
labs(x = "Age", y = "Log-Mortality")
= pd.DataFrame({"age": np.arange(25, 35, 0.5)})
age_grid = model.predict(age_grid) predictions
= lux_2020[(lux_2020['age'] >= 25) & (lux_2020['age'] <= 35)]
lux_young 'age'], lux_young['log_mx'], alpha=0.75)
plt.scatter(lux_young[='red', marker='o')
plt.scatter(age_grid, model.predict(age_grid), color'Age'); plt.ylabel('Log-Mortality'); plt.xlabel(
<- data.frame(age = seq(40, 130))
df_grid $log_mx <- predict(model_lr, newdata = df_grid) df_grid
<- lux_2020 %>% filter(age >= 40, age <= 130)
lux_old
ggplot(lux_old, aes(x = age, y = log_mx)) +
theme_minimal() +
geom_line(data = df_grid, aes(y = log_mx), color = "red", linewidth = 2) +
geom_point(alpha = 0.75) +
labs(x = "Age", y = "Log-Mortality")
= pd.DataFrame({"age": np.arange(40, 130)})
age_grid = model.predict(age_grid) predictions
= lux_2020[(lux_2020['age'] >= 40) & (lux_2020['age'] <= 130)]
lux_old ='red', marker='o')
plt.plot(age_grid, model.predict(age_grid), color'age'], lux_old['log_mx'], alpha=0.75)
plt.scatter(lux_old['Age'); plt.ylabel('Log-Mortality'); plt.xlabel(
= lux[c("age", "year", "log_mx")]
df_mlr head(df_mlr)
# A tibble: 6 × 3
age year log_mx
<int> <int> <dbl>
1 0 1960 -3.74
2 1 1960 -6.38
3 2 1960 -6.37
4 3 1960 -6.68
5 4 1960 -7.08
6 5 1960 -7.04
Fitting:
<- lm(log_mx ~ age + year, data = df_mlr) linear_model
Predicting:
<- data.frame(year = 2040, age = 20)
new_point predict(linear_model, newdata = new_point)
1
-8.66
coef(linear_model)
(Intercept) age year
34.58222358 0.07287039 -0.02191158
# This code chunk is courtesy of ChatGPT.
library(plotly)
<- lm(log_mx ~ age + year, data = lux)
model # Predict values over a grid to create the regression plane
# Create a sequence of ages and years that covers the range of your data
<- seq(min(lux$age), max(lux$age), length.out = 100)
age_range <- seq(min(lux$year), max(lux$year), length.out = 100)
year_range
# Create a grid of age and year values
<- expand.grid(age = age_range, year = year_range)
grid
# Predict log_mx for the grid
$log_mx <- predict(model, newdata = grid)
grid
# Plot
<- plot_ly(lux, x = ~age, y = ~year, z = ~log_mx, type = 'scatter3d', mode = 'markers', size = 0.5) %>%
fig add_trace(data = grid, x = ~age, y = ~year, z = ~log_mx, type = 'mesh3d', opacity = 0.5) %>%
layout(scene = list(xaxis = list(title = 'Age'),
yaxis = list(title = 'Year'),
zaxis = list(title = 'Log-Mortality')))
# Show the plot
fig
Extend the standard linear model
Y_i = \beta_0 + \beta_1 x_i + \varepsilon_i
to
Y_i = \beta_0 + \beta_1 x_i + \beta_2 x_i^2 + \cdots + \beta_d x_i^d + \varepsilon_i
<- data.frame(age = lux_2020$age, age2 = lux_2020$age^2, log_mx = lux_2020$log_mx)
df_pr head(df_pr)
age age2 log_mx
1 0 0 -5.363176
2 6 36 -8.111728
3 15 225 -6.949619
4 16 256 -8.040959
5 18 324 -7.389022
6 21 441 -8.159519
<- lm(log_mx ~ age + age2,
poly_model data = df_pr)
coef(poly_model)
(Intercept) age age2
-7.065977594 -0.066603952 0.001421058
<- data.frame(age = 20, age2 = 20)
bad_x predict(poly_model, newdata = bad_x)
1
-8.369635
ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
geom_line(data = df_pr, aes(x = age, y = predict(poly_model)), color = "red", linewidth=2) +
geom_point(alpha = 0.75) + labs(x = "Age", y = "Log-Mortality")
We just tricked R into thinking that age2
is a separate variable!
This is a linear model of a nonlinearly transformed variable.
poly
function<- data.frame(age = lux_2020$age, log_mx = lux_2020$log_mx)
df_pr head(df_pr)
age log_mx
1 0 -5.363176
2 6 -8.111728
3 15 -6.949619
4 16 -8.040959
5 18 -7.389022
6 21 -8.159519
<- lm(log_mx ~ poly(age, 2),
poly_model data = df_pr)
coef(poly_model)
(Intercept) poly(age, 2)1 poly(age, 2)2
-5.787494 14.534731 6.376355
Now we can’t put in age^2
as a separate variable.
<- data.frame(age = 20)
new_input predict(poly_model, newdata = new_input)
1
-7.829633
ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
geom_line(data = df_pr, aes(x = age, y = predict(poly_model)), color = "red", linewidth=2) +
geom_point(alpha = 0.75) + labs(x = "Age", y = "Log-Mortality")
The coefficients are different, but the predictions are the same!
ggplot(lux_2020, aes(x = age, y = log_mx)) + theme_minimal() +
stat_smooth(method = "lm", formula = y ~ poly(x, 2), color = "red", linewidth=2) +
geom_point(alpha = 0.75) + labs(x = "Age", y = "Log-Mortality")
head(lux$age)
[1] 0 1 2 3 4 5
<- model.matrix(~ poly(age, 2), data = lux)
age_poly head(age_poly)
(Intercept) poly(age, 2)1 poly(age, 2)2
1 1 -0.03020513 0.03969719
2 1 -0.02961226 0.03744658
3 1 -0.02901939 0.03524373
4 1 -0.02842652 0.03308866
5 1 -0.02783365 0.03098136
6 1 -0.02724077 0.02892183
<- model.matrix(~ poly(age, 2, raw=TRUE), data = lux)
age_poly head(age_poly)
(Intercept) poly(age, 2, raw = TRUE)1 poly(age, 2, raw = TRUE)2
1 1 0 0
2 1 1 1
3 1 2 4
4 1 3 9
5 1 4 16
6 1 5 25
raw=TRUE
)<- model.matrix(~ poly(age, 2, raw=TRUE), data = lux) age_poly
<- data.frame(age = lux$age, age_poly[, -1])
df_poly
<- df_poly %>%
df_poly_long pivot_longer(cols = -age, names_to = "Polynomial", values_to = "Value")
ggplot(df_poly_long, aes(x = age, y = Value, color = Polynomial)) +
geom_line(linewidth=2) +
theme_minimal() +
labs(title = "Polynomial Terms of Age",
x = "Age",
y = "Polynomial Value",
color = "Term")
<- model.matrix(~ poly(age, 4), data = lux) age_poly
<- data.frame(age = lux$age, age_poly[, -1])
df_poly
<- df_poly %>%
df_poly_long pivot_longer(cols = -age, names_to = "Polynomial", values_to = "Value")
ggplot(df_poly_long, aes(x = age, y = Value, color = Polynomial)) +
geom_line(linewidth=2) +
theme_minimal() +
labs(title = "Polynomial Terms of Age",
x = "Age",
y = "Polynomial Value",
color = "Term")
<- 1:20
orders for (i in seq_along(orders)) {
<- orders[i]
order
<- as.formula(glue("log_mx ~ poly(age, {order})"))
formula <- lm(formula, data = lux_2020)
model
$log_mx <- predict(model, newdata = age_grid)
age_grid
plot_regression(lux_2020, age_grid, "age", "log_mx",
glue("Polynomial Regression Order {order}"),
glue("./Animations/NonLinear/Poly{i}.svg"))
}
\mathbb{P}(y_i > 250|x_i) = \frac{\exp(\beta_0 + \beta_1x_i + \beta_2x_i^2 + \beta_3x_i^3 + \beta_4x_i^4)}{1+\exp(\beta_0 + \beta_1x_i + \beta_2x_i^2 + \beta_3x_i^3 + \beta_4x_i^4)}
Pros:
Cons:
Polynomial regression imposes a global structure on the nonlinear function; an alternative is to use step functions.
Break up range of x into k distinct regions c_0 < c_1 < \dots < c_k
Do a least squares fit on y_i = \beta_0 + \beta_1 I(c_1 \leq x_i \leq c_2) + \beta_2 I(c_2 \leq x_i < c_3) + \dots + \beta_{k-1} I(c_{k-1} \leq x_i \leq c_k)
Wage
dataI
and cut
head(model.matrix(~ age + I(age >= 3), data = lux))
(Intercept) age I(age >= 3)TRUE
1 1 0 0
2 1 1 0
3 1 2 0
4 1 3 1
5 1 4 1
6 1 5 1
head(cut(lux$age, c(0, 5, 100), right=FALSE))
[1] [0,5) [0,5) [0,5) [0,5) [0,5) [5,100)
Levels: [0,5) [5,100)
head(model.matrix(~ age + cut(age, c(0, 5, 100), right=F), data = lux))
(Intercept) age cut(age, c(0, 5, 100), right = F)[5,100)
1 1 0 0
2 1 1 0
3 1 2 0
4 1 3 0
5 1 4 0
6 1 5 1
<- lm(log_mx ~ cut(age, c(0, 5, 100), right=F), data = lux)
model_step coef(model_step)
(Intercept)
-6.555113
cut(age, c(0, 5, 100), right = F)[5,100)
1.300758
Fit the model:
y_i = \beta_0 + \beta_1 b_1(x_i) + \beta_2 b_2(x_i) + \dots + \beta_k b_k(x_i)
\boldsymbol{X} = \begin{bmatrix} 1 & x_{11} & x_{12} & \dots & x_{1p} \\ 1 & x_{21} & x_{22} & \dots & x_{2p} \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & x_{n1} & x_{n2} & \dots & x_{np} \end{bmatrix}
\boldsymbol{X} = \begin{bmatrix} 1 & b_1(x_1) & b_2(x_1) & \dots & b_k(x_1) \\ 1 & b_1(x_2) & b_2(x_2) & \dots & b_k(x_2) \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & b_1(x_n) & b_2(x_n) & \dots & b_k(x_n) \end{bmatrix}
Example: Fitting a piecewise cubic polynomial with one “knot” y_i = \begin{cases} \beta_{0,1} + \beta_{1,1} x_i + \beta_{2,1} x_i^2 + \beta_{3,1} x_i^3 & \text{ if } x_i < c \\ \beta_{0,2} + \beta_{1,2} x_i + \beta_{2,2} x_i^2 + \beta_{3,2} x_i^3 & \text{ if } x_i \geq c \end{cases}
c is a knot: a point of our choosing where the model changes from one to another
A piecewise polynomial function of degree d is a spline if the function is continuous up to the (d-1)th derivative at each knot.
<- lm(log_mx ~ bs(age, degree=1, knots=...), data = lux_2020) model
# Linear regression splines with different number of knots
<- 2:11
knots_seq for (i in seq_along(knots_seq)) {
<- knots_seq[i]
n_knots <- quantile(lux_2020$age, probs = seq(0, 1, length.out = n_knots + 1))[-c(1, n_knots + 2)]
knots <- as.formula(glue("log_mx ~ bs(age, degree=1, knots = c({toString(knots)}))"))
formula <- lm(formula, data = lux_2020)
model
$log_mx <- predict(model, newdata = age_grid)
age_grid
plot_regression(lux_2020, age_grid, "age", "log_mx",
glue("Linear Spline with {n_knots} Knots"),
glue("./Animations/NonLinear/SplineLin{i}.svg"),
knots = knots)
}
<- lm(log_mx ~ bs(age, degree=3, knots=...), data = lux_2020) model
# Cubic regression splines with different number of knots
for (i in seq_along(knots_seq)) {
<- knots_seq[i]
n_knots <- quantile(lux_2020$age, probs = seq(0, 1, length.out = n_knots + 1))[-c(1, n_knots + 2)]
knots <- as.formula(glue("log_mx ~ bs(age, degree=3, knots = c({toString(knots)}))"))
formula <- lm(formula, data = lux_2020)
model
$log_mx <- predict(model, newdata = age_grid)
age_grid
plot_regression(lux_2020, age_grid, "age", "log_mx",
glue("Cubic Spline with {n_knots} Knots"),
glue("./Animations/NonLinear/SplineCub{i}.svg"),
knots = knots)
}
The cubic is preferred as it is the smallest order where the knots are not visible without close inspection.
We can extend the idea of a cubic spline to a natural cubic spline. It is a spline where outside the boundary knots (extrapolation) the function is linear.
Find a function g which minimises
\frac{1}{n} \sum_{i=1}^n (y_i - g(x_i))^2 + \lambda \int_{x=-\infty}^{\infty} g''(x)^2 \, \mathrm{d}x
For mortality smoothing, US uses smoothing splines, UK uses regression splines.
<- smooth.spline(lux_2020$age, lux_2020$log_mx, df = df) model
# Smoothing splines with different df (degrees of freedom)
<- seq(2, 20, length.out = 10)
dfs
for (i in seq_along(dfs)) {
<- dfs[i]
df <- smooth.spline(lux_2020$age, lux_2020$log_mx, df = df)
model <- predict(model, age_grid$age)
predictions $log_mx <- predictions$y
age_grid
plot_regression(lux_2020, age_grid, "age", "log_mx",
glue("Smoothing Spline with df {df}"),
glue("./Animations/NonLinear/SmoothSpline{i}.svg"))
}
Wage
dataAlgorithm
Wage
datay_i = \beta_0 + f_1(x_{i,1}) + f_2(x_{i,2}) + ... + f_p(x_{i,p}) + \varepsilon_i
Wage
datawage
education
is qualitative. The others are fit with natural cubic splinesWage
datawage
education
is qualitative. The others are fit with smoothing splinesmgcv
)<- gam(log_mx ~ s(age) + s(year), data=lux) model_gam
plot(model_gam, select=1)
plot(model_gam, select=2)
gam
)library(gam)
<- lux %>% mutate(year = factor(year))
lux_factor <- gam(log_mx ~ s(age) + year, data=lux_factor)
model_gam plot(model_gam)
It won’t help with your assessment, it’s just very entertaining/interesting.
print(sessionInfo(), locale=FALSE, tzone=FALSE)
R version 4.4.0 (2024-04-24)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.5
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
attached base packages:
[1] splines stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] gam_1.22-3 foreach_1.5.2 glue_1.7.0 plotly_4.10.4
[5] mgcv_1.9-1 nlme_3.1-165 lubridate_1.9.3 forcats_1.0.0
[9] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.2 readr_2.1.5
[13] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] utf8_1.2.4 generics_0.1.3 stringi_1.8.4 lattice_0.22-6
[5] hms_1.1.3 digest_0.6.36 magrittr_2.0.3 evaluate_0.24.0
[9] grid_4.4.0 timechange_0.3.0 iterators_1.0.14 fastmap_1.2.0
[13] rprojroot_2.0.4 jsonlite_1.8.8 Matrix_1.7-0 httr_1.4.7
[17] fansi_1.0.6 crosstalk_1.2.1 viridisLite_0.4.2 scales_1.3.0
[21] codetools_0.2-20 lazyeval_0.2.2 cli_3.6.3 rlang_1.1.4
[25] munsell_0.5.1 withr_3.0.0 yaml_2.3.8 tools_4.4.0
[29] tzdb_0.4.0 colorspace_2.1-0 here_1.0.1 reticulate_1.38.0
[33] vctrs_0.6.5 R6_2.5.1 png_0.1-8 lifecycle_1.0.4
[37] htmlwidgets_1.6.4 pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.5
[41] data.table_1.15.4 Rcpp_1.0.12 xfun_0.45 tidyselect_1.2.1
[45] knitr_1.47 farver_2.1.2 htmltools_0.5.8.1 rmarkdown_2.27
[49] labeling_0.4.3 compiler_4.4.0
from watermark import watermark
print(watermark(python=True, packages="matplotlib,numpy,pandas,seaborn,scipy"))
Python implementation: CPython
Python version : 3.11.9
IPython version : 8.26.0
matplotlib: 3.9.0
numpy : 1.26.4
pandas : 2.2.2
seaborn : 0.13.2
scipy : 1.11.0