using CairoMakie
using Distributions
using LaTeXStrings
using Turing
using Printf
using Random
using StatsBase
using HypothesisTests
using Optim
- 1
-
We use
using
to import necessary packages
This notebook provides computational examples to accompany the Probability and inference chapter. See also: Moquito bites and beer consumption
using CairoMakie
using Distributions
using LaTeXStrings
using Turing
using Printf
using Random
using StatsBase
using HypothesisTests
using Optim
using
to import necessary packages
Understanding probability distributions requires grasping three fundamental functions: the probability density function (PDF) or probability mass function (PMF), the cumulative distribution function (CDF), and the quantile function. These examples demonstrate how these functions work together using the normal distribution (continuous) and Poisson distribution (discrete).
The visualizations show both forward operations (finding probabilities from values using the CDF) and inverse operations (finding values from probabilities using quantiles). This computational perspective reinforces the theoretical relationships discussed in the main chapter.
We start by defining some helper functions.
function add_pdf_area!(ax, dist, a, b; color=(:orange, 0.4), label=nothing)
x_fill = a:0.01:b
pdf_fill = pdf.(dist, x_fill)
band!(ax, x_fill, zeros(length(x_fill)), pdf_fill, color=color, label=label)
prob = cdf(dist, b) - cdf(dist, a)
return prob
end
function add_forward_cdf!(ax, dist, x_point; color=:red, x_min=-4)
y_point = cdf(dist, x_point)
scatter!(ax, [x_point], [y_point], color=color, markersize=8)
lines!(ax, [x_point, x_point], [0, y_point], color=color, linestyle=:dash)
lines!(ax, [x_min, x_point], [y_point, y_point], color=color, linestyle=:dash)
return y_point
end
function add_inverse_cdf!(ax, dist, p_target; color=:green, x_min=-4)
x_inv = quantile(dist, p_target)
y_actual = cdf(dist, x_inv)
scatter!(ax, [x_inv], [y_actual], color=color, markersize=8)
lines!(ax, [x_inv, x_inv], [0, y_actual], color=color, linestyle=:dash)
lines!(ax, [x_min, x_inv], [p_target, p_target], color=color, linestyle=:dash)
return x_inv, y_actual
end
PDF
curve between bounds \(a\) and \(b\). By convention a function that ends in !
modifies its arguments (in this case, a plot axis).
PDF
using broadcasting (the .
operator)CDF
difference: \(P(a \leq X \leq b) = F(b) - F(a)\)
CDF
operation: given \(x\), find \(F(x) = P(X \leq x)\)
CDF
operation: given probability \(p\), find \(x\) such that \(F(x) = p\)CDF
Next, we use Makie.jl
to creat the plot.
function plot_normal_pdf_cdf()
μ, σ = 0.0, 1.0
x_range = -4:0.01:4
normal_dist = Normal(μ, σ)
fig = Figure(size=(900, 400))
ax1 = Axis(fig[1, 1],
xlabel=L"x",
ylabel=L"\text{Density } $p(x)$",
title="Normal(0, 1) PDF")
lines!(ax1, x_range, pdf.(normal_dist, x_range),
color=:blue, linewidth=2, label=L"p(x)")
prob_area = add_pdf_area!(ax1, normal_dist, -1, 1,
label=L"P(-1 \leq X \leq 1)")
text!(ax1, -0.6, 0.125,
text=L"\text{Area} = %$(round(prob_area, digits=3))",
fontsize=14, color=:black)
axislegend(ax1, position=:rt)
ax2 = Axis(fig[1, 2],
xlabel=L"x",
ylabel=L"\text{Probability } $F(x)$",
title="Normal CDF: Forward and Inverse")
lines!(ax2, x_range, cdf.(normal_dist, x_range),
color=:blue, linewidth=2, label=L"$F(x)$")
y_point = add_forward_cdf!(ax2, normal_dist, 1.0)
text!(ax2, 1.2, y_point - 0.1,
text=L"F(1) = %$(round(y_point, digits=3))", color=:red)
x_inv, _ = add_inverse_cdf!(ax2, normal_dist, 0.25)
text!(ax2, x_inv - 0.8, 0.35,
text=L"F^{-1}(0.25) = %$(round(x_inv, digits=2))", color=:green)
axislegend(ax2, position=:rb)
return fig
end
fig_normal = plot_normal_pdf_cdf()
fig_normal
Normal(0,1)
distribution object from Distributions.jl
PDF
using vectorized evaluation over x_range
CDF
showing cumulative probabilities
The normal distribution demonstrates these concepts for continuous variables, where probabilities correspond to areas under smooth curves. Next, we examine the Poisson distribution for discrete random variables, where probabilities correspond to point masses and the CDF
becomes a step function.
We define some more helper functions for discrete distributions.
function plot_pmf_stems!(ax, dist, x_range; color=:blue, linewidth=3, markersize=8)
pmf_vals = pdf.(dist, x_range)
for (i, x) in enumerate(x_range)
lines!(ax, [x, x], [0, pmf_vals[i]], color=color, linewidth=linewidth)
scatter!(ax, [x], [pmf_vals[i]], color=color, markersize=markersize)
end
return pmf_vals
end
function highlight_pmf_mass!(ax, dist, x_range; color=:orange)
pmf_vals = pdf.(dist, x_range)
for (i, x) in enumerate(x_range)
lines!(ax, [x, x], [0, pmf_vals[i]], color=color, linewidth=5)
scatter!(ax, [x], [pmf_vals[i]], color=color, markersize=10)
end
return sum(pmf_vals)
end
function plot_discrete_cdf!(ax, dist, x_range; color=:blue, linewidth=2, markersize=6)
cdf_vals = cdf.(dist, x_range)
for i in 1:(length(x_range)-1)
lines!(ax, [x_range[i], x_range[i+1]], [cdf_vals[i], cdf_vals[i]],
color=color, linewidth=linewidth)
end
scatter!(ax, x_range, cdf_vals, color=color, markersize=markersize)
return cdf_vals
end
function add_discrete_forward_cdf!(ax, dist, x_point; color=:red, x_min=0)
y_point = cdf(dist, x_point)
scatter!(ax, [x_point], [y_point], color=color, markersize=10)
lines!(ax, [x_point, x_point], [0, y_point], color=color, linestyle=:dash)
lines!(ax, [x_min, x_point], [y_point, y_point], color=color, linestyle=:dash)
return y_point
end
function add_discrete_inverse_cdf!(ax, dist, p_target; color=:green, x_min=0)
x_inv = quantile(dist, p_target)
y_actual = cdf(dist, x_inv)
scatter!(ax, [x_inv], [y_actual], color=color, markersize=10)
lines!(ax, [x_min, x_inv], [p_target, p_target], color=color, linestyle=:dash)
return x_inv, y_actual
end
pdf()
returns probability mass \(P(X = x)\)
CDF
CDF
values between integers
Now we create the plot for the Poisson distribution.
function plot_poisson_pmf_cdf()
λ = 3.0
x_range = 0:10
poisson_dist = Poisson(λ)
fig = Figure(size=(900, 400))
ax1 = Axis(fig[1, 1],
xlabel=L"x",
ylabel=L"P(X = x)",
title=L"\text{Poisson}(3) \text{ PMF}",
xticks=1:10)
plot_pmf_stems!(ax1, poisson_dist, x_range)
prob_mass = highlight_pmf_mass!(ax1, poisson_dist, 0:2)
text!(ax1, 6, 0.15,
text=L"P(X \leq 2) = %$(round(prob_mass, digits=3))",
fontsize=14, color=:black)
ax2 = Axis(fig[1, 2],
xlabel=L"x",
ylabel=L"\text{Probability } F(x)",
title=L"\text{Poisson CDF: Forward and Inverse}",
xticks=1:10)
plot_discrete_cdf!(ax2, poisson_dist, x_range)
y_point = add_discrete_forward_cdf!(ax2, poisson_dist, 4)
text!(ax2, 4.2, y_point - 0.1,
text=L"F(4) = %$(round(y_point, digits=3))", color=:red)
x_inv, _ = add_discrete_inverse_cdf!(ax2, poisson_dist, 0.4)
text!(ax2, x_inv - 1.5, 0.5,
text=L"F^{-1}(0.4) = %$(Int(x_inv))", color=:green)
return fig
end
fig_poisson = plot_poisson_pmf_cdf()
fig_poisson
Having explored the foundational concepts of probability distributions, we now turn to statistical inference—the process of learning about unknown parameters from observed data.
The coin flip example provides a pedagogically clean introduction to statistical inference. We observe a series of coin flips and want to infer the probability \(\theta\) of heads. This simple setup allows us to demonstrate maximum likelihood estimation, Bayesian inference, and the relationship between analytical and computational approaches.
The likelihood function shows how plausible different values of \(\theta\) are given our observed data. For the Binomial
model, this creates a clear peak at the observed proportion of heads.
function plot_binomial_likelihood(y, n)
θ_grid = 0.01:0.01:0.99
# Using Distributions.jl
likelihood_vals = [pdf(Binomial(n, θ), y) for θ in θ_grid]
fig = Figure(size=(800, 500))
ax = Axis(fig[1, 1],
xlabel=L"$\theta$ (probability of heads)",
ylabel="Likelihood",
title="Likelihood for $(y) heads in $(n) flips")
lines!(ax, θ_grid, likelihood_vals, label="Full likelihood", linewidth=2, color=:blue)
# Mark the MLE
mle = y / n
vlines!(ax, [mle], color=:black, linestyle=:dot, linewidth=2, label="MLE = $(round(mle, digits=2))")
axislegend(ax, position=:lt)
return fig
end
# Example with 7 heads in 10 flips
fig_likelihood = plot_binomial_likelihood(7, 10)
fig_likelihood
The likelihood peaks at \(\theta = 0.7\) when we observe 7 heads out of 10 flips. This demonstrates that the maximum likelihood estimate is simply the observed proportion.
Maximum likelihood estimation finds the parameter value that makes the observed data most probable. For the coin flip problem, we can solve this analytically by taking the derivative of the log-likelihood and setting it to zero. The following shows how the MLE
varies with different numbers of observed heads:
function demonstrate_mle_estimation(y_values, n)
θ_grid = 0.01:0.01:0.99
fig = Figure(size=(800, 600))
for (i, y) in enumerate(y_values)
ax = Axis(fig[div(i - 1, 2)+1, ((i-1)%2)+1],
xlabel=L"\theta",
ylabel="Log-likelihood",
title="$(y)/$(n) heads")
log_likelihood = [y * log(θ) + (n - y) * log(1 - θ) for θ in θ_grid]
lines!(ax, θ_grid, log_likelihood, linewidth=2, color=:blue)
mle = y / n
vlines!(ax, [mle], color=:red, linestyle=:dash, linewidth=2,
label="MLE")
if i == 1
axislegend(ax, position=:cb)
end
end
return fig
end
# Show MLE for different outcomes
fig_mle = demonstrate_mle_estimation([3, 5, 7, 9], 10)
fig_mle
MLE
for 3, 5, 7, and 9 heads out of 10 flips
Each plot shows the log-likelihood function as a smooth curve with its maximum clearly indicated. Notice that the MLE
is always at the observed proportion \(\frac{y}{n}\), confirming our analytical derivation.
Bayesian inference treats the parameter \(\theta\) as a random variable with its own probability distribution. We start with a prior belief about \(\theta\), observe data, and update our beliefs to get the posterior distribution.
For the coin flip problem with a Beta
prior, we can compute the posterior analytically using conjugacy. The posterior parameters are simply the prior parameters plus the observed successes and failures. This example compares the exact analytical solution with MCMC
sampling.
First we define the Turing.jl
model and the comparison function. See the official Turing.jl
documentation and tutorials.
# Turing model for coin flipping
@model function coin_flip_model(y, n, prior_α, prior_β)
θ ~ Beta(prior_α, prior_β)
y ~ Binomial(n, θ)
return θ
end
Turing.jl
probabilistic programming model specification using @model
macro
coin_flip_model (generic function with 2 methods)
Next we define a function to help us create the data and fits
function compare_analytical_mcmc(y, n; prior_α=1, prior_β=1, n_samples=5000)
# Analytical posterior
post_α = prior_α + y
post_β = prior_β + n - y
analytical_posterior = Beta(post_α, post_β)
# MCMC sampling
model = coin_flip_model(y, n, prior_α, prior_β)
chain = sample(model, NUTS(), n_samples, verbose=false, progress=false)
mcmc_samples = Array(chain[:θ])
# Return data for plotting instead of creating plots
return (analytical_posterior=analytical_posterior, mcmc_samples=mcmc_samples, n_samples=n_samples)
end
# Generate data by calling the same function twice
data_low = compare_analytical_mcmc(7, 10, n_samples=500)
data_high = compare_analytical_mcmc(7, 10, n_samples=50_000)
Beta
-Binomial conjugacy
NUTS
)
Array(chain[:θ])
┌ Info: Found initial step size └ ϵ = 1.6 ┌ Info: Found initial step size └ ϵ = 1.6
(analytical_posterior = Distributions.Beta{Float64}(α=8.0, β=4.0), mcmc_samples = [0.8072931234757614; 0.47570867256497573; … ; 0.39836176331365647; 0.9166400407435039;;], n_samples = 50000)
And then we plot
# Combine plots
fig_comparison = Figure(size=(1200, 500))
ax1 = Axis(fig_comparison[1, 1],
xlabel=L"$\theta$ (probability of heads)",
ylabel="Density",
title="MCMC with 500 samples")
ax2 = Axis(fig_comparison[1, 2],
xlabel=L"$\theta$ (probability of heads)",
title="MCMC with 50,000 samples")
# Link y-axes for comparison
linkaxes!(ax1, ax2)
# Plot using the same plotting logic for both panels
for (ax, data) in [(ax1, data_low), (ax2, data_high)]
# Plot analytical posterior
θ_grid = 0.01:0.01:0.99
analytical_density = pdf.(data.analytical_posterior, θ_grid)
lines!(ax, θ_grid, analytical_density, label="Analytical",
linewidth=3, color=:blue)
# Plot MCMC histogram
hist!(ax, vec(data.mcmc_samples), bins=40, normalization=:pdf,
color=(:red, 0.6), label="MCMC samples")
# Add summary statistics
analytical_mean = mean(data.analytical_posterior)
mcmc_mean = mean(data.mcmc_samples)
# Vertical lines for means
vlines!(ax, [analytical_mean], color=:blue, linestyle=:dash, alpha=0.8)
vlines!(ax, [mcmc_mean], color=:red, linestyle=:dot, alpha=0.8)
# Add vertical line at 0.3 to show tail cutoff
vlines!(ax, [0.3], color=:gray, linestyle=:solid, alpha=0.6)
# Add text annotations with statistics
text!(ax, 0.35, maximum(analytical_density) * 0.95,
text=L"\text{Analytical: Mean} = %$(round(analytical_mean, digits=3))",
fontsize=10, color=:blue)
text!(ax, 0.35, maximum(analytical_density) * 0.85,
text=L"\text{MCMC: Mean} = %$(round(mcmc_mean, digits=3))",
fontsize=10, color=:red)
# Calculate P(theta < 0.25) - tail probability
tail = 0.25
analytical_tail_prob = cdf(data.analytical_posterior, tail)
analytical_tail_prob = @sprintf("%.2E", analytical_tail_prob)
mcmc_tail_prob = mean(data.mcmc_samples .< tail)
mcmc_tail_prob = @sprintf("%.2E", mcmc_tail_prob)
text!(ax, 0.05, maximum(analytical_density) * 0.55,
text=L"\text{Analytical}: $P(\theta < %$(tail))$ = %$(analytical_tail_prob)",
fontsize=10, color=:blue)
text!(ax, 0.05, maximum(analytical_density) * 0.45,
text=L"\text{MCMC}: $P(\theta < %$(tail))$ = %$(mcmc_tail_prob)",
fontsize=10, color=:red)
axislegend(ax, position=:rt)
end
fig_comparison
MCMC
samples
CDF
MCMC
samples
The results show excellent agreement between analytical and MCMC
approaches. With sufficient samples, MCMC
accurately approximates the true posterior distribution. The comparison also illustrates how MCMC
precision varies across the distribution—central quantities like the mean are estimated more accurately than tail probabilities.
The coin flip example demonstrated inference for a single parameter using a simple likelihood function. Real-world problems typically involve multiple parameters and more complex relationships between variables. Linear regression provides our first example of multivariate statistical inference while maintaining analytical tractability.
This example demonstrates the progression from curve fitting to probabilistic inference. We’ll examine the same problem through three lenses: minimizing prediction error, maximum likelihood estimation, and Bayesian inference. All three approaches yield the same parameter estimates for the linear model with normal errors, but provide different ways of quantifying uncertainty.
We start by creating synthetic data from a known linear relationship with added noise. This allows us to verify that our inference methods recover the true parameters.
# Set parameters for data generation
Random.seed!(42)
n_obs = 30
true_intercept = 2.0
true_slope = 1.5
true_sigma = 0.8
# Generate predictor variables
x_reg = sort(rand(Uniform(0, 5), n_obs))
# Generate response with linear relationship plus noise
y_reg = true_intercept .+ true_slope .* x_reg .+ rand(Normal(0, true_sigma), n_obs)
# Store true parameter values for comparison
true_params = (intercept=true_intercept, slope=true_slope, sigma=true_sigma)
Let’s visualize our synthetic dataset:
function plot_synthetic_data(x_data, y_data, true_params)
fig = Figure(size=(700, 500))
ax = Axis(fig[1, 1],
xlabel="x",
ylabel="y",
title="Synthetic Linear Regression Dataset")
# Plot data points
scatter!(ax, x_data, y_data, color=:blue, markersize=8, label="Observed data")
# Plot true relationship
x_line = range(minimum(x_data), maximum(x_data), length=100)
y_true = true_params.intercept .+ true_params.slope .* x_line
lines!(ax, x_line, y_true, color=:red, linewidth=3,
label=L"\text{True relationship: } $y = %$(true_params.intercept) + %$(true_params.slope) x$")
# Add noise band to show true sigma
y_upper = y_true .+ 2 * true_params.sigma
y_lower = y_true .- 2 * true_params.sigma
band!(ax, x_line, y_lower, y_upper, color=(:red, 0.2),
label=L"$\pm 2 \sigma$")
axislegend(ax, position=:lt)
return fig
end
fig_data = plot_synthetic_data(x_reg, y_reg, true_params)
fig_data
This visualization shows our synthetic dataset with the true linear relationship and noise level. The scattered points represent our “observations,” while the red line shows the true underlying relationship we’re trying to recover. The shaded band indicates the expected range of observations given the noise level (\(\pm 2\sigma\)).
Without probability theory, we can view regression as finding the line that best fits the data. This requires choosing a loss function to measure fit quality. The most common choice is mean squared error (MSE
), which penalizes large prediction errors more than small ones. This is more of a “machine learning” perspective on the problem, and is one to which we return in the ML chapter.
function fit_least_squares(x_data, y_data)
n = length(x_data)
x_bar = mean(x_data)
y_bar = mean(y_data)
# Analytical least squares formulas
slope = sum((x_data .- x_bar) .* (y_data .- y_bar)) / sum((x_data .- x_bar) .^ 2)
intercept = y_bar - slope * x_bar
# Calculate residuals and MSE
y_pred = intercept .+ slope .* x_data
residuals = y_data .- y_pred
mse = mean(residuals .^ 2)
return (intercept=intercept, slope=slope, mse=mse, residuals=residuals, predictions=y_pred)
end
function plot_least_squares_fit(x_data, y_data, fit_results)
fig = Figure(size=(800, 500))
ax = Axis(fig[1, 1],
xlabel="x",
ylabel="y",
title="Least Squares Curve Fitting")
# Plot data points
scatter!(ax, x_data, y_data, color=:blue, markersize=8, label="Data")
# Plot fitted line
x_line = range(minimum(x_data), maximum(x_data), length=100)
y_line = fit_results.intercept .+ fit_results.slope .* x_line
lines!(ax, x_line, y_line, color=:red, linewidth=2,
label=L"\text{Fitted line:} $y = %$(round(fit_results.intercept, digits=2)) + %$(round(fit_results.slope, digits=2)) x$")
# Show residuals as vertical lines
for i in eachindex(x_data)
lines!(ax, [x_data[i], x_data[i]], [y_data[i], fit_results.predictions[i]],
color=:gray, linestyle=:dash, alpha=0.7, linewidth=1)
end
# Add true line for comparison
y_true_line = true_params.intercept .+ true_params.slope .* x_line
lines!(ax, x_line, y_true_line, color=:green, linewidth=2, linestyle=:dot,
label=L"\text{True line: } $y = %$(true_params.intercept) + %$(true_params.slope)x$")
axislegend(ax, position=:lt)
return fig
end
# Fit the model and create visualization
ols_fit = fit_least_squares(x_reg, y_reg)
fig_curve_fit = plot_least_squares_fit(x_reg, y_reg, ols_fit)
fig_curve_fit
The least squares method finds parameter estimates close to the true values. The residual lines visualize the model’s prediction errors, with shorter lines indicating better fit.
The curve fitting approach works but provides no way to quantify uncertainty in our parameter estimates. By introducing a probabilistic model, we can use maximum likelihood estimation to both estimate parameters and assess their precision.
For linear regression, we assume the response follows a normal distribution around the linear predictor:
\[ y_i \sim N(\alpha + \beta x_i, \sigma^2) \]
as discussed in the theory section. Using the Optim
package, we can numerically maximize the likelihood function to find the MLE
estimates for \(\alpha\), \(\beta\), and \(\sigma\). In Optim
, all functions are minimized, so we minimize the negative log-likelihood instead.
function negative_log_likelihood(params, x_data, y_data)
α, β, σ = params
if σ <= 0
return Inf
end
# Calculate predicted values
μ = α .+ β .* x_data
# Sum of negative log-likelihoods for each observation
nll = 0.0
for i in eachindex(y_data)
nll += -logpdf(Normal(μ[i], σ), y_data[i])
end
return nll
end
function find_mle_estimates(x_data, y_data; initial_guess=[0.0, 1.0, 1.0])
# Use optimization to find MLE
result = optimize(params -> negative_log_likelihood(params, x_data, y_data),
initial_guess, NelderMead())
if !Optim.converged(result)
@warn "MLE optimization did not converge"
end
mle_params = Optim.minimizer(result)
return (intercept=mle_params[1], slope=mle_params[2], sigma=mle_params[3])
end
# Find MLE estimates
mle_estimates = find_mle_estimates(x_reg, y_reg)
println("MLE Estimates:")
println(" Intercept: $(round(mle_estimates.intercept, digits=3))")
println(" Slope: $(round(mle_estimates.slope, digits=3))")
println(" Sigma: $(round(mle_estimates.sigma, digits=3))")
println()
println("True Parameters:")
println(" Intercept: $(true_params.intercept)")
println(" Slope: $(true_params.slope)")
println(" Sigma: $(true_params.sigma)")
println()
println("Least Squares Estimates (for comparison):")
println(" Intercept: $(round(ols_fit.intercept, digits=3))")
println(" Slope: $(round(ols_fit.slope, digits=3))")
MLE
using numerical optimization
MLE Estimates:
Intercept: 1.92
Slope: 1.545
Sigma: 0.796
True Parameters:
Intercept: 2.0
Slope: 1.5
Sigma: 0.8
Least Squares Estimates (for comparison):
Intercept: 1.92
Slope: 1.545
The MLE
estimates for the intercept and slope match the least squares results exactly. This confirms the theoretical result that for linear regression with normal errors, maximum likelihood estimation is equivalent to minimizing mean squared error. The MLE
approach additionally estimates \(\sigma\), quantifying the noise level in the data.
Bayesian inference treats all parameters as random variables with probability distributions. We specify prior beliefs, observe data, and compute posterior distributions that quantify parameter uncertainty.
# Define Turing.jl model for Bayesian linear regression
@model function linear_regression_model(x, y)
n = length(y)
# Priors
α ~ Normal(0, 5)
β ~ Normal(0, 5)
σ ~ Exponential(1)
# Likelihood
μ = α .+ β .* x
for i in 1:n
y[i] ~ Normal(μ[i], σ)
end
end
function run_bayesian_regression(x_data, y_data; n_samples=5000)
# Create model instance
model = linear_regression_model(x_data, y_data)
# Sample from posterior using NUTS
chain = sample(model, NUTS(), n_samples, verbose=false, progress=false)
return chain
end
# Run Bayesian inference
bayesian_chain = run_bayesian_regression(x_reg, y_reg)
# Extract posterior samples
α_samples = vec(Array(bayesian_chain[:α]))
β_samples = vec(Array(bayesian_chain[:β]))
σ_samples = vec(Array(bayesian_chain[:σ]))
# Calculate posterior summaries
bayesian_estimates = (
intercept=(mean=mean(α_samples), std=std(α_samples)),
slope=(mean=mean(β_samples), std=std(β_samples)),
sigma=(mean=mean(σ_samples), std=std(σ_samples))
)
println("Bayesian Posterior Summaries:")
println(" Intercept: $(round(bayesian_estimates.intercept.mean, digits=3)) ± $(round(bayesian_estimates.intercept.std, digits=3))")
println(" Slope: $(round(bayesian_estimates.slope.mean, digits=3)) ± $(round(bayesian_estimates.slope.std, digits=3))")
println(" Sigma: $(round(bayesian_estimates.sigma.mean, digits=3)) ± $(round(bayesian_estimates.sigma.std, digits=3))")
@model
macro
NUTS
)
┌ Info: Found initial step size └ ϵ = 0.0125 Bayesian Posterior Summaries: Intercept: 1.912 ± 0.28 Slope: 1.546 ± 0.101 Sigma: 0.846 ± 0.12
We can also use Turing.jl
to find the MLE
by optimizing the model instead of sampling. This demonstrates how the same probabilistic model can be used for both optimization-based and sampling-based inference.
# Use the same Turing model but optimize instead of sample
function find_turing_mle(x_data, y_data)
model = linear_regression_model(x_data, y_data)
# Find MAP estimate using optimization
mle_result = optimize(model, MAP())
# Extract parameter estimates
estimates = (
intercept=mle_result.values[:α],
slope=mle_result.values[:β],
sigma=mle_result.values[:σ]
)
return estimates
end
# Find MLE using Turing optimization
turing_mle = find_turing_mle(x_reg, y_reg)
println("Turing MLE Estimates:")
println(" Intercept: $(round(turing_mle.intercept, digits=3))")
println(" Slope: $(round(turing_mle.slope, digits=3))")
println(" Sigma: $(round(turing_mle.sigma, digits=3))")
println()
println("Comparison with Manual MLE:")
println(" Intercept difference: $(round(abs(turing_mle.intercept - mle_estimates.intercept), digits=6))")
println(" Slope difference: $(round(abs(turing_mle.slope - mle_estimates.slope), digits=6))")
println(" Sigma difference: $(round(abs(turing_mle.sigma - mle_estimates.sigma), digits=6))")
MAP()
(Maximum A Posteriori) which reduces to MLE
with flat priors
Turing MLE Estimates:
Intercept: 1.916
Slope: 1.546
Sigma: 0.785
Comparison with Manual MLE:
Intercept difference: 0.004083
Slope difference: 0.001082
Sigma difference: 0.010205
This approach shows that Turing.jl
provides a unified framework: the same model definition can be used for both MLE
optimization and MCMC
sampling. The results should match our manual MLE
implementation, demonstrating the consistency between different computational approaches, although small numerical differences may arise due to optimization tolerances.
All three approaches provide similar point estimates, but differ in their treatment of uncertainty:
function create_regression_comparison_plot()
fig = Figure(size=(1200, 800))
# Data and fits subplot
ax1 = Axis(fig[1, 1:2],
xlabel="x",
ylabel="y",
title="Regression Results: All Methods")
# Plot data
scatter!(ax1, x_reg, y_reg, color=:black, markersize=6, label="Data")
# Plot true relationship
x_line = range(minimum(x_reg), maximum(x_reg), length=100)
y_true = true_params.intercept .+ true_params.slope .* x_line
lines!(ax1, x_line, y_true, color=:green, linewidth=3,
label="True: y = $(true_params.intercept) + $(true_params.slope)x")
# Plot least squares fit
y_ols = ols_fit.intercept .+ ols_fit.slope .* x_line
lines!(ax1, x_line, y_ols, color=:blue, linewidth=2,
label="Least squares")
# Plot MLE fit
y_mle = mle_estimates.intercept .+ mle_estimates.slope .* x_line
lines!(ax1, x_line, y_mle, color=:red, linewidth=2, linestyle=:dash,
label="MLE")
# Plot Bayesian posterior mean and credible interval
y_bayes = bayesian_estimates.intercept.mean .+ bayesian_estimates.slope.mean .* x_line
lines!(ax1, x_line, y_bayes, color=:purple, linewidth=2, linestyle=:dot,
label="Bayesian mean")
# Calculate 95% credible interval for predictions
n_pred_samples = min(1000, length(α_samples))
y_pred_samples = zeros(length(x_line), n_pred_samples)
for i in 1:n_pred_samples
y_pred_samples[:, i] = α_samples[i] .+ β_samples[i] .* x_line
end
# Calculate percentiles for credible band
y_lower = [quantile(y_pred_samples[i, :], 0.025) for i in 1:length(x_line)]
y_upper = [quantile(y_pred_samples[i, :], 0.975) for i in 1:length(x_line)]
# Add credible interval band
band!(ax1, x_line, y_lower, y_upper, color=(:purple, 0.2),
label="95% credible interval")
axislegend(ax1, position=:lt)
# Posterior distributions subplot
ax2 = Axis(fig[2, 1],
xlabel=L"Intercept $\alpha$",
ylabel="Density",
title="Parameter Uncertainty")
hist!(ax2, vec(α_samples), bins=30, normalization=:pdf,
color=(:purple, 0.6), label="Posterior")
vlines!(ax2, [true_params.intercept], color=:green, linewidth=2,
label="True value")
vlines!(ax2, [mle_estimates.intercept], color=:red, linewidth=2,
label="MLE")
axislegend(ax2, position=:rt)
ax3 = Axis(fig[2, 2],
xlabel=L"Slope $\beta$",
ylabel="Density")
hist!(ax3, vec(β_samples), bins=30, normalization=:pdf,
color=(:purple, 0.6))
vlines!(ax3, [true_params.slope], color=:green, linewidth=2)
vlines!(ax3, [mle_estimates.slope], color=:red, linewidth=2)
return fig
end
fig_comparison = create_regression_comparison_plot()
fig_comparison
This comparison reveals several important points:
MLE
requires optimization, MCMC
is most expensiveThe Bayesian approach excels when we need to propagate parameter uncertainty through subsequent calculations or decision processes. The credible interval demonstrates how parameter uncertainty translates into prediction uncertainty—a crucial consideration for risk assessment and decision-making under uncertainty.