# Dynamic Discrete Choice notebook¶

Trying to replicate Rust (1987) results using Rust (1987) and Hotz-Miller (1993) approaches

In :
using Statistics, LinearAlgebra, GLM
using Random, Distributions, StatsBase
using DataFrames, DataFramesMeta, ShiftedArrays
using Optim, NLopt, NLSolversBase
using Plots
using DelimitedFiles, CSVFiles
Threads.nthreads()

Out:
13

Create a function that computes logsumexp trying to avoid overflows

In :
function logsumexp(mat)
A = mean(mat);
out = log.(sum(exp.(mat .- A), dims = 1)) .+ A  # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
return out
end

# function logsumexp(mat)
#     A = mean(mat);
#     out = log.(sum(exp.(mat .- A))) .+ A  # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
#     return out
# end


Some cleaning code

In :
# df1 = readdlm("g870.csv", ',', Float64);
# df2 = readdlm("rt50.csv", ',', Float64);
# df3 = readdlm("t8h203.csv", ',', Float64);
# df8 = readdlm("a530875.csv", ',', Float64);

# df1 = reshape(df, 36, 15)'
# writedlm("g870.csv", df, ",")

# df2 = reshape(df2,60,4)'
# writedlm("rt50.csv", df2, ",")

# df3 = reshape(df3, 81, 48)'
# writedlm("t8h203.csv", df3, ",")

# df8 = reshape(df8, 128, 37)'
# writedlm("a530875.csv", df8, ",")

In :
# load the dataframe
# - df1.csv are groups 1-3 in Rust's paper
# - df2.csv is group 4
# - df3.csv are groups 1-4

df = load("df3.csv") |> DataFrame;

In :
# compute empirical distribution of state transitions, using the discretized states
# Note that the assumption in Rust's paper is that the transition is at most 2 states
# - first obtain the transition probability from choosing action 0 (no replace)
J = 2; # number of choices
B = 90; # number of states
β = .9999;  # discount factor
θ = [3; 10];  # initial guess of [θ_1, RC]
θ_3 = zeros(3, J)

trans0 = @linq df |>
where(:i.== 0) |>
transform(p = :delta_x .==0,  q = (:delta_x .== 1));
p = mean(trans0.p);
q = mean(trans0.q);
θ_3[:, 1] = [p; q; 1 - p - q];

# - obtain transition probabilities from choosing action 1 (replace)
trans1 = @linq df |>
where(:i.== 1) |>
transform(p = :delta_x .==0,  q = (:delta_x .== 1));
p1 = mean(trans1.p);
q1 = mean(trans1.q);
θ_3[:, 2] = [p1; q1; 1 - p1 - q1];


## Coding the NFP Algorithm¶

In :
# outer loop - partial likelihood taking the state transition parameters
# - θ_3 as given.
# θ: structural parameters of interest to search over. In Rust's case this is
# - θ_11 and RC
# θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# - second two indictate the 0 and +1 state transitions for action i = 1
# data: data
# B: number of states (90)
# J: number of actions (2)
# β: discount factor
function compute_ll(θ, data, B, G, J, β)

# we only need the state and action from the dataset
x = data.x
i = data.i

# given θ, iterate the inner loop
EV = compute_EV(θ, B, G, J, β)
maxEV = maximum(EV)

# initialize Cond. choice prob.
CCP = zeros(J, B)

u = compute_utility(θ, B, J);

# use logit formula to calculate conditional choice prob
for y = 1:B
denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]

# compute the partial log likelihood
ll = 0
for n = 1:length(x)
x_t = x[n]
i_t = i[n]
ll = ll + log(CCP[i_t + 1, x_t + 1])
end
return -ll
end

# # outer loop - partial likelihood taking the state transition parameters
# # - θ_3 as given.
# # θ: structural parameters of interest to search over. In Rust's case this is
# # - θ_11 and RC
# # θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# # - second two indictate the 0 and +1 state transitions for action i = 1
# # data: data
# # B: number of states (90)
# # J: number of actions (2)
# # β: discount factor
# function compute_ll(θ, θ_3, data, B, J, β)

#     # we only need the state and action from the dataset
#     x = data.x
#     i = data.i

#     # construct parameter vector to feed into inner loop
#     θ = [θ; θ_3]

#     # given θ, iterate the inner loop
#     EV = compute_EV(θ, B, J, β)
#     maxEV = maximum(EV)

#     # initialize Cond. choice prob.
#     CCP = zeros(J, B)

#     # use logit formula to calculate conditional choice prob
#     for x = 1:B
#         u = compute_utility(θ, x, J);
#         denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
#         CCP[1,x] = exp.(u .+ β .* EV[1, x] .- maxEV)./denom[1,x]
#     end
#     # CCP for the second action is just "1 - CCP" of the first action given state x
#     CCP[2,:] = 1 .- CCP[1,:]

#     # compute the partial log likelihood
#     ll = 0
#     for n = 1:length(x)
#         x_t = x[n]
#         i_t = i[n]
#         ll = ll + log(CCP[i_t + 1, x_t + 1])
#     end
#     return -ll
# end


Note that because of discreteness of our state space, the equation of interest (4.14) can be simplified:

\begin{align*} EV(x, i) &= \int_y \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; p(dy|x, i, \theta_3) \\ &= p \int_x^{x + 5000} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \\ &\quad + q \int_{x + 5000}^{x + 10000} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \\ &\quad + (1 - p - q)\int_{x + 5000}^{\infty} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \end{align*}

But notice that the integral over $dy$ doesn't really matter because we have discretized the states for every 5000 miles. So we can replace $y$ with current state $x$ plus however many discrete states we will transition as implied by the transition probability and drop the integral

In :
# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor

function compute_EV(θ, B, G, J, β)
tol = 100

# initial guess of EV
EV_old = zeros(J, B)

while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired
EV_new = zeros(J, B)

for j = 1:J # iterate over actions
EV_new[j, :] = logsumexp(compute_utility(θ, B, J) + β * EV_old) * G[:, :, j]'
end
tol = maximum(abs.(EV_new - EV_old))
EV_old = EV_new
end
return EV_old

end

# function compute_EV(θ, B, J, β)
# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor
#     tol = 100

#     θ_3 = zeros(2, J) # two parameters indexing transitions, J actions

#     θ_3[:, 1] = θ[3:4] # transition parameters for action 0
#     θ_3[:, 2] = θ[5:6] # transition parameters for action 1

#     # initial guess of EV
#     EV_old = zeros(J, B)

#     while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired
#         EV_new = zeros(J, B)
#         for j = 1:J # iterate over actions

#             p = θ_3[1, j] # given action, obtain transition probabilities
#             q = θ_3[2, j]

#             for x = 1:B # iterate over states
#                 if j == 1
#                 # if don't replace engine, we can transition 0, 1 or 2 states
#                 # - integral over the state is dropped because of discreteness of states and transitions
#                 # - minimum function exist because we have a finite number of states
#                 # - multiplying by the transition probabilities is essentially integrating over p(dy|⋅) in equation 4.14
#                 # -- because of the discrete states/transition nature

#                 EV_new[j, x] =  p * logsumexp(compute_utility(θ, x-1, J) .+ β*EV_old[:,x]) +
#                                 q * logsumexp(compute_utility(θ, minimum([x, B-1]), J) .+ β*EV_old[:, minimum([x+1, B])]) +
#                                 (1 - p - q) * logsumexp(compute_utility(θ, minimum([x + 1, B - 1]), J) .+ β*EV_old[:, minimum([x+2, B])])
#                 else
#                 # if replace engine, state must transition back to 0, 1, or 2 next period
#                 EV_new[j, x] =  p * logsumexp(compute_utility(θ, 0, J) .+ β*EV_old[:,1]) +
#                                 q * logsumexp(compute_utility(θ, 1, J) .+ β*EV_old[:,2]) +
#                                 (1 - p - q) * logsumexp(compute_utility(θ, 2, J) .+ β*EV_old[:,3])
#                 end
#             end
#         end
#         tol = maximum(abs.(EV_new - EV_old))
#         EV_old = EV_new
#     end
#     return (EV_old)

# end

Out:
compute_EV (generic function with 1 method)

We let $c = 0.001 \theta_1 x$ in order to attempt to replicate Table IX

In :
# θ: params
# x: state, in {0 ,.., 89}
# J: number of choices
# out: J x 1 vector of utilities, each element corresponding to a specific choice
function compute_utility(θ, B, J)
θ_1 = θ  # since cost fn only has 1 param
R = θ
u = zeros(J,B)

# let c(x, θ_1) = 0.001θ_1x
for x = 0:B-1
for i = 0: J-1
u[i+1, x+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
end
end

return u
end

# # θ: params
# # x: state, in {0 ,.., 89}
# # J: number of choices
# # out: J x 1 vector of utilities, each element corresponding to a specific choice
# function compute_utility(θ, x, J)
#     θ_1 = θ  # since cost fn only has 1 param
#     R = θ
#     u = zeros(J,1)

#     # let c(x, θ_1) = 0.001θ_1x
#     for i = 0: J-1
#         u[i+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
#     end

#     return u
# end

Out:
compute_utility (generic function with 1 method)
In :
# create the transition matrices
function build_transition_mx(θ_3, B, J)
G = zeros(B, B, J);

for x = 1:B
G[x, 1:3, 2] = θ_3[:, 2]
for y = x:x+2
G[x, minimum([y, B]), 1] = G[x,minimum([y, B]),1] + θ_3[y - x + 1, 1]
end
end

return G
end

Out:
build_transition_mx (generic function with 1 method)

## Solving for structural parameters¶

We have already gotten the $\theta_3$ parameters from the data, so the next step is to take those as given and solve the partial log-likeihood \begin{align*} \sum_{t = 1}^T \log(Pr(i_t|x_t;\theta)) \end{align*}

Given the observed actions $i_t$ and states $x_t$ from the data. To do this, we compute the conditional choice probability implies by the NFP algorithm and then search over parameters $\theta_1, RC$ that return the CCP that maximizes the likelihood.

In :
# a general framework if you want to have multiple optimizatioθations
G = build_transition_mx(θ_3, B, J)

nruns = 1;
res = zeros(length(θ), nruns)
minf = zeros(nruns)

opt = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt,1e-6)
maxeval!(opt, 10000)

for run = 1:nruns
if run >= 2
θ = res[:, run - 1]
end

min_objective!(opt, (vars, y) -> compute_ll(vars, df, B, G, J, β))
@time (minf[run],res[:, run],ret) = NLopt.optimize(opt, θ)
end

res

284.804959 seconds (75.51 M allocations: 456.961 GiB, 13.38% gc time)

Out:
2×1 Array{Float64,2}:
2.5557595103164203
9.817935469327495 

Compare the results to Table IX column 3, seems pretty close.

Now we want to optimize over the full likelihood function starting at our estimates of $\theta_1, RC, \theta_3$ in order to obtain an effcient estimate of these parameters (optional, can stop with the partial LL)

In :
function compute_ll_full(θ, data, B, J, β)
x = data.x
i = data.i
delta_x = data.delta_x

θ_1 = θ[1:2];
θ_3 = zeros(3, J) # two parameters indexing transitions, J actions

θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1

# If optimizer wants negative probabilies, throwa large objective function value
if sum(θ_3 .< 0) >= 1
return 1e10
end

# build the transition matrix implied by the parameters
G = build_transition_mx(θ_3, B, J)

CCP = zeros(J, B)

# compute inner loop
EV = compute_EV(θ_1, B, G, J, β)
maxEV = maximum(EV)

u = compute_utility(θ, B, J);

# use logit formula to calculate conditional choice prob
for y = 1:B
denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]

# compute full log likelihood function
ll = 0
for n = 1:length(x)
x_t = x[n]
i_t = i[n]
x_tp1 = delta_x[n] # state transitions

# minimum is because we cap our state transitions at 2
ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
end
return -ll
end

# function compute_ll_full(θ, data, B, G, J, β)
#     x = data.x
#     i = data.i
#     delta_x = data.delta_x

#     θ_3 = zeros(3, J) # two parameters indexing transitions, J actions

#     θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
#     θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1

#     # If optimizer wants negative probabilies, throwa large objective function value
#     if sum(θ_3 .< 0) >= 1
#         return 1e10
#     end

#     CCP = zeros(J, B)

#     # compute inner loop
#     EV = compute_EV(θ, B, J, β)
#     maxEV = maximum(EV)

#     # obtain CCP
#     for x = 1:B
#         u = compute_utility(θ, x, J);
#         denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
#         CCP[1,x] = exp.(u .+ β .* EV[1, x] .- maxEV)./denom[1,x]
#     end
#     CCP[2,:] = 1 .- CCP[1,:]

#     # compute full log likelihood function
#     ll = 0
#     for n = 1:length(x)
#         x_t = x[n]
#         i_t = i[n]
#         x_tp1 = delta_x[n] # state transitions

#         # minimum is because we cap our state transitions at 2
#         ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
#     end
#     return -ll
# end

In :
θ_3hat = reshape(θ_3[1:2,:], 4)
θ_new = [θ; θ_3hat]

opt2 = Opt(:LN_NELDERMEAD, length(θ_new))
ftol_rel!(opt2,1e-6)
maxeval!(opt2, 10000)

min_objective!(opt2, (vars, y) -> compute_ll_full(vars, df, B, J, β))
@time (minf2,minx2,ret2) = NLopt.optimize(opt2, θ_new)

842.770916 seconds (210.90 M allocations: 1.316 TiB, 13.73% gc time)

Out:
(6238.392464832454, [2.588360857386383, 9.91233716850671, 0.35672458975600285, 0.628741985413026, 0.6800480698441189, 0.31993723241917094], :FTOL_REACHED)

Pretty close to the actual Rust results. Note that the results from the partial likelihood are even closer. Differences from this data and Rust's data might be the reason why results are slightly different. Rust has slightly fewer observationss.

# Hotz - Miller Approach¶

## Step 1: Estimate Transition CDF from the data¶

We will estimate the transition CDF (note: not probability density) from the data simply by counting the number of observations with transitions.

Assume that if we do not see any people in a state/action pairing, then they stay in that state if $j = 0$, or go to state 0 if $j = 1$

In :
# transition density
f_hat = zeros(B, B, J)
tmp = by(df, :id, xp = :x => Base.Fix2(lead, 1))
df.xp = tmp.xp

for j = 1:J
df_j = @linq df |>
where(:i .== j-1)

df_j = @linq df_j |>
where(.!ismissing.(:xp))

for x = 0:B-1
N = sum(df_j.x  .== x);

# if don't replace, cannot transition backwards in state. If do replace, can transition backwards
for xp = (x*(j%2)):B-1
df_j2 = @linq df_j |>
where(:x .== x, :xp .== xp)
num = sum(length(df_j2.x))
if(N != 0)
f_hat[x + 1, xp + 1, j] = num/N;
elseif ((xp == x) & (j == 1))
f_hat[x + 1, xp + 1, j] = 1;
elseif ((xp == 0) & (j == 2))
f_hat[x + 1, xp + 1, j] = 1;
else
f_hat[x + 1, xp + 1, j] = 0;
end
end
end
end

In :
# empirical CCP, use a probit because 0 probability is bad
# df.x2 = df.x.^2
# df.x3 = df.x.^3

CCP_probit = glm(@formula(i ~ x), df, Binomial(), ProbitLink())

CCP_hat = zeros(J,B)

pred_ccp_x = DataFrame(x = 0:89, x2 = (0:89).^2, x3 = (0:89).^3)

CCP_hat[2, :] = predict(CCP_probit, pred_ccp_x);
CCP_hat[1, :] = 1 .- CCP_hat[2, :];


The next step is to simulate the choice-specific value function. That is, we want to simulate:

\begin{align*} V_s(i_t, x_t;\theta) &= E_{\epsilon_{i_t, t}}[V(i_t, x_t, \epsilon_t; \theta)] \\ &= -c(x_t, i_t,; \theta_1) + \beta E_{\epsilon}[u(x_{t+1}, i_{t+1}; \theta) + \epsilon_{i_{t+1}, t+1} + \beta E_\epsilon[\ldots]] \end{align*}

Note that EV type 1 assumptions allos us to simplify the expectation: \begin{align*} E[\epsilon| i_t, x_t] = \gamma - log(pr(i_t|x_t)) \end{align*} Where $\gamma = .5772$ is euler's constant. We draw $S$ simulations each of length $T$, and then compute the conditional choice probability implied by the simulated choice-specific value functions.

In :
S = 100; # number of sims
T = 100; # time length of sims

In :
function simulate_V(θ, f_hat, CCP_hat, B, J, S, T)

# we'll eventually take the mean across the third dimension of the simulated V's
V_sim = zeros(J, B, S)
u = compute_utility(θ, B, J)

Threads.@threads for s = 1:S
Random.seed!(29489 + s);
for j = 0:J - 1
for x = 0:B - 1
path = zeros(Int64, T, 2) # T x 2 matrix of the maths that state and choices will evolve
path[1, :] = [x j] # initial starting point
for t = 2:T
# obtain previous time period state and action
x_m1 = path[t - 1, 1]
j_m1 = path[t - 1, 2];

# simulate this period's state and action
xp = sample(0:B-1, Weights(f_hat[x_m1 + 1, :, j_m1 + 1]))
jp = sample(0:J-1, Weights(CCP_hat[:, xp + 1]))

# add to bath
path[t, :] = [xp jp]
end

# backwards iterate on path to compute V_sim
for t = T:-1:2
x_t = path[t, 1];
j_t = path[t, 2];

# recursively compute flow utility + E(ϵ) + β * future util
V_sim[j + 1,x + 1,s] = u[j_t + 1, x_t + 1] + .5772 - log(CCP_hat[j_t + 1, x_t + 1]) + β * V_sim[j + 1,x + 1,s]
end

# in period 1, we do not add E[ϵ]
V_sim[j+1, x+1, s] = u[j + 1, x + 1] + β * V_sim[j + 1,x + 1,s]

end
end
end

# take the mean over the simulations
V_sim = mean(V_sim, dims = 3)

return V_sim
end

Out:
simulate_V (generic function with 1 method)
In :
function Hotz_Miller_obj(θ, f_hat, CCP_hat, B, J, S, T, W)

# compute the simulated V given θ
V_sim = simulate_V(θ, f_hat, CCP_hat, B, J, S, T)

maxV = maximum(V_sim);

CCP = zeros(J, B)

for x = 1:B
denom = sum(exp.(V_sim[:, x] .- maxV), dims = 1)
CCP[1,x] = exp.(V_sim[1, x] .- maxV)./denom
end
# CCP for the second action is just "1 - CCP" of the first action given state x
CCP[2,:] = 1 .- CCP[1,:]

# find difference between computed CCP and "empirical" CCP
g = CCP[1,:] - CCP_hat[1,:]

# weighting matrix is some arbitrary PSD matrix
# I weight states with no observations much less
obj = g' * W * g

return obj
end

Out:
Hotz_Miller_obj (generic function with 1 method)
In :
opt3 = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt3,1e-6)
maxeval!(opt3, 10000)

# W = Matrix{Float64}(I, B, B)
W = Diagonal([fill(1, 80); fill(0.02, B - 80)])

min_objective!(opt3, (vars, y) -> Hotz_Miller_obj(vars, f_hat, CCP_hat, B, J, S, T, W))
@time (minf3,minx3,ret3) = NLopt.optimize(opt3, θ)

 27.389042 seconds (778.74 M allocations: 92.264 GiB, 49.20% gc time)

Out:
(0.012872781641471749, [3.1542914210585877, 8.658485587802716], :FTOL_REACHED)

Note that the performance with Hotz-Miller is about 15x faster! The actual parameter estimates are a little different, but within a reasonable += 20% range

In [ ]:


In [ ]:


In [ ]:


In [ ]: