Dynamic Discrete Choice notebook¶

Trying to replicate Rust (1987) results using Rust (1987) and Hotz-Miller (1993) approaches

using Statistics, LinearAlgebra, GLM
using Random, Distributions, StatsBase
using DataFrames, DataFramesMeta, ShiftedArrays
using Optim, NLopt, NLSolversBase
using Plots
using DelimitedFiles, CSVFiles
Threads.nthreads()

13

Create a function that computes logsumexp trying to avoid overflows

function logsumexp(mat)
    A = mean(mat);
    out = log.(sum(exp.(mat .- A), dims = 1)) .+ A  # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
    return out
end

# function logsumexp(mat)
#     A = mean(mat);
#     out = log.(sum(exp.(mat .- A))) .+ A  # for matrices (not vectors), may need to specify dimension to sum over (usually dims = 1)
#     return out
# end

Some cleaning code

# df1 = readdlm("g870.csv", ',', Float64);
# df2 = readdlm("rt50.csv", ',', Float64);
# df3 = readdlm("t8h203.csv", ',', Float64);
# df8 = readdlm("a530875.csv", ',', Float64);

# df1 = reshape(df, 36, 15)'
# writedlm("g870.csv", df, ",")
       
# df2 = reshape(df2,60,4)'
# writedlm("rt50.csv", df2, ",")

# df3 = reshape(df3, 81, 48)'
# writedlm("t8h203.csv", df3, ",")

# df8 = reshape(df8, 128, 37)'
# writedlm("a530875.csv", df8, ",")

# load the dataframe
# - df1.csv are groups 1-3 in Rust's paper
# - df2.csv is group 4
# - df3.csv are groups 1-4

df = load("df3.csv") |> DataFrame;

# compute empirical distribution of state transitions, using the discretized states
# Note that the assumption in Rust's paper is that the transition is at most 2 states
# - first obtain the transition probability from choosing action 0 (no replace)
J = 2; # number of choices
B = 90; # number of states
β = .9999;  # discount factor
θ = [3; 10];  # initial guess of [θ_1, RC]
θ_3 = zeros(3, J)

trans0 = @linq df |>
    where(:i.== 0) |>
    transform(p = :delta_x .==0,  q = (:delta_x .== 1));
p = mean(trans0.p);
q = mean(trans0.q);
θ_3[:, 1] = [p; q; 1 - p - q];

# - obtain transition probabilities from choosing action 1 (replace)
trans1 = @linq df |>
    where(:i.== 1) |>
    transform(p = :delta_x .==0,  q = (:delta_x .== 1));
p1 = mean(trans1.p);
q1 = mean(trans1.q);
θ_3[:, 2] = [p1; q1; 1 - p1 - q1];

Coding the NFP Algorithm¶

# outer loop - partial likelihood taking the state transition parameters 
# - θ_3 as given.
# θ: structural parameters of interest to search over. In Rust's case this is
# - θ_11 and RC
# θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# - second two indictate the 0 and +1 state transitions for action i = 1
# data: data
# B: number of states (90)
# J: number of actions (2)
# β: discount factor
function compute_ll(θ, data, B, G, J, β)
    
    # we only need the state and action from the dataset
    x = data.x
    i = data.i
    
    # given θ, iterate the inner loop
    EV = compute_EV(θ, B, G, J, β)
    maxEV = maximum(EV)

    # initialize Cond. choice prob.
    CCP = zeros(J, B)
    
    u = compute_utility(θ, B, J);
    
    # use logit formula to calculate conditional choice prob
    for y = 1:B
        denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
        CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
    end
    # CCP for the second action is just "1 - CCP" of the first action given state x
    CCP[2,:] = 1 .- CCP[1,:]
    
    # compute the partial log likelihood
    ll = 0
    for n = 1:length(x)
        x_t = x[n]
        i_t = i[n]
        ll = ll + log(CCP[i_t + 1, x_t + 1])
    end
    return -ll
end


# # outer loop - partial likelihood taking the state transition parameters 
# # - θ_3 as given.
# # θ: structural parameters of interest to search over. In Rust's case this is
# # - θ_11 and RC
# # θ_3: 4x1 vector of transition parameters, first two indicate 0 and +1 state transition for action i = 0
# # - second two indictate the 0 and +1 state transitions for action i = 1
# # data: data
# # B: number of states (90)
# # J: number of actions (2)
# # β: discount factor
# function compute_ll(θ, θ_3, data, B, J, β)
    
#     # we only need the state and action from the dataset
#     x = data.x
#     i = data.i
    
#     # construct parameter vector to feed into inner loop
#     θ = [θ; θ_3]
    
#     # given θ, iterate the inner loop
#     EV = compute_EV(θ, B, J, β)
#     maxEV = maximum(EV)

#     # initialize Cond. choice prob.
#     CCP = zeros(J, B)
    
#     # use logit formula to calculate conditional choice prob
#     for x = 1:B
#         u = compute_utility(θ, x, J);
#         denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
#         CCP[1,x] = exp.(u[1] .+ β .* EV[1, x] .- maxEV)./denom[1,x]
#     end
#     # CCP for the second action is just "1 - CCP" of the first action given state x
#     CCP[2,:] = 1 .- CCP[1,:]
    
#     # compute the partial log likelihood
#     ll = 0
#     for n = 1:length(x)
#         x_t = x[n]
#         i_t = i[n]
#         ll = ll + log(CCP[i_t + 1, x_t + 1])
#     end
#     return -ll
# end

Note that because of discreteness of our state space, the equation of interest (4.14) can be simplified:

\begin{align*} EV(x, i) &= \int_y \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; p(dy|x, i, \theta_3) \\ &= p \int_x^{x + 5000} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \\ &\quad + q \int_{x + 5000}^{x + 10000} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \\ &\quad + (1 - p - q)\int_{x + 5000}^{\infty} \log[ \sum_j \exp[u(y,j,\theta_1) + \beta EV(y, j)] \; dy \end{align*}

But notice that the integral over $dy$ doesn't really matter because we have discretized the states for every 5000 miles. So we can replace $y$ with current state $x$ plus however many discrete states we will transition as implied by the transition probability and drop the integral

# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor


function compute_EV(θ, B, G, J, β)
    tol = 100 
        
    # initial guess of EV
    EV_old = zeros(J, B)
    
    
    while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired 
        EV_new = zeros(J, B)
                
        for j = 1:J # iterate over actions
            EV_new[j, :] = logsumexp(compute_utility(θ, B, J) + β * EV_old) * G[:, :, j]' 
        end
        tol = maximum(abs.(EV_new - EV_old))
        EV_old = EV_new
    end
    return EV_old
    
end



# function compute_EV(θ, B, J, β)
# computes the inner fixed point for EV
# θ: vector of parameters, includes state transition parameters (unlike the outer loop θ)
# B: number of states (90) (note that they are indexed 1:90, but the realizations are 0:89)
# J: number of actions (2)
# β: discount factor
#     tol = 100 
   
#     θ_3 = zeros(2, J) # two parameters indexing transitions, J actions

#     θ_3[:, 1] = θ[3:4] # transition parameters for action 0
#     θ_3[:, 2] = θ[5:6] # transition parameters for action 1
        
#     # initial guess of EV
#     EV_old = zeros(J, B)
    
    
#     while tol > 1e-4 # good enough tolerance level, set 1e-6 if desired 
#         EV_new = zeros(J, B)
#         for j = 1:J # iterate over actions
            
#             p = θ_3[1, j] # given action, obtain transition probabilities
#             q = θ_3[2, j]
            
#             for x = 1:B # iterate over states
#                 if j == 1
#                 # if don't replace engine, we can transition 0, 1 or 2 states
#                 # - integral over the state is dropped because of discreteness of states and transitions
#                 # - minimum function exist because we have a finite number of states
#                 # - multiplying by the transition probabilities is essentially integrating over p(dy|⋅) in equation 4.14
#                 # -- because of the discrete states/transition nature
                    
#                 EV_new[j, x] =  p * logsumexp(compute_utility(θ, x-1, J) .+ β*EV_old[:,x]) +
#                                 q * logsumexp(compute_utility(θ, minimum([x, B-1]), J) .+ β*EV_old[:, minimum([x+1, B])]) + 
#                                 (1 - p - q) * logsumexp(compute_utility(θ, minimum([x + 1, B - 1]), J) .+ β*EV_old[:, minimum([x+2, B])])
#                 else
#                 # if replace engine, state must transition back to 0, 1, or 2 next period
#                 EV_new[j, x] =  p * logsumexp(compute_utility(θ, 0, J) .+ β*EV_old[:,1]) +
#                                 q * logsumexp(compute_utility(θ, 1, J) .+ β*EV_old[:,2]) + 
#                                 (1 - p - q) * logsumexp(compute_utility(θ, 2, J) .+ β*EV_old[:,3]) 
#                 end
#             end
#         end
#         tol = maximum(abs.(EV_new - EV_old))
#         EV_old = EV_new
#     end
#     return (EV_old)
    
# end

compute_EV (generic function with 1 method)

We let $c = 0.001 \theta_1 x$ in order to attempt to replicate Table IX

# θ: params
# x: state, in {0 ,.., 89}
# J: number of choices 
# out: J x 1 vector of utilities, each element corresponding to a specific choice
function compute_utility(θ, B, J)
    θ_1 = θ[1]  # since cost fn only has 1 param
    R = θ[2]
    u = zeros(J,B)
    
    # let c(x, θ_1) = 0.001θ_1x
    for x = 0:B-1
        for i = 0: J-1     
            u[i+1, x+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
        end
    end
    
    return u
end



# # θ: params
# # x: state, in {0 ,.., 89}
# # J: number of choices 
# # out: J x 1 vector of utilities, each element corresponding to a specific choice
# function compute_utility(θ, x, J)
#     θ_1 = θ[1]  # since cost fn only has 1 param
#     R = θ[2]
#     u = zeros(J,1)
    
#     # let c(x, θ_1) = 0.001θ_1x
#     for i = 0: J-1            
#         u[i+1] = i * -R - (1 - i) * 0.001 * θ_1 * x
#     end
    
#     return u
# end

compute_utility (generic function with 1 method)

# create the transition matrices
function build_transition_mx(θ_3, B, J)
    G = zeros(B, B, J);

    for x = 1:B
        G[x, 1:3, 2] = θ_3[:, 2]
        for y = x:x+2
            G[x, minimum([y, B]), 1] = G[x,minimum([y, B]),1] + θ_3[y - x + 1, 1]
        end
    end
    
    return G
end

build_transition_mx (generic function with 1 method)

Solving for structural parameters¶

We have already gotten the $\theta_3$ parameters from the data, so the next step is to take those as given and solve the partial log-likeihood \begin{align*} \sum_{t = 1}^T \log(Pr(i_t|x_t;\theta)) \end{align*}

Given the observed actions $i_t$ and states $x_t$ from the data. To do this, we compute the conditional choice probability implies by the NFP algorithm and then search over parameters $\theta_1, RC$ that return the CCP that maximizes the likelihood.

# a general framework if you want to have multiple optimizatioθations
G = build_transition_mx(θ_3, B, J)

nruns = 1;
res = zeros(length(θ), nruns)
minf = zeros(nruns)

opt = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt,1e-6)
maxeval!(opt, 10000)

for run = 1:nruns
    if run >= 2
        θ = res[:, run - 1]
    end
    
    min_objective!(opt, (vars, y) -> compute_ll(vars, df, B, G, J, β))
    @time (minf[run],res[:, run],ret) = NLopt.optimize(opt, θ)
end

res

284.804959 seconds (75.51 M allocations: 456.961 GiB, 13.38% gc time)

2×1 Array{Float64,2}:
 2.5557595103164203
 9.817935469327495

Compare the results to Table IX column 3, seems pretty close.

Now we want to optimize over the full likelihood function starting at our estimates of $\theta_1, RC, \theta_3$ in order to obtain an effcient estimate of these parameters (optional, can stop with the partial LL)

function compute_ll_full(θ, data, B, J, β)
    x = data.x
    i = data.i
    delta_x = data.delta_x
    
    θ_1 = θ[1:2];
    θ_3 = zeros(3, J) # two parameters indexing transitions, J actions
    
    θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
    θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1
    
    # If optimizer wants negative probabilies, throwa large objective function value
    if sum(θ_3 .< 0) >= 1
        return 1e10
    end
    
    # build the transition matrix implied by the parameters
    G = build_transition_mx(θ_3, B, J)
    
    CCP = zeros(J, B)
    
    # compute inner loop
    EV = compute_EV(θ_1, B, G, J, β)
    maxEV = maximum(EV)
    
    u = compute_utility(θ, B, J);
    
    # use logit formula to calculate conditional choice prob
    for y = 1:B
        denom = sum(exp.(u[:, y] .+ β * EV .- maxEV), dims = 1)
        CCP[1,y] = exp.(u[1, y] .+ β .* EV[1, y] .- maxEV)./denom[1,y]
    end
    # CCP for the second action is just "1 - CCP" of the first action given state x
    CCP[2,:] = 1 .- CCP[1,:]
    
    # compute full log likelihood function
    ll = 0
    for n = 1:length(x)
        x_t = x[n]
        i_t = i[n]
        x_tp1 = delta_x[n] # state transitions
        
        # minimum is because we cap our state transitions at 2
        ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
    end
    return -ll
end




# function compute_ll_full(θ, data, B, G, J, β)
#     x = data.x
#     i = data.i
#     delta_x = data.delta_x
        
#     θ_3 = zeros(3, J) # two parameters indexing transitions, J actions
    
#     θ_3[:, 1] = [θ[3:4]; 1 - sum(θ[3:4])] # obtain state transitions given choice i = 0
#     θ_3[:, 2] = [θ[5:6]; 1 - sum(θ[5:6])] # " i = 1
    
#     # If optimizer wants negative probabilies, throwa large objective function value
#     if sum(θ_3 .< 0) >= 1
#         return 1e10
#     end
    
#     CCP = zeros(J, B)
    
#     # compute inner loop
#     EV = compute_EV(θ, B, J, β)
#     maxEV = maximum(EV)
    
#     # obtain CCP 
#     for x = 1:B
#         u = compute_utility(θ, x, J);
#         denom = sum(exp.(u .+ β * EV .- maxEV), dims = 1)
#         CCP[1,x] = exp.(u[1] .+ β .* EV[1, x] .- maxEV)./denom[1,x]
#     end
#     CCP[2,:] = 1 .- CCP[1,:]
    
#     # compute full log likelihood function
#     ll = 0
#     for n = 1:length(x)
#         x_t = x[n]
#         i_t = i[n]
#         x_tp1 = delta_x[n] # state transitions
        
#         # minimum is because we cap our state transitions at 2
#         ll = ll + log(CCP[i_t + 1, x_t + 1]) + log(θ_3[minimum([x_tp1, 2]) + 1, i_t + 1])
#     end
#     return -ll
# end

θ_3hat = reshape(θ_3[1:2,:], 4)
θ_new = [θ; θ_3hat]

opt2 = Opt(:LN_NELDERMEAD, length(θ_new))
ftol_rel!(opt2,1e-6)
maxeval!(opt2, 10000)

min_objective!(opt2, (vars, y) -> compute_ll_full(vars, df, B, J, β))
@time (minf2,minx2,ret2) = NLopt.optimize(opt2, θ_new)

842.770916 seconds (210.90 M allocations: 1.316 TiB, 13.73% gc time)

(6238.392464832454, [2.588360857386383, 9.91233716850671, 0.35672458975600285, 0.628741985413026, 0.6800480698441189, 0.31993723241917094], :FTOL_REACHED)

Pretty close to the actual Rust results. Note that the results from the partial likelihood are even closer. Differences from this data and Rust's data might be the reason why results are slightly different. Rust has slightly fewer observationss.

Hotz - Miller Approach¶

Step 1: Estimate Transition CDF from the data¶

We will estimate the transition CDF (note: not probability density) from the data simply by counting the number of observations with transitions.

Assume that if we do not see any people in a state/action pairing, then they stay in that state if $j = 0$, or go to state 0 if $j = 1$

# transition density
f_hat = zeros(B, B, J)
tmp = by(df, :id, xp = :x => Base.Fix2(lead, 1))
df.xp = tmp.xp

for j = 1:J       
    df_j = @linq df |>
        where(:i .== j-1)
        
    df_j = @linq df_j |>
        where(.!ismissing.(:xp)) 
        
    for x = 0:B-1
        N = sum(df_j.x  .== x); 
        
        # if don't replace, cannot transition backwards in state. If do replace, can transition backwards
        for xp = (x*(j%2)):B-1    
            df_j2 = @linq df_j |>
                where(:x .== x, :xp .== xp) 
            num = sum(length(df_j2.x))
            if(N != 0)
                f_hat[x + 1, xp + 1, j] = num/N;
            elseif ((xp == x) & (j == 1))
                f_hat[x + 1, xp + 1, j] = 1;
            elseif ((xp == 0) & (j == 2))
                f_hat[x + 1, xp + 1, j] = 1;
            else
                f_hat[x + 1, xp + 1, j] = 0;
            end
        end
    end
end

# empirical CCP, use a probit because 0 probability is bad
# df.x2 = df.x.^2
# df.x3 = df.x.^3

CCP_probit = glm(@formula(i ~ x), df, Binomial(), ProbitLink())

CCP_hat = zeros(J,B)

pred_ccp_x = DataFrame(x = 0:89, x2 = (0:89).^2, x3 = (0:89).^3)

CCP_hat[2, :] = predict(CCP_probit, pred_ccp_x);
CCP_hat[1, :] = 1 .- CCP_hat[2, :];

The next step is to simulate the choice-specific value function. That is, we want to simulate:

\begin{align*} V_s(i_t, x_t;\theta) &= E_{\epsilon_{i_t, t}}[V(i_t, x_t, \epsilon_t; \theta)] \\ &= -c(x_t, i_t,; \theta_1) + \beta E_{\epsilon}[u(x_{t+1}, i_{t+1}; \theta) + \epsilon_{i_{t+1}, t+1} + \beta E_\epsilon[\ldots]] \end{align*}

Note that EV type 1 assumptions allos us to simplify the expectation: \begin{align*} E[\epsilon| i_t, x_t] = \gamma - log(pr(i_t|x_t)) \end{align*} Where $\gamma = .5772$ is euler's constant. We draw $S$ simulations each of length $T$, and then compute the conditional choice probability implied by the simulated choice-specific value functions.

S = 100; # number of sims
T = 100; # time length of sims

function simulate_V(θ, f_hat, CCP_hat, B, J, S, T)
    
    # we'll eventually take the mean across the third dimension of the simulated V's
    V_sim = zeros(J, B, S)
    u = compute_utility(θ, B, J)
    
    Threads.@threads for s = 1:S
        Random.seed!(29489 + s);
        for j = 0:J - 1
            for x = 0:B - 1
                path = zeros(Int64, T, 2) # T x 2 matrix of the maths that state and choices will evolve
                path[1, :] = [x j] # initial starting point
                for t = 2:T
                    # obtain previous time period state and action
                    x_m1 = path[t - 1, 1] 
                    j_m1 = path[t - 1, 2];
                    
                    # simulate this period's state and action
                    xp = sample(0:B-1, Weights(f_hat[x_m1 + 1, :, j_m1 + 1]))
                    jp = sample(0:J-1, Weights(CCP_hat[:, xp + 1]))
                    
                    # add to bath
                    path[t, :] = [xp jp]
                end 
                
                # backwards iterate on path to compute V_sim
                for t = T:-1:2
                    x_t = path[t, 1];
                    j_t = path[t, 2];
                    
                    # recursively compute flow utility + E(ϵ) + β * future util
                    V_sim[j + 1,x + 1,s] = u[j_t + 1, x_t + 1] + .5772 - log(CCP_hat[j_t + 1, x_t + 1]) + β * V_sim[j + 1,x + 1,s]
                end
                
                # in period 1, we do not add E[ϵ]
                V_sim[j+1, x+1, s] = u[j + 1, x + 1] + β * V_sim[j + 1,x + 1,s]
                
            end
        end
    end
    
    # take the mean over the simulations
    V_sim = mean(V_sim, dims = 3)
    
    return V_sim
end

simulate_V (generic function with 1 method)

function Hotz_Miller_obj(θ, f_hat, CCP_hat, B, J, S, T, W)
    
    # compute the simulated V given θ
    V_sim = simulate_V(θ, f_hat, CCP_hat, B, J, S, T)
    
    maxV = maximum(V_sim);
    
    CCP = zeros(J, B)
    
    for x = 1:B
        denom = sum(exp.(V_sim[:, x] .- maxV), dims = 1)
        CCP[1,x] = exp.(V_sim[1, x] .- maxV)./denom[1]
    end
    # CCP for the second action is just "1 - CCP" of the first action given state x
    CCP[2,:] = 1 .- CCP[1,:]
    
    # find difference between computed CCP and "empirical" CCP
    g = CCP[1,:] - CCP_hat[1,:]
    
    # weighting matrix is some arbitrary PSD matrix
    # I weight states with no observations much less
    obj = g' * W * g
        
    return obj
end

Hotz_Miller_obj (generic function with 1 method)

opt3 = Opt(:LN_NELDERMEAD, length(θ))
ftol_rel!(opt3,1e-6)
maxeval!(opt3, 10000)

# W = Matrix{Float64}(I, B, B)
W = Diagonal([fill(1, 80); fill(0.02, B - 80)])

min_objective!(opt3, (vars, y) -> Hotz_Miller_obj(vars, f_hat, CCP_hat, B, J, S, T, W))
@time (minf3,minx3,ret3) = NLopt.optimize(opt3, θ)

 27.389042 seconds (778.74 M allocations: 92.264 GiB, 49.20% gc time)

(0.012872781641471749, [3.1542914210585877, 8.658485587802716], :FTOL_REACHED)

Note that the performance with Hotz-Miller is about 15x faster! The actual parameter estimates are a little different, but within a reasonable += 20% range