## Thursday, January 31, 2013

### US Daily Gun Deaths R Animation - Sandy Hook

R script

# Listenning to NPR about gun deaths in the US got me thinking.

# Find the article here:
# http://www.slate.com/articles/news_and_politics/crime/2012/12/gun_death_tally_every_american_gun_death_since_newtown_sandy_hook_shooting.html

# Let's create an animation of gun deaths since Dec 14, 2012
gun.deaths <- getcsv.php="" gun-deaths="" http:="" p="" read.csv="" slate-interactives-prod.elasticbeanstalk.com="">
gun.deaths$victimID # I first need to change the dates from days to days for later reference # First I will make a table of the dates wich will include a count deaths.tab = table(gun.deaths$date)
cum.deaths = deaths.tab
for (i in 1:(length(cum.deaths)-1)) cum.deaths[i+1] = cum.deaths[i]+deaths.tab[i+1]

plot(deaths.tab, main="Daily Total US Deaths by Gun", ylab="Death count")

# Number of days in our data (constantly updated every time we run the code
ndays = length(deaths.tab)

# This complicated bit of code will force the dates which are currently factor variables into string variables
gun.deaths$dates = t(data.frame(lapply(gun.deaths$date, as.character), stringsAsFactors=FALSE))

# Create an empty factor to be filled
gun.deaths$day = NA # This command loops through all of the days and checks if each individual entry in the data from is from that day. # If it is, then it assigns that day to the day entry. for (i in 1:ndays) gun.deaths$day[gun.deaths$dates==names(deaths.tab)[i]] = i # We will cut the data into different age categories # Some individuals have ages missing. We will code these as category 0. gun.deaths$age[is.na(gun.deaths$age)] <- 999="" p=""> gun.deaths$age.cat = ""
gun.deaths$age.cat[gun.deaths$age<12 children="" p="">  gun.deaths$age.cat[gun.deaths$age>11 & gun.deaths$age<20 p="" teens=""> gun.deaths$age.cat[gun.deaths$age>=20 & gun.deaths$age<30 adults20="" p="">  gun.deaths$age.cat[gun.deaths$age>=30 & gun.deaths$age<40 adults30="" p=""> gun.deaths$age.cat[gun.deaths$age>=40 & gun.deaths$age<65 madults="" p="">  gun.deaths$age.cat[gun.deaths$age>65 & gun.deaths$age<999 nbsp="" oadults="" p=""> # Adjust the latitude and logitude variables to account for a rescaling of the graph later # as well as some noise which will help identify when there are multiple deaths in the same city. nll = length(gun.deaths$lng)
gun.deaths$x = ((gun.deaths$lng+125)/(60))*ndays+rnorm(nll)*.006
# For the graph that will be produced the 20 year olds have the highest likelihood of death.
# Thus they will provide the y upper limit.
ymax = ceiling(sum(gun.deaths$age.cat=="adults20")/50)*50 gun.deaths$y = ((gun.deaths$lat-24)/(27))*ymax+rnorm(nll)*.06 # Generate subsets of the data. children = gun.deaths[gun.deaths$age.cat == "children",]
teens            = gun.deaths[gun.deaths$age.cat == "teens",] adults20 = gun.deaths[gun.deaths$age.cat == "adults20",]
adults30         = gun.deaths[gun.deaths$age.cat == "adults30",] madults = gun.deaths[gun.deaths$age.cat == "madults",]
oadults          = gun.deaths[gun.deaths$age.cat == "oadults",] # This will count cumulative deaths by data subset children.tab = table(children$date)
cum.children = children.tab
for (i in 1:(length(cum.children)-1)) cum.children[i+1] = cum.children[i]+children.tab[i+1]

teens.tab = table(teens$date) cum.teens = teens.tab for (i in 1:(length(cum.teens)-1)) cum.teens[i+1] = cum.teens[i]+teens.tab[i+1] adults20.tab = table(adults20$date)

adults30.tab = table(adults30$date) cum.adults30 = adults30.tab for (i in 1:(length(cum.adults30)-1)) cum.adults30[i+1] = cum.adults30[i]+adults30.tab[i+1] madults.tab = table(madults$date)

oadults.tab = table(oadults$date) cum.oadults = oadults.tab for (i in 1:(length(cum.oadults)-1)) cum.oadults[i+1] = cum.oadults[i]+oadults.tab[i+1] # This counts the total deaths cum.total = cum.adults20+cum.adults30+cum.teens+cum.children+cum.madults+cum.oadults # In order to get a map of the US we will need to install the library maps library(maps) # Rescale the US map to fit in our data map.us = map('state', plot=F) map.us$x = ((map('state', plot=F)$x+125)/(60))*ndays map.us$y = ((map('state', plot=F)$y-24)/(27)*ymax) # Static Plot - Final image i=ndays dev.new(width=15, height=10) plot(cum.adults20, type="n", ylim=c(0,ymax), ylab="Cumulative Deaths by Age Group" , main="US gun Deaths Since Dec 14, 2012", cex.main=1.5, xlab=paste(names(deaths.tab)[i],"day",toString(i), " ", toString(deaths.tab[i]), "Killed", " ", toString(cum.deaths[i]), "Dead")) grid(lwd = 2) # grid only in y-direction # Draw the US state map lines(map.us, type="l") # Place dots for every death lines(adults20$x, adults20$y, type="p", col="blue",pch=16, cex=.5) lines(adults30$x, adults30$y, type="p", col="purple",pch=16, cex=.5) lines(madults$x, madults$y, type="p", col="orange",pch=16, cex=.5) lines(oadults$x, oadults$y, type="p", col=colors()[641],pch=16, cex=.5) lines(teens$x, teens$y, type="p", col="red",pch=16, cex=.5) lines(children$x, children$y, type="p", col=colors()[258],pch=16, cex=.5) lines(cum.teens, type="l", col=gray(.9), lwd=2) lines(cum.children, type="l", col=gray(.9), lwd=2) lines(cum.adults20, type="l", col=gray(.9), lwd=2) lines(cum.adults30, type="l", col=gray(.9), lwd=2) lines(cum.madults, type="l", col=gray(.9), lwd=2) lines(cum.oadults, type="l", col=gray(.9), lwd=2) lines(cum.teens, type="l", col="red", lwd=1, cex=.5) lines(cum.children, type="l", col=colors()[258], lwd=1, cex=.5) lines(cum.adults20, type="l", col="blue", lwd=1, cex=.5) lines(cum.adults30, type="l", col="purple", lwd=1, cex=.5) lines(cum.madults, type="l", col="orange", lwd=1, cex=.5) lines(cum.oadults, type="l", col=colors()[641], lwd=1, cex=.5) lines(c(ndays,ndays), c(-15,ymax-27), lwd=2, lty=2) legend("topright", "Today", cex=1.5, bty="n") legend(0, ymax+20, expression(Children, Teens, "Adults 20s", "Adults 30s", "Adults 40-65", "Adults 65+"), lty = 1:1, col = c(colors()[258], "red", "blue", "purple", "orange", colors()[641]), adj = c(0, 0.6), ncol = 6, cex=.75, lwd=2 ) # Sequential Graphic - Animation # Animation package must be installed library(animation) deaths.animation = function() { for (i in c(1:ndays, rep(ndays,10))) { # Adding the rep(ndays,10)) causes the animation to wait for the equivalent of 10 days after ending. plot(cum.adults20, type="n", ylim=c(0,ymax), ylab="Cumulative Deaths by Age Group" , main="US gun Deaths Since Dec 14, 2012", cex.main=2, xlab=paste(names(deaths.tab)[i],"day",toString(i), " ", toString(deaths.tab[i]), "Killed", " ", toString(cum.deaths[i]), "Dead")) # Draw the US state map grid(lwd = 2) # grid only in y-direction # Place dots for every death lines(map.us, type="l") t.adults20 = adults20[adults20$day  t.adults30 = adults30[adults30$day t.madults = madults[madults$day  t.oadults = oadults[oadults$day t.teens = teens[teens$day  t.children = children[children$day lines(t.adults20$x, t.adults20$y, type="p", col="blue",pch=16, cex=1.5) lines(t.adults30$x, t.adults30$y, type="p", col="purple",pch=16, cex=1.5) lines(t.madults$x, t.madults$y, type="p", col="orange",pch=16, cex=1.5) lines(t.oadults$x, t.oadults$y, type="p", col=colors()[641],pch=16, cex=1.5) lines(t.teens$x, t.teens$y, type="p", col="red",pch=16, cex=1.5) lines(t.children$x, t.children$y, type="p", col=colors()[258],pch=16, cex=1.5) p.adults20 = adults20[adults20$day==i,]
p.adults30 = adults30[adults30$day==i,] p.madults = madults[madults$day==i,]
p.oadults = oadults[oadults$day==i,] p.teens = teens[teens$day==i,]
qui replace ln_likehood' = ln(1-(zA' + (1-zA'[1])*normal(xB'))) if $ML_y==0 * This only works if we are only estimating a single coefficient in zA. * If anybody knows how to generalize the above statement to allow for any number of parameters in zA to be estimated, please post it was a comment. end * Let's generate some sample data clear * We have 500 panel level observations set obs 500 gen id=_n * We have some random continuous explanatory variables gen x1 = rnormal() gen x2 = rnormal() * Let's first just think about A being a single constant. So A=.5 is telling us that 50% of the decision to buy or rent this period will be dependent upon the decision the previous period. gen A = .5 * The decision to rent at t=0 we will say is unobserved but that it enters the probability of renting this period in the form of the binomail draw. gen p=A*rbinomial(1,.5) + (1-A)*normal(.5*x1 - .5*x2) * Now we simulate the decision to rent at time period t=1 gen rent=rbinomial(1,p) * We expand the data to make 150 time periods expand 50 bysort id: gen t=_n * Now we generate time varying explanatory variables replace x1 = rnormal() if t>1 replace x2 = rnormal() if t>1 * This part of the data generating process is slightly complicated because in each period the decision to rent or buy is dependent upon the previous period's decision. forv i=2/50 { qui replace p=A*rent[_n-1] + (1-A)*normal(.5*x1 - .5*x2) if t==i' * The actual decision to rent or not is a binomial draw with the probability of 1 equal to p qui replace rent=rbinomial(1,p) if t==i' } **** Done with simulation * Now let's estimate gen rent_lag = rent[_n-1] if t>1 probit rent rent_lag x1 x2 * This is the benchmark ml model lf stickyprobit (probit: rent=x1 x2, noconstant) (zA:rent_lag, noconstant) ml maximize, iterate(10) * I set iterate to 10 because the ml model has difficulty converging on the solution. * I think this is because as A gets close to .5 then there is an increasing frequency in which infeasible values are evaluated (those are Ln(p<0 ln="" or="" p=""> * Because we know there there is serial correlation of the errors then we cannot trust that standard errors from the maximum likelihood estimator. * Thus we need to bootstrap clustering at the observation level. * In order to do this we will need to write a short program cap program drop bsstickyprobit program define bsstickyprobit ml model lf stickyprobit (probit: rent=x1 x2, noconstant) (zA:rent_lag, noconstant) ml maximize, iterate(10) * I will set iterate to 10 to speed up the boostrapped standard errors. end bs, cluster(id): bsstickyprobit /* (running bsstickyprobit on estimation sample) Bootstrap replications (50) ----+--- 1 ---+--- 2 ---+--- 3 ---+--- 4 ---+--- 5 .................................................. 50 Bootstrap results Number of obs = 24500 Replications = 50 Wald chi2(2) = 11.92 Log likelihood = -12748.652 Prob > chi2 = 0.0026 (Replications based on 500 clusters in id) ------------------------------------------------------------------------------ | Observed Bootstrap Normal-based rent | Coef. Std. Err. z P>|z| [95% Conf. Interval] -------------+---------------------------------------------------------------- probit | x1 | .4972777 .145441 3.42 0.001 .2122186 .7823369 x2 | -.5287849 .1586004 -3.33 0.001 -.8396359 -.2179339 -------------+---------------------------------------------------------------- zA | rent_lag | .497552 .1232098 4.04 0.000 .2560652 .7390388 ------------------------------------------------------------------------------ Warning: convergence not achieved */ * Finally, let's compare our boot strapped errors with simulated errors. cap program drop simstickyprobit program define simstickyprobit, rclass clear set obs 150 gen id=_n gen x1 = rnormal() gen x2 = rnormal() gen A = .5 gen p=A*rbinomial(1,.5) + (1-A)*normal(.5*x1 - .5*x2) gen rent=rbinomial(1,p) expand 150 bysort id: gen t=_n replace x1 = rnormal() if t>1 replace x2 = rnormal() if t>1 forv i=2/150 { qui replace p=A*rent[_n-1] + (1-A)*normal(.5*x1 - .5*x2) if t==i' * The actual decision to rent or not is a binomial draw with the probability of 1 equal to p qui replace rent=rbinomial(1,p) if t==i' } **** Done with simulation gen rent_lag = rent[_n-1] if t>1 ml model lf stickyprobit (probit: rent=x1 x2, noconstant) (zA:rent_lag, noconstant) ml maximize, iterate(20) return scalar b1=[probit]_b[x1] return scalar b2=[probit]_b[x2] return scalar A=[zA]_b[rent_lag] end simstickyprobit return list * The list of returned values look good. simulate A=r(A) b1=r(b1) b2=r(b2), rep(50): simstickyprobit sum /* Variable | Obs Mean Std. Dev. Min Max -------------+-------------------------------------------------------- A | 50 .3727054 .1289269 .23698 .5096082 b1 | 50 .3494328 .1601192 .1524611 .5577423 b2 | 50 -.348098 .1563565 -.5363456 -.145098 */ * We can see the standard deviation estimates from the clustered bootstrap are right on. * It is important to note that the estimator is strongly biased. ## Tuesday, January 22, 2013 ### Use Expand to Manually Reshape Data do file * Imagine that you have a wide data set that you would like to convert to a long data set. * Your data set is structured in the following manner. clear set obs 1000 gen id = _n forv i=1/4 { gen var1_i' = rnormal() gen var2_i' = rnormal() gen var3_i' = rnormal() gen var=i'+3' = rbinomial(1,.5) } order _all, alphabetic * You should now have data in which there are three variables 1,2, and 3 which have four different records and four variables var4-var7 which are time invariant. * We can use expand to reshape the data. expand 4 * Now we have four instances of each of the original observations. * We want now to create variables var1 var2 var3 and var4 which represent each different values for each panel period. * First let's sort and label our different panel periods. * Clump all of the same id's together bysort id: gen year=_n * Now we just copy our variables so that they only occur in the appropriate time period. * I will do this manually though it would be very easy to do by macros. gen var1 = var1_1 if year == 1 replace var1 = var1_2 if year == 2 replace var1 = var1_3 if year == 3 replace var1 = var1_4 if year == 4 gen var2 = var2_1 if year == 1 replace var2 = var2_2 if year == 2 replace var2 = var2_3 if year == 3 replace var2 = var2_4 if year == 4 gen var3 = var3_1 if year == 1 replace var3 = var3_2 if year == 2 replace var3 = var3_3 if year == 3 replace var3 = var3_4 if year == 4 * Now we just need to drop the extra variables. drop var?_? ************************************************ * This can also be accomplished with the reshape command: clear set obs 1000 gen id = _n forv i=1/4 { gen var1_i' = rnormal() gen var2_i' = rnormal() gen var3_i' = rnormal() gen var=`i'+3' = rbinomial(1,.5) } order _all, alphabetic reshape long var1_ var2_ var3_, i(id) * The tricky thing to remember with reshape is that it requires exact syntax on your variables to be converted. * That is, reshape is looking for a number at the end of each variable name. This number it will turn into the j variable. ## Thursday, January 17, 2013 ### model selection tests do file * It is typically simple to test for superiority of one model over another model when using nested models. * This can be accomplished in linear regression through use of a Wald test. * For example: * Let's say you have one hypothesis that H0: y = x1*B1 and another hypothesis that H1: y = x1*B1 + x2*B2 + x3*B3. * We can see that H0 is nested within H1. That is H0 = H1 when B2=B3=0 * Let's generate some sample data: clear set obs 10000 gen x1 = rnormal() gen x2 = rnormal() gen x3 = rnormal() gen u = rnormal()*2 gen y1 = x1*2 + u * Under this case the null is true. reg y1 x? test x2=x3=0 * This test should reject the null about %5 of the time when the null is at an alpha = .05 gen y2 = x1*2 - x2*1 + x3*1 + u reg y2 x? * In Stata this test is very easy to perform. test x2=x3=0 * This test should almost always reject the null since the effects of x2 and x3 on y is large. * For example. gen y_p1 = normal(x1*.5) gen y_p2 = normal(x1*.5+x2*1-x3*.2) gen Y1 = rbinomial(1,y_p1) probit Y1 x1 x2 x3 test x2=x3=0 * When looking at nested maximum likelihood estimators the likelihood ratio test can be used to test if the difference in explanatory powers of the models is statistically significant. * The estimator is 2(lk0-lkA) distributed chi-squared with degrees of freedom equal to the difference in degrees of freedom of the two models. * Let's see this in action: probit Y1 x1 x2 x3 * Now let's save the degrees of freedom and log likelihood. global df0 = e(df_m) global ll0 = e(ll) * Now for the other model probit Y1 x1 * Now let's save the degrees of freedom and log likelihood. global df1 = e(df_m) global ll1 = e(ll) * Now let's get the chi-squared statistic. di "Chi-squared =" 2*(${ll0}-${ll1}) " with DF = "${df0}-${df1} di "P value = " 1-chi2(${df0}-${df1},2*(${ll0}-${ll1})) " probability" * Notice how very similar this LR test is to the previous Wald test in this case. * Alternatively when the models are actually different: gen Y2 = rbinomial(1,y_p2) probit Y2 x1 x2 x3 test x2=x3=0 probit Y2 x1 x2 x3 global df0 = e(df_m) global ll0 = e(ll) probit Y2 x1 global df1 = e(df_m) global ll1 = e(ll) di "Chi-squared =" 2*(${ll0}-${ll1}) " with DF = "${df0}-${df1} di "P value = " 1-chi2(${df0}-${df1},2*(${ll0}-\${ll1})) " probability"

* Notice that the Chi-squared value for the Wald test and the LR test is quite different yet the conclusion is identical when it comes to rejecting the null.

* When models are not nested the problem becomes a bit more challenging.

* The Vuong test is a useful test of the goodness of fit of non-nested models.

* Imagine the above data Y2 which is generated based on the assumptions for the probit model.

* We are not sure that the probit is the better model or the logit.

* With the Vuong (1989) test of non-nested models we can construct a test statistic based on the log likelihoods of each individual observation.

* This is done by treating the difference in log likelihoods as a random variable and constructing a chi-squared estimator based on that difference (once again referencing Wooldridge class notes from EC 821B).

* First we will estimate the probit.

probit Y2 x1 x2 x3

* We will predict the fitted values which are the predicted probabilities of a draw of 1.

predict Y2_probit

* The log likelihood can be found from this by taking the log of those probabilities when the draw is 1 and the log of 1 less those probabilities when the draw is 0.

gen ll_probit = Y2*log(Y2_probit) + (1-Y2)*log(1-Y2_probit)

* In order to test that this is actually the case:
sum ll_probit

di "The log likelihood from the MLE should be equal to " r(N)*r(mean)

* Bingo!

* Now we will do the same with the logit.

logit Y2 x1 x2 x3

* We will predict the fitted values which are the predicted probabilities of a draw of 1.

predict Y2_logit

* The log likelihood can be found from this by taking the log of those probabilities when the draw is 1 and the log of 1 less those probabilities when the draw is 0.

gen ll_logit = Y2*log(Y2_logit) + (1-Y2)*log(1-Y2_logit)

* In order to test that this is actually the case:
sum ll_logit

di "The log likelihood from the MLE should be equal to " r(N)*r(mean)

* We next construct a variable called dif, which is a measure of the individual differences of the likelihoods.

gen dif = ll_probit-ll_logit

* Now this variable should only be statistically different from 0 about 5% of the time (at the 5% level) if the two models have equal explanatory power.

reg dif

* The constant dif only seems to be statistically significant a small proportion of the times indicating that as discussed previous the difference between a probit and a logit model is extremely small.

* It is possible to compare many other non-nested models in this way.

* Binary response models are particularly convenient examples because the log-likelihood statistic is so easy to construct.

* However, log likelihoods are easy to recover and or necessary to construct for many maximum likelihood procedures.

## Tuesday, January 15, 2013

### Creating an easy pie chart from data vectors

R script

# Pie charts can be a memorable way of representing information.

# There are a number of online examples showing how to create pie charts in R.

# For example: http://www.statmethods.net/graphs/pie.html

# Imagine you have survey response data from 100 respondents and your survey answers range from A do D.

# This will draw 100 draws from the LETTERS vector between A and D
response = LETTERS[ceiling(runif(100)^2*4)]

# You would like to now see the pie chart of results you can simply tabulate.
pie(table(response))

# We can see that about half the results are A followed by B then C then D.

# Note, that without including percentages it is often difficult to identify the differences in the individual sizes of each slice.

## Thursday, January 10, 2013

### Efficiency of LAD vs OLS

R Script

# The following post follows class notes from Panel Data Analysis II by Jeffrey Wooldridge (though all errors are mine)

# When deciding on the best method to estimate a coefficient, LAD and OLS are two commonly considered.

# Often times these estimators are theoretically estimating the same coefficient.

# That is, in the linear relationship y=xB + u with u~symmetrically distributed and independent of x with E(u|x)=0 and med(u|x)=0, both OLS and LAD are consistent estimators of B.

# However, in terms of efficiency, the two estimators individual efficiency is variable.

# Assuming u is independent of x, the asymptotic variance of the LAD estimator (n^.5 * (B[LAD]-B))= is 1/(4*f(0)^2)*E(x'x)^-1

# with f(0) being the pdf of the error distribution at the quantile being estimated (median 0).

# While the asymptotic variance of OLS is (n^.5 * (B[OLS]-B))=sigma2*E(x'x)^-1

# with sigma2 being the variance of the error distribution.

# Now we can evaluate the differences in asymptotic variances given difference distributions of errors.

# For instance, assume the underlying distribution is standard normal

# sigma2=1

# f(0)

1/(4*dnorm(0)^2)

# Yields 1.57.  This indicates that LAD is 57% less efficient than OLS when the errors are distributed normally.

# On the other hand.  If we had a distribution of errors v = exp(|u|)*(2-|u|)*sign(u) with u~normally.

N = 1000000

u = rnorm(N)

v = exp(abs(u))*(2-abs(u))*sign(u)

# We can estimate the variance of the errors by sampling.

hist(v)
summary(v)

var(v)
# Is estimated around 12.7

# To calculate the density at the median 0 we can sum across the pdf values of u in which v = 0.

# Which is u=2 and u=-2.  Because the normal is symmetric:

dnorm(-2)*2