## Tuesday, September 11, 2012

### Calculate an Empirical CDF for a random variable

# First lets define the function. ecdf (Empirical Cumulative Distribution Function)
ecdf <- function(x, bins=100) {

# We will sort the imput variable from smallest to largest
x_sort = sort(x)

# Now we will create a percentile variable that will rank x from slightly above zero to slightly below 1.
pct =  (1:length(x)/(length(x)+1))

# We will now ceate a number of groups equal to length(x)/bins
grps = ceiling(pct*bins)

# First let's create an empty vector of zeros with length = length(pct) = length(x)
avpct = 0*pct
# Next, let's replace avpct with the mean of pct for each group.
for (i in 1:bins) avpct[grps==i] = mean(pct[grps==i])

# Create a bunch of empty value to hold the CDF values
cdf_bins = cdf_pcts = cdf_values = 1:bins

# Now loop through each bin value
for (i in 1:bins) {
cdf_pcts[i] = mean(pct[grps==i])
cdf_values[i] = mean(x_sort[grps==i])
}

# Because there is no
data.frame(bins=cdf_bins, pcts=cdf_pcts, xvar=cdf_values)

}

# We will first generate some data, y is 4000 normal draws divided by a uniform(.1,1.1) draw.
y <- rnorm(4000)/(runif(4000)+.1)

cdf1 = ecdf(y, 100) # Calculate an Empirical CDF for variable y with 100 bins
cdf1

# We can see the ECDF is pretty steep around the mean 0.  This is because most of the observations fall between -10 and 10, yet a very few are either large (around 30) or small (around -30), forcing the x dimension to be large.
plot(cdf1\$xvar, cdf1\$pcts, type="l")

# In a future post I hope to use the empirical CDF information to calculate an inverse CDF that can be used to draw correlated non-normally distributed random variables.

#### 1 comment:

1. Haha, I just found out that R already has a ecdf function programmed up that does exactly this.

ecdf

Also, someone else already blogged a very similar post:

http://greentheo.scroggles.com/2007/08/22/r_code_for_generating_a_cumulative_distr/