Author: Tal Galili ( [email protected] )

Introduction

As mentioned in CRAN Task View: Probability Distributions

Empirical distribution : Base R provides functions for univariate analysis: (1) the empirical density (see density()), (2) the empirical cumulative distribution function (see ecdf()), (3) the empirical quantile (see quantile()) and (4) random sampling (see sample()).

This package aims to easily wrap these into a single function edfun (short for Empirical Distribution FUNctions). Also, since quantile is generally a slow function to perform, the default for creating a quantile function (inverse-CDF) is by approximating the function of predicting the data values (x) from their quantiles (CDF). This is done using the approxfun function. It takes a bit longer to create qfun, but it is MUCH faster to run than quantile (and is thus much better for simulations). Special care is taken for dealing with the support of the distribution (if it is known upfront).

The added speed allows to use these functions to run simulation studies for unusual distributions.

Installation

To install the stable version on CRAN:

# install.packages('edfun') # not on CRAN yet

To install the GitHub version:

# You'll need devtools
if (!require(devtools)) install.packages("devtools");

devtools::install_github('talgalili/edfun')

And then you may load the package using:

library("edfun")

Usage

Basic usage - the normal distribution

First load the library:

library(edfun)

set.seed(123)
x <- rnorm(1000)
x_dist <- edfun(x)

f <- x_dist$dfun
curve(f, -2,2)

f <- x_dist$pfun
curve(f, -2,2)

f <- x_dist$qfun
curve(f, 0,1)

new_x <- x_dist$rfun(1000)
hist(new_x)

The same can be done if we had the density of the distribution. In this case, x should be a sequence of values for which we wish to pre-compute the quantiles (inv-CDF).

x <- seq(-4,4, length.out = 1e3)
x_dist <- edfun(x, dfun = dnorm)

f <- x_dist$dfun
curve(f, -2,2)

f <- x_dist$pfun
curve(f, -2,2)

f <- x_dist$qfun
curve(f, 0,1)

new_x <- x_dist$rfun(1000)
hist(new_x)

If dfun is NULL then it is just returned as NULL. This is useful if using the functions ONLY for creating the CDF and inv-CDF and wanting to save computation time.

set.seed(123)
x <- rnorm(1000)
x_dist <- edfun(x, dfun = NULL)

x_dist$dfun

#> NULL

# but this still works just fine:
f <- x_dist$pfun
curve(f, -2,2)

f <- x_dist$qfun
curve(f, 0,1)

If it is known, we can specifically define the support:

x <- seq(-4,4, length.out = 1e3)
x_dist_no_support <- edfun(x, dfun = dnorm)
x_dist_known_support <- edfun(x, dfun = dnorm, support = c(-Inf, Inf))

x_dist_no_support$qfun(0)

#> [1] -4

x_dist_known_support$qfun(0)

#> [1] -Inf

Timing is basically the same when using the functions relying on density (i.e.: dfun) or on a sample only. But the accuracy of using the density is much better.

 set.seed(2016-08-18)
  x_simu <- rnorm(1e5)
  x_funs_simu <- edfun(x_simu, support = c(-Inf, Inf))
  
  
  x <- seq(-4,4, length.out = 1e5)
  x_funs <- edfun(x, dfun = dnorm, support = c(-Inf, Inf))
  x_funs$qfun(0) # -Inf
  
  # precision - qfun
  q_to_test <- seq(0.001,.999, length.out = 100)
  edfun_out <- x_funs$qfun(q_to_test) # -4
  edfun_simu_out <- x_funs_simu$qfun(q_to_test) # -4
  real_out <- qnorm(q_to_test)
  
  mean(abs(edfun_out-real_out))
  mean(abs(edfun_simu_out-real_out)) # quite terrible compared to when knowing dfun
  
  library(microbenchmark)
  microbenchmark(dfun_known = x_funs$qfun(q_to_test),
                 dfun_NOT_known = x_funs_simu$qfun(q_to_test))
  # same time for the q function!
  
  
  # precision - pfun
  q_to_test <- seq(-3,3, length.out = 100)
  edfun_out <- x_funs$pfun(q_to_test) # -4
  edfun_simu_out <- x_funs_simu$pfun(q_to_test) # -4
  real_out <- pnorm(q_to_test)
  
  mean(abs(edfun_out-real_out))
  mean(abs(edfun_simu_out-real_out)) # quite terrible compared to when knowing dfun
  
  library(microbenchmark)
  microbenchmark(dfun_known = x_funs$pfun(q_to_test),
                 dfun_NOT_known = x_funs_simu$pfun(q_to_test))
  # same time for the p function!

  
  # timing for the rfun
  library(microbenchmark)
  microbenchmark(dfun_known = x_funs$rfun(100),
                 dfun_NOT_known = x_funs_simu$rfun(100))
  # this rfun is slower...

The double exponential distribution

First load the library:

library(edfun)

# credit: library(smoothmest)
ddoublex <- function (x, mu = 0, lambda = 1) {
  exp(-abs(x - mu)/lambda)/(2 * lambda)  
}

curve(ddoublex, -4,4) # the peak in 0 should go to Inf, but it doesn't because of the limits of `curve`

x <- seq(-4,4, length.out = 1e3)
x_dist <- edfun(x, dfun = ddoublex)

f <- x_dist$dfun
curve(f, -4,4)

f <- x_dist$pfun
curve(f, -4,4)

f <- x_dist$qfun
curve(f, 0,1)

new_x <- x_dist$rfun(1000)
hist(new_x)

A mixture of two normal distributions

First load the library:

library(edfun)

# http://stackoverflow.com/questions/20452650/writing-a-bimodal-normal-distribution-function-in-r?rq=1
# random sample:
# mixtools::rnormmix # random sample of a mixture of distributions
# http://stats.stackexchange.com/questions/70855/generating-random-variables-from-a-mixture-of-normal-distributions
# nor1mix::rnorMix
# https://en.wikipedia.org/wiki/Mixture_distribution
# density:
dmixnorm <- function(x, p1 = 0.5, m1=0, m2=0, s1=1, s2=1) {
  p1 * dnorm(x, m1, s1) + (1-p1) * dnorm(x, m2, s2)
}
# a convex mixture of densities is a density:
#  https://en.wikipedia.org/wiki/Mixture_distribution#Properties

# yay - it works.
dmixnorm_1 <- function(x) dmixnorm(x, .75, 0,5,  1,1)
curve(dmixnorm_1, -4,9)

x <- seq(-4,9, length.out = 1e3)
x_dist <- edfun(x, dfun = dmixnorm_1)

f <- x_dist$dfun
curve(f, -4,9)

f <- x_dist$pfun
curve(f, -4,9)

f <- x_dist$qfun
curve(f, 0,1)

new_x <- x_dist$rfun(1000)
hist(new_x)

Speed comparisons

# A nice tutorial on the distr family of packages
# https://cran.r-project.org/web/packages/distrDoc/vignettes/distr.pdf

library(distr)

ddoublex <- function (x, mu = 0, lambda = 1) {
  exp(-abs(x - mu)/lambda)/(2 * lambda)  
}

my_dist <- AbscontDistribution(d=ddoublex)
curve(d(my_dist)(x),-2,2)
curve(p(my_dist)(x),-2,2)
curve(q(my_dist)(x),0,1)
hist(r(my_dist)(1000))

library(edfun)

x <- seq(-4,4, length.out = 1e3)
x_dist <- edfun(x, dfun = ddoublex)
f <- x_dist$pfun
curve(f, -4,4)

library(microbenchmark)
microbenchmark(distr = p(my_dist)(0), edfun = x_dist$pfun(0))

# > microbenchmark(distr = p(my_dist)(0), edfun = x_dist$pfun(0))
# Unit: microseconds
#   expr   min    lq    mean median    uq    max neval cld
#  distr 5.588 6.518 8.56095  6.829 7.450 63.010   100   b
#  edfun 1.552 1.863 2.70703  2.173 2.484 15.831   100  a 

library(microbenchmark)
microbenchmark(distr = q(my_dist)(0.5), edfun = x_dist$qfun(0.5))

# Unit: microseconds
#   expr    min     lq     mean median     uq     max neval cld
#  distr 26.694 27.935 33.55964 28.867 30.108 104.601   100   b
#  edfun  1.552  1.863  2.63255  2.174  2.639  18.314   100  a

Comparing to the spd R package

set.seed(123)
x <- rnorm(1000)
library(edfun)
x_dist <- edfun(x)

f <- x_dist$dfun
curve(f, -2,2)


library(spd)
fit <- spdfit(x)

library(microbenchmark)
microbenchmark(edfun = x_dist$dfun(0), spd = dspd(0, fit))
# Unit: microseconds
#   expr       min        lq        mean   median         uq       max neval cld
#  edfun     2.173     3.104     6.54647     8.07     8.8465    18.313   100  a 
#    spd 15690.458 18262.183 20566.08416 18914.78 19625.1010 85231.180   100   b

microbenchmark(edfun = x_dist$pfun(0.5), spd = pspd(0.5, fit))
# Unit: microseconds
#   expr     min       lq      mean  median       uq     max neval cld
#  edfun   1.552   2.1730   4.12237   3.725   4.1905  66.424   100  a 
#    spd 234.344 246.4495 260.13406 251.570 267.5560 421.818   100   b


microbenchmark(edfun = x_dist$qfun(0), spd = qspd(0, fit))
# Unit: microseconds
#   expr     min       lq      mean  median       uq      max neval cld
#  edfun   1.553   2.4840   3.70962   4.035   4.3460   18.314   100  a 
#    spd 255.140 266.6245 292.00170 271.435 292.3865 1354.847   100   b

Contact

You are welcome to:

submit suggestions and bug-reports at: https://github.com/talgalili/edfun/issues
send a pull request on: https://github.com/talgalili/edfun/
compose a friendly e-mail to: [email protected]

Latest news

You can see the most recent changes to the package in the NEWS.md file

Session info

sessionInfo()

#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] knitr_1.49  edfun_0.3.0
#> 
#> loaded via a namespace (and not attached):
#>  [1] digest_0.6.37     R6_2.6.0          fastmap_1.2.0     xfun_0.50        
#>  [5] maketools_1.3.2   cachem_1.1.0      htmltools_0.5.8.1 rmarkdown_2.29   
#>  [9] buildtools_1.0.0  lifecycle_1.0.4   cli_3.6.4         sass_0.4.9       
#> [13] jquerylib_0.1.4   compiler_4.4.2    sys_3.4.3         tools_4.4.2      
#> [17] evaluate_1.0.3    bslib_0.9.0       yaml_2.3.10       jsonlite_1.8.9   
#> [21] rlang_1.1.5

Introduction to edfun

Introduction

Installation

Usage

Basic usage - the normal distribution

The double exponential distribution

A mixture of two normal distributions

Speed comparisons

Contact

Latest news

Session info