Author: Tal Galili ( [email protected] )
As mentioned in CRAN Task View: Probability Distributions
Empirical distribution : Base R provides functions for univariate analysis: (1) the empirical density (see density()), (2) the empirical cumulative distribution function (see ecdf()), (3) the empirical quantile (see quantile()) and (4) random sampling (see sample()).
This package aims to easily wrap these into a single function
edfun
(short for Empirical Distribution FUNctions). Also,
since quantile is generally a slow function to perform, the default for
creating a quantile function (inverse-CDF) is by approximating the
function of predicting the data values (x) from their quantiles (CDF).
This is done using the approxfun
function. It takes a bit
longer to create qfun, but it is MUCH faster to run than quantile (and
is thus much better for simulations). Special care is taken for dealing
with the support of the distribution (if it is known upfront).
The added speed allows to use these functions to run simulation studies for unusual distributions.
To install the stable version on CRAN:
To install the GitHub version:
# You'll need devtools
if (!require(devtools)) install.packages("devtools");
devtools::install_github('talgalili/edfun')
And then you may load the package using:
First load the library:
The same can be done if we had the density of the distribution. In
this case, x
should be a sequence of values for which we
wish to pre-compute the quantiles (inv-CDF).
If dfun
is NULL
then it is just returned as
NULL
. This is useful if using the functions ONLY for
creating the CDF and inv-CDF and wanting to save computation time.
#> NULL
If it is known, we can specifically define the support:
x <- seq(-4,4, length.out = 1e3)
x_dist_no_support <- edfun(x, dfun = dnorm)
x_dist_known_support <- edfun(x, dfun = dnorm, support = c(-Inf, Inf))
x_dist_no_support$qfun(0)
#> [1] -4
#> [1] -Inf
Timing is basically the same when using the functions relying on
density (i.e.: dfun
) or on a sample only. But the accuracy
of using the density is much better.
set.seed(2016-08-18)
x_simu <- rnorm(1e5)
x_funs_simu <- edfun(x_simu, support = c(-Inf, Inf))
x <- seq(-4,4, length.out = 1e5)
x_funs <- edfun(x, dfun = dnorm, support = c(-Inf, Inf))
x_funs$qfun(0) # -Inf
# precision - qfun
q_to_test <- seq(0.001,.999, length.out = 100)
edfun_out <- x_funs$qfun(q_to_test) # -4
edfun_simu_out <- x_funs_simu$qfun(q_to_test) # -4
real_out <- qnorm(q_to_test)
mean(abs(edfun_out-real_out))
mean(abs(edfun_simu_out-real_out)) # quite terrible compared to when knowing dfun
library(microbenchmark)
microbenchmark(dfun_known = x_funs$qfun(q_to_test),
dfun_NOT_known = x_funs_simu$qfun(q_to_test))
# same time for the q function!
# precision - pfun
q_to_test <- seq(-3,3, length.out = 100)
edfun_out <- x_funs$pfun(q_to_test) # -4
edfun_simu_out <- x_funs_simu$pfun(q_to_test) # -4
real_out <- pnorm(q_to_test)
mean(abs(edfun_out-real_out))
mean(abs(edfun_simu_out-real_out)) # quite terrible compared to when knowing dfun
library(microbenchmark)
microbenchmark(dfun_known = x_funs$pfun(q_to_test),
dfun_NOT_known = x_funs_simu$pfun(q_to_test))
# same time for the p function!
# timing for the rfun
library(microbenchmark)
microbenchmark(dfun_known = x_funs$rfun(100),
dfun_NOT_known = x_funs_simu$rfun(100))
# this rfun is slower...
First load the library:
# credit: library(smoothmest)
ddoublex <- function (x, mu = 0, lambda = 1) {
exp(-abs(x - mu)/lambda)/(2 * lambda)
}
curve(ddoublex, -4,4) # the peak in 0 should go to Inf, but it doesn't because of the limits of `curve`
First load the library:
# http://stackoverflow.com/questions/20452650/writing-a-bimodal-normal-distribution-function-in-r?rq=1
# random sample:
# mixtools::rnormmix # random sample of a mixture of distributions
# http://stats.stackexchange.com/questions/70855/generating-random-variables-from-a-mixture-of-normal-distributions
# nor1mix::rnorMix
# https://en.wikipedia.org/wiki/Mixture_distribution
# density:
dmixnorm <- function(x, p1 = 0.5, m1=0, m2=0, s1=1, s2=1) {
p1 * dnorm(x, m1, s1) + (1-p1) * dnorm(x, m2, s2)
}
# a convex mixture of densities is a density:
# https://en.wikipedia.org/wiki/Mixture_distribution#Properties
# yay - it works.
dmixnorm_1 <- function(x) dmixnorm(x, .75, 0,5, 1,1)
curve(dmixnorm_1, -4,9)
# A nice tutorial on the distr family of packages
# https://cran.r-project.org/web/packages/distrDoc/vignettes/distr.pdf
library(distr)
ddoublex <- function (x, mu = 0, lambda = 1) {
exp(-abs(x - mu)/lambda)/(2 * lambda)
}
my_dist <- AbscontDistribution(d=ddoublex)
curve(d(my_dist)(x),-2,2)
curve(p(my_dist)(x),-2,2)
curve(q(my_dist)(x),0,1)
hist(r(my_dist)(1000))
library(edfun)
x <- seq(-4,4, length.out = 1e3)
x_dist <- edfun(x, dfun = ddoublex)
f <- x_dist$pfun
curve(f, -4,4)
library(microbenchmark)
microbenchmark(distr = p(my_dist)(0), edfun = x_dist$pfun(0))
# > microbenchmark(distr = p(my_dist)(0), edfun = x_dist$pfun(0))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# distr 5.588 6.518 8.56095 6.829 7.450 63.010 100 b
# edfun 1.552 1.863 2.70703 2.173 2.484 15.831 100 a
library(microbenchmark)
microbenchmark(distr = q(my_dist)(0.5), edfun = x_dist$qfun(0.5))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# distr 26.694 27.935 33.55964 28.867 30.108 104.601 100 b
# edfun 1.552 1.863 2.63255 2.174 2.639 18.314 100 a
Comparing to the spd
R package
set.seed(123)
x <- rnorm(1000)
library(edfun)
x_dist <- edfun(x)
f <- x_dist$dfun
curve(f, -2,2)
library(spd)
fit <- spdfit(x)
library(microbenchmark)
microbenchmark(edfun = x_dist$dfun(0), spd = dspd(0, fit))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# edfun 2.173 3.104 6.54647 8.07 8.8465 18.313 100 a
# spd 15690.458 18262.183 20566.08416 18914.78 19625.1010 85231.180 100 b
microbenchmark(edfun = x_dist$pfun(0.5), spd = pspd(0.5, fit))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# edfun 1.552 2.1730 4.12237 3.725 4.1905 66.424 100 a
# spd 234.344 246.4495 260.13406 251.570 267.5560 421.818 100 b
microbenchmark(edfun = x_dist$qfun(0), spd = qspd(0, fit))
# Unit: microseconds
# expr min lq mean median uq max neval cld
# edfun 1.553 2.4840 3.70962 4.035 4.3460 18.314 100 a
# spd 255.140 266.6245 292.00170 271.435 292.3865 1354.847 100 b
You are welcome to:
You can see the most recent changes to the package in the NEWS.md file
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.1 LTS
#>
#> Matrix products: default
#> BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: Etc/UTC
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats graphics grDevices utils datasets methods base
#>
#> other attached packages:
#> [1] knitr_1.49 edfun_0.3.0
#>
#> loaded via a namespace (and not attached):
#> [1] digest_0.6.37 R6_2.5.1 fastmap_1.2.0 xfun_0.49
#> [5] maketools_1.3.1 cachem_1.1.0 htmltools_0.5.8.1 rmarkdown_2.29
#> [9] buildtools_1.0.0 lifecycle_1.0.4 cli_3.6.3 sass_0.4.9
#> [13] jquerylib_0.1.4 compiler_4.4.2 sys_3.4.3 tools_4.4.2
#> [17] evaluate_1.0.1 bslib_0.8.0 yaml_2.3.10 jsonlite_1.8.9
#> [21] rlang_1.1.4