#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_distribution.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]
template<typename RNG>
double sum_rng(int n) {
auto rng = dqrng::generator<RNG>(42);
::uniform_distribution dist;
dqrngdouble result = 0.0;
for (int i = 0; i < n; ++i) {
+= dist(*rng);
result }
return result;
}
// [[Rcpp::export]]
double sum_128plus(int n) {
return sum_rng<dqrng::xoroshiro128plus>(n);
}
// [[Rcpp::export]]
double sum_256plus(int n) {
return sum_rng<dqrng::xoshiro256plus>(n);
}
// [[Rcpp::export]]
double sum_128starstar(int n) {
return sum_rng<dqrng::xoroshiro128starstar>(n);
}
// [[Rcpp::export]]
double sum_256starstar(int n) {
return sum_rng<dqrng::xoshiro256starstar>(n);
}
// [[Rcpp::export]]
double sum_128plusplus(int n) {
return sum_rng<dqrng::xoroshiro128plusplus>(n);
}
// [[Rcpp::export]]
double sum_256plusplus(int n) {
return sum_rng<dqrng::xoshiro256plusplus>(n);
}
Currently the dqrng package supports only xoroshiro128+ and xoshiro256+ from https://prng.di.unimi.it/ (see also Blackman and Vigna 2021). These RNGs should only be used for creating floating point numbers, which was the case for dqrng originally. However, dqsample
and dqrrademacher
make use of the full bit pattern. So it would be better to support the **
and/or ++
variants for both RNGs and make one of them the default. This would be a breaking change, of course. In #57 I have added these additional 4 RNGs to xoshiro.h
so now is the time to do some benchmarking first by generating some random numbers:
<- 1e5
N <- bench::mark(
bm sum_128plus(N),
sum_128starstar(N),
sum_128plusplus(N),
sum_256plus(N),
sum_256starstar(N),
sum_256plusplus(N),
check = FALSE,
min_time = 1
)::kable(bm[, 1:6]) knitr
expression | min | median | itr/sec | mem_alloc | gc/sec |
---|---|---|---|---|---|
sum_128plus(N) | 486µs | 504µs | 1868.621 | 2.49KB | 0 |
sum_128starstar(N) | 486µs | 501µs | 1877.453 | 2.49KB | 0 |
sum_128plusplus(N) | 486µs | 500µs | 1876.092 | 2.49KB | 0 |
sum_256plus(N) | 487µs | 500µs | 1865.577 | 2.49KB | 0 |
sum_256starstar(N) | 486µs | 503µs | 1851.012 | 2.49KB | 0 |
sum_256plusplus(N) | 486µs | 504µs | 1852.690 | 2.49KB | 0 |
plot(bm)
The current default xoroshiro128+ is fastest in this comparison with the other generators being very similar. Let’s try a more realistic usecase: generating many uniformaly distributed random numbers:
#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_distribution.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]
template<typename RNG>
::NumericVector runif_rng(int n) {
Rcppauto rng = dqrng::generator<RNG>(42);
::uniform_distribution dist;
dqrng::NumericVector result(Rcpp::no_init(n));
Rcppstd::generate(result.begin(), result.end(), [rng, dist] () {return dist(*rng);});
return result;
}
// [[Rcpp::export]]
::NumericVector runif_128plus(int n) {
Rcppreturn runif_rng<dqrng::xoroshiro128plus>(n);
}
// [[Rcpp::export]]
::NumericVector runif_256plus(int n) {
Rcppreturn runif_rng<dqrng::xoshiro256plus>(n);
}
// [[Rcpp::export]]
::NumericVector runif_128starstar(int n) {
Rcppreturn runif_rng<dqrng::xoroshiro128starstar>(n);
}
// [[Rcpp::export]]
::NumericVector runif_256starstar(int n) {
Rcppreturn runif_rng<dqrng::xoshiro256starstar>(n);
}
// [[Rcpp::export]]
::NumericVector runif_128plusplus(int n) {
Rcppreturn runif_rng<dqrng::xoroshiro128plusplus>(n);
}
// [[Rcpp::export]]
::NumericVector runif_256plusplus(int n) {
Rcppreturn runif_rng<dqrng::xoshiro256plusplus>(n);
}
<- 1e5
N <- bench::mark(
bm runif(N),
runif_128plus(N),
runif_128starstar(N),
runif_128plusplus(N),
runif_256plus(N),
runif_256starstar(N),
runif_256plusplus(N),
check = FALSE,
min_time = 1
)::kable(bm[, 1:6]) knitr
expression | min | median | itr/sec | mem_alloc | gc/sec |
---|---|---|---|---|---|
runif(N) | 2.97ms | 3.51ms | 278.908 | 786KB | 4.147331 |
runif_128plus(N) | 419.7µs | 777.16µs | 1284.184 | 784KB | 22.993440 |
runif_128starstar(N) | 420.05µs | 777.19µs | 1274.454 | 784KB | 21.034679 |
runif_128plusplus(N) | 419.83µs | 813.89µs | 1125.404 | 784KB | 18.206746 |
runif_256plus(N) | 489.21µs | 857.94µs | 1086.515 | 784KB | 18.089734 |
runif_256starstar(N) | 489.12µs | 851.94µs | 1168.742 | 784KB | 19.178194 |
runif_256plusplus(N) | 489.42µs | 862.5µs | 1102.899 | 784KB | 18.210929 |
plot(bm)
Here all six generators are very similar, with all of them clearly faster than R’s built in runif
. How about sampling with replacement, which is also mostly governed by the speed of generating random numbers:
#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_sample.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
::IntegerVector sample_128plus(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoroshiro128plus>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
::IntegerVector sample_128starstar(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoroshiro128starstar>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
::IntegerVector sample_128plusplus(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoroshiro128plusplus>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
::IntegerVector sample_256plus(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoshiro256plus>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
::IntegerVector sample_256starstar(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoshiro256starstar>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
::IntegerVector sample_256plusplus(int m, int n) {
Rcppauto rng = dqrng::generator<dqrng::xoshiro256plusplus>(42);
return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
<- 1e5
N <- 1e3
M <- bench::mark(
bm sample.int(M, N, replace = TRUE),
sample_128plus(M, N),
sample_128starstar(M, N),
sample_128plusplus(M, N),
sample_256plus(M, N),
sample_256starstar(M, N),
sample_256plusplus(M, N),
check = FALSE,
min_time = 1
)::kable(bm[, 1:6]) knitr
expression | min | median | itr/sec | mem_alloc | gc/sec |
---|---|---|---|---|---|
sample.int(M, N, replace = TRUE) | 3.25ms | 3.61ms | 269.5642 | 401KB | 2.034447 |
sample_128plus(M, N) | 408.16µs | 633.07µs | 1532.6744 | 393KB | 14.303494 |
sample_128starstar(M, N) | 373.28µs | 556.72µs | 1733.5808 | 393KB | 14.147238 |
sample_128plusplus(M, N) | 424.63µs | 647.42µs | 1516.8105 | 393KB | 12.909025 |
sample_256plus(M, N) | 391.63µs | 575.64µs | 1686.6577 | 393KB | 15.283629 |
sample_256starstar(M, N) | 406.82µs | 595.84µs | 1622.1948 | 393KB | 12.809865 |
sample_256plusplus(M, N) | 372.11µs | 554.85µs | 1760.1333 | 393KB | 14.141986 |
plot(bm)
Again nothing really conclusive. All six RNGs are similar and much faster than R’s build in sample.int
.
The speed comparisons between these generators is inconclusive to me. The xoroshiro128 seem to be slightly faster than the xoshiro256 variants. So I am leaning towards one of those as the new default while still making the corresponding xoshiro256 variant available in case a longer period or a higher degree of parallelisation is needed. Comparing the ++
and **
variants, I am leaning towards ++
, but that is not set in stone.