Choosing a new default RNG for dqrng

R
RNG
package
Published

September 9, 2023

Currently the dqrng package supports only xoroshiro128+ and xoshiro256+ from https://prng.di.unimi.it/ (see also Blackman and Vigna 2021). These RNGs should only be used for creating floating point numbers, which was the case for dqrng originally. However, dqsample and dqrrademacher make use of the full bit pattern. So it would be better to support the ** and/or ++ variants for both RNGs and make one of them the default. This would be a breaking change, of course. In #57 I have added these additional 4 RNGs to xoshiro.h so now is the time to do some benchmarking first by generating some random numbers:

#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_distribution.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]

template<typename RNG>
double sum_rng(int n) {
  auto rng = dqrng::generator<RNG>(42);
  dqrng::uniform_distribution dist;
  double result = 0.0;
  for (int i = 0; i < n; ++i) {
    result += dist(*rng);
  }
  return result;
}

// [[Rcpp::export]]
double sum_128plus(int n) {
  return sum_rng<dqrng::xoroshiro128plus>(n);
}
// [[Rcpp::export]]
double sum_256plus(int n) {
  return sum_rng<dqrng::xoshiro256plus>(n);
}
// [[Rcpp::export]]
double sum_128starstar(int n) {
  return sum_rng<dqrng::xoroshiro128starstar>(n);
}
// [[Rcpp::export]]
double sum_256starstar(int n) {
  return sum_rng<dqrng::xoshiro256starstar>(n);
}
// [[Rcpp::export]]
double sum_128plusplus(int n) {
  return sum_rng<dqrng::xoroshiro128plusplus>(n);
}
// [[Rcpp::export]]
double sum_256plusplus(int n) {
  return sum_rng<dqrng::xoshiro256plusplus>(n);
}
N <- 1e5
bm <- bench::mark(
  sum_128plus(N),
  sum_128starstar(N),
  sum_128plusplus(N),
  sum_256plus(N),
  sum_256starstar(N),
  sum_256plusplus(N),
  check = FALSE,
  min_time = 1
)
knitr::kable(bm[, 1:6])
expression min median itr/sec mem_alloc gc/sec
sum_128plus(N) 486µs 504µs 1868.621 2.49KB 0
sum_128starstar(N) 486µs 501µs 1877.453 2.49KB 0
sum_128plusplus(N) 486µs 500µs 1876.092 2.49KB 0
sum_256plus(N) 487µs 500µs 1865.577 2.49KB 0
sum_256starstar(N) 486µs 503µs 1851.012 2.49KB 0
sum_256plusplus(N) 486µs 504µs 1852.690 2.49KB 0
plot(bm)

The current default xoroshiro128+ is fastest in this comparison with the other generators being very similar. Let’s try a more realistic usecase: generating many uniformaly distributed random numbers:

#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_distribution.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]

template<typename RNG>
Rcpp::NumericVector runif_rng(int n) {
  auto rng = dqrng::generator<RNG>(42);
  dqrng::uniform_distribution dist;
  Rcpp::NumericVector result(Rcpp::no_init(n));
  std::generate(result.begin(), result.end(), [rng, dist] () {return dist(*rng);});
  return result;
}

// [[Rcpp::export]]
Rcpp::NumericVector runif_128plus(int n) {
  return runif_rng<dqrng::xoroshiro128plus>(n);
}
// [[Rcpp::export]]
Rcpp::NumericVector runif_256plus(int n) {
  return runif_rng<dqrng::xoshiro256plus>(n);
}
// [[Rcpp::export]]
Rcpp::NumericVector runif_128starstar(int n) {
  return runif_rng<dqrng::xoroshiro128starstar>(n);
}
// [[Rcpp::export]]
Rcpp::NumericVector runif_256starstar(int n) {
  return runif_rng<dqrng::xoshiro256starstar>(n);
}
// [[Rcpp::export]]
Rcpp::NumericVector runif_128plusplus(int n) {
  return runif_rng<dqrng::xoroshiro128plusplus>(n);
}
// [[Rcpp::export]]
Rcpp::NumericVector runif_256plusplus(int n) {
  return runif_rng<dqrng::xoshiro256plusplus>(n);
}
N <- 1e5
bm <- bench::mark(
  runif(N),
  runif_128plus(N),
  runif_128starstar(N),
  runif_128plusplus(N),
  runif_256plus(N),
  runif_256starstar(N),
  runif_256plusplus(N),
  check = FALSE,
  min_time = 1
)
knitr::kable(bm[, 1:6])
expression min median itr/sec mem_alloc gc/sec
runif(N) 2.97ms 3.51ms 278.908 786KB 4.147331
runif_128plus(N) 419.7µs 777.16µs 1284.184 784KB 22.993440
runif_128starstar(N) 420.05µs 777.19µs 1274.454 784KB 21.034679
runif_128plusplus(N) 419.83µs 813.89µs 1125.404 784KB 18.206746
runif_256plus(N) 489.21µs 857.94µs 1086.515 784KB 18.089734
runif_256starstar(N) 489.12µs 851.94µs 1168.742 784KB 19.178194
runif_256plusplus(N) 489.42µs 862.5µs 1102.899 784KB 18.210929
plot(bm)

Here all six generators are very similar, with all of them clearly faster than R’s built in runif. How about sampling with replacement, which is also mostly governed by the speed of generating random numbers:

#include <Rcpp.h>
// [[Rcpp::depends(dqrng, BH)]]
#include <dqrng_sample.h>
#include <xoshiro.h>
// [[Rcpp::plugins(cpp11)]]

// [[Rcpp::export]]
Rcpp::IntegerVector sample_128plus(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoroshiro128plus>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
Rcpp::IntegerVector sample_128starstar(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoroshiro128starstar>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
Rcpp::IntegerVector sample_128plusplus(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoroshiro128plusplus>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
Rcpp::IntegerVector sample_256plus(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoshiro256plus>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
Rcpp::IntegerVector sample_256starstar(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoshiro256starstar>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
// [[Rcpp::export]]
Rcpp::IntegerVector sample_256plusplus(int m, int n) {
  auto rng = dqrng::generator<dqrng::xoshiro256plusplus>(42);
  return dqrng::sample::sample<Rcpp::IntegerVector, uint32_t>(*rng, uint32_t(m), uint32_t(n), true, 0);
}
N <- 1e5
M <- 1e3
bm <- bench::mark(
  sample.int(M, N, replace = TRUE),
  sample_128plus(M, N),
  sample_128starstar(M, N),
  sample_128plusplus(M, N),
  sample_256plus(M, N),
  sample_256starstar(M, N),
  sample_256plusplus(M, N),
  check = FALSE,
  min_time = 1
)
knitr::kable(bm[, 1:6])
expression min median itr/sec mem_alloc gc/sec
sample.int(M, N, replace = TRUE) 3.25ms 3.61ms 269.5642 401KB 2.034447
sample_128plus(M, N) 408.16µs 633.07µs 1532.6744 393KB 14.303494
sample_128starstar(M, N) 373.28µs 556.72µs 1733.5808 393KB 14.147238
sample_128plusplus(M, N) 424.63µs 647.42µs 1516.8105 393KB 12.909025
sample_256plus(M, N) 391.63µs 575.64µs 1686.6577 393KB 15.283629
sample_256starstar(M, N) 406.82µs 595.84µs 1622.1948 393KB 12.809865
sample_256plusplus(M, N) 372.11µs 554.85µs 1760.1333 393KB 14.141986
plot(bm)

Again nothing really conclusive. All six RNGs are similar and much faster than R’s build in sample.int.

The speed comparisons between these generators is inconclusive to me. The xoroshiro128 seem to be slightly faster than the xoshiro256 variants. So I am leaning towards one of those as the new default while still making the corresponding xoshiro256 variant available in case a longer period or a higher degree of parallelisation is needed. Comparing the ++ and ** variants, I am leaning towards ++, but that is not set in stone.

References

Blackman, David, and Sebastiano Vigna. 2021. “Scrambled Linear Pseudorandom Number Generators.” ACM Transactions on Mathematical Software 47 (4): 1–32. https://doi.org/10.1145/3460772.