Profiling & Parallelization

Lecture 20

Dr. Colin Rundel

Profiling & Benchmarking

profvis demo

n = 1e6
d = tibble(
  x1 = rt(n, df = 3),
  x2 = rt(n, df = 3),
  x3 = rt(n, df = 3),
  x4 = rt(n, df = 3),
  x5 = rt(n, df = 3),
) |>
  mutate(y = -2*x1 - 1*x2 + 0*x3 + 1*x4 + 2*x5 + rnorm(n))

profvis::profvis(lm(y~., data=d))

Benchmarking - `bench`

d = tibble(
  x = runif(10000),
  y = runif(10000)
)

(b = bench::mark(
  d[d$x > 0.5, ],
  d[which(d$x > 0.5), ],
  subset(d, x > 0.5),
  filter(d, x > 0.5)
))

# A tibble: 4 × 6
  expression                 min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>            <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 d[d$x > 0.5, ]          49.4µs   57.4µs    16967.  240.14KB     46.5
2 d[which(d$x > 0.5), ]     90µs  102.6µs     9559.  272.03KB     50.8
3 subset(d, x > 0.5)      87.6µs  104.1µs     9350.  298.36KB     49.3
4 filter(d, x > 0.5)     290.9µs    318µs     3000.    1.48MB     48.7

Larger n

d = tibble(
  x = runif(1e6),
  y = runif(1e6)
)

(b = bench::mark(
  d[d$x > 0.5, ],
  d[which(d$x > 0.5), ],
  subset(d, x > 0.5),
  filter(d, x > 0.5)
))

# A tibble: 4 × 6
  expression                 min   median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>            <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
1 d[d$x > 0.5, ]          3.64ms   4.19ms      240.    13.4MB     160.
2 d[which(d$x > 0.5), ]   8.45ms   8.85ms      113.    24.8MB     142.
3 subset(d, x > 0.5)      9.15ms   9.87ms      102.    24.8MB     117.
4 filter(d, x > 0.5)       4.9ms    5.5ms      181.    24.8MB     228.

`bench` - relative results

summary(b, relative=TRUE)

# A tibble: 4 × 6
  expression              min median `itr/sec` mem_alloc `gc/sec`
  <bch:expr>            <dbl>  <dbl>     <dbl>     <dbl>    <dbl>
1 d[d$x > 0.5, ]         1      1         2.36      1        1.37
2 d[which(d$x > 0.5), ]  2.32   2.11      1.11      1.86     1.21
3 subset(d, x > 0.5)     2.51   2.36      1         1.86     1   
4 filter(d, x > 0.5)     1.35   1.31      1.78      1.86     1.95

Parallelization

`parallel`

Part of the base packages in R

tools for the forking of R processes (some functions do not work on Windows)
Core functions:
- detectCores
- pvec
- mclapply
- mcparallel & mccollect

`detectCores`

Surprisingly, detects the number of cores of the current system.

detectCores()

[1] 10

pvec

Parallelization of a vectorized function call

system.time(pvec(1:1e7, sqrt, mc.cores = 1))

   user  system elapsed 
  0.088   0.011   0.099

system.time(pvec(1:1e7, sqrt, mc.cores = 4))

   user  system elapsed 
  0.164   0.118   0.224

system.time(pvec(1:1e7, sqrt, mc.cores = 8))

   user  system elapsed 
  0.091   0.166   0.165

system.time(sqrt(1:1e7))

   user  system elapsed 
  0.017   0.016   0.035

pvec - `bench::system_time`

bench::system_time(pvec(1:1e7, sqrt, mc.cores = 1))

process    real 
 58.6ms  58.4ms

bench::system_time(pvec(1:1e7, sqrt, mc.cores = 4))

process    real 
  157ms   199ms

bench::system_time(pvec(1:1e7, sqrt, mc.cores = 8))

process    real 
  180ms   204ms

bench::system_time(Sys.sleep(.5))

process    real 
   60µs   497ms

system.time(Sys.sleep(.5))

   user  system elapsed 
  0.000   0.000   0.505

Cores by size

cores = c(1,4,6,8,10)
order = 6:8
f = function(x,y) {
  system.time(
    pvec(1:(10^y), sqrt, mc.cores = x)
  )[3]
}

res = map(
  cores, 
  function(x) {
     map_dbl(order, f, x = x)
  }
) |> 
  do.call(rbind, args = _)

rownames(res) = paste0(cores," cores")
colnames(res) = paste0("10^",order)

res

          10^6  10^7  10^8
1 cores  0.004 0.057 0.324
4 cores  0.038 0.149 1.738
6 cores  0.031 0.143 1.336
8 cores  0.042 0.137 1.438
10 cores 0.032 0.168 1.406

mclapply

Parallelized version of lapply

system.time(rnorm(1e7))

   user  system elapsed 
  0.262   0.004   0.265

system.time(unlist(mclapply(1:10, function(x) rnorm(1e6), mc.cores = 2)))

   user  system elapsed 
  0.327   0.092   0.268

system.time(unlist(mclapply(1:10, function(x) rnorm(1e6), mc.cores = 4)))

   user  system elapsed 
  0.335   0.100   0.174

system.time(unlist(mclapply(1:10, function(x) rnorm(1e6), mc.cores = 8)))

   user  system elapsed 
  0.338   0.150   0.163

system.time(unlist(mclapply(1:10, function(x) rnorm(1e6), mc.cores = 10)))

   user  system elapsed 
  0.368   0.157   0.169

mcparallel

Asynchronously evaluation of an R expression in a separate process

m = mcparallel(rnorm(1e6))
n = mcparallel(rbeta(1e6,1,1))
o = mcparallel(rgamma(1e6,1,1))

str(m)

List of 2
 $ pid: int 19229
 $ fd : int [1:2] 4 7
 - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

str(n)

List of 2
 $ pid: int 19230
 $ fd : int [1:2] 5 9
 - attr(*, "class")= chr [1:3] "parallelJob" "childProcess" "process"

mccollect

Checks mcparallel objects for completion

str(mccollect(list(m,n,o)))

List of 3
 $ 19229: num [1:1000000] 1.088 0.48 0.706 -2.542 -0.594 ...
 $ 19230: num [1:1000000] 0.192 0.934 0.861 0.64 0.575 ...
 $ 19231: num [1:1000000] 0.25 0.84 1.124 2.366 0.922 ...

mccollect - waiting

p = mcparallel(mean(rnorm(1e5)))

mccollect(p, wait = FALSE, 10)

$`19232`
[1] 0.0004659567

mccollect(p, wait = FALSE)

NULL

mccollect(p, wait = FALSE)

NULL

doMC & foreach

Packages by Revolution Analytics that provides the foreach function which is a parallelizable for loop (and then some).

Core functions:
- registerDoMC
- foreach, %dopar%, %do%

`registerDoMC`

Primarily used to set the number of cores used by foreach, by default uses options("cores") or half the number of cores found by detectCores from the parallel package.

options("cores")

$cores
NULL

detectCores()

[1] 10

getDoParWorkers()

[1] 1

registerDoMC(4)
getDoParWorkers()

[1] 4

`foreach`

A slightly more powerful version of base for loops (think for with an lapply flavor). Combined with %do% or %dopar% for single or multicore execution.

for(i in 1:10) {
  sqrt(i)
}

foreach(i = 1:5) %do% {
  sqrt(i)   
}

[[1]]
[1] 1

[[2]]
[1] 1.414214

[[3]]
[1] 1.732051

[[4]]
[1] 2

[[5]]
[1] 2.236068

`foreach` - iterators

foreach can iterate across more than one value, but it doesn’t do length coercion

foreach(i = 1:5, j = 1:5) %do% {
  sqrt(i^2+j^2)   
}

[[1]]
[1] 1.414214

[[2]]
[1] 2.828427

[[3]]
[1] 4.242641

[[4]]
[1] 5.656854

[[5]]
[1] 7.071068

foreach(i = 1:5, j = 1:2) %do% {
  sqrt(i^2+j^2)   
}

[[1]]
[1] 1.414214

[[2]]
[1] 2.828427

`foreach` - combining results

foreach(i = 1:5, .combine='c') %do% {
  sqrt(i)
}

[1] 1.000000 1.414214 1.732051 2.000000 2.236068

foreach(i = 1:5, .combine='cbind') %do% {
  sqrt(i)
}

     result.1 result.2 result.3 result.4 result.5
[1,]        1 1.414214 1.732051        2 2.236068

foreach(i = 1:5, .combine='+') %do% {
  sqrt(i)
}

[1] 8.382332

`foreach` - parallelization

Swapping out %do% for %dopar% will use the parallel backend.

registerDoMC(4)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

   user  system elapsed 
  0.298   0.028   0.110

registerDoMC(8)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

   user  system elapsed 
  0.302   0.039   0.078

registerDoMC(10)
system.time(foreach(i = 1:10) %dopar% mean(rnorm(1e6)))

   user  system elapsed 
  0.336   0.051   0.067

furrr / future

system.time( purrr::map(c(1,1,1), Sys.sleep) )

   user  system elapsed 
  0.000   0.000   3.011

system.time( furrr::future_map(c(1,1,1), Sys.sleep) )

   user  system elapsed 
  0.045   0.006   3.074

future::plan(future::multisession) # See also future::multicore
system.time( furrr::future_map(c(1,1,1), Sys.sleep) )

   user  system elapsed 
  0.168   0.004   1.436

Example - Bootstraping

Bootstrapping is a resampling scheme where the original data is repeatedly reconstructed by taking a samples of size n (with replacement) from the original data, and using that to repeat an analysis procedure of interest. Below is an example of fitting a local regression (loess) to some synthetic data, we will construct a bootstrap prediction interval for this model.

set.seed(3212016)
d = data.frame(x = 1:120) |>
    mutate(y = sin(2*pi*x/120) + runif(length(x),-1,1))

l = loess(y ~ x, data=d)
p = predict(l, se=TRUE)

d = d |> mutate(
  pred_y = p$fit,
  pred_y_se = p$se.fit
)

ggplot(d, aes(x,y)) +
  geom_point(color="gray50") +
  geom_ribbon(
    aes(ymin = pred_y - 1.96 * pred_y_se, 
        ymax = pred_y + 1.96 * pred_y_se), 
    fill="red", alpha=0.25
  ) +
  geom_line(aes(y=pred_y)) +
  theme_bw()

Bootstraping Demo

What to use when?

Optimal use of parallelization / multiple cores is hard, there isn’t one best solution

Don’t underestimate the overhead cost
Experimentation is key
Measure it or it didn’t happen
Be aware of the trade off between developer time and run time

BLAS and LAPACK

Statistics and Linear Algebra

An awful lot of statistics is at its core linear algebra.

For example:

Linear regession models, find

\[ \hat{\beta} = (X^T X)^{-1} X^Ty \]

Principle component analysis
- Find \(T = XW\) where \(W\) is a matrix whose columns are the eigenvectors of \(X^TX\).
- Often solved via SVD - Let \(X = U\Sigma W^T\) then \(T = U\Sigma\).

Numerical Linear Algebra

Not unique to Statistics, these are the type of problems that come up across all areas of numerical computing.

Numerical linear algebra \(\ne\) mathematical linear algebra
Efficiency and stability of numerical algorithms matter
- Designing and implementing these algorithms is hard
Don’t reinvent the wheel - common core linear algebra tools (well defined API)

BLAS and LAPACK

Low level algorithms for common linear algebra operations

BLAS

Basic Linear Algebra Subprograms
Copying, scaling, multiplying vectors and matrices
Origins go back to 1979, written in Fortran

LAPACK

Linear Algebra Package
Higher level functionality building on BLAS.
Linear solvers, eigenvalues, and matrix decompositions
Origins go back to 1992, mostly Fortran (expanded on LINPACK, EISPACK)

Modern variants?

Most default BLAS and LAPACK implementations (like R’s defaults) are somewhat dated

Written in Fortran and designed for a single cpu core
Certain (potentially non-optimal) hard coded defaults (e.g. block size).

Multithreaded alternatives:

ATLAS - Automatically Tuned Linear Algebra Software
OpenBLAS - fork of GotoBLAS from TACC at UTexas
Intel MKL - Math Kernel Library, part of Intel’s commercial compiler tools
cuBLAS / Magma - GPU libraries from Nvidia and UTK respectively
Accelerate / vecLib - Apple’s framework for GPU and multicore computing

OpenBLAS Matrix Multiply Performance

x=matrix(runif(5000^2),ncol=5000)

sizes = c(100,500,1000,2000,3000,4000,5000)
cores = c(1,2,4,8,16)

sapply(
  cores, 
  function(n_cores) 
  {
    flexiblas::flexiblas_set_num_threads(n_cores)
    sapply(
      sizes, 
      function(s) 
      {
        y = x[1:s,1:s]
        system.time(y %*% y)[3]
      }
    )
  }
)

n	1 core	2 cores	4 cores	8 cores	16 cores
100	0.000	0.000	0.000	0.000	0.000
500	0.004	0.003	0.002	0.002	0.004
1000	0.028	0.016	0.010	0.007	0.009
2000	0.207	0.110	0.058	0.035	0.039
3000	0.679	0.352	0.183	0.103	0.081
4000	1.587	0.816	0.418	0.227	0.145
5000	3.104	1.583	0.807	0.453	0.266

Profiling & Parallelization

Profiling & Benchmarking

profvis demo

Benchmarking - bench

Larger n

bench - relative results

Parallelization

parallel

detectCores

pvec

pvec - bench::system_time

Cores by size

mclapply

mcparallel

mccollect

mccollect - waiting

doMC & foreach

doMC & foreach

registerDoMC

foreach

foreach - iterators

foreach - combining results

foreach - parallelization

furrr / future

Example - Bootstraping

Bootstraping Demo

What to use when?

BLAS and LAPACK

Statistics and Linear Algebra

Numerical Linear Algebra

BLAS and LAPACK

BLAS

LAPACK

Modern variants?

OpenBLAS Matrix Multiply Performance

Benchmarking - `bench`

`bench` - relative results

`parallel`

`detectCores`

pvec - `bench::system_time`

`registerDoMC`

`foreach`

`foreach` - iterators

`foreach` - combining results

`foreach` - parallelization