RcppParallel version much faster when using only one thread

I'm testing package RcppParallel to compute inner products for on-disk data (accessed via memory-mapping -- similar to package bigmemory).

A "minimal" reproducing example:

// [[Rcpp::depends(RcppParallel, BH, bigstatsr)]]

#include <bigstatsr/BMCodeAcc.h>

#include <RcppParallel.h>

using namespace RcppParallel;



struct Sum : public Worker {



  SubBMCode256Acc macc;

  double xySum;

  std::size_t j0, j;



  // constructors

  Sum(SubBMCode256Acc macc) :

    macc(macc),     xySum(0), j0(0), j(0) {}

  Sum(const Sum& sum, std::size_t j0, std::size_t j) :

    macc(sum.macc), xySum(0), j0(j0), j(j) {}

  Sum(const Sum& sum, Split) :

    macc(sum.macc), xySum(0), j0(sum.j0), j(sum.j) {}



  // accumulate just the element of the range I've been asked to

  void operator()(std::size_t begin, std::size_t end) {

    for (std::size_t i = begin; i < end; i++) {

      xySum += macc(i, j) * macc(i, j0);

    }

  }

  // join results

  void join(const Sum& rhs) {

    xySum += rhs.xySum;

  }

};



// [[Rcpp::export]]

NumericVector parallelVectorSum(Environment BM) {



  XPtr<FBM> xpBM = BM["address"];

  std::size_t n = xpBM->nrow();

  std::size_t m = xpBM->ncol();

  SubBMCode256Acc macc(xpBM, seq_len(n) - 1, seq_len(m) - 1, BM["code256"]);



  int grain = std::sqrt(n);



  Sum sum0(macc);

  NumericVector res(m);

  for (size_t j = 0; j < m; j++) {

    Sum sum(sum0, 0, j);

    parallelReduce(0, n, sum, grain);

    res[j] = sum.xySum;

  }



  return res;

}





/*** R

RcppParallel::setThreadOptions(2)

library(bigsnpr)

snp <- snp_attachExtdata()

G <- snp$genotypes

test0 <- parallelVectorSum(G)



G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))

dim(G2)

RcppParallel::setThreadOptions(1)

system.time(test1 <- parallelVectorSum(G2))

testthat::expect_identical(test1, 500 * test0)

RcppParallel::setThreadOptions(2)

system.time(test2 <- parallelVectorSum(G2))

testthat::expect_identical(test2, 500 * test0)

*/

Output:

> Rcpp::sourceCpp('tmp-tests/test-rcpp-parallel.cpp')



> RcppParallel::setThreadOptions(2)



> library(bigsnpr)



> snp <- snp_attachExtdata()



> G <- snp$genotypes



> test0 <- parallelVectorSum(G)



> G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))



> dim(G2)

[1] 258500   4542



> RcppParallel::setThreadOptions(1)



> system.time(test1 <- parallelVectorSum(G2))  # 100 / 3

   user  system elapsed 

  3.621   0.423   4.045 



> testthat::expect_identical(test1, 500 * test0)



> RcppParallel::setThreadOptions(2)



> system.time(test2 <- parallelVectorSum(G2))  # 177 / 39

   user  system elapsed 

 39.958  42.590  53.516 



> testthat::expect_identical(test2, 500 * test0)

Using one thread takes 4 seconds, and 53 seconds using 2 threads. I'm a bit lost of what could possibly be causing this large difference. Any idea??

PS1: I've run this on two different computers (no other processes running).

PS2: I know I should probably parallelize over j instead. I've tested it; it works well. Yet, in the real problem I have, iterations over j are not independent, so that it would be much easier to parallelize over i.

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

add a comment |

I'm testing package RcppParallel to compute inner products for on-disk data (accessed via memory-mapping -- similar to package bigmemory).

A "minimal" reproducing example:

// [[Rcpp::depends(RcppParallel, BH, bigstatsr)]]

#include <bigstatsr/BMCodeAcc.h>

#include <RcppParallel.h>

using namespace RcppParallel;



struct Sum : public Worker {



  SubBMCode256Acc macc;

  double xySum;

  std::size_t j0, j;



  // constructors

  Sum(SubBMCode256Acc macc) :

    macc(macc),     xySum(0), j0(0), j(0) {}

  Sum(const Sum& sum, std::size_t j0, std::size_t j) :

    macc(sum.macc), xySum(0), j0(j0), j(j) {}

  Sum(const Sum& sum, Split) :

    macc(sum.macc), xySum(0), j0(sum.j0), j(sum.j) {}



  // accumulate just the element of the range I've been asked to

  void operator()(std::size_t begin, std::size_t end) {

    for (std::size_t i = begin; i < end; i++) {

      xySum += macc(i, j) * macc(i, j0);

    }

  }

  // join results

  void join(const Sum& rhs) {

    xySum += rhs.xySum;

  }

};



// [[Rcpp::export]]

NumericVector parallelVectorSum(Environment BM) {



  XPtr<FBM> xpBM = BM["address"];

  std::size_t n = xpBM->nrow();

  std::size_t m = xpBM->ncol();

  SubBMCode256Acc macc(xpBM, seq_len(n) - 1, seq_len(m) - 1, BM["code256"]);



  int grain = std::sqrt(n);



  Sum sum0(macc);

  NumericVector res(m);

  for (size_t j = 0; j < m; j++) {

    Sum sum(sum0, 0, j);

    parallelReduce(0, n, sum, grain);

    res[j] = sum.xySum;

  }



  return res;

}





/*** R

RcppParallel::setThreadOptions(2)

library(bigsnpr)

snp <- snp_attachExtdata()

G <- snp$genotypes

test0 <- parallelVectorSum(G)



G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))

dim(G2)

RcppParallel::setThreadOptions(1)

system.time(test1 <- parallelVectorSum(G2))

testthat::expect_identical(test1, 500 * test0)

RcppParallel::setThreadOptions(2)

system.time(test2 <- parallelVectorSum(G2))

testthat::expect_identical(test2, 500 * test0)

*/

Output:

> Rcpp::sourceCpp('tmp-tests/test-rcpp-parallel.cpp')



> RcppParallel::setThreadOptions(2)



> library(bigsnpr)



> snp <- snp_attachExtdata()



> G <- snp$genotypes



> test0 <- parallelVectorSum(G)



> G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))



> dim(G2)

[1] 258500   4542



> RcppParallel::setThreadOptions(1)



> system.time(test1 <- parallelVectorSum(G2))  # 100 / 3

   user  system elapsed 

  3.621   0.423   4.045 



> testthat::expect_identical(test1, 500 * test0)



> RcppParallel::setThreadOptions(2)



> system.time(test2 <- parallelVectorSum(G2))  # 177 / 39

   user  system elapsed 

 39.958  42.590  53.516 



> testthat::expect_identical(test2, 500 * test0)

Using one thread takes 4 seconds, and 53 seconds using 2 threads. I'm a bit lost of what could possibly be causing this large difference. Any idea??

PS1: I've run this on two different computers (no other processes running).

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

add a comment |

I'm testing package RcppParallel to compute inner products for on-disk data (accessed via memory-mapping -- similar to package bigmemory).

A "minimal" reproducing example:

// [[Rcpp::depends(RcppParallel, BH, bigstatsr)]]

#include <bigstatsr/BMCodeAcc.h>

#include <RcppParallel.h>

using namespace RcppParallel;



struct Sum : public Worker {



  SubBMCode256Acc macc;

  double xySum;

  std::size_t j0, j;



  // constructors

  Sum(SubBMCode256Acc macc) :

    macc(macc),     xySum(0), j0(0), j(0) {}

  Sum(const Sum& sum, std::size_t j0, std::size_t j) :

    macc(sum.macc), xySum(0), j0(j0), j(j) {}

  Sum(const Sum& sum, Split) :

    macc(sum.macc), xySum(0), j0(sum.j0), j(sum.j) {}



  // accumulate just the element of the range I've been asked to

  void operator()(std::size_t begin, std::size_t end) {

    for (std::size_t i = begin; i < end; i++) {

      xySum += macc(i, j) * macc(i, j0);

    }

  }

  // join results

  void join(const Sum& rhs) {

    xySum += rhs.xySum;

  }

};



// [[Rcpp::export]]

NumericVector parallelVectorSum(Environment BM) {



  XPtr<FBM> xpBM = BM["address"];

  std::size_t n = xpBM->nrow();

  std::size_t m = xpBM->ncol();

  SubBMCode256Acc macc(xpBM, seq_len(n) - 1, seq_len(m) - 1, BM["code256"]);



  int grain = std::sqrt(n);



  Sum sum0(macc);

  NumericVector res(m);

  for (size_t j = 0; j < m; j++) {

    Sum sum(sum0, 0, j);

    parallelReduce(0, n, sum, grain);

    res[j] = sum.xySum;

  }



  return res;

}





/*** R

RcppParallel::setThreadOptions(2)

library(bigsnpr)

snp <- snp_attachExtdata()

G <- snp$genotypes

test0 <- parallelVectorSum(G)



G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))

dim(G2)

RcppParallel::setThreadOptions(1)

system.time(test1 <- parallelVectorSum(G2))

testthat::expect_identical(test1, 500 * test0)

RcppParallel::setThreadOptions(2)

system.time(test2 <- parallelVectorSum(G2))

testthat::expect_identical(test2, 500 * test0)

*/

Output:

> Rcpp::sourceCpp('tmp-tests/test-rcpp-parallel.cpp')



> RcppParallel::setThreadOptions(2)



> library(bigsnpr)



> snp <- snp_attachExtdata()



> G <- snp$genotypes



> test0 <- parallelVectorSum(G)



> G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))



> dim(G2)

[1] 258500   4542



> RcppParallel::setThreadOptions(1)



> system.time(test1 <- parallelVectorSum(G2))  # 100 / 3

   user  system elapsed 

  3.621   0.423   4.045 



> testthat::expect_identical(test1, 500 * test0)



> RcppParallel::setThreadOptions(2)



> system.time(test2 <- parallelVectorSum(G2))  # 177 / 39

   user  system elapsed 

 39.958  42.590  53.516 



> testthat::expect_identical(test2, 500 * test0)

Using one thread takes 4 seconds, and 53 seconds using 2 threads. I'm a bit lost of what could possibly be causing this large difference. Any idea??

PS1: I've run this on two different computers (no other processes running).

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

I'm testing package RcppParallel to compute inner products for on-disk data (accessed via memory-mapping -- similar to package bigmemory).

A "minimal" reproducing example:

// [[Rcpp::depends(RcppParallel, BH, bigstatsr)]]

#include <bigstatsr/BMCodeAcc.h>

#include <RcppParallel.h>

using namespace RcppParallel;



struct Sum : public Worker {



  SubBMCode256Acc macc;

  double xySum;

  std::size_t j0, j;



  // constructors

  Sum(SubBMCode256Acc macc) :

    macc(macc),     xySum(0), j0(0), j(0) {}

  Sum(const Sum& sum, std::size_t j0, std::size_t j) :

    macc(sum.macc), xySum(0), j0(j0), j(j) {}

  Sum(const Sum& sum, Split) :

    macc(sum.macc), xySum(0), j0(sum.j0), j(sum.j) {}



  // accumulate just the element of the range I've been asked to

  void operator()(std::size_t begin, std::size_t end) {

    for (std::size_t i = begin; i < end; i++) {

      xySum += macc(i, j) * macc(i, j0);

    }

  }

  // join results

  void join(const Sum& rhs) {

    xySum += rhs.xySum;

  }

};



// [[Rcpp::export]]

NumericVector parallelVectorSum(Environment BM) {



  XPtr<FBM> xpBM = BM["address"];

  std::size_t n = xpBM->nrow();

  std::size_t m = xpBM->ncol();

  SubBMCode256Acc macc(xpBM, seq_len(n) - 1, seq_len(m) - 1, BM["code256"]);



  int grain = std::sqrt(n);



  Sum sum0(macc);

  NumericVector res(m);

  for (size_t j = 0; j < m; j++) {

    Sum sum(sum0, 0, j);

    parallelReduce(0, n, sum, grain);

    res[j] = sum.xySum;

  }



  return res;

}





/*** R

RcppParallel::setThreadOptions(2)

library(bigsnpr)

snp <- snp_attachExtdata()

G <- snp$genotypes

test0 <- parallelVectorSum(G)



G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))

dim(G2)

RcppParallel::setThreadOptions(1)

system.time(test1 <- parallelVectorSum(G2))

testthat::expect_identical(test1, 500 * test0)

RcppParallel::setThreadOptions(2)

system.time(test2 <- parallelVectorSum(G2))

testthat::expect_identical(test2, 500 * test0)

*/

Output:

> Rcpp::sourceCpp('tmp-tests/test-rcpp-parallel.cpp')



> RcppParallel::setThreadOptions(2)



> library(bigsnpr)



> snp <- snp_attachExtdata()



> G <- snp$genotypes



> test0 <- parallelVectorSum(G)



> G2 <- big_copy(G, ind.row = rep(rows_along(G), 500))



> dim(G2)

[1] 258500   4542



> RcppParallel::setThreadOptions(1)



> system.time(test1 <- parallelVectorSum(G2))  # 100 / 3

   user  system elapsed 

  3.621   0.423   4.045 



> testthat::expect_identical(test1, 500 * test0)



> RcppParallel::setThreadOptions(2)



> system.time(test2 <- parallelVectorSum(G2))  # 177 / 39

   user  system elapsed 

 39.958  42.590  53.516 



> testthat::expect_identical(test2, 500 * test0)

Using one thread takes 4 seconds, and 53 seconds using 2 threads. I'm a bit lost of what could possibly be causing this large difference. Any idea??

PS1: I've run this on two different computers (no other processes running).

r parallel-processing rcppparallel

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

edited Nov 25 '18 at 21:18

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

asked Nov 25 '18 at 20:54

F. Privé

6,9902943

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53471869%2frcppparallel-version-much-faster-when-using-only-one-thread%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

DGFsTj5T HYFrM9tuuAH0TRnOtse

搜尋此網誌

Tukukkk