docs/zoltan2/Zoltan2__Directory__Comm_8cpp_source.html

// @HEADER

// *****************************************************************************

//   Zoltan2: A package of combinatorial algorithms for scientific computing

//

// Copyright 2012 NTESS and the Zoltan2 contributors.

// SPDX-License-Identifier: BSD-3-Clause

// *****************************************************************************

// @HEADER


#include "Zoltan2_Directory_Comm.hpp"

#include <stdexcept>

#include <memory>


namespace Zoltan2 {


void Zoltan2_Directory_Plan::getInvertedValues(Zoltan2_Directory_Plan * from) {

  total_recv_size = 0;

  for (int i = 0; i < from->nsends + from->self_msg; i++) {

    total_recv_size += from->lengths_to[i];

  }


  max_send_size = 0;

  for (int i = 0; i < from->nrecvs; i++) {

    if (from->lengths_from[i] > max_send_size) {

      max_send_size = from->lengths_from[i];

    }

  }


  nvals         = from->nvals_recv;

  nvals_recv    = from->nvals;

  lengths_to    = from->lengths_from;

  procs_to      = from->procs_from;

  indices_to    = from->indices_from;


  starts_to     = from->starts_from;

  lengths_from  = from->lengths_to;

  procs_from    = from->procs_to;

  indices_from  = from->indices_to;


  starts_from   = from->starts_to;

  nrecvs        = from->nsends;

  nsends        = from->nrecvs;

  self_msg      = from->self_msg;

  comm          = from->comm;

}


void Zoltan2_Directory_Plan::print(const std::string& headerMessage) const {


  #define PRINT_VECTOR(v)                        \

  if(v != Teuchos::null) {                       \

    std::cout << "  " << #v << " ";              \

    for(Teuchos::ArrayRCP<int>::size_type n = 0; n < v.size(); ++n) { \

      std::cout << v[n] << " ";                  \

    }                                            \

    std::cout << std::endl;                      \

  }


  #define PRINT_VAL(val) std::cout << "  " << #val << ": " << val << std::endl;


  for(int proc = 0; proc < comm->getSize(); ++proc) {

    comm->barrier();

    if(proc == comm->getRank()) {


      std::cout << "Rank " << proc << " " << headerMessage << std::endl;


      PRINT_VECTOR(procs_to)

      PRINT_VECTOR(procs_from)

      PRINT_VECTOR(lengths_to)

      PRINT_VECTOR(lengths_from)

      PRINT_VECTOR(starts_to)

      PRINT_VECTOR(starts_from)

      PRINT_VECTOR(indices_to)

      PRINT_VECTOR(indices_from)

      PRINT_VECTOR(sizes)

      PRINT_VECTOR(sizes_to)

      PRINT_VECTOR(sizes_from)

      PRINT_VECTOR(starts_to_ptr)

      PRINT_VECTOR(starts_from_ptr)

      PRINT_VECTOR(indices_to_ptr)

      PRINT_VECTOR(indices_from_ptr)


      PRINT_VAL(nvals)

      PRINT_VAL(nvals_recv)

      PRINT_VAL(nrecvs)

      PRINT_VAL(nsends)

      PRINT_VAL(self_msg)

      PRINT_VAL(max_send_size)

      PRINT_VAL(total_recv_size)

      PRINT_VAL(maxed_recvs)

    }

  }

  comm->barrier();

}


Zoltan2_Directory_Comm::Zoltan2_Directory_Comm(

  int       nvals,                        /* number of values I currently own */

  const Teuchos::ArrayRCP<int>  &assign,  /* processor assignment for all my values */

  Teuchos::RCP<const Teuchos::Comm<int> > comm,               /* communicator */

  int       tag) :                                   /* message tag I can use */

  comm_(comm),

  plan_forward(NULL)

{

  if (comm == Teuchos::null){

    throw std::logic_error("Invalid communicator: MPI_COMM_NULL.");

  }


  int my_proc = comm->getRank();    /* my processor tag in communicator */

  int nprocs = comm->getSize();   /* number of  processors in communicator */


  /* First check to see if items are grouped by processor with no gaps. */

  /* If so, indices_to should be NULL (= identity) */


  /* Make data structures that will allow me to traverse arrays quickly. */

  /* Begin by determining number of objects I'm sending to each processor. */

  Teuchos::ArrayRCP<int> starts(new int[nprocs + 1], 0, nprocs + 1, true);

  for(int n = 0; n < starts.size(); ++n) {

    starts[n] = 0;

  }


  /* Note: Negative assign value means ignore item. */

  /* Non-trailing negatives mean data not packed so need send_buf. */

  /* Could (but don't) allow negatives between processor blocks w/o buf. */

  int nactive = 0;  /* number of values to remap */


  int no_send_buff = 1; /* is data nicely grouped by processor? */


  int prev_proc = nprocs; /* processor on previous loop pass */


  for (int i = 0; i < nvals; i++) {

    int proc = assign[i];

    if (no_send_buff && proc != prev_proc) { /* Checks if blocked by proc */

      if (proc >= 0 && (starts[proc] || prev_proc < 0)) {

        no_send_buff = 0;

      }

      else {

        prev_proc = proc;

      }

    }

    if (proc >= 0) {

      ++starts[proc];

      ++nactive;

    }

  }


  int self_msg = (starts[my_proc] != 0);  /* do I have data for myself? */


  Teuchos::ArrayRCP<int> lengths_to;   /* lengths I'll send to */

  Teuchos::ArrayRCP<int> procs_to;     /* processors I'll send to */

  Teuchos::ArrayRCP<int> starts_to;    /* where in list my sends begin */

  Teuchos::ArrayRCP<int> indices_to;   /* local_id values I'll be sending */


  int max_send_size = 0;         /* size of longest message I send */

  int nsends = 0;                /* # procs I'll send to (including self) */

  int nrecvs = 0;                /* # procs I'll recv from (including self) */


  if (no_send_buff) {

    /* Grouped by processor.  Array indices_to can be NULL (= identity) */

    nsends = 0;

    for (int i = 0; i < nprocs; i++) {

      if (starts[i] != 0) ++nsends;

    }


    lengths_to.resize(nsends);

    starts_to.resize(nsends);

    procs_to.resize(nsends);


    int index = 0;    /* index into list of objects */

    /* Note that procs_to is in the order the data was passed in. */

    for (int i = 0; i < nsends; i++) {

      starts_to[i] = index;

      int proc = assign[index];

      procs_to[i] = proc;

      index += starts[proc];

    }


    /* Now sort the outgoing procs. */

    /* This keeps recvs deterministic if I ever invert communication */

    /* It also allows for better balance of traffic in comm_do */

    sort_ints(procs_to, starts_to);


    max_send_size = 0;

    for (int i = 0; i < nsends; i++) {

      int proc = procs_to[i];

      lengths_to[i] = starts[proc];

      if (proc != my_proc && lengths_to[i] > max_send_size) {

        max_send_size = lengths_to[i];

      }

    }

  }

  else {  /* Not grouped by processor.  More complex data structures. */

    /* Sum starts values to be offsets into indices_to array. */

    nsends = (starts[0] != 0);

    for (int i = 1; i < nprocs; i++) {

      if (starts[i] != 0)

        ++nsends;

      starts[i] += starts[i - 1];

    }


    for (int i = nprocs - 1; i; i--) {

      starts[i] = starts[i - 1];

    }


    starts[0] = 0;


    indices_to = (nactive > 0) ?

      Teuchos::arcp(new int[nactive], 0, nactive, true) : Teuchos::null;


    for (int i = 0; i < nvals; i++) {

      int proc = assign[i];

      if (proc >= 0) {

        indices_to[starts[proc]] = i;

        ++starts[proc];

      }

    }


    /* Indices_to array now has the data in clumps for each processor. */

    /* Now reconstruct starts array to index into indices_to. */

    for (int i = nprocs - 1; i; i--) {

      starts[i] = starts[i - 1];

    }

    starts[0] = 0;

    starts[nprocs] = nactive;


    /* Construct lengths_to, starts_to and procs_to arrays. */

    /* Note: If indices_to is needed, procs are in increasing order */

    lengths_to.resize(nsends);

    starts_to.resize(nsends);

    procs_to.resize(nsends);


    int j = 0;

    max_send_size = 0;

    for (int i = 0; i < nprocs; i++) {

      if (starts[i + 1] != starts[i]) {

        starts_to[j] = starts[i];

        lengths_to[j] = starts[i + 1] - starts[i];

        if (i != my_proc && lengths_to[j] > max_send_size) {

          max_send_size = lengths_to[j];

        }

        procs_to[j] = i;

        j++;

      }

    }

  }


  /* Now change nsends to count only non-self messages */

  nsends -= self_msg;


  /* Determine how many messages & what length I'll receive. */

  Teuchos::ArrayRCP<int> lengths_from; /* lengths of messages I'll receive */

  Teuchos::ArrayRCP<int> procs_from; /* processors I'll receive from */

  int out_of_mem = 0; // TODO refactor this bit


  int comm_flag = invert_map(lengths_to, procs_to, nsends, self_msg,

    lengths_from, procs_from, &nrecvs, my_proc, nprocs,

    out_of_mem, tag, comm);


  /* pointers for where to put recv data */

  Teuchos::ArrayRCP<int> starts_from(

    new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

  int j = 0;

  for (int i = 0; i < nrecvs + self_msg; i++) {

    starts_from[i] = j;

    j += lengths_from[i];

  }


  if (comm_flag != 0) {

    throw std::logic_error("Failed to construct Zoltan2_Directory_Comm");

  }


  int total_recv_size = 0;  /* total size of messages I recv */

  for (int i = 0; i < nrecvs + self_msg; i++) {

    total_recv_size += lengths_from[i];

  }


  plan_forward = new Zoltan2_Directory_Plan;

  plan_forward->lengths_to = lengths_to;

  plan_forward->starts_to = starts_to;

  plan_forward->procs_to = procs_to;

  plan_forward->indices_to = indices_to;

  plan_forward->lengths_from = lengths_from;

  plan_forward->starts_from = starts_from;

  plan_forward->procs_from = procs_from;

  plan_forward->nvals = nvals;

  plan_forward->nvals_recv = total_recv_size;

  plan_forward->nrecvs = nrecvs;

  plan_forward->nsends = nsends;

  plan_forward->self_msg = self_msg;

  plan_forward->max_send_size = max_send_size;

  plan_forward->total_recv_size = total_recv_size;

  plan_forward->maxed_recvs = 0;

  plan_forward->comm = comm;


  if (MPI_RECV_LIMIT > 0) {


    throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (1)"); // needs unit testing


    /* If we have a limit to the number of posted receives we are allowed,

    ** and our plan has exceeded that, then switch to an MPI_Alltoallv so

    ** that we will have fewer receives posted when we do the communication.

    */

    int global_nrecvs;

    Teuchos::reduceAll(*comm, Teuchos::REDUCE_MAX, 1, &nrecvs, &global_nrecvs);

    if (global_nrecvs > MPI_RECV_LIMIT){

      plan_forward->maxed_recvs = 1;

    }

  }


  if (plan_forward->maxed_recvs == 0) {

    // See notes in header for MPI_Request

    plan_forward->request.resize(plan_forward->nrecvs);

  }


  nrec = total_recv_size;

}


Zoltan2_Directory_Comm::~Zoltan2_Directory_Comm()

{

  delete plan_forward;

}


int Zoltan2_Directory_Comm::invert_map(

  const Teuchos::ArrayRCP<int> &lengths_to,  /* number of items I'm sending */

  const Teuchos::ArrayRCP<int> &procs_to,    /* procs I send to */

  int       nsends,                    /* number of messages I'll send */

  int       self_msg,                  /* do I copy data to myself? */

  Teuchos::ArrayRCP<int> &lengths_from,      /* number of items I'm receiving */

  Teuchos::ArrayRCP<int> &procs_from,        /* procs I recv lengths from */

  int      *pnrecvs,                   /* number of messages I receive */

  int       my_proc,                   /* my processor number */

  int       nprocs,                    /* total number of processors */

  int       /* out_of_mem */,          /* tell everyone I'm out of memory? */

  int       tag,                       /* message tag I can use */

  Teuchos::RCP<const Teuchos::Comm<int> > comm) /* communicator */

{

  Teuchos::ArrayRCP<int> msg_count(new int[nprocs], 0, nprocs, true);

  Teuchos::ArrayRCP<int> counts(new int[nprocs], 0, nprocs, true);

  for(int i = 0; i < nprocs; ++i) {

    msg_count[i] = 0;

    counts[i] = 1;

  }


  for (int i = 0; i < nsends + self_msg; i++) {

    msg_count[procs_to[i]] = 1;

  }


  /*

  *  KDDKDD:  Replaced MPI_Reduce_scatter with MPI_Reduce and MPI_Scatter

  *  KDDKDD:  to avoid reported problems with MPICH 1.5.2.1.

  *  KDDKDD:  Some sort of MPI_TYPE_INDEXED error.

  *  KDDKDD:  Bug fix suggested by Clark Dohrmann and Rob Hoekstra.

  *  KDDKDD:  July 20, 2004


    MPI_Reduce_scatter((void *) msg_count, (void *) &nrecvs, counts, MPI_INT,

      MPI_SUM, comm);

  */

  Teuchos::reduceAll<int>(*comm, Teuchos::REDUCE_SUM, nprocs,

    msg_count.getRawPtr(), counts.getRawPtr());


  int nrecvs = 0;   /* number of messages I'll receive */


  Teuchos::scatter<int, int>(&(counts[0]), 1, &nrecvs, 1, 0, *comm);


  int max_nrecvs = 0;

  if (my_proc == 0) {

    for (int i=0; i < nprocs; i++) {

      if (counts[i] > max_nrecvs) {

        max_nrecvs = counts[i];

      }

    }

  }


  Teuchos::broadcast(*comm, 0, &max_nrecvs);


  if(nrecvs > 0) {

    lengths_from.resize(nrecvs);   /* number of items I'm receiving */

    procs_from.resize(nrecvs);    /* processors I'll receive from  */

  }


  if (MPI_RECV_LIMIT == 0 || max_nrecvs <= MPI_RECV_LIMIT) {

    // See notes in header for MPI_Request

    Teuchos::ArrayRCP<Teuchos::RCP<Teuchos::CommRequest<int> > > requests(nrecvs);


    /* Note: I'm counting on having a unique tag or some of my incoming */

    /* messages might get confused with others.                         */

    for (int i=0; i < nrecvs; i++) {

#ifdef HAVE_MPI // Teuchos::ireceive not implemented for Serial - Serial is just for debugging

      Teuchos::ArrayRCP<int> single_elem(&lengths_from[i], 0, 1, false);

      requests[i] = Teuchos::ireceive(single_elem, MPI_ANY_SOURCE, tag, *comm);

#endif

    }


    for (int i=0; i < nsends+self_msg; i++) {

#ifdef HAVE_MPI // Teuchos::send not implemented for Serial - Serial is just for debugging

      Teuchos::send(&lengths_to[i], 1, procs_to[i], tag, *comm);

#endif

    }


    for (int i=0; i < nrecvs; i++) {

#ifdef HAVE_MPI

      procs_from[i] = requests[i]->wait()->getSourceRank();

#else

      // above Teuchos MPI calls not supported for Serial so manually do the transfer.

      // We don't really need Serial for this class but it helps with debugging to have a serial test that can run.

      lengths_from[i] = lengths_to[i];

#endif

    }


  }

  else { /* some large HPC machines have a limit on number of posted receives */

    Teuchos::ArrayRCP<int> sendbuf(new int[nprocs], 0, nprocs, true);

    Teuchos::ArrayRCP<int> recvbuf(new int[nprocs], 0, nprocs, true);


    for (int i=0; i < nsends + self_msg; i++) {

      sendbuf[procs_to[i]] = lengths_to[i];

    }


    throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (2)"); // needs unit testing

    // Did not refactor this - need Teuchos form but this is not tested code.

    // MPI_Alltoall(&(sendbuf[0]), 1, MPI_INT, &(recvbuf[0]), 1, MPI_INT, getRawComm());


    for (int i=0, j=0; i < nprocs; i++) {

      if (recvbuf[i] > 0){

        lengths_from[j] = recvbuf[i];

        procs_from[j] = i;

        if (++j == nrecvs) {

          break;

        }

      }

    }

  }


  /* Sort recv lists to keep execution deterministic (e.g. for debugging) */

  sort_ints(procs_from, lengths_from);


  *pnrecvs = nrecvs - self_msg;    /* Only return number of true messages */


  return 0;

}


int Zoltan2_Directory_Comm::sort_ints(

  Teuchos::ArrayRCP<int> &vals_sort,     /* values to be sorted */

  Teuchos::ArrayRCP<int> &vals_other)    /* other array to be reordered w/ sort */

{

  // TODO: Check - perhaps we can skip all of these for efficiency

  if (vals_sort == Teuchos::null || vals_sort.size() == 0) {

    return 1;

  }

  if (vals_other == Teuchos::null || vals_other.size() == 0) {

    return 1;

  }

  if (vals_sort == Teuchos::null || vals_sort.size() == 1) {

    return 0;           /* fastest way to sort 1 item is to return */

  }


  /* find largest value (sort sometimes used for non processor lists) */

  int already_sorted = 1;  /* flag indicating whether vals_sort is

                              already sorted; can exit early and skip

                              memory allocations if it is.  */

  int top = vals_sort[0]; /* largest integer to sort, smallest is assumed 0 */

  for (Teuchos::ArrayRCP<int>::size_type i = 1; i < vals_sort.size(); i++) {

    if (vals_sort[i-1] > vals_sort[i]) {

      already_sorted = 0;

    }

    if (top < vals_sort[i]) {

      top = vals_sort[i];

    }

  }


  if (already_sorted) {

    return 0;

  }


  Teuchos::ArrayRCP<int> store(new int[top+2], 0, top+2, true);

  for(int n = 0; n < store.size(); ++n) {

    store[n] = 0;

  }


  Teuchos::ArrayRCP<int> copy_sort(new int[vals_sort.size()], 0, vals_sort.size(), true);

  for(Teuchos::ArrayRCP<int>::size_type n = 0; n < copy_sort.size(); ++n) {

    copy_sort[n] = vals_sort[n]; // TODO - use deepCopy method?

  }


  Teuchos::ArrayRCP<int> copy_other(new int[vals_other.size()], 0, vals_other.size(), true);

  for(Teuchos::ArrayRCP<int>::size_type n = 0; n < copy_other.size(); ++n) {

    copy_other[n] = vals_other[n]; // TODO - use deepCopy method?

  }


  // TODO: May want to modernize this ptr handling - however I didn't want

  // to introduce inefficiencies so for now have kept the original structure

  int *p = &(store[1]);

  for (Teuchos::ArrayRCP<int>::size_type i = 0; i < vals_sort.size(); i++) {

    p[copy_sort[i]]++;                /* count number of occurances */

  }


  for (int i = 1; i < top+1; i++) {

    p[i] += p[i-1];                   /* compute partial sums */

  }

                                   /* assert: p[top] = nvals */

  p = &(store[0]);                    /* effectively shifts down by one */

  for (Teuchos::ArrayRCP<int>::size_type i = 0; i < vals_sort.size(); i++) {

    vals_sort[p[copy_sort[i]]] = copy_sort[i];

    vals_other[p[copy_sort[i]]] = copy_other[i];

    ++p[copy_sort[i]];

  }


  return 0;

}


int Zoltan2_Directory_Comm::do_forward(

  int tag,                               /* message tag for communicating */

  const Teuchos::ArrayRCP<char> &send_data,    /* array of data I currently own */

  int nbytes,                            /* msg size */

  Teuchos::ArrayRCP<char> &recv_data)    /* array of data I'll own after comm */

{

  int status = 0;


  if (!plan_forward->maxed_recvs) {

    status = do_post (plan_forward, tag, send_data, nbytes, recv_data);

    if (status == 0) {

      status = do_wait (plan_forward, tag, send_data, nbytes, recv_data);

    }

  }

  else {

    status = do_all_to_all(plan_forward, send_data, nbytes, recv_data);

  }


  return status;

}


int Zoltan2_Directory_Comm::do_post(

  Zoltan2_Directory_Plan *plan,          /* communication data structure  */

  int tag,                               /* message tag for communicating */

  const Teuchos::ArrayRCP<char> &send_data,    /* array of data I currently own */

  int nbytes,                            /* msg size */

  Teuchos::ArrayRCP<char> &recv_data)     /* array of data I'll own after comm */

{

  /* Check input parameters */

  if (!plan) {

    throw std::logic_error("Communication plan = NULL");

  }


  /* If not point to point, currently we do synchroneous communications */

  if (plan->maxed_recvs) {

    throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (3)"); // needs unit testing

    return do_all_to_all(plan, send_data, nbytes, recv_data);

  }


  int my_proc = plan->comm->getRank();    /* processor ID */

  if ((plan->nsends + plan->self_msg) && !send_data.size()) {

    throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (4)"); // needs unit testing

    size_t sum = 0;

    if (plan->sizes_to.size()) {   /* Not an error if all sizes_to == 0 */

      for (int i = 0; i < (plan->nsends + plan->self_msg); i++) {

        sum += plan->sizes_to[i];

      }

    }

    if (!plan->sizes_to.size() || (plan->sizes_to.size() && sum)) {

      throw std::logic_error("nsends not zero, but send_data = NULL");

    }

  }

  if ((plan->nrecvs + plan->self_msg) && recv_data == Teuchos::null) {

    throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (5)"); // needs unit testing

    size_t sum = 0;

    if (plan->sizes_from != Teuchos::null)   /* Not an error if all sizes_from == 0 */

      for (int i = 0; i < (plan->nrecvs + plan->self_msg); i++)

        sum += plan->sizes_from[i];

    if (!plan->sizes_from.size() || (plan->sizes_from.size() && sum)) {

      throw std::logic_error("nrecvs not zero, but recv_data = NULL");

    }

  }


  /* Post irecvs */

  if (plan->indices_from == Teuchos::null) {

    /* Data can go directly into user space. */

    plan->recv_buff = recv_data;

  }

  else {      /* Need to buffer receive to reorder */

    size_t rsize = (size_t) (plan->total_recv_size) * (size_t) nbytes;

    plan->recv_buff = Teuchos::arcp(new char[rsize], 0, rsize, true);

  }


  size_t self_recv_address = 0;  /* where in recv_data self info starts */

  if (!plan->using_sizes) { /* All data the same size */

    int k = 0;

    for (int i = 0; i < plan->nrecvs + plan->self_msg; i++) {

      if (plan->procs_from[i] != my_proc) {

        Teuchos::ArrayRCP<char> subview(

          &plan->getRecvBuff().getRawPtr()[plan->starts_from[i] * nbytes],

          0, plan->lengths_from[i] * nbytes, false);

        plan->request[k] = Teuchos::ireceive(subview, plan->procs_from[i], tag, *comm_);

        k++;

      }

      else {

        self_recv_address = (size_t)(plan->starts_from[i]) * (size_t)nbytes;

      }

    }

  }

  else {      /* Data of varying sizes */

    int k = 0;

    for (int i = 0; i < plan->nrecvs + plan->self_msg; i++) {

      if (plan->procs_from[i] != my_proc) {

        if (plan->sizes_from[i]) {

          Teuchos::ArrayRCP<char> subview(

            &plan->getRecvBuff().getRawPtr()[(size_t)(plan->starts_from_ptr[i])

              * (size_t)nbytes],

            0, plan->sizes_from[i] * nbytes, false);

          plan->request[k] = Teuchos::ireceive(subview, plan->procs_from[i], tag, *comm_);

        }

        else {

          plan->request[k] = Teuchos::null;

        }

        k++;

      }

      else {

        self_recv_address =

          (size_t)(plan->starts_from_ptr[i]) * (size_t)nbytes;

      }

    }

  }


  Teuchos::ArrayRCP<char> send_buff;

  if(plan->indices_to != Teuchos::null) {

    send_buff = Teuchos::arcp(new char[plan->max_send_size * nbytes], 0, plan->max_send_size * nbytes, true);

  }


  /* Barrier to ensure irecvs are posted before doing any sends. */

  /* Simultaneously see if anyone out of memory */

  int out_of_mem = 0;

  // WARNING - do not delete this without proper barrier added as replacmeent.

  // I'm refactoring memory handling so probably we won't use out_of_mem

  // in the new version but we must still preserve a barrier here or get

  // intermittent failures.

  // I'll keep the memory reduce for now since we may end up with a memory

  // handling anyways.

  int global_out_of_mem;

  Teuchos::reduceAll(*plan->comm, Teuchos::REDUCE_SUM, 1, &out_of_mem,

    &global_out_of_mem);


  /* Send out data */


  /* Scan through procs_to list to start w/ higher numbered procs */

  /* This should balance message traffic. */


  int nblocks = plan->nsends + plan->self_msg; /* # procs needing my data */

  int proc_index = 0; /* loop counter over procs to send to */

  while (proc_index < nblocks && plan->procs_to[proc_index] < my_proc) {

    proc_index++;

  }

  if (proc_index == nblocks) {

    proc_index = 0;

  }


  if (!plan->using_sizes) { /* Data all of same size */

    if (plan->indices_to == Teuchos::null) {  /* data already blocked by processor. */

      int self_num = 0;       /* where in send list my_proc appears */

      for (int i = proc_index, j = 0; j < nblocks; j++) {

        if (plan->procs_to[i] != my_proc) {

          Teuchos::ArrayRCP<char> subview(

            &send_data[plan->starts_to[i] * nbytes],

            0, plan->lengths_to[i] * nbytes, false);

          Teuchos::readySend(subview.getRawPtr(), static_cast<int>(subview.size()), plan->procs_to[i], tag, *comm_);

        }

        else {

          self_num = i;

        }

        if (++i == nblocks) {

          i = 0;

        }

      }


      if (plan->self_msg) { /* Copy data to self. */

        /* I use array+offset instead of &(array[offset]) because of

         a bug with PGI v9 */

        /* I use memmove because I'm not sure that the pointer are not

         overlapped. */


        // TODO: Refactor to make C++ and remove getRawPtr, etc

        memmove(

          plan->getRecvBuff().getRawPtr()+self_recv_address,

          send_data.getRawPtr()+(size_t)(plan->starts_to[self_num])*(size_t)nbytes,

          (size_t) (plan->lengths_to[self_num]) * (size_t) nbytes);

      }

    }

    else { /* Not blocked by processor.  Need to buffer. */

      int self_index = 0;     /* send offset for data I'm keeping */

      int self_num = 0;       /* where in send list my_proc appears */

      for (int i = proc_index, jj = 0; jj < nblocks; jj++) {

        if (plan->procs_to[i] != my_proc) {

          /* Need to pack message first. */

          size_t offset = 0; /* offset into array I'm copying into */

          int j = plan->starts_to[i];

          for (int k = 0; k < plan->lengths_to[i]; k++) {

            memcpy(&send_buff[offset],

            &send_data[(size_t)(plan->indices_to[j++]) * (size_t)nbytes], nbytes);

            offset += nbytes;

          }

          Teuchos::readySend(&send_buff[0], plan->lengths_to[i] * nbytes,

            plan->procs_to[i], tag, *comm_);

        }

        else {

          self_num = i;

          self_index = plan->starts_to[i];

        }

        if (++i == nblocks)

          i = 0;

      }


      if (plan->self_msg) { /* Copy data to self. */

        for (int k = 0; k < plan->lengths_to[self_num]; k++) {

          memcpy(&(plan->getRecvBuff())[self_recv_address],

            &send_data[

              (size_t)(plan->indices_to[self_index++]) * (size_t)nbytes],

              nbytes);

          self_recv_address += nbytes;

        }

      }

    }

  }

  else {                       /* Data of differing sizes */

    if (plan->indices_to == Teuchos::null) {        /* data already blocked by processor. */

      int self_num = 0;       /* where in send list my_proc appears */

      for (int i = proc_index, j = 0; j < nblocks; j++) {

        if (plan->procs_to[i] != my_proc) {

          if (plan->sizes_to[i]) {

            Teuchos::readySend(&send_data[plan->starts_to_ptr[i] * nbytes], plan->sizes_to[i] * nbytes,

              plan->procs_to[i], tag, *comm_);

          }

        }

        else

          self_num = i;


        if (++i == nblocks)

          i = 0;

      }

      if (plan->self_msg) {    /* Copy data to self. */

        if (plan->sizes_to[self_num]) {

          char* lrecv = &plan->getRecvBuff().getRawPtr()[self_recv_address];

          const char* lsend =

          &send_data.getRawPtr()[(size_t)(plan->starts_to_ptr[self_num]) * (size_t)nbytes];

          int sindex = plan->sizes_to[self_num], idx;

          for (idx=0; idx<nbytes; idx++) {

            memcpy(lrecv, lsend, sindex);

            lrecv += sindex;

            lsend += sindex;

          }

        }

      }

    }

    else {                     /* Not blocked by processor.  Need to buffer. */

      int self_num = 0;       /* where in send list my_proc appears */

      for (int i = proc_index, jj = 0; jj < nblocks; jj++) {

        if (plan->procs_to[i] != my_proc) {

          size_t offset = 0; /* offset into array I'm copying into */

          int j = plan->starts_to[i];

          for (int k = 0; k < plan->lengths_to[i]; k++) {

            if (plan->sizes[plan->indices_to[j]]) {

              memcpy(&send_buff[offset],

                &send_data[(size_t)(plan->indices_to_ptr[j]) * (size_t)nbytes],

                (size_t)(plan->sizes[plan->indices_to[j]]) * (size_t)nbytes);

              offset +=

                (size_t)(plan->sizes[plan->indices_to[j]]) * (size_t)nbytes;

            }

            j++;

          }

          if (plan->sizes_to[i]) {

            Teuchos::readySend(&send_buff[0], plan->sizes_to[i] * nbytes,

              plan->procs_to[i], tag, *comm_);

          }

        }

        else

          self_num = i;


        if (++i == nblocks)

          i = 0;

      }

      if (plan->self_msg) {    /* Copy data to self. */

        if (plan->sizes_to[self_num]) {

          int j = plan->starts_to[self_num];

          for (int k = 0; k < plan->lengths_to[self_num]; k++) {

            int kk = plan->indices_to_ptr[j];

            char* lrecv = &(plan->getRecvBuff())[self_recv_address];

            size_t send_idx = (size_t)kk * (size_t)nbytes;

            const char* lsend = &send_data[send_idx];

            int sindex = plan->sizes[plan->indices_to[j]], idx;

            for (idx=0; idx<nbytes; idx++) {

              memcpy(lrecv, lsend, sindex);

              lrecv += sindex;

              lsend += sindex;

            }

            self_recv_address += (size_t)(plan->sizes[plan->indices_to[j]])

              * (size_t) nbytes;

            j++;

          }

        }

      }

    }

  }


  return 0;

}


int Zoltan2_Directory_Comm::do_wait(

  Zoltan2_Directory_Plan *plan,          /* communication data structure */

  int /* tag */,                               /* message tag for communicating */

  const Teuchos::ArrayRCP<char> &/* send_data */, /* array of data I currently own */

  int nbytes,                            /* msg size */

  Teuchos::ArrayRCP<char> &recv_data)    /* array of data I'll own after comm */

{

  /* If not point to point, currently we do synchroneous communications */

  if (plan->maxed_recvs){

    /* Do nothing */

    return 0;

  }


  int my_proc = plan->comm->getRank();    /* processor ID */


  /* Wait for messages to arrive & unpack them if necessary. */

  /* Note: since request is in plan, could wait in later routine. */

  if (plan->indices_from == Teuchos::null) {  /* No copying required */

    if (plan->nrecvs > 0) {

      Teuchos::waitAll(*comm_, plan->request());

    }

  }

  else {        /* Need to copy into recv_data. */

    int self_num;   /* where in send list my_proc appears */

    size_t offsetDst = 0;

    if (plan->self_msg) {   /* Unpack own data before waiting */

      for (self_num = 0; self_num < plan->nrecvs + plan->self_msg;

        self_num++) {

        if (plan->procs_from[self_num] == my_proc) {

          break;

        }

      }


      if(plan->sizes_from.size()) {

        // NEW METHOD for variable sized data

        // This will NOT put the data in order but during the directory read

        // of the buffer which follows, this data gets sorted anyways since each

        // element has an index to tell where it belongs. I am not sure yet if

        // we must sort here, or if it's fine to allow the sort to happen through

        // the index value. This needs furthe discussion.

        memcpy(&recv_data[offsetDst * (size_t)nbytes],

          &(plan->getRecvBuff())[plan->starts_from_ptr[self_num] * (size_t)nbytes],

          plan->sizes_from[self_num] * (size_t)nbytes);

        offsetDst += plan->sizes_from[self_num];

      }

      else {

        int k = plan->starts_from[self_num];

        for (int j = plan->lengths_from[self_num]; j; j--) {

          memcpy(&recv_data[(size_t)(plan->indices_from[k]) * (size_t)nbytes],

            &(plan->getRecvBuff())[(size_t)k * (size_t)nbytes], nbytes);

          k++;

        }

      }

    }

    else {

      self_num = plan->nrecvs;

    }


    for (int jj = 0; jj < plan->nrecvs; jj++) {

      // TODO: Refactored directory to use Teuchos comm but we have no Teuchos::waitAny

      // Short term fix is we just call wait() on each request in serial.

      // When we add Teuchos::waitAny we can replace it for the old version

      // which is commented out below.

      plan->request[jj]->wait();

      int index = jj;


      // Old form with MPI_Waitany

      // MPI_Status status;               /* return from Waitany */

      // int index;

      // MPI_Waitany(plan->nrecvs, &plan->request[0], &index, &status);


      if (index >= self_num) {

        index++;

      }


      if(plan->sizes_from.size()) {

        // NEW METHOD for variable sized data

        // This will NOT put the data in order but during the directory read

        // of the buffer which follows, this data gets sorted anyways since each

        // element has an index to tell where it belongs. I am not sure yet if

        // we must sort here, or if it's fine to allow the sort to happen through

        // the index value. This needs furthe discussion.

        memcpy(&recv_data[offsetDst * (size_t)nbytes],

          &plan->getRecvBuff().getRawPtr()[plan->starts_from_ptr[index] * (size_t)nbytes],

          plan->sizes_from[index] * (size_t)nbytes);

        offsetDst += plan->sizes_from[index];

      }

      else {

        int k = plan->starts_from[index];

        for (int j = plan->lengths_from[index]; j; j--) {

          memcpy(&recv_data.getRawPtr()[(size_t)(plan->indices_from[k]) * (size_t)nbytes],

                 &plan->getRecvBuff().getRawPtr()[(size_t)k * (size_t)nbytes], nbytes);

          k++;

        }

      }

    }

  }


  return 0;

}


/* Do_Post would require posting more receives than allowed on this platform.

*  We use MPI_AlltoAllv instead, which is probably implemented such that each

*  process does one receive at a time.

*/


int Zoltan2_Directory_Comm::do_all_to_all(

  Zoltan2_Directory_Plan *plan,              /* communication data structure */

  const Teuchos::ArrayRCP<char> &send_data,        /* array of data I currently own */

  int nbytes,                                /* msg size */

  Teuchos::ArrayRCP<char> &recv_data)    /* array of data I'll own after comm */

{

  throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (6)"); // needs unit testing


  int sm = (plan->self_msg > 0) ? 1 : 0;


  int nSendMsgs = plan->nsends + sm;

  int nRecvMsgs = plan->nrecvs + sm;


  int nSendItems = 0;

  for (int i=0; i <nSendMsgs; i++) {

    nSendItems += plan->lengths_to[i];

  }

  int nRecvItems = 0;

  for (int i=0; i <nRecvMsgs; i++) {

    nRecvItems += plan->lengths_from[i];

  }


  int nprocs = plan->comm->getSize();


  Teuchos::ArrayRCP<int> outbufCounts(new int[nprocs], 0, nprocs, true);

  Teuchos::ArrayRCP<int> outbufOffsets(new int[nprocs], 0, nprocs, true);

  Teuchos::ArrayRCP<int> inbufCounts(new int[nprocs], 0, nprocs, true);

  Teuchos::ArrayRCP<int> inbufOffsets(new int[nprocs], 0, nprocs, true);


  /* The *_to fields of the plan refer to the items in the send_data buffer,

   * and how to pull out the correct items for each receiver.  The

   * *_from fields of the plan refer to the recv_data buffer.  Items

   * arrive in process rank order, and these fields tell us where to

   * put them in the recv_data buffer.

   */


  /* CREATE SEND BUFFER */


  int sorted = 0;

  if (plan->indices_to == Teuchos::null){

    sorted = 1;

    for (int i=1; i< nSendMsgs; i++){

      if (plan->starts_to[i] < plan->starts_to[i-1]){

        sorted = 0;

        break;

      }

    }

  }


  Teuchos::ArrayRCP<char> outbuf;

  Teuchos::ArrayRCP<char> inbuf;

  Teuchos::ArrayRCP<char> buf;


  if (plan->sizes_to.size()){

    /*

     * Each message contains items for a process, and each item may be

     * a different size.

     */


    int outbufLen = 0;

    for (int i = 0; i < nSendMsgs; i++){

      outbufLen += plan->sizes_to[i];

    }


    if (plan->indices_to != Teuchos::null) {

      /*

       * items are not grouped by message

       */

      buf.resize(outbufLen*nbytes);

      outbuf.resize(outbufLen*nbytes);

      char * pBufPtr = &(outbuf[0]);

      int i = 0;

      int k = 0;

      for (int p = 0; p < nprocs; p++) {


        int length = 0;


        if (i < nSendMsgs){

          if (plan->procs_to[i] == p){   /* procs_to is sorted */


            for (int j=0; j < plan->lengths_to[i]; j++,k++){

              int itemSize = plan->sizes[plan->indices_to[k]] * nbytes;

              int offset = plan->indices_to_ptr[k] * nbytes;


              memcpy(pBufPtr, &(send_data[0]) + offset, itemSize);


              pBufPtr += itemSize;

              length += itemSize;

            }

            i++;

          }

        }


        outbufCounts[p] = length;

        if (p){

          outbufOffsets[p] = outbufOffsets[p-1] + outbufCounts[p-1];

        }

      }

    }

    else{

      /*

       * items are stored contiguously for each message

       */


      if (!sorted || (plan->nvals > nSendItems) ){

        buf.resize(outbufLen*nbytes);

        outbuf.resize(outbufLen*nbytes);

      }

      else{

        /* All items in send_data are being sent, and they are sorted

         * in process rank order.

         */

        // TODO: Optimize - original just set the ptr...

        for(int n = 0; n < outbufLen*nbytes; ++n) {

          outbuf[n] = send_data[n];

        }

      }


      char * pBufPtr = &(outbuf[0]);


      int i = 0;

      for (int p = 0; p < nprocs; p++) {


        int length = 0;


        if (i < nSendMsgs){

          if (plan->procs_to[i] == p){   /* procs_to is sorted */

            length = plan->sizes_to[i] * nbytes;

            int offset = plan->starts_to_ptr[i] * nbytes;


            if ((!sorted || (plan->nvals > nSendItems)) && length){

              memcpy(pBufPtr, &(send_data[0]) + offset, length);

              pBufPtr += length;

            }

            i++;

          }

        }


        outbufCounts[p] = length;

        if (p){

          outbufOffsets[p] = outbufOffsets[p-1] + outbufCounts[p-1];

        }

      }

    }

  }

  else if (plan->indices_to != Teuchos::null) {

    /*

     * item sizes are constant, however the items belonging in a given

     * message may not be contiguous in send_data

     */

    buf.resize(nSendItems*nbytes);

    outbuf.resize(nSendItems*nbytes);

    char * pBufPtr = &(outbuf[0]);

    int i = 0;

    int k = 0;

    for (int p = 0; p < nprocs; p++){


      int length = 0;


      if (i < nSendMsgs){

        if (plan->procs_to[i] == p){   /* procs_to is sorted */

          for (int j=0; j < plan->lengths_to[i]; j++,k++) {

            int offset = plan->indices_to[k] * nbytes;

            memcpy(pBufPtr, &(send_data[0]) + offset, nbytes);

            pBufPtr += nbytes;

          }

          length = plan->lengths_to[i] * nbytes;

          i++;

        }

      }


      outbufCounts[p] = length;

      if (p){

        outbufOffsets[p] = outbufOffsets[p-1] + outbufCounts[p-1];

      }

    }

  }

  else{


    /* item sizes are constant, and items belonging to a

     * given message are always stored contiguously in send_data

     */


    if (!sorted || (plan->nvals > nSendItems)){

      buf.resize(nSendItems*nbytes);

      outbuf.resize(nSendItems*nbytes);

    }

    else{

      /* send_data is sorted by process, and we don't skip

       * any of the data in the buffer, so we can use send_data

       * in the alltoall call

       */

      // TODO: Optimize - original just set ptr

      outbuf = send_data;

    }


    char * pBufPtr = &(outbuf[0]);


    int i = 0;

    for (int p=0; p < nprocs; p++) {


      int length = 0;


      if (i < nSendMsgs){

        if (plan->procs_to[i] == p){    /* procs_to is sorted */

          int offset = plan->starts_to[i] * nbytes;

          length = plan->lengths_to[i] * nbytes;


          if ((!sorted || (plan->nvals > nSendItems)) && length){

            memcpy(pBufPtr, &(send_data[0]) + offset, length);

            pBufPtr += length;

          }

          i++;

        }

      }


      outbufCounts[p] = length;

      if (p){

        outbufOffsets[p] = outbufOffsets[p-1] + outbufCounts[p-1];

      }

    }

  }


  /* CREATE RECEIVE BUFFER */


  sorted = 0;

  int i;

  if (plan->indices_from == Teuchos::null) {

    sorted = 1;

    for (i=1; i< nRecvMsgs; i++) {

      if (plan->starts_from[i] < plan->starts_from[i-1]){

        sorted = 0;

        break;

      }

    }

  }


  if (sorted){

    /* Caller already expects received data to be ordered by

     * the sending process rank.

     */


    // TODO: Optimize - original just set ptr

    outbuf = send_data;

    inbuf = recv_data;

  }

  else {

    inbuf.resize(plan->total_recv_size * nbytes);

  }


  for (int p = 0; p < nprocs; p++) {

    int length = 0;


    if (i < nRecvMsgs){

      if (plan->procs_from[i] == p){


        if (!plan->using_sizes){

          length = plan->lengths_from[i] * nbytes;

        }

        else{

          length = plan->sizes_from[i] * nbytes;

        }

        i++;

      }

    }


    inbufCounts[p] = length;

    if (p){

      inbufOffsets[p] = inbufOffsets[p-1] + inbufCounts[p-1];

    }

  }


  /* EXCHANGE DATA */

  // Did not refactor this - need Teuchos form but not tested code

  // MPI_Alltoall(&(sendbuf[0]), 1,  MPI_INT, &(recvbuf[0]), 1, MPI_INT,

  //      getRawComm());


  /* WRITE RECEIVED DATA INTO USER'S BUFFER WHERE IT'S EXPECTED */


  if (!sorted){


    char * pBufPtr = &(inbuf[0]);


    if (!plan->using_sizes){


      /* each item in each message is nbytes long */


      if (plan->indices_from == Teuchos::null) {

        for (i=0; i < nRecvMsgs; i++){

          int offset = plan->starts_from[i] * nbytes;

          int length = plan->lengths_from[i] * nbytes;

          memcpy(&(recv_data[0]) + offset, pBufPtr, length);

          pBufPtr += length;

        }

      }

      else{

        int k = 0;

        for (i=0; i < nRecvMsgs; i++) {


          for (int j=0; j < plan->lengths_from[i]; j++,k++){

            int offset = plan->indices_from[k] * nbytes;

            memcpy(&(recv_data[0]) + offset, pBufPtr, nbytes);

            pBufPtr += nbytes;

          }

        }

      }

    }

    else{  /* (sizes!=NULL) && (indices_from!=NULL) not allowed by Zoltan_Comm_Resize */


      /* items can be different sizes */


      for (i=0; i < nRecvMsgs; i++){

        int offset = plan->starts_from_ptr[i] * nbytes;

        int length = plan->sizes_from[i] * nbytes;

        memcpy(&(recv_data[0]) + offset, pBufPtr, length);

        pBufPtr += length;

      }

    }

  }


  return 0;

}


int Zoltan2_Directory_Comm::do_reverse(

  int tag,                                   /* message tag for communicating */

  const Teuchos::ArrayRCP<char> &send_data,        /* array of data I currently own */

  int nbytes,                                /* msg size */

  const Teuchos::ArrayRCP<int> &sizes,

  Teuchos::ArrayRCP<char> &recv_data)  /* array of data I'll own after reverse comm */

{

  /* create plan->plan_reverse

   */

  int status = create_reverse_plan(tag, sizes);


  // NEW METHOD

  // Set up recv_data with the proper size

  // This information is only available after the above create_reverse_plan is

  // called so we can setup the return data with proper buffer size now.

  // However should we do this here?

  size_t new_size = plan_forward->plan_reverse->total_recv_size*nbytes;

  if(new_size > 0) {

    // have to be careful with new[0] which will for example turn a

    // arrayRCP.getRawPtr() from a valid NULL read to a debug assert fail.

    recv_data = Teuchos::arcp(new char[new_size], 0, new_size, true);

  }


  if (status == 0) {


    if (plan_forward->plan_reverse->maxed_recvs) {


      throw std::logic_error("Zoltan2_Directory_Comm.coom untested refactored code (7)"); // needs unit testing


      /* use MPI_Alltoallv to implement plan->plan_reverse, because comm_do_post

       * would post more receives that allowed on this machine

       */


      status = do_all_to_all(plan_forward->plan_reverse, send_data,

        nbytes, recv_data);

    }

    else {

      /* use post/wait which is faster when each sends to few

       */

      status = do_post(plan_forward->plan_reverse, tag, send_data,

        nbytes, recv_data);


      if (status == 0) {

        status = do_wait (plan_forward->plan_reverse, tag, send_data,

          nbytes, recv_data);

      }

    }

  }


  free_reverse_plan(plan_forward);


  return status;

}


void Zoltan2_Directory_Comm::free_reverse_plan(Zoltan2_Directory_Plan *plan)

{

  if(!plan) {

    throw std::logic_error("Plan is NULL!");

  }

  delete plan->plan_reverse;

  plan->plan_reverse = NULL;

}


int Zoltan2_Directory_Comm::create_reverse_plan(

  int tag,

  const Teuchos::ArrayRCP<int> &sizes)/* variable size of objects (if not size 0) */

{

  /* Check input parameters */

  if (!plan_forward){

    throw std::logic_error("memory error");

  }


  /* Let Zoltan_Comm_Do check the remaining parameters. */

  plan_forward->plan_reverse = new Zoltan2_Directory_Plan;

  plan_forward->plan_reverse->getInvertedValues(plan_forward);


  if (MPI_RECV_LIMIT > 0){

    /* If we have a limit to the number of posted receives we are allowed,

    ** and our plan has exceeded that, then switch to an MPI_Alltoallv so

    ** that we will have fewer receives posted when we do the communication.

    */

    int global_nsends;

    Teuchos::reduceAll<int>(*plan_forward->comm, Teuchos::REDUCE_SUM, 1,

      &plan_forward->nsends, &global_nsends);

    if (global_nsends > MPI_RECV_LIMIT){

      plan_forward->plan_reverse->maxed_recvs = 1;

    }

  }


  if (plan_forward->plan_reverse->maxed_recvs == 0) {

    // See notes in header for MPI_Request

    //   plan_forward->plan_reverse->request = Teuchos::arcp(new MPI_Request[plan_forward->plan_reverse->nrecvs], 0, plan_forward->plan_reverse->nrecvs, true);

    //   plan_forward->plan_reverse->status = Teuchos::arcp(new MPI_Status[plan_forward->plan_reverse->nrecvs], 0, plan_forward->plan_reverse->nrecvs, true);

    plan_forward->plan_reverse->request.resize(plan_forward->plan_reverse->nrecvs);

  }


  int sum_recv_sizes;

  int comm_flag = resize( plan_forward->plan_reverse,

    sizes, tag, &sum_recv_sizes);


  if (comm_flag != 0) {

    return(comm_flag);

  }


  if (sum_recv_sizes != plan_forward->plan_reverse->total_recv_size){

     /* Sanity check */

     return 1;

  }


  return 0;

}


int Zoltan2_Directory_Comm::resize(

  const Teuchos::ArrayRCP<int> &sizes,   /* size of each item I'm sending */

  int       tag,                   /* message tag I can use */

  int      *sum_recv_sizes)        /* sum of the sizes of the items I'll receive */

{

  return resize(plan_forward, sizes, tag, sum_recv_sizes);

}


int Zoltan2_Directory_Comm::resize(

  Zoltan2_Directory_Plan *plan,    /* communication plan object */

  const Teuchos::ArrayRCP<int> &sizes,   /* size of each item I'm sending */

  int       tag,                   /* message tag I can use */

  int      *sum_recv_sizes)        /* sum of the sizes of the items I'll receive */

{

  /* If sizes vary, then I need to compute and communicate message lengths */

  /* First check if sizes array is NULL on all procs. */

  int my_proc = plan->comm->getRank();    /* my processor ID */

  int has_sizes = (sizes.size() != 0);

  int var_sizes;        /* items have variable sizes? */


  Teuchos::reduceAll(*comm_, Teuchos::REDUCE_BOR, 1, &has_sizes, &var_sizes);


  if (var_sizes && plan->indices_from != Teuchos::null) {

    // NEW METHOD

    // Allow this to run now - the below implementation is working but perhaps

    // not done correctly for other usage cases I have not considered yet.

    // throw std::logic_error("Non-blocked, variable-sized recvs not supported");

  }


  int nsends = plan->nsends; /* number of msgs I'll send */

  int nrecvs = plan->nrecvs; /* number of msgs I'll recv */

  int self_msg = plan->self_msg;


  Teuchos::ArrayRCP<int> sizes_to;

  Teuchos::ArrayRCP<int> sizes_from;

  Teuchos::ArrayRCP<int> starts_to_ptr;

  Teuchos::ArrayRCP<int> starts_from_ptr;

  Teuchos::ArrayRCP<int> indices_to_ptr;

  Teuchos::ArrayRCP<int> indices_from_ptr;


  if (!var_sizes) { /* Easy case.  Size = length */

    plan->total_recv_size = 0;

    for (int i = 0; i < nrecvs + self_msg; i++) {

      plan->total_recv_size += plan->lengths_from[i];

    }


    plan->max_send_size = 0;

    for (int i = 0; i < nsends + self_msg; i++) {

      if (plan->procs_to[i] != my_proc &&

          plan->lengths_to[i] > plan->max_send_size) {

        plan->max_send_size = plan->lengths_to[i];

      }

    }

  }

  else {    /* Need to actually compute message sizes */


    // TODO Investigate purpose of the +1 in the old code. Is that used?


    //  OLD CODE

    //  plan->sizes.resize(plan->nvals + 1);

    //  for (int i = 0; i < plan->nvals; i++) {

    //    plan->sizes[i] = sizes[i];

    //  }


    // NEW CODE

    plan->sizes = sizes; // can we just copy?

    plan->using_sizes = true;


    if(nsends + self_msg > 0) {

      sizes_to = Teuchos::arcp(

        new int[nsends + self_msg], 0, nsends + self_msg, true);

      for(int n = 0; n < sizes_to.size(); ++n) {

        sizes_to[n] = 0;

      }

    }

    if(nrecvs + self_msg > 0) {

      sizes_from = Teuchos::arcp(

        new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

    }


    /* Several cases:

     1. indices_to == NULL

     => starts_to != NULL, need to allocate, set starts_to_ptr

     2. indices_to != NULL (=> starts_to == NULL)

     need to allocate, set indices_to_ptr

     3,4. mirror cases for _from

     */

    if(nsends + self_msg > 0) {

      starts_to_ptr = Teuchos::arcp(

        new int[nsends + self_msg], 0, nsends + self_msg, true);

    }

    if (plan->indices_to == Teuchos::null) {

      /* Simpler case; sends already blocked by processor */

      Teuchos::ArrayRCP<int> index;

      Teuchos::ArrayRCP<int> sort_val;

      if(nsends + self_msg > 0) {

        index = Teuchos::arcp(

          new int[nsends + self_msg], 0, nsends + self_msg, true);

        sort_val = Teuchos::arcp(

          new int[nsends + self_msg], 0, nsends + self_msg, true);

      }

      for (int i = 0; i < nsends + self_msg; i++) {

        int j = plan->starts_to[i];


        for (int k = 0; k < plan->lengths_to[i]; k++) {

          sizes_to[i] += sizes[j++];

        }

        if (sizes_to[i] > plan->max_send_size &&

            plan->procs_to[i] != my_proc)

          plan->max_send_size = sizes_to[i];

      }

      for (int i = 0; i < nsends + self_msg; i++) {

        sort_val[i] = plan->starts_to[i];

        index[i] = i;

      }

      sort_ints(sort_val, index);

      int sum = 0;

      for (int i = 0; i < nsends + self_msg; i++) {

        starts_to_ptr[index[i]] = sum;

        sum += sizes_to[index[i]];

      }

    }

    else {    /* Harder case, sends not blocked */

      Teuchos::ArrayRCP<int> offset;

      if(plan->nvals > 0) {

        offset = Teuchos::arcp(new int[plan->nvals], 0, plan->nvals, true);

      }

      indices_to_ptr.resize(plan->nvals);


      /* Compute address for every item in send array */

      int sum = 0;

      for (int i = 0; i < plan->nvals; i++) {

        offset[i] = sum;

        sum += sizes[i];

      }


      sum = 0;

      plan->max_send_size = 0;

      for (int i = 0; i < nsends + self_msg; i++) {

        starts_to_ptr[i] = sum;

        int j = plan->starts_to[i];

        for (int k = 0; k < plan->lengths_to[i]; k++) {

          indices_to_ptr[j] = offset[plan->indices_to[j]];

          sizes_to[i] += sizes[plan->indices_to[j++]];

        }

        if (sizes_to[i] > plan->max_send_size &&

            plan->procs_to[i] != my_proc)

          plan->max_send_size = sizes_to[i];

        sum += sizes_to[i];

      }

    }


    /* Note: This routine only gets message sizes, not object sizes. */

    /*  Anything requiring item sizes requires more code */

    /*      Should such functionality reside here? */

    exchange_sizes(sizes_to, plan->procs_to, nsends, self_msg,

      sizes_from, plan->procs_from, nrecvs,

      &plan->total_recv_size, my_proc, tag, plan->comm);


    if(nrecvs + self_msg > 0) {

      starts_from_ptr = Teuchos::arcp(

        new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

    }


    if (plan->indices_from == Teuchos::null) {

      /* Simpler case; recvs already blocked by processor */

      Teuchos::ArrayRCP<int> index;

      Teuchos::ArrayRCP<int> sort_val;

      if(nrecvs + self_msg > 0) {

        index = Teuchos::arcp(

          new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

        sort_val = Teuchos::arcp<int>(

            new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

      }


      for (int i = 0; i < nrecvs + self_msg; i++) {

        sort_val[i] = plan->starts_from[i];

        index[i] = i;

      }

      sort_ints(sort_val, index);


      int sum = 0;

      for (int i = 0; i < nrecvs + self_msg; i++) {

        starts_from_ptr[index[i]] = sum;

        sum += sizes_from[index[i]];

      }

    }

    else {

      // OLD COMMENT left here for reference but to be deleted

           /* Harder case, recvs not blocked */

           /* Not currently supported */

           /* Can't do w/o individual item sizes */

           /* Currently checked for at top of file */


      // NEW METHOD

      // Note this is currently just a duplicate of above block which is working

      // since I've set up do_wait to just copy the entire block. However I am

      // not sure yet how to organize this implementation and suspect this may

      // not be correct anyways - though it seems to work. After do_wait copies

      // the data (out of order) the directory will handle resorting it since

      // each element has index to indicate where it goes in the array. In the

      // original do_wait implementation it seemed it was setup to place the

      // elemenets in order, even though they would be sorted later, so that

      // part I need to discuss further

      Teuchos::ArrayRCP<int> index;

      Teuchos::ArrayRCP<int> sort_val;

      if(nrecvs + self_msg > 0) {

        index = Teuchos::arcp(

          new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

        sort_val = Teuchos::arcp(

          new int[nrecvs + self_msg], 0, nrecvs + self_msg, true);

      }


      for (int i = 0; i < nrecvs + self_msg; i++) {

        sort_val[i] = plan->starts_from[i];

        index[i] = i;

      }

      sort_ints(sort_val, index);


      int sum = 0;

      for (int i = 0; i < nrecvs + self_msg; i++) {

        starts_from_ptr[index[i]] = sum;

        sum += sizes_from[index[i]];

      }

    }

  }

  plan->sizes_to = sizes_to;

  plan->sizes_from = sizes_from;

  plan->starts_to_ptr = starts_to_ptr;

  plan->starts_from_ptr = starts_from_ptr;

  plan->indices_to_ptr = indices_to_ptr;

  plan->indices_from_ptr = indices_from_ptr;


  if (sum_recv_sizes) {

    *sum_recv_sizes = plan->total_recv_size;

  }


  return 0;

}


int Zoltan2_Directory_Comm::exchange_sizes(

  const Teuchos::ArrayRCP<int> &sizes_to,   /* value I need to exchange (size of true msg) */

  const Teuchos::ArrayRCP<int> &procs_to,   /* procs I send to */

  int       nsends,                   /* number of messages I'll send */

  int       self_msg,                 /* do I copy data to myself? */

  Teuchos::ArrayRCP<int> &sizes_from,       /* (returned) size of all my receives */

  const Teuchos::ArrayRCP<int> &procs_from, /* procs I recv from */

  int       nrecvs,                   /* number of messages I receive */

  int      *total_recv_size,          /* (returned) sum of all incoming sizes */

  int       my_proc,                  /* my processor number */

  int       tag,                      /* message tag I can use */

  Teuchos::RCP<const Teuchos::Comm<int> > /* comm */) {   /* communicator */


  /* If sizes vary, then I need to communicate messaaage lengths */

  int self_index_to = -1;  /* location of self in procs_to */

  for (int i = 0; i < nsends + self_msg; i++) {

    if (procs_to[i] != my_proc) {

#ifdef HAVE_MPI // Teuchos::send not implemented for Serial - Serial is just for debugging

      Teuchos::send(*comm_, 1, &sizes_to[i], procs_to[i]);

#endif

    }

    else {

      self_index_to = i;

    }

  }


  *total_recv_size = 0;


  for (int i = 0; i < nrecvs + self_msg; i++) {

    if (procs_from[i] != my_proc) {

#ifdef HAVE_MPI // Teuchos::receive not implemented for Serial - Serial is just for debugging

      Teuchos::receive(*comm_, procs_from[i], 1, &sizes_from[i]);

#endif

    }

    else {

      sizes_from[i] = sizes_to[self_index_to];

    }

    *total_recv_size += sizes_from[i];

  }

  return 0;

}


} // end namespace Zoltan2

PRINT_VECTOR
#define PRINT_VECTOR(v)

PRINT_VAL
#define PRINT_VAL(val)

Zoltan2_Directory_Comm.hpp

MPI_RECV_LIMIT
#define MPI_RECV_LIMIT
Definition Zoltan2_Directory_Comm.hpp:176

Zoltan2::Zoltan2_Directory_Comm::~Zoltan2_Directory_Comm
~Zoltan2_Directory_Comm()
Definition Zoltan2_Directory_Comm.cpp:317

Zoltan2::Zoltan2_Directory_Comm::do_reverse
int do_reverse(int tag, const Teuchos::ArrayRCP< char > &send_data, int nbytes, const Teuchos::ArrayRCP< int > &sizes, Teuchos::ArrayRCP< char > &recv_data)
Definition Zoltan2_Directory_Comm.cpp:1232

Zoltan2::Zoltan2_Directory_Comm::Zoltan2_Directory_Comm
Zoltan2_Directory_Comm(int nvals, const Teuchos::ArrayRCP< int > &assign, Teuchos::RCP< const Teuchos::Comm< int > > comm, int tag)
Definition Zoltan2_Directory_Comm.cpp:96

Zoltan2::Zoltan2_Directory_Comm::do_forward
int do_forward(int tag, const Teuchos::ArrayRCP< char > &send_data, int nbytes, Teuchos::ArrayRCP< char > &recv_data)
Definition Zoltan2_Directory_Comm.cpp:510

Zoltan2::Zoltan2_Directory_Comm::resize
int resize(const Teuchos::ArrayRCP< int > &sizes, int tag, int *sum_recv_sizes)
Definition Zoltan2_Directory_Comm.cpp:1344

Zoltan2::Zoltan2_Directory_Plan
Definition Zoltan2_Directory_Comm.hpp:20

Zoltan2::Zoltan2_Directory_Plan::sizes_to
Teuchos::ArrayRCP< int > sizes_to
Definition Zoltan2_Directory_Comm.hpp:54

Zoltan2::Zoltan2_Directory_Plan::max_send_size
int max_send_size
Definition Zoltan2_Directory_Comm.hpp:74

Zoltan2::Zoltan2_Directory_Plan::nvals
int nvals
Definition Zoltan2_Directory_Comm.hpp:69

Zoltan2::Zoltan2_Directory_Plan::sizes
Teuchos::ArrayRCP< int > sizes
Definition Zoltan2_Directory_Comm.hpp:51

Zoltan2::Zoltan2_Directory_Plan::recv_buff
Teuchos::ArrayRCP< char > recv_buff
Definition Zoltan2_Directory_Comm.hpp:83

Zoltan2::Zoltan2_Directory_Plan::getInvertedValues
void getInvertedValues(Zoltan2_Directory_Plan *from)
Definition Zoltan2_Directory_Comm.cpp:16

Zoltan2::Zoltan2_Directory_Plan::self_msg
int self_msg
Definition Zoltan2_Directory_Comm.hpp:73

Zoltan2::Zoltan2_Directory_Plan::print
void print(const std::string &headerMessage) const
Definition Zoltan2_Directory_Comm.cpp:48

Zoltan2::Zoltan2_Directory_Plan::getRecvBuff
Teuchos::ArrayRCP< char > getRecvBuff() const
Definition Zoltan2_Directory_Comm.hpp:84

Zoltan2::Zoltan2_Directory_Plan::plan_reverse
Zoltan2_Directory_Plan * plan_reverse
Definition Zoltan2_Directory_Comm.hpp:81

Zoltan2::Zoltan2_Directory_Plan::starts_from
Teuchos::ArrayRCP< int > starts_from
Definition Zoltan2_Directory_Comm.hpp:39

Zoltan2::Zoltan2_Directory_Plan::request
Teuchos::ArrayRCP< Teuchos::RCP< Teuchos::CommRequest< int > > > request
Definition Zoltan2_Directory_Comm.hpp:79

Zoltan2::Zoltan2_Directory_Plan::comm
Teuchos::RCP< const Teuchos::Comm< int > > comm
Definition Zoltan2_Directory_Comm.hpp:77

Zoltan2::Zoltan2_Directory_Plan::procs_from
Teuchos::ArrayRCP< int > procs_from
Definition Zoltan2_Directory_Comm.hpp:33

Zoltan2::Zoltan2_Directory_Plan::starts_from_ptr
Teuchos::ArrayRCP< int > starts_from_ptr
Definition Zoltan2_Directory_Comm.hpp:59

Zoltan2::Zoltan2_Directory_Plan::nrecvs
int nrecvs
Definition Zoltan2_Directory_Comm.hpp:71

Zoltan2::Zoltan2_Directory_Plan::lengths_from
Teuchos::ArrayRCP< int > lengths_from
Definition Zoltan2_Directory_Comm.hpp:35

Zoltan2::Zoltan2_Directory_Plan::maxed_recvs
int maxed_recvs
Definition Zoltan2_Directory_Comm.hpp:76

Zoltan2::Zoltan2_Directory_Plan::total_recv_size
int total_recv_size
Definition Zoltan2_Directory_Comm.hpp:75

Zoltan2::Zoltan2_Directory_Plan::lengths_to
Teuchos::ArrayRCP< int > lengths_to
Definition Zoltan2_Directory_Comm.hpp:34

Zoltan2::Zoltan2_Directory_Plan::nsends
int nsends
Definition Zoltan2_Directory_Comm.hpp:72

Zoltan2::Zoltan2_Directory_Plan::starts_to_ptr
Teuchos::ArrayRCP< int > starts_to_ptr
Definition Zoltan2_Directory_Comm.hpp:58

Zoltan2::Zoltan2_Directory_Plan::procs_to
Teuchos::ArrayRCP< int > procs_to
Definition Zoltan2_Directory_Comm.hpp:32

Zoltan2::Zoltan2_Directory_Plan::sizes_from
Teuchos::ArrayRCP< int > sizes_from
Definition Zoltan2_Directory_Comm.hpp:55

Zoltan2::Zoltan2_Directory_Plan::indices_from_ptr
Teuchos::ArrayRCP< int > indices_from_ptr
Definition Zoltan2_Directory_Comm.hpp:64

Zoltan2::Zoltan2_Directory_Plan::indices_from
Teuchos::ArrayRCP< int > indices_from
Definition Zoltan2_Directory_Comm.hpp:45

Zoltan2::Zoltan2_Directory_Plan::indices_to
Teuchos::ArrayRCP< int > indices_to
Definition Zoltan2_Directory_Comm.hpp:42

Zoltan2::Zoltan2_Directory_Plan::starts_to
Teuchos::ArrayRCP< int > starts_to
Definition Zoltan2_Directory_Comm.hpp:38

Zoltan2::Zoltan2_Directory_Plan::indices_to_ptr
Teuchos::ArrayRCP< int > indices_to_ptr
Definition Zoltan2_Directory_Comm.hpp:62

Zoltan2::Zoltan2_Directory_Plan::using_sizes
bool using_sizes
Definition Zoltan2_Directory_Comm.hpp:52

Zoltan2::Zoltan2_Directory_Plan::nvals_recv
int nvals_recv
Definition Zoltan2_Directory_Comm.hpp:70

Zoltan2
Created by mbenlioglu on Aug 31, 2020.
Definition Zoltan2_AlgHybrid2GL.hpp:47