/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include "md_system.h"

#ifdef MPI_SDMD

#include "parallel.h"
#include "sdmd.h"

#ifdef KCOMP
#include <mpi-ext.h>
#endif

/* #include "mpi_debug.c"  */
/* #include "/home1/ike/test/test/test.c" */

void SDMD_setup0(MD_SYSTEM *sys)
{
  LINKED_CELL *lc;
  BOUNDARY *bc;
  int npx[3];

  lc = &sys->linked_cell;
  bc = &sys->boundary;

  SDMD_get_npx(lc, bc, npx);
  mpi_cart3D_setup(npx);
}


void SDMD_setup(MD_SYSTEM *sys)
{
  char *func="SDMD_setup";
  LINKED_CELL *lc;
  NONBOND_LIST *nl;
  ATOM_DATA *ad;
  BOUNDARY *bc;
  BOND_DATA *bd;
  ANGLE_DATA *and;
  DIHEDRAL_DATA *dd;
  EWALD *ew;
  RMOL_DATA *md;
  int i;

  lc = &sys->linked_cell; nl = &sys->nonbond; ad = &sys->atom; bc = &sys->boundary;
  bd = &sys->bond; and = &sys->angle; dd = &sys->dihed;
  ew = &sys->ewald;
  md = &sys->rigid_mol;
  
  /* step 0. misc. setups.. */

  sync_xv(sys);
  
  MD_SYSTEM_degree_of_freedom(sys);
  for (i=0;i<SDMD_N_TIME;i++) lc->time[i]=0.0;
  lc->atom_cell=emalloc(func,sizeof(int)*ad->natom);
#ifdef TR_LIST_ATOM_REQ 
  lc->atom_req =emalloc(func,sizeof(unsigned char)*ad->natom);
#endif  

  /*ATOM_DATA_print_hydrogen_group(ad);*/
  
  /* step 1. determination of cell size */
  SDMD_cell_setup(lc, nl, ad, bc);
  LINKED_CELL_setup(lc, nl, ad, bc);

  /* step 2. assign atoms to cell */
  LINKED_CELL_assign_atom(lc, ad, bc);

  /* step 3. assign cell to PE */
  SDMD_assign_cells_to_PE(lc);
  SDMD_assign_node_atom(lc, ad);
  SDMD_assign_node_rmol(md, ad);
  SDMD_assign_node_rattle(&sys->rattle, ad);

  /*SDMD_EW_assign_tr(ew, lc, nl, bc);*/

  /*  debug
  for (i=0;i<ad->natom;i++) {
    printf("atom_cell[%d]=%d\n",i,lc->atom_cell[i]);
  }
  */

  /* step 4. assign bond, angle, dihedral to cell or PE */
  SDMD_clear_cell_req(lc,ad);
  SDMD_assign_internal_to_PE(lc, ad, bd, and, dd, ew);

  /*SDMD_assign_cmap_to_PE_static(lc, dd, ad); */

  /* debug 
  { int j,ok,k;
  for (i=0;i<bd->n_bond;i++) {
    if (bd->bonds[i].flag & BOND_OTHER_NODE) continue;
    if (!(bd->bonds[i].flag & RATTLE_FLAG)) continue;
    ok = 0;
    for (j=ad->node_fatom_h;j>=0;j=ad->node_fatom_n[j]) {
      if (bd->bonds[i].atom1 == j) ok++;
      if (bd->bonds[i].atom2 == j) ok++;
    }
    if (ok!=2) {
      printf("not ok:%d, %d %d(%d) %d(%d)\n",ok, mpi.rank,
	     bd->bonds[i].atom1, lc->cell[lc->atom_cell[bd->bonds[i].atom1]].pe,
	     bd->bonds[i].atom2, lc->cell[lc->atom_cell[bd->bonds[i].atom2]].pe);
    }
  }}
  */
  
  /* step 5. assign cellpair to PE in a simple way */
  LINKED_CELL_make_cell_pairlist(lc, bc);
  SDMD_assign_cellpair_to_PE(lc);
  
  /* step 6. load balancer setup: n_cell and n_cell_pair are required */
  SDMD_load_balancer_setup(lc, nl);

  /* step 7. construct cell data transfer list */
  SDMD_alloc_tr_list(lc, ad);

  if (lc->tr_mode == TR_MODE_ES) {
    SDMD_setup_tr_list_xyz_ES(lc, ad);
    SDMD_make_tr_list_xyz(lc, ad);
  } else if (lc->tr_mode == TR_MODE_MP) {
    SDMD_setup_tr_list_xyz_MP(lc, ad);
    SDMD_make_tr_list_xyz(lc, ad);
  } else {
    SDMD_make_tr_list_by_cell_req(lc, ad);
  }

  /* step 8. construct atom pair list */
  /* ATOM_DATA_check_hydrogen_dist(ad, 1.5); */
  LINKED_CELL_calc_tr_x(lc, ad, bc);

  /*
  SDMD_alloc_nonbond_list(lc, nl, ad, bc);
  SDMD_make_nonbond_list(lc, nl, ad, bc, 0);
  */
  LINKED_CELL_alloc_nonbond_list(lc, nl, ad, bc);
  LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 0);

  /* step 8.1. remake tr_list */
  
#ifdef TR_LIST_ATOM_REQ
  SDMD_make_tr_list_by_atom_req(lc, ad);
#endif  

  /*
  SDMD_dist_x_xyz(lc, ad);
  SDMD_dist_x(lc, ad);
  */

  /* step 9. calc initial energy force */
  SDMD_calc_force(sys);

  /*
  SDMD_check_force(lc, ad);
  marble_exit(1);
  */
  
  if (sys->remove_momentum)
    MD_SYSTEM_remove_momentum(sys);
  
  if (sys->rigid_mol_flag)
    RMOL_DATA_init_time0(md,ad);
  if (sys->rattle.flag)
    RATTLE_init_time0(&sys->rattle,ad);

  if (sys->Ex_System_P_flag)
    MD_SYSTEM_calc_kene_full(sys);
  else
    MD_SYSTEM_calc_kene(sys);
    /* SDMD_calc_kene(sys); */
  
  if (mpi.master) {
    MD_SYSTEM_calc_Pint(sys);
    MD_SYSTEM_sum_potential(sys);
    MD_SYSTEM_sum_total_energy(sys);
    MD_SYSTEM_clear_statistics(sys);
  }
}

void SDMD_cell_setup(LINKED_CELL *lc, NONBOND_LIST *nl,
		     ATOM_DATA *ad,   BOUNDARY *bc)
{
  double dx_pe, dx_cell;
  int i;

  /*
    int npx[3];
  SDMD_get_npx(lc, bc, npx);
  mpi_cart3D_setup(npx);
  */

  lprintf("SDMD:        Number of PE in 3D   = %d %d %d\n", 
	  mpi.npx[0], mpi.npx[1], mpi.npx[2]);
#ifdef KCOMP
  if (mpi.dim == 3) {
    if (mpi.np_in_node != 1) {
      lprintf("SDMD_K:      Number of nodes      = %d %d %d\n", 
	      mpi.npx[0]/mpi.npx_in_node[0], 
	      mpi.npx[1]/mpi.npx_in_node[1],
	      mpi.npx[2]/mpi.npx_in_node[2]);
      lprintf("SDMD_K:      Number of PE in node = %d %d %d\n", 
	      mpi.npx_in_node[0], 
	      mpi.npx_in_node[1],
	      mpi.npx_in_node[2]);
    }
  }
#endif    

  /* Calculation of lc->n_grid */
  if (lc->request_grid) {
    for (i = 0; i < 3; i++) {
      if (lc->n_grid[i] % mpi.npx[i] != 0) {
	lprintf("ERROR: n_cell (%d) must be divisible by n_pe (%d) in dimension %d.\n",
		lc->n_grid[i],mpi.npx[i], i);
	marble_exit(1);
      }
    }
  } else {
    for (i = 0; i < 3; i++) {
      dx_pe = bc->reclen[i] / mpi.npx[i];
      dx_cell = nl->cell_div / lc->neighbor[i];
      lc->n_grid[i] = dx_pe / dx_cell;
      if (lc->n_grid[i] <= 0)
	lc->n_grid[i] = 1;
      lc->n_grid[i] *= mpi.npx[i];
    }
    lc->request_grid = 1;
  }
}

void SDMD_assign_cells_to_PE(LINKED_CELL *lc)
{
  int ipx, ipy, ipz, n_grid_pe[3];
  int i, j, k, l, prev;
  int ixm[3], ixp[3], pem, pep;
  CELL_PE *cell_pe;
  CELL    *cell;
  char *func="SDMD_assign_cells_to_PE";

  for (i=0;i<3;i++) {
    n_grid_pe[i] = lc->n_grid[i] / mpi.npx[i];
  }

  for (i=0;i<lc->n_cell;i++) {
    ipx = lc->cell[i].ix / n_grid_pe[0];
    ipy = lc->cell[i].iy / n_grid_pe[1];
    ipz = lc->cell[i].iz / n_grid_pe[2];
    mpi_xyz2rank(ipx, ipy, ipz, &(lc->cell[i].pe));
  }

  /* The codes from here to end of this function are the same as the old version. */
  lc->node_head = -1;
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].pe == mpi.rank) {
      lc->cell[i].node_next = -1;
      if (lc->node_head == -1) {
	lc->node_head=i;
      } else {
	lc->cell[prev].node_next = i;
      }
      prev = i;
    }
  }

  /* initialization of cell_pe */
  if (lc->cell_pe == NULL) {
    lc->cell_pe = emalloc(func, sizeof(CELL_PE)*mpi.n_pe);
    for (i=0;i<mpi.n_pe;i++) {
      lc->cell_pe[i].n_cell=0;
    }
  } else {
    for (i=0;i<mpi.n_pe;i++) {
      if (lc->cell_pe[i].n_cell > 0) {
	free(lc->cell_pe[i].cell);
      }
      lc->cell_pe[i].n_cell=0;
    }
  }

  for (i=0;i<lc->n_cell;i++)
    lc->cell_pe[lc->cell[i].pe].n_cell++;
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    cell_pe->cell=emalloc(func, sizeof(int)*cell_pe->n_cell);
    cell_pe->n_cell=0;
  }
  for (i=0;i<lc->n_cell;i++) {
    cell_pe = &lc->cell_pe[lc->cell[i].pe];
    cell_pe->cell[cell_pe->n_cell]=i;
    cell_pe->n_cell++;
  }
  
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    for (j=0;j<3;j++) {
      cell_pe->min[j] = lc->n_grid[j];
      cell_pe->max[j] = -1;
    }
    for (j=0;j<cell_pe->n_cell;j++) {
      cell=&lc->cell[cell_pe->cell[j]];
      if (cell_pe->min[0] > cell->ix) cell_pe->min[0]=cell->ix;
      if (cell_pe->min[1] > cell->iy) cell_pe->min[1]=cell->iy;
      if (cell_pe->min[2] > cell->iz) cell_pe->min[2]=cell->iz;
      if (cell_pe->max[0] < cell->ix) cell_pe->max[0]=cell->ix;
      if (cell_pe->max[1] < cell->iy) cell_pe->max[1]=cell->iy;
      if (cell_pe->max[2] < cell->iz) cell_pe->max[2]=cell->iz;
    }
  }

  /* generate neighbor_pe data */
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    if (cell_pe->n_cell > 0) {
      for (j=0;j<3;j++) {
	for (k=0;k<2;k++) {
	  for (l=0;l<2;l++) {
	    cell_pe->neighbor_pe[k][l][j]=-1;
	  }
	}
      }

      for (j=0;j<3;j++) {
	ixm[0]=cell_pe->min[0];
	ixm[1]=cell_pe->min[1];
	ixm[2]=cell_pe->min[2];
	ixp[0]=cell_pe->max[0];
	ixp[1]=cell_pe->max[1];
	ixp[2]=cell_pe->max[2];

	for (k=1;k<=lc->neighbor[j];k++) {
	  ixm[j]=cell_pe->min[j]-k;
	  if (ixm[j] < 0)  ixm[j]+=lc->n_grid[j];
	  ixp[j]=cell_pe->max[j]+k;
	  if (ixp[j] >= lc->n_grid[j]) ixp[j]-=lc->n_grid[j];
	  pem=lc->cell[CELL_INDEX(lc,ixm[0],ixm[1],ixm[2])].pe;
	  pep=lc->cell[CELL_INDEX(lc,ixp[0],ixp[1],ixp[2])].pe;

	  for (l=0;l<2;l++) {
	    /*if (pem==i) break;*/
	    if (cell_pe->neighbor_pe[l][0][j] == -1) {
	      cell_pe->neighbor_pe[l][0][j]=pem;
	      break;
	    }
	    if (cell_pe->neighbor_pe[l][0][j] == pem)
	      break;
	  }
	  for (l=0;l<2;l++) {
	    /*if (pep==i) break;*/
	    if (cell_pe->neighbor_pe[l][1][j] == -1) {
	      cell_pe->neighbor_pe[l][1][j]=pep;
	      break;
	    }
	    if (cell_pe->neighbor_pe[l][1][j] == pep)
	      break;
	  }
	}
      }
    }
  }
}

void SDMD_get_npx(LINKED_CELL *lc, BOUNDARY *bc, int npx[3])
{
#ifdef KCOMP
  int dim, n_node, nodex[3], ipx[3], nnx[3], nx[3], rank, rank2, ok;
  double reclen[3];
  int idiv, max_i, min_i, min, i;
  int divisor[100], n_divisor, cdiv, n_pe;
  char *pe_node;

  FJMPI_Topology_get_dimension(&dim);
  if (dim == 1) {
    mpi.dim = 1;
  } else if (dim == 3) {
    mpi.dim = 3;
  } else {
    lprintf("ERROR: node dimension must be 1 or 3.\n");
    marble_exit(1);
  }
  if (mpi.dim == 3) {
    FJMPI_Topology_get_shape(&nodex[0], &nodex[1], &nodex[2]);
    if (lc->request_npx) {
      if (lc->req_npx[0] % nodex[0] != 0 ||
	  lc->req_npx[1] % nodex[1] != 0 ||
	  lc->req_npx[2] % nodex[2]) {
	lprintf("ERROR: Requested cell dimensions (%d %d %d) must be the integral multiple of node dimensions (%d %d %d).\n", lc->req_npx[0], lc->req_npx[1], lc->req_npx[2], nodex[0], nodex[1], nodex[2]);
	marble_exit(1);
      }
      mpi.npx_in_node[0] = lc->req_npx[0]/nodex[0];
      mpi.npx_in_node[1] = lc->req_npx[1]/nodex[1];
      mpi.npx_in_node[2] = lc->req_npx[2]/nodex[2];
    } else if ((pe_node = getenv("MBL_PE_NODE")) != NULL) {
      sscanf(pe_node, "%dx%dx%d",&mpi.npx_in_node[0],&mpi.npx_in_node[1],&mpi.npx_in_node[2]);
      npx[0] = nodex[0] * mpi.npx_in_node[0];
      npx[1] = nodex[1] * mpi.npx_in_node[1];
      npx[2] = nodex[2] * mpi.npx_in_node[2];
      
      if (npx[0]*npx[1]*npx[2] != mpi.n_pe) {
	lprintf("ERROR: Value of environment variable MBL_PE_NODE (%s) is invalid.\n",  pe_node, mpi.n_pe, nodex[0],nodex[1],nodex[2]);
	marble_exit(1);
      }
    } else {
      n_node = nodex[0] * nodex[1] * nodex[2];
      if (mpi.n_pe % n_node != 0) {
	lprintf("ERROR: Number of processes (%d) must be the integral multiple of number of nodes (%d).\n", 
		mpi.n_pe, n_node);
	marble_exit(1);
      }
      mpi.np_in_node = mpi.n_pe/n_node;
      n_pe = mpi.np_in_node;
      n_divisor = 0;
      cdiv = 2;
      while (n_pe > 1) {
	if (n_pe % cdiv == 0) {
	  divisor[n_divisor] = cdiv;
	  if (++n_divisor >= 100) {
	    lprintf("ERROR: np_in_node (%d) is too large\n", mpi.np_in_node);
	    marble_exit(1);
	  }
	  n_pe /= cdiv;
	} else {
	  cdiv++;
	}
      }
      /* lprintf("%d %d\n", n_divisor, mpi.np_in_node); */
      /* Calculation of divisors is done. */

      /* Calculation of npx_in_node[i] */
      for (i = 0; i < 3; i++) {
	mpi.npx_in_node[i] = 1;
      }
      
      if (lc->request_grid) {
	int n_cell[3], order[3], i, j, npx_tmp[3];
	n_cell[0] = lc->n_grid[0];
	n_cell[1] = lc->n_grid[1];
	n_cell[2] = lc->n_grid[2];
	if (n_cell[0] % nodex[0] != 0 ||
	    n_cell[1] % nodex[1] != 0 ||
	    n_cell[2] % nodex[2] != 0) {
	  lprintf("ERROR: requested cell dimension (%d %d %d) is the integral multiple of node dimension (%d %d %d)\n", n_cell[0], n_cell[1], n_cell[2], nodex[0], nodex[1], nodex[2]);
	  marble_exit(1);
	}
	n_cell[0] /= nodex[0];
	n_cell[1] /= nodex[1];
	n_cell[2] /= nodex[2];
	npx_tmp[0] = nodex[0];
	npx_tmp[1] = nodex[1];
	npx_tmp[2] = nodex[2];
	for (idiv=n_divisor-1;idiv>=0;idiv--) {
	  if (npx_tmp[0] <= npx_tmp[1]) {
	    if (npx_tmp[1] <= npx_tmp[2]) {
	      order[0] = 0; order[1] = 1;  order[2] = 2;
	    } else if (npx_tmp[0] <= npx_tmp[2]) {
	      order[0] = 0; order[1] = 2;  order[2] = 1;
	    } else {
	      order[0] = 2; order[1] = 0;  order[2] = 1;
	    }
	  } else {
	    if (npx_tmp[0] <= npx_tmp[2]) {
	      order[0] = 1; order[1] = 0;  order[2] = 2;
	    } else if (npx_tmp[1] <= npx_tmp[2]) {
	      order[0] = 1; order[1] = 2;  order[2] = 0;
	    } else {
	      order[0] = 2; order[1] = 1;  order[2] = 0;
	    }
	  }
	  ok = 0;
	  for (i = 0; i < 3; i++) {
	    if (n_cell[order[i]] % divisor[idiv] == 0) {
	      n_cell[order[i]] /= divisor[idiv];
	      npx_tmp[order[i]] *= divisor[idiv];
	      mpi.npx_in_node[order[i]] *= divisor[idiv];
	      ok = 1;
	      break;
	    }
	  }
	  if (!ok) {
	    lprintf("ERROR: requested cell dimension (%d %d %d) could not be assigned to nodes (%d %d %d) and n_pe in a node (%d).\n", 
		    lc->n_grid[0], lc->n_grid[1], lc->n_grid[2], 
		    nodex[0], nodex[1], nodex[2], mpi.np_in_node);
	    marble_exit(1);
	  }
	}
      } else {
	int npx_tmp[3];

	npx_tmp[0] = nodex[0];
	npx_tmp[1] = nodex[1];
	npx_tmp[2] = nodex[2];
	for (idiv=n_divisor-1;idiv>=0;idiv--) {
	  min_i=0;
	  min = npx_tmp[0];
	  for (i = 1; i < 3; i++) {
	    if (npx_tmp[i] < min) {
	      min_i = i;
	      min = npx_tmp[i];
	    }
	  }
	  mpi.npx_in_node[min_i] *= divisor[idiv];
	  npx_tmp[min_i] *= divisor[idiv];
	}
      }
	/*lprintf("%d %d %d\n", mpi.npx_in_node[0], mpi.npx_in_node[1], mpi.npx_in_node[2]);*/
    }
    mpi.np_in_node = mpi.npx_in_node[0] * mpi.npx_in_node[1] * mpi.npx_in_node[2];
    npx[0] = nodex[0] * mpi.npx_in_node[0];
    npx[1] = nodex[1] * mpi.npx_in_node[1];
    npx[2] = nodex[2] * mpi.npx_in_node[2];

    /*lprintf("%d %d %d\n", npx[0], npx[1], npx[2]);*/
    mpi.npx[0] = npx[0];
    mpi.npx[1] = npx[1];
    mpi.npx[2] = npx[2];

    /* check routine */
    if (lc->request_grid) {
      if (lc->n_grid[0] % npx[0] != 0 ||
	  lc->n_grid[1] % npx[1] != 0 ||
	  lc->n_grid[2] % npx[2] != 0) {
	lprintf("ERROR: requested n_cell (%dx%dx%d) must be divisible by n_pe (%dx%dx%d) in each dimension.\n",
		lc->n_grid[0], lc->n_grid[1], lc->n_grid[2], npx[0], npx[1], npx[2]);
	lprintf("node: (%dx%dx%d)  pe_in_node: (%dx%dx%d)\n", nodex[0], nodex[1], nodex[2], 
		mpi.npx_in_node[0], mpi.npx_in_node[1], mpi.npx_in_node[2]);
	marble_exit(1);
      }
    }
    ok = 1;
    for (rank=0;rank<mpi.n_pe;rank++) {
      mpi_rank2xyz(rank, &ipx[0], &ipx[1], &ipx[2]);
      /*lprintf("rank (%d:(%d %d %d))\n", rank, ipx[0],ipx[1],ipx[2]);*/
      nnx[0] = ipx[0] / mpi.npx_in_node[0];
      nnx[1] = ipx[1] / mpi.npx_in_node[1];
      nnx[2] = ipx[2] / mpi.npx_in_node[2];
      FJMPI_Topology_rank2xyz(rank, &nx[0], &nx[1], &nx[2]);
      if (nnx[0] != nx[0] ||
	  nnx[1] != nx[1] ||
	  nnx[2] != nx[2]) {
	lprintf("ERROR: inconsistent assignment of rank (%d:(%d %d %d)) to node (%d %d %d).\n",
		rank, ipx[0],ipx[1],ipx[2], nx[0], nx[1], nx[2]);
	ok = 0;
      }
      mpi_xyz2rank(ipx[0], ipx[1], ipx[2], &rank2);
      /*lprintf("rank (%d:(%d %d %d)) rank2 (%d)\n", rank, ipx[0],ipx[1],ipx[2], rank2);*/
      if (rank != rank2) {
	lprintf("ERROR: inconsistent assignment of rank (%d:(%d %d %d)) to nodes. (rank2:%d)\n",
		rank, ipx[0],ipx[1],ipx[2], rank2);
	ok = 0;
      }
    }
    if (!ok)
      marble_exit(1);
    
  } else {
    /*mpi.dim == 1;*/
    /* normal use */

    if (lc->request_npx) {
      if (mpi.n_pe != lc->req_npx[0]*lc->req_npx[1]*lc->req_npx[2]) {
	lprintf("ERROR: requested pe size (%d,%d,%d) must be equal to n_pe (%d).\n",
		lc->req_npx[0],lc->req_npx[1],lc->req_npx[2], mpi.n_pe);
	marble_exit(1);
      }
      for (i = 0; i < 3; i++) {
	npx[i] = lc->req_npx[i];
      }
    } else {
      /* Calculation of divisors */
      n_pe = mpi.n_pe;
      n_divisor = 0;
      cdiv = 2;
      while (n_pe > 1) {
	if (n_pe % cdiv == 0) {
	  divisor[n_divisor] = cdiv;
	  if (++n_divisor >= 100) {
	    lprintf("ERROR: n_pe (%d) is too large\n", n_pe);
	  }
	  n_pe /= cdiv;
	} else {
	  cdiv++;
	}
      }
      /* Calculation of divisors is done. */

      /* Calculation of npx[i] */
      for (i = 0; i < 3; i++) {
	reclen[i] = bc->reclen[i];
	npx[i] = 1;
      }
      for (idiv=n_divisor-1;idiv>=0;idiv--) {
	max_i=0;
	for (i = 1; i < 3; i++) {
	  if (reclen[max_i] < reclen[i]) {
	    max_i = i;
	  }
	}
	npx[max_i] *= divisor[idiv];
	reclen[max_i] /= divisor[idiv];
      }
      /* Calculation of n_pe3d is done. */
    }
  }
#else   /* KCOMP */
  double reclen[3];
  int idiv, max_i, i;
  int divisor[100], n_divisor, cdiv, n_pe;

  if (lc->request_npx) {
    if (mpi.n_pe != lc->req_npx[0]*lc->req_npx[1]*lc->req_npx[2]) {
      lprintf("ERROR: requested pe size (%d,%d,%d) must be equal to n_pe (%d).\n",
	      lc->req_npx[0],lc->req_npx[1],lc->req_npx[2], mpi.n_pe);
      marble_exit(1);
    }
    for (i = 0; i < 3; i++) {
      npx[i] = lc->req_npx[i];
    }
  } else {
    /* Calculation of divisors */
    n_pe = mpi.n_pe;
    n_divisor = 0;
    cdiv = 2;
    while (n_pe > 1) {
      if (n_pe % cdiv == 0) {
	divisor[n_divisor] = cdiv;
	if (++n_divisor >= 100) {
	  lprintf("ERROR: n_pe (%d) is too large\n", n_pe);
	}
	n_pe /= cdiv;
      } else {
	cdiv++;
      }
    }
    /* Calculation of divisors is done. */

    /* Calculation of npx[i] */
    for (i = 0; i < 3; i++) {
      reclen[i] = bc->reclen[i];
      npx[i] = 1;
    }
    for (idiv=n_divisor-1;idiv>=0;idiv--) {
      max_i=0;
      for (i = 1; i < 3; i++) {
	if (reclen[max_i] < reclen[i]) {
	  max_i = i;
	}
      }
      npx[max_i] *= divisor[idiv];
      reclen[max_i] /= divisor[idiv];
    }
    /* Calculation of n_pe3d is done. */
  }
#endif  /* KCOMP */
}


#if 0
void SDMD_assign_cells_to_PE(LINKED_CELL *lc)
{
  int i, j, k, l, prev;
  int ixm[3], ixp[3], pem, pep;
  CELL_PE *cell_pe;
  CELL    *cell;
  char *func="SDMD_assign_cells_to_PE";

  if (mpi.n_pe >= lc->n_grid[0]*lc->n_grid[1]*lc->n_grid[2]) {
    SDMD_assign_cells_sparse(lc);
    lprintf("Cells Assigned Sparsely\n");
    /*
    SDMD_assign_cells_round_robin(lc);
    lprintf("Cells Assigned by Round Robin\n");
    */
  } else {
    SDMD_assign_cells_bisection(lc);
    lprintf("Cells Assigned by Recursive Bisection\n");
  }

  SDMD_sort_pe_for_cells(lc);

  lc->node_head = -1;
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].pe == mpi.rank) {
      lc->cell[i].node_next = -1;
      if (lc->node_head == -1) {
	lc->node_head=i;
      } else {
	lc->cell[prev].node_next = i;
      }
      prev = i;
    }
  }

  /*
  for (i=0;i<lc->n_cell;i++) {
    lprintf("%d %d %d: %d\n",
	    lc->cell[i].ix,lc->cell[i].iy,lc->cell[i].iz, lc->cell[i].pe);
	    
  }
  marble_exit(1);
  */

  /* cell_pe is initialized */

  if (lc->cell_pe == NULL) {
    lc->cell_pe = emalloc(func, sizeof(CELL_PE)*mpi.n_pe);
    for (i=0;i<mpi.n_pe;i++) {
      lc->cell_pe[i].n_cell=0;
    }
  } else {
    for (i=0;i<mpi.n_pe;i++) {
      if (lc->cell_pe[i].n_cell > 0) {
	free(lc->cell_pe[i].cell);
      }
      lc->cell_pe[i].n_cell=0;
    }
  }
  for (i=0;i<lc->n_cell;i++)
    lc->cell_pe[lc->cell[i].pe].n_cell++;
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    cell_pe->cell=emalloc(func, sizeof(int)*cell_pe->n_cell);
    cell_pe->n_cell=0;
  }
  for (i=0;i<lc->n_cell;i++) {
    cell_pe = &lc->cell_pe[lc->cell[i].pe];
    cell_pe->cell[cell_pe->n_cell]=i;
    cell_pe->n_cell++;
  }
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    for (j=0;j<3;j++) {
      cell_pe->min[j] = lc->n_grid[j];
      cell_pe->max[j] = -1;
    }
    for (j=0;j<cell_pe->n_cell;j++) {
      cell=&lc->cell[cell_pe->cell[j]];
      if (cell_pe->min[0] > cell->ix) cell_pe->min[0]=cell->ix;
      if (cell_pe->min[1] > cell->iy) cell_pe->min[1]=cell->iy;
      if (cell_pe->min[2] > cell->iz) cell_pe->min[2]=cell->iz;
      if (cell_pe->max[0] < cell->ix) cell_pe->max[0]=cell->ix;
      if (cell_pe->max[1] < cell->iy) cell_pe->max[1]=cell->iy;
      if (cell_pe->max[2] < cell->iz) cell_pe->max[2]=cell->iz;
    }
  }

  /* generate neighbor_pe data */
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    if (cell_pe->n_cell > 0) {
      for (j=0;j<3;j++) {
	for (k=0;k<2;k++) {
	  for (l=0;l<2;l++) {
	    cell_pe->neighbor_pe[k][l][j]=-1;
	  }
	}
      }

      for (j=0;j<3;j++) {
	ixm[0]=cell_pe->min[0];
	ixm[1]=cell_pe->min[1];
	ixm[2]=cell_pe->min[2];
	ixp[0]=cell_pe->max[0];
	ixp[1]=cell_pe->max[1];
	ixp[2]=cell_pe->max[2];

	for (k=1;k<=lc->neighbor[j];k++) {
	  ixm[j]=cell_pe->min[j]-k;
	  if (ixm[j] < 0)  ixm[j]+=lc->n_grid[j];
	  ixp[j]=cell_pe->max[j]+k;
	  if (ixp[j] >= lc->n_grid[j]) ixp[j]-=lc->n_grid[j];
	  pem=lc->cell[CELL_INDEX(lc,ixm[0],ixm[1],ixm[2])].pe;
	  pep=lc->cell[CELL_INDEX(lc,ixp[0],ixp[1],ixp[2])].pe;

	  for (l=0;l<2;l++) {
	    /*if (pem==i) break;*/
	    if (cell_pe->neighbor_pe[l][0][j] == -1) {
	      cell_pe->neighbor_pe[l][0][j]=pem;
	      break;
	    }
	    if (cell_pe->neighbor_pe[l][0][j] == pem)
	      break;
	  }
	  for (l=0;l<2;l++) {
	    /*if (pep==i) break;*/
	    if (cell_pe->neighbor_pe[l][1][j] == -1) {
	      cell_pe->neighbor_pe[l][1][j]=pep;
	      break;
	    }
	    if (cell_pe->neighbor_pe[l][1][j] == pep)
	      break;
	  }
	}
      }
    }
  }

  /*
  for (i=0;i<mpi.n_pe;i++) {
    cell_pe = &lc->cell_pe[i];
    if (cell_pe->n_cell > 0) {
      lprintf("rank %d (%d %d %d)-(%d %d %d):", i, cell_pe->min[0], cell_pe->min[1], cell_pe->min[2], cell_pe->max[0], cell_pe->max[1], cell_pe->max[2]);
      for (j=0;j<3;j++) {
	for (k=0;k<2;k++) {
	  for (l=0;l<2;l++) {
	    lprintf(" %d", cell_pe->neighbor_pe[l][k][j]);
	  }
	}
      }
      lprintf("\n");
    }
  }
  marble_exit(1);
  */
}

void SDMD_assign_cells_sparse(LINKED_CELL *lc)
{
  int nbit, max, i, j, pe, n;
  
  for (nbit=0;mpi.n_pe > (1 << nbit);nbit++);
  max = 1<<nbit;

  n=0;
  for (i=1;i<=max;i++) {
    pe = 0;
    for (j=0;j<nbit;j++) {
      if (i & (1 << j)) {
	pe |= 1 << (nbit-j-1);
      }
    }
    if (pe >= mpi.n_pe) continue;

    lc->cell[n].pe = pe;

    n++;
    if (n >= lc->n_cell) break;
  }
}


void SDMD_assign_cells_round_robin(LINKED_CELL *lc)
{
  int i;
  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].pe = i % mpi.n_pe;
  }
}

void SDMD_assign_cells_bisection(LINKED_CELL *lc)
{
  int i, pe[2], region[3][2];
  pe[0]=0; pe[1]=mpi.n_pe-1;
  for (i=0;i<3;i++) {
    region[i][0]=0;
    region[i][1]=lc->n_grid[i]-1;
  }
  SDMD_assign_cells_bisection_rec(lc, pe, region, 0);
}


void SDMD_assign_cells_bisection_rec(LINKED_CELL *lc,
				     int pe[2], int region[3][2], int idir)
{
  int region1[3][2], region2[3][2], pe1[2], pe2[2], n, i, j;
  int n_pe, n_pe1, n_pe2;
  int n1, min_dir, min_n1, dir;
  double ratio_pe1, diff_ratio1, diff_ratio11, min;
  
  if (pe[0]==pe[1]) {
    int ix, iy, iz;
    for (ix=region[0][0];ix<=region[0][1];ix++)
      for (iy=region[1][0];iy<=region[1][1];iy++)
	for (iz=region[2][0];iz<=region[2][1];iz++) {
	  lc->cell[CELL_INDEX(lc,ix,iy,iz)].pe = pe[0];
	  /* lc->cell_index[ix][iy][iz].pe = pe[0]; */
	  /* lprintf("%d %d %d: %d\n", ix, iy, iz, pe[0]); */
	}
    return;
  }
  
  n_pe = pe[1] - pe[0] + 1;
  pe2[1] = pe[1];
  pe2[0] = pe2[1] + 1 - n_pe/2;
  
  pe1[0] = pe[0];
  pe1[1] = pe2[0] - 1;

  n_pe1 = pe1[1] - pe1[0] + 1;
  n_pe2 = pe2[1] - pe2[0] + 1;
  ratio_pe1 = (double) n_pe1 / n_pe;

  /* lprintf("%d %d %f\n", n_pe1, n_pe2, ratio_pe1); */

  min = 1000.0;
  for (i=0;i<3;i++) {
    dir = (idir + i) % 3;
    n  = region[dir][1]-region[dir][0] + 1;
    n1 = n*ratio_pe1;
    if (n1<0)  n1 = 0;
    if (n1>=n) n1 = n-1;
    
    diff_ratio1  = fabs((double) n1/n - ratio_pe1);
    diff_ratio11 = fabs((double) (n1+1)/n - ratio_pe1);
    if (diff_ratio1 > diff_ratio11) {
      n1++;
      diff_ratio1  = diff_ratio11;
    }
    if (min > diff_ratio1) {
      min = diff_ratio1;
      min_dir = dir;
      min_n1 = n1;
    }
  }
  
  n  = region[min_dir][1]-region[min_dir][0] + 1;
  /* lprintf("%d %d %d\n", min_dir, min_n1, n-min_n1); */
  
  for (i=0;i<3;i++) {
    for (j=0;j<2;j++) {
      region1[i][j]=region2[i][j]=region[i][j];
    }
  }
  region1[min_dir][1]=region[min_dir][0]-1 + min_n1;
  region2[min_dir][0]=region1[min_dir][1]+1;

  if (min_n1 > 0)
    SDMD_assign_cells_bisection_rec(lc,pe1,region1,(min_dir+1)%3);
  if (n-min_n1 > 0)
    SDMD_assign_cells_bisection_rec(lc,pe2,region2,(min_dir+1)%3);
}

typedef struct {
  int pe;
  int n_cell;
  int ix, iy, iz;
} PE_CELL;

int SDMD_pe_cell_cmp(PE_CELL *p1, PE_CELL *p2)
{
  double x1, y1, z1, x2, y2, z2;
  if (p1->n_cell == 0 && p2->n_cell == 0) return 0;
  if (p1->n_cell == 0) return  1;
  if (p2->n_cell == 0) return -1;
  x1 = (double) (p1->ix) / p1->n_cell;
  y1 = (double) (p1->iy) / p1->n_cell;
  z1 = (double) (p1->iz) / p1->n_cell;
  
  x2 = (double) (p2->ix) / p2->n_cell;
  y2 = (double) (p2->iy) / p2->n_cell;
  z2 = (double) (p2->iz) / p2->n_cell;

  if (x1 < x2) return -1;
  if (x1 > x2) return  1;

  if (y1 < y2) return -1;
  if (y1 > y2) return  1;

  if (z1 < z2) return -1;
  if (z1 > z2) return  1;

  if (p1->pe < p2->pe) return -1;
  if (p1->pe > p2->pe) return  1;
  return 0;
}

void SDMD_sort_pe_for_cells(LINKED_CELL *lc)
{
  int i,j;
  PE_CELL *pe1, *pe2;
  char *func = "SDMD_sort_pe_for_cells";

  pe1=emalloc(func, sizeof(PE_CELL)*mpi.n_pe);
  pe2=emalloc(func, sizeof(PE_CELL)*mpi.n_pe);

  for (i=0;i<mpi.n_pe;i++) {
    pe1[i].pe = i;
    pe1[i].n_cell = 0;
    pe1[i].ix = pe1[i].iy = pe1[i].iz = 0;
  }
  for (i=0;i<lc->n_cell;i++) {
    j = lc->cell[i].pe;
    pe1[j].n_cell++;
    pe1[j].ix += lc->cell[i].ix;
    pe1[j].iy += lc->cell[i].iy;
    pe1[j].iz += lc->cell[i].iz;
  }
  memcpy(pe2,pe1,sizeof(PE_CELL)*mpi.n_pe);
  qsort(pe1, mpi.n_pe, sizeof(PE_CELL),
	(int (*) (const void *, const void *)) SDMD_pe_cell_cmp);

  j=0;
  for (i=0;i<mpi.n_pe;i++) {
    if (pe2[i].n_cell == 0) continue;
    pe2[pe1[j].pe].pe = i;
    j++;
  }
  /*
    here, 
     pe1[filled new].pe = old
     pe2[old].pe = new
  */
  
  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].pe = pe2[lc->cell[i].pe].pe;
  }

  /*
  j=0;
  for (i=0;i<mpi.n_pe;i++) {
    if (pe2[i].n_cell == 0) {
      lprintf("%d %d\n", i, 0);
      continue;
    }
    lprintf("%d %d %.1f %.1f %.1f\n", i, pe1[j].n_cell,
	   (double) pe1[j].ix / pe1[j].n_cell,
	   (double) pe1[j].iy / pe1[j].n_cell,
	   (double) pe1[j].iz / pe1[j].n_cell
	   );
    j++;
  }
  */

  free(pe1);
  free(pe2);
}
#endif /* 0 */

void SDMD_assign_node_atom(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, cur;

  /* gather atom indeces belonging to this node */
  cur = -1;
  /*
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].pe == mpi.rank) {
  */
  for (i=lc->node_head;i>=0;i=lc->cell[i].node_next) {
      for (j=lc->cell[i].head; j>=0; j=lc->next_atom[j]) {

	/* Atoms which belong to ATOM_CHILD is treated by ATOM_PARENT procedure below. */
	if (ad->ex[j].flag & ATOM_CHILD) continue;

	/* ATOM_PARENT or other */
	if (cur < 0)
	  ad->node_atom_h = j;
	else
	  ad->node_atom_n[cur] = j;
	cur = j;
	
	if (ad->ex[j].flag & ATOM_PARENT) {
	  for (k=ad->ex[j].child_list;k>=0; k=ad->ex[k].child_list) {
	    ad->node_atom_n[cur] = k;
	    cur = k;
	  }
	}
	
      }
  }
  
  if (cur < 0)
    ad->node_atom_h = -1;
  else
    ad->node_atom_n[cur] = -1;

  if (ad->node_atom_n == ad->node_fatom_n) {
    /* No rigid molecules. All atoms are flexible. */
    ad->node_fatom_h = ad->node_atom_h;
    return;
  }

  /* There are some rigid molecules. */
  ATOM_DATA_set_node_fatom(ad);
}

void SDMD_assign_node_rmol(RMOL_DATA *md, ATOM_DATA *ad)
{
  int i, cur, i_rmol;

  md->node_rmol_h = cur = -1;

  if (md->n_mol == 0) return;
    
  for (i=ad->node_atom_h;i>=0;i=ad->node_atom_n[i]) {
    if ((ad->ex[i].flag & ATOM_RIGID) && (ad->ex[i].flag & ATOM_PARENT)) {
      i_rmol = ad->ex[i].parent;
      if (i_rmol<0||i_rmol>=md->n_mol)
	printf("ERROR: (internal) assign_node_rmol %d %d\n",i_rmol,md->n_mol);
      if (cur < 0) {
	md->node_rmol_h = i_rmol;
      } else {
	md->mol[cur].node_rmol_n = i_rmol;
      }
      md->mol[i_rmol].node_rmol_n = -1;
      cur = i_rmol;
    }
  }
}

void SDMD_assign_node_rattle(RATTLE *rt, ATOM_DATA *ad)
{
  int i, cur, igrp;

  if (!rt->flag) return;
  
  rt->node_grp_h = cur = -1;
    
  for (i=ad->node_atom_h;i>=0;i=ad->node_atom_n[i]) {
    if ((ad->ex[i].flag & ATOM_RATTLE) && (ad->ex[i].flag & ATOM_PARENT)) {
      igrp = ad->ex[i].parent;
      if (igrp<0||igrp>=rt->n_grp)
	printf("ERROR: (internal) assign_node_rattle %d %d\n",igrp,rt->n_grp);
      if (cur < 0) {
	rt->node_grp_h = igrp;
      } else {
	rt->grp[cur].node_grp_n = igrp;
      }
      rt->grp[igrp].node_grp_n = -1;
      cur = igrp;
    }
  }

  RATTLE_calc_rg(rt,ad);
  RATTLE_calc_vg(rt,ad);
}


void SDMD_clear_cell_req(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i;
  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].req = 0;
  }

#ifdef TR_LIST_ATOM_REQ
  for (i=0;i<ad->natom;i++) {
    /* ad->ex[i].flag &= ~ATOM_REQ; */
    lc->atom_req[i] &= ~ATOM_REQ;
  }
#endif  
}

void SDMD_clear_cell_req_internal(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i;
  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].req &= ~CELL_REQ_INTERNAL;
  }

#ifdef TR_LIST_ATOM_REQ
  for (i=0;i<ad->natom;i++) {
    /* ad->ex[i].flag &= ~ATOM_REQ; */
    lc->atom_req[i] &= ~ATOM_REQ;
  }
#endif  
}
  
void SDMD_assign_internal_to_PE(LINKED_CELL *lc, ATOM_DATA *ad, BOND_DATA *bd,
				ANGLE_DATA *and, DIHEDRAL_DATA *dd, EWALD *ew)
{
  int i;

  /*
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].pe == mpi.rank)
  */
  for (i=lc->node_head;i>=0;i=lc->cell[i].node_next) {
    lc->cell[i].req |= CELL_REQ_HOME;
  }

  /*return;*/
  
  SDMD_assign_bond_to_PE(lc, bd, ad);
  /* add_dtime(&lc->time[SDMD_TIME_AS_BOND]); */
  
  SDMD_assign_angle_to_PE(lc, and, ad);
  /* add_dtime(&lc->time[SDMD_TIME_AS_ANGLE]); */
  
  SDMD_assign_dihedral_to_PE(lc, dd, ad);
  
  SDMD_assign_cmap_to_PE(lc, dd, ad);

  /* add_dtime(&lc->time[SDMD_TIME_AS_DIHED]); */
  
  SDMD_EW_assign_cor_to_PE(ew,lc, ad);
  /* add_dtime(&lc->time[SDMD_TIME_AS_EWALD]); */
}

void SDMD_assign_bond_to_PE(LINKED_CELL *lc, BOND_DATA *bd, ATOM_DATA *ad)
{
  int i, k, icell, jcell, cell1, cell2;
  int prev, rprev;

  bd->head = bd->rhead = -1;
  
  for (i=0;i<bd->n_bond;i++) {
    if (bd->bonds[i].flag & RATTLE_FLAG) {

      /* the case that atom1 is ATOM_PARENT must be included */
      if (ad->ex[bd->bonds[i].atom1].flag & ATOM_PARENT) {
	k = bd->bonds[i].atom1;
      } else {
	k=ad->ex[bd->bonds[i].atom1].parent;
      }
      
      cell1=lc->atom_cell[k];
      if (lc->cell[cell1].pe == mpi.rank) {
	bd->bonds[i].flag &= ~BOND_OTHER_NODE;
	
	bd->bonds[i].rnext = -1;
	if (bd->rhead < 0) {
	  bd->rhead = i;
	} else {
	  bd->bonds[rprev].rnext = i;
	}
	rprev = i;
	
      } else {
	bd->bonds[i].flag |= BOND_OTHER_NODE;
      }
    } else {
      /* in the case of normal bonds */
      icell=lc->atom_cell[bd->bonds[i].atom1];
      jcell=lc->atom_cell[bd->bonds[i].atom2];

      if (lc->cell[icell].pe != mpi.rank &&
	  lc->cell[jcell].pe != mpi.rank) {
	bd->bonds[i].flag |= BOND_OTHER_NODE;
	continue;
      }
    
      if (SDMD_cellcmp(lc,icell,jcell)) {
	cell1=icell; cell2=jcell;
      } else {
	cell1=jcell; cell2=icell;
      }
    
      if (lc->cell[cell1].pe == mpi.rank) {
	bd->bonds[i].flag &= ~BOND_OTHER_NODE;

	bd->bonds[i].next = -1;
	if (bd->head < 0) {
	  bd->head = i;
	} else {
	  bd->bonds[prev].next = i;
	}
	prev = i;
	
	lc->cell[cell1].req |= CELL_REQ_BOND;
	lc->cell[cell2].req |= CELL_REQ_BOND;

#ifdef TR_LIST_ATOM_REQ
	/*
	ad->ex[bd->bonds[i].atom1].flag |= ATOM_REQ_INT;
	ad->ex[bd->bonds[i].atom2].flag |= ATOM_REQ_INT;
	*/
	lc->atom_req[bd->bonds[i].atom1] |= ATOM_REQ_INT;
	lc->atom_req[bd->bonds[i].atom2] |= ATOM_REQ_INT;
#endif	
	
      } else {
	bd->bonds[i].flag |= BOND_OTHER_NODE;
      }
    }
  }
}

void SDMD_assign_angle_to_PE(LINKED_CELL *lc, ANGLE_DATA *and, ATOM_DATA *ad)
{
  int i, icell, jcell, kcell, cell1, cell2, cell3;
  int prev;
  
  and->head = -1;
  
  for (i=0;i<and->n_angle;i++) {
    icell=lc->atom_cell[and->angles[i].atom1];
    jcell=lc->atom_cell[and->angles[i].atom2];
    kcell=lc->atom_cell[and->angles[i].atom3];

    if (lc->cell[icell].pe != mpi.rank &&
	lc->cell[jcell].pe != mpi.rank &&
	lc->cell[kcell].pe != mpi.rank) {
      and->angles[i].flag |= ANGLE_OTHER_NODE;
      continue;
    }
    
    
    if (SDMD_cellcmp(lc,icell,jcell) &&
	SDMD_cellcmp(lc,icell,kcell))
      { cell1=icell; cell2=jcell; cell3=kcell; }
    else if (SDMD_cellcmp(lc,jcell,kcell))
      { cell1=jcell; cell2=icell; cell3=kcell; }
    else
      { cell1=kcell; cell2=icell; cell3=jcell; }
    
    if (lc->cell[cell1].pe == mpi.rank) {
      and->angles[i].flag &= ~ANGLE_OTHER_NODE;

      and->angles[i].next = -1;
      if (and->head < 0) {
	and->head = i;
      } else {
	and->angles[prev].next = i;
      }
      prev = i;
      
      lc->cell[cell1].req |= CELL_REQ_ANGLE;
      lc->cell[cell2].req |= CELL_REQ_ANGLE;
      lc->cell[cell3].req |= CELL_REQ_ANGLE;
      
#ifdef TR_LIST_ATOM_REQ
      /*
      ad->ex[and->angles[i].atom1].flag |= ATOM_REQ_INT;
      ad->ex[and->angles[i].atom2].flag |= ATOM_REQ_INT;
      ad->ex[and->angles[i].atom3].flag |= ATOM_REQ_INT;
      */
      
      lc->atom_req[and->angles[i].atom1] |= ATOM_REQ_INT;
      lc->atom_req[and->angles[i].atom2] |= ATOM_REQ_INT;
      lc->atom_req[and->angles[i].atom3] |= ATOM_REQ_INT;
#endif	

    } else {
      and->angles[i].flag |= ANGLE_OTHER_NODE;
    }
  }
}

void SDMD_assign_dihedral_to_PE(LINKED_CELL *lc, DIHEDRAL_DATA *dd, ATOM_DATA *ad)
{
  int i, icell, jcell, kcell, lcell, cell1, cell2, cell3, cell4;
  int atom3, atom4;
  int prev;

  dd->head = -1;
  for (i=0;i<dd->n_dihedral;i++) {
    icell=lc->atom_cell[dd->dihedrals[i].atom1];
    jcell=lc->atom_cell[dd->dihedrals[i].atom2];
    atom3=dd->dihedrals[i].atom3;
    atom4=dd->dihedrals[i].atom4;
    if (atom3 < 0) atom3 = -atom3;
    if (atom4 < 0) atom4 = -atom4;
    kcell=lc->atom_cell[atom3];
    lcell=lc->atom_cell[atom4];

    if (lc->cell[icell].pe != mpi.rank &&
	lc->cell[jcell].pe != mpi.rank &&
	lc->cell[kcell].pe != mpi.rank &&
	lc->cell[lcell].pe != mpi.rank) {
      dd->dihedrals[i].flag |= DIHED_OTHER_NODE;
      continue;
    }
    
    if (SDMD_cellcmp(lc,icell,jcell) &&
	SDMD_cellcmp(lc,icell,kcell) &&
	SDMD_cellcmp(lc,icell,lcell))
      { cell1=icell; cell2=jcell; cell3=kcell; cell4=lcell;}
    else if (SDMD_cellcmp(lc,jcell,kcell) &&
	     SDMD_cellcmp(lc,jcell,lcell))
      { cell1=jcell; cell2=icell; cell3=kcell; cell4=lcell;}
    else if (SDMD_cellcmp(lc,kcell,lcell))
      { cell1=kcell; cell2=icell; cell3=jcell; cell4=lcell;}
    else
      { cell1=lcell; cell2=icell; cell3=jcell; cell4=kcell;}
    
    if (lc->cell[cell1].pe == mpi.rank) {
      dd->dihedrals[i].flag &= ~DIHED_OTHER_NODE;

      dd->dihedrals[i].next = -1;
      if (dd->head < 0) {
	dd->head = i;
      } else {
	dd->dihedrals[prev].next = i;
      }
      prev = i;
      
      lc->cell[cell1].req |= CELL_REQ_DIHED;
      lc->cell[cell2].req |= CELL_REQ_DIHED;
      lc->cell[cell3].req |= CELL_REQ_DIHED;
      lc->cell[cell4].req |= CELL_REQ_DIHED;
      
#ifdef TR_LIST_ATOM_REQ
      /*
      ad->ex[dd->dihedrals[i].atom1].flag |= ATOM_REQ_INT;
      ad->ex[dd->dihedrals[i].atom2].flag |= ATOM_REQ_INT;
      ad->ex[atom3].flag                  |= ATOM_REQ_INT;
      ad->ex[atom4].flag                  |= ATOM_REQ_INT;
      */
      lc->atom_req[dd->dihedrals[i].atom1] |= ATOM_REQ_INT;
      lc->atom_req[dd->dihedrals[i].atom2] |= ATOM_REQ_INT;
      lc->atom_req[atom3]                  |= ATOM_REQ_INT;
      lc->atom_req[atom4]                  |= ATOM_REQ_INT;
#endif      

    } else {
      dd->dihedrals[i].flag |= DIHED_OTHER_NODE;
    }
  }
}


void SDMD_assign_cmap_to_PE(LINKED_CELL *lc, DIHEDRAL_DATA *dd, ATOM_DATA *ad)
{
  int i, j, k, icell[8], prev, a, this_pe;

  /* cmap */
  dd->head_cmap = -1;
  for (i=0;i<dd->n_cmap;i++) {
    this_pe = 0;
    for (j=0;j<8;j++) {
      icell[j]=lc->atom_cell[dd->cmap[i].atom[j]];
      if (lc->cell[icell[j]].pe == mpi.rank) {
	this_pe = 1;
      }
    }
    if (!this_pe) {
      dd->cmap[i].flag |= DIHED_OTHER_NODE;
      continue;
    }
    
    for (j=1;j<8;j++) {
      a = icell[j];
      k = j-1;
      while (k >= 0 && SDMD_cellcmp(lc,a,icell[k])) {
	icell[k+1] = icell[k];
	k--;
      }
      icell[k+1] = a;
    }
    
    if (lc->cell[icell[0]].pe == mpi.rank) {
      dd->cmap[i].flag &= ~DIHED_OTHER_NODE;

      dd->cmap[i].next = -1;
      if (dd->head_cmap < 0) {
	dd->head_cmap = i;
      } else {
	dd->cmap[prev].next = i;
      }
      prev = i;
      
      for (j=0;j<8;j++) {
	lc->cell[icell[j]].req |= CELL_REQ_DIHED;
#ifdef TR_LIST_ATOM_REQ
	lc->atom_req[dd->cmap[i].atom[j]] |= ATOM_REQ_INT;
#endif
      }
    } else {
      dd->cmap[i].flag |= DIHED_OTHER_NODE;
    }
  }
}

/*
void SDMD_assign_cmap_to_PE_static(LINKED_CELL *lc, DIHEDRAL_DATA *dd, ATOM_DATA *ad)
{
  int i;
  for (i=0;i<dd->n_cmap;i++) {
    SDMD_setup_ex_tr_atom(lc, 8, dd->cmap[i].atom);
  }
}
*/


int SDMD_cellcmp(LINKED_CELL *lc,int icell,int jcell)
{
  int idx, idy, idz;

  if (icell == jcell) return 1;

  idx = lc->cell[icell].ix - lc->cell[jcell].ix;
  if (idx < -lc->n_grid_h[0]) idx += lc->n_grid[0];
  if (idx >  lc->n_grid_h[0]) idx -= lc->n_grid[0];
  
  if (idx < 0) return 1;
  if (idx > 0) return 0;
  
  idy = lc->cell[icell].iy - lc->cell[jcell].iy;
  if (idy < -lc->n_grid_h[1]) idy += lc->n_grid[1];
  if (idy >  lc->n_grid_h[1]) idy -= lc->n_grid[1];
  
  if (idy < 0) return 1;
  if (idy > 0) return 0;

  idz = lc->cell[icell].iz - lc->cell[jcell].iz;
  if (idz < -lc->n_grid_h[2]) idz += lc->n_grid[2];
  if (idz >  lc->n_grid_h[2]) idz -= lc->n_grid[2];
  
  if (idz < 0) return 1;
  if (idz > 0) return 0;
  return 1;

  /*
  idx = lc->cell[icell].ix - lc->cell[jcell].ix;
  if (idx < -lc->n_grid[0]/2) idx += lc->n_grid[0];
  if (idx >  lc->n_grid[0]/2) idx -= lc->n_grid[0];
  
  if (idx < 0) return 1;
  if (idx > 0) return 0;
  
  idy = lc->cell[icell].iy - lc->cell[jcell].iy;
  if (idy < -lc->n_grid[1]/2) idy += lc->n_grid[1];
  if (idy >  lc->n_grid[1]/2) idy -= lc->n_grid[1];
  
  if (idy < 0) return 1;
  if (idy > 0) return 0;

  idz = lc->cell[icell].iz - lc->cell[jcell].iz;
  if (idz < -lc->n_grid[2]/2) idz += lc->n_grid[2];
  if (idz >  lc->n_grid[2]/2) idz -= lc->n_grid[2];
  
  if (idz < 0) return 1;
  if (idz > 0) return 0;
  return 1;
  */
}

/* half shell */
void SDMD_assign_cellpair_HS(LINKED_CELL *lc)
{
  int i;

  for (i=0;i<lc->n_cell_pair;i++) {
    lc->cell_pair[i].pe = lc->cell[lc->cell_pair[i].i].pe;
  }
}

/* neutral territory */
void SDMD_assign_cellpair_NT(LINKED_CELL *lc)
{
  int i, ix, iy, iz, jx, jy, jz, px, py, pz, flag;
  int offset[27][3];
  CELL *icell, *jcell;
  CELL_PAIR *cp;
  
  for (ix=-1;ix<=1;ix++) {
    for (iy=-1;iy<=1;iy++) {
      for (iz=-1;iz<=1;iz++) {
	i = (iz+1) + 3*((iy+1) + 3*(ix+1));
	offset[i][0] = ix * lc->n_grid[0];
	offset[i][1] = iy * lc->n_grid[1];
	offset[i][2] = iz * lc->n_grid[2];
      }
    }
  }

  for (i=0;i<lc->n_cell_pair;i++) {
    cp = &lc->cell_pair[i];
    icell = &lc->cell[lc->cell_pair[i].i];
    jcell = &lc->cell[lc->cell_pair[i].j];
    
    ix = icell->ix + offset[cp->offset][0];
    iy = icell->iy + offset[cp->offset][1];
    iz = icell->iz + offset[cp->offset][2];
    
    jx = jcell->ix;
    jy = jcell->iy;
    jz = jcell->iz;

    if (ix < jx) {
      flag = 1;
    } else if (ix > jx) {
      flag = 0;
    } else { /* ix == jx */
      if (iy < jy) {
	flag = 1;
      } else if (iy < jy) {
	flag = 0;
      } else { /* iy == jy */
	if (iz < jz) {
	  flag = 1;
	} else {
	  flag = 0;
	}
      }
    }
    if (flag) {
      px = ix;
      py = iy;
      pz = jz;
    } else {
      px = jx;
      py = jy;
      pz = iz;
    }
    if (px < 0)              px += lc->n_grid[0];
    if (px >= lc->n_grid[0]) px -= lc->n_grid[0];

    if (py < 0)              py += lc->n_grid[1];
    if (py >= lc->n_grid[1]) py -= lc->n_grid[1];

    if (pz < 0)              pz += lc->n_grid[2];
    if (pz >= lc->n_grid[2]) pz -= lc->n_grid[2];
    
    lc->cell_pair[i].pe = lc->cell[CELL_INDEX(lc, px,py,pz)].pe;
  }
}

/* eighth shell */
void SDMD_assign_cellpair_ES(LINKED_CELL *lc)
{
  int i, ix, iy, iz, jx, jy, jz, px, py, pz, flag;
  int offset[27][3];
  CELL *icell, *jcell;
  CELL_PAIR *cp;
  
  for (ix=-1;ix<=1;ix++) {
    for (iy=-1;iy<=1;iy++) {
      for (iz=-1;iz<=1;iz++) {
	i = (iz+1) + 3*((iy+1) + 3*(ix+1));
	offset[i][0] = ix * lc->n_grid[0];
	offset[i][1] = iy * lc->n_grid[1];
	offset[i][2] = iz * lc->n_grid[2];
      }
    }
  }

  for (i=0;i<lc->n_cell_pair;i++) {
    cp = &lc->cell_pair[i];
    icell = &lc->cell[lc->cell_pair[i].i];
    jcell = &lc->cell[lc->cell_pair[i].j];
    
    ix = icell->ix + offset[cp->offset][0];
    iy = icell->iy + offset[cp->offset][1];
    iz = icell->iz + offset[cp->offset][2];
    
    jx = jcell->ix;
    jy = jcell->iy;
    jz = jcell->iz;

    if (ix < jx) {
      px = ix;
    } else {
      px = jx;
    }
    if (iy < jy) {
      py = iy;
    } else {
      py = jy;
    }
    if (iz < jz) {
      pz = iz;
    } else {
      pz = jz;
    }

    if (px < 0)              px += lc->n_grid[0];
    if (px >= lc->n_grid[0]) px -= lc->n_grid[0];

    if (py < 0)              py += lc->n_grid[1];
    if (py >= lc->n_grid[1]) py -= lc->n_grid[1];

    if (pz < 0)              pz += lc->n_grid[2];
    if (pz >= lc->n_grid[2]) pz -= lc->n_grid[2];
    
    lc->cell_pair[i].pe = lc->cell[CELL_INDEX(lc, px,py,pz)].pe;
  }
}

/* mid point */
void SDMD_assign_cellpair_MP(LINKED_CELL *lc)
{
  int i, ix, iy, iz, jx, jy, jz, px, py, pz, flag;
  int offset[27][3];
  CELL *icell, *jcell;
  CELL_PAIR *cp;
  
  for (ix=-1;ix<=1;ix++) {
    for (iy=-1;iy<=1;iy++) {
      for (iz=-1;iz<=1;iz++) {
	i = (iz+1) + 3*((iy+1) + 3*(ix+1));
	offset[i][0] = ix * lc->n_grid[0];
	offset[i][1] = iy * lc->n_grid[1];
	offset[i][2] = iz * lc->n_grid[2];
      }
    }
  }

  for (i=0;i<lc->n_cell_pair;i++) {
    cp = &lc->cell_pair[i];
    icell = &lc->cell[lc->cell_pair[i].i];
    jcell = &lc->cell[lc->cell_pair[i].j];
    
    ix = icell->ix + offset[cp->offset][0];
    iy = icell->iy + offset[cp->offset][1];
    iz = icell->iz + offset[cp->offset][2];
    
    jx = jcell->ix;
    jy = jcell->iy;
    jz = jcell->iz;

    px = (int) floor(((ix+0.49) + (jx+0.49))*0.5);
    py = (int) floor(((iy+0.49) + (jy+0.49))*0.5);
    pz = (int) floor(((iz+0.49) + (jz+0.49))*0.5);

    if (px < 0)              px += lc->n_grid[0];
    if (px >= lc->n_grid[0]) px -= lc->n_grid[0];

    if (py < 0)              py += lc->n_grid[1];
    if (py >= lc->n_grid[1]) py -= lc->n_grid[1];

    if (pz < 0)              pz += lc->n_grid[2];
    if (pz >= lc->n_grid[2]) pz -= lc->n_grid[2];
    
    lc->cell_pair[i].pe = lc->cell[CELL_INDEX(lc, px,py,pz)].pe;

    /*lprintf("(%d %d %d)-(%d %d %d) %d %d %d: %d\n", ix, iy, iz, jx, jy, jz, px, py, pz, lc->cell_pair[i].pe);*/
  }
}


void SDMD_assign_cellpair_to_PE(LINKED_CELL *lc)
{
  int i, prev;

  if (lc->tr_mode == TR_MODE_ES) {
    SDMD_assign_cellpair_ES(lc);
  } else if (lc->tr_mode == TR_MODE_MP) {
    SDMD_assign_cellpair_MP(lc);
  } else {
    SDMD_assign_cellpair_ES(lc);
  }
    /*SDMD_assign_cellpair_HS(lc);*/
    /*SDMD_assign_cellpair_NT(lc);*/
    /*SDMD_assign_cellpair_MP(lc);*/

  lc->cell_pair_req = emalloc("SDMD_assign_cellpair_to_PE", sizeof(int)*lc->n_cell_pair);
  lc->n_cell_pair_req = 0;
  for (i=0;i<mpi.n_threads;i++) {
    lc->td[i].cell_pair_req = emalloc("SDMD_assign_cellpair_to_PE", sizeof(int)*lc->n_cell_pair);
    lc->td[i].n_cell_pair_req = 0;
  }

  lc->pair_head = -1;
  
  for (i=0;i<lc->n_cell_pair;i++) {
    if (lc->cell_pair[i].pe == mpi.rank) {
      lc->cell[lc->cell_pair[i].i].req |= CELL_REQ_NONBOND;
      lc->cell[lc->cell_pair[i].j].req |= CELL_REQ_NONBOND;

      lc->cell_pair[i].next = -1;
      if (lc->pair_head == -1) {
	lc->pair_head=i;
      } else {
	lc->cell_pair[prev].next = i;
      }
      prev = i;

      lc->cell_pair_req[lc->n_cell_pair_req]=i;
      lc->n_cell_pair_req++;
      
    }
  }
  
}

void SDMD_assign_atom_req(LINKED_CELL *lc)
{
  int i, j, k;

  k=0;
  for (j=lc->req_head; j>=0; j=lc->cell[j].req_next) {
   for (i=lc->cell[j].head; i>=0; i=lc->next_atom[i]) {
     lc->atom_cell_req[k] = i;
     k++;
   }
  }
  lc->n_atom_cell_req = k;
}


void SDMD_tr_list_xyz_init(TR_LIST_XYZ *p)
{
  p->pe = -1;
  p->n_atom = 0;
  RIA_init(&p->aid, 1000);
  p->n_cell = 0;
  RIA_init(&p->cid, 10);
}

void SDMD_check_alloc_tr_list(LINKED_CELL *lc, int pe, int n)
{
  char *func="SDMD_check_alloc_tr_list";

  if (n >= lc->tr_list[pe].n_alloc) {
    lc->tr_list[pe].n_alloc += TR_LIST_ALLOC_UNIT;
    lc->tr_list[pe].recv_x = erealloc(func,lc->tr_list[pe].recv_x,sizeof(int)*lc->tr_list[pe].n_alloc);
    lc->tr_list[pe].send_x = erealloc(func,lc->tr_list[pe].send_x,sizeof(int)*lc->tr_list[pe].n_alloc);
    lc->tr_list[pe].recv_f = lc->tr_list[pe].send_x;
    lc->tr_list[pe].send_f = lc->tr_list[pe].recv_x;
  }
}

void SDMD_alloc_tr_list(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, n_alloc;
  char *func="SDMD_alloc_tr_list";
  
  lc->tr_list = emalloc(func,sizeof(TR_LIST)*mpi.n_pe);
  for (i=0;i<mpi.n_pe;i++) {
    if (i!=mpi.rank) {
      n_alloc = ad->natom / mpi.n_pe * lc->tr_list_factor;
      if (n_alloc > ad->natom) n_alloc = ad->natom;
      lc->tr_list[i].n_alloc = n_alloc;
      lc->tr_list[i].recv_x = emalloc(func,sizeof(int)*n_alloc);
      lc->tr_list[i].send_x = emalloc(func,sizeof(int)*n_alloc);
    } else {
      lc->tr_list[i].recv_x = lc->tr_list[i].send_x = NULL;
    }
    /*
    lc->tr_list[i].n_alloc = 0;
    lc->tr_list[i].recv_x = lc->tr_list[i].send_x = NULL;
    */
    lc->tr_list[i].recv_f = lc->tr_list[i].send_x;
    lc->tr_list[i].send_f = lc->tr_list[i].recv_x;
  }
  lc->f = emalloc(func, sizeof(VEC)*ad->natom);

  /* tr_list_xyz */
  lc->sxl_x = lc->rfl_x = emalloc(func, sizeof(TR_LIST_XYZ)*2);
  lc->sxl_y = lc->rfl_y = emalloc(func, sizeof(TR_LIST_XYZ)*2);
  lc->sxl_z = lc->rfl_z = emalloc(func, sizeof(TR_LIST_XYZ)*2);
  lc->rxl_x = lc->sfl_x = emalloc(func, sizeof(TR_LIST_XYZ)*2);
  lc->rxl_y = lc->sfl_y = emalloc(func, sizeof(TR_LIST_XYZ)*2);
  lc->rxl_z = lc->sfl_z = emalloc(func, sizeof(TR_LIST_XYZ)*2);

  for (i=0;i<2;i++) {
    SDMD_tr_list_xyz_init(&lc->sxl_x[i]);
    SDMD_tr_list_xyz_init(&lc->sxl_y[i]);
    SDMD_tr_list_xyz_init(&lc->sxl_z[i]);

    SDMD_tr_list_xyz_init(&lc->rxl_x[i]);
    SDMD_tr_list_xyz_init(&lc->rxl_y[i]);
    SDMD_tr_list_xyz_init(&lc->rxl_z[i]);
  }
  /* end of tr_list_xyz */
}

void SDMD_print_tr_list(LINKED_CELL *lc, ATOM_DATA *ad)
{
#if 0
  int pe, count_r, count_s, n_recv, n_send;
  count_r = count_s = n_recv = n_send = 0;
  for (pe=0;pe<mpi.n_pe;pe++) {
    n_recv += lc->tr_list[pe].n_recv_x;
    n_send += lc->tr_list[pe].n_send_x;
    if (lc->tr_list[pe].n_recv_x > 0) {
      count_r++;
    }
    if (lc->tr_list[pe].n_send_x > 0) {
      count_s++;
    }
  }
  printf("rank %02d : %d %d %d %d\n", mpi.rank, count_r, count_s, n_recv, n_send);
  marble_exit(1);
#endif

  int pe, i;

  for (pe=0;pe<mpi.n_pe;pe++) {
    for (i=0;i<lc->tr_list[pe].n_recv_x;i++) {
      printf("rank %d -> %d : recv %d %d\n", pe, mpi.rank, i, lc->tr_list[pe].recv_x[i]);
    }
    for (i=0;i<lc->tr_list[pe].n_send_x;i++) {
      printf("rank %d -> %d : send %d %d\n", mpi.rank, pe, i, lc->tr_list[pe].send_x[i]);
    }
  }
  /*marble_exit(1);*/
}


void SDMD_setup_tr_list_xyz_ES(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, l, ll, lll, m;
  int local_pe, pe[10], ipe;
  CELL_PE *cell_pe, *remote_cell_pe, *remote2_cell_pe;
  TR_LIST_XYZ *tr;

  local_pe = mpi.rank;
  cell_pe = &lc->cell_pe[local_pe];
  /* set pe to which data are sent or from which data are received */
  for (l=0;l<2;l++) {
    lc->sxl_x[l].pe = cell_pe->neighbor_pe[l][0][0];
    lc->sxl_y[l].pe = cell_pe->neighbor_pe[l][0][1];
    lc->sxl_z[l].pe = cell_pe->neighbor_pe[l][0][2];

    lc->rxl_x[l].pe = cell_pe->neighbor_pe[l][1][0];
    lc->rxl_y[l].pe = cell_pe->neighbor_pe[l][1][1];
    lc->rxl_z[l].pe = cell_pe->neighbor_pe[l][1][2];

  }
  
  /* x */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_x[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    tr->n_cell = cell_pe->n_cell;
    RIA_alloc(&tr->cid, tr->n_cell);
    for (k=0;k<cell_pe->n_cell;k++) {
      RIA_set(&tr->cid, k, cell_pe->cell[k]);
    }
  }
  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_x[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    remote_cell_pe = &lc->cell_pe[tr->pe];
    tr->n_cell = remote_cell_pe->n_cell;
    RIA_alloc(&tr->cid, tr->n_cell);
    for (k=0;k<remote_cell_pe->n_cell;k++) {
      RIA_set(&tr->cid, k, remote_cell_pe->cell[k]);
    }
  }
  /* y */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_y[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = local_pe;
    ipe = 1;
    for (ll=0;ll<2;ll++) {
      pe[ipe]=cell_pe->neighbor_pe[ll][1][0];
      if (pe[ipe] < 0) break;
      ipe++;
    }
    tr->n_cell = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      tr->n_cell += remote_cell_pe->n_cell;
    }
    RIA_alloc(&tr->cid, tr->n_cell);
    j=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	RIA_set(&tr->cid, j, remote_cell_pe->cell[k]);
	j++;
      }
    }
  }
  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_y[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = tr->pe;
    remote_cell_pe = &lc->cell_pe[tr->pe];
    ipe = 1;
    for (ll=0;ll<2;ll++) {
      pe[ipe]=remote_cell_pe->neighbor_pe[ll][1][0];
      if (pe[ipe] < 0) break;
      ipe++;
    }
    tr->n_cell = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      tr->n_cell += remote_cell_pe->n_cell;
    }
    RIA_alloc(&tr->cid, tr->n_cell);
    j=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	RIA_set(&tr->cid, j, remote_cell_pe->cell[k]);
	j++;
      }
    }
  }
  /* z */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_z[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = local_pe;
    ipe = 1;
    /* x direction */
    for (lll=0;lll<2;lll++) {
      pe[ipe]=cell_pe->neighbor_pe[lll][1][0];
      if (pe[ipe] < 0) break;
      ipe++;
    }
    /* y direction */
    for (ll=0;ll<2;ll++) {
      pe[ipe]=cell_pe->neighbor_pe[ll][1][1];
      if (pe[ipe] < 0) break;
      ipe++;
      remote_cell_pe = &lc->cell_pe[pe[ipe-1]];
      /* y-x direction */
      for (lll=0;lll<2;lll++) {
	pe[ipe]=remote_cell_pe->neighbor_pe[lll][1][0];
	if (pe[ipe] < 0) break;
	ipe++;
      }
    }
    tr->n_cell = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      tr->n_cell += remote_cell_pe->n_cell;
    }
    RIA_alloc(&tr->cid, tr->n_cell);
    j=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	RIA_set(&tr->cid, j, remote_cell_pe->cell[k]);
	j++;
      }
    }
  }
  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_z[l];
    if (tr->pe < 0 || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = tr->pe;
    ipe = 1;
    remote2_cell_pe = &lc->cell_pe[pe[0]];
    /* x direction */
    for (lll=0;lll<2;lll++) { 
      pe[ipe]=remote2_cell_pe->neighbor_pe[lll][1][0];
      if (pe[ipe] < 0) break;
      ipe++;
    }
    /* y direction */
    for (ll=0;ll<2;ll++) {
      pe[ipe]=remote2_cell_pe->neighbor_pe[ll][1][1];
      if (pe[ipe] < 0) break;
      ipe++;
      remote_cell_pe = &lc->cell_pe[pe[ipe-1]];
      /* y-x direction */
      for (lll=0;lll<2;lll++) { 
	pe[ipe]=remote_cell_pe->neighbor_pe[lll][1][0];
	if (pe[ipe] < 0) break;
	ipe++;
      }
    }
    tr->n_cell = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      tr->n_cell += remote_cell_pe->n_cell;
    }
    RIA_alloc(&tr->cid, tr->n_cell);
    j=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	RIA_set(&tr->cid, j, remote_cell_pe->cell[k]);
	j++;
      }
    }
  }

  /*
  printf("rank %d (%d %d %d)-(%d %d %d): ", mpi.rank, cell_pe->min[0], cell_pe->min[1], cell_pe->min[2], cell_pe->max[0], cell_pe->max[1], cell_pe->max[2]);
  for (ll=0;ll<2;ll++) {
    for (l=0;l<3;l++) {
      for (lll=0;lll<2;lll++) {
	if      (l==0 && ll==0) tr = &lc->sxl_x[lll];
	else if (l==1 && ll==0) tr = &lc->sxl_y[lll];
	else if (l==2 && ll==0) tr = &lc->sxl_z[lll];
	if      (l==0 && ll==1) tr = &lc->rxl_x[lll];
	else if (l==1 && ll==1) tr = &lc->rxl_y[lll];
	else if (l==2 && ll==1) tr = &lc->rxl_z[lll];
	printf(" %d %d (", tr->pe, tr->n_cell);
	for (m=0;m<tr->n_cell;m++)
	  printf(" %d", tr->cid.data[m]);
	printf(")");
      }
    }
  }
  printf("\n");
  marble_exit(1);
  */
}

inline int SDMD_periodic_range(int i, int min, int max, int width)
{
  max -= min;
  /* this is not good for npx = 1
  if (max < 0)      max += width;
  if (max >= width) max -= width;
  */
  i -= min;
  if (i < 0)      i += width;
  if (i >= width) i -= width;
  
  return (i <= max);
}


void SDMD_setup_tr_list_xyz_MP(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, l, ll, lll, m, count;
  int local_pe, pe[10], ipe, jpe;
  CELL_PE *cell_pe, *remote_cell_pe, *remote2_cell_pe;
  TR_LIST_XYZ *tr;

  local_pe = mpi.rank;
  cell_pe = &lc->cell_pe[local_pe];
  /* set pe to which data are sent or from which data are received */
  for (l=0;l<2;l++) {
    lc->sxl_x[l].pe = cell_pe->neighbor_pe[0][l][0];
    lc->sxl_y[l].pe = cell_pe->neighbor_pe[0][l][1];
    lc->sxl_z[l].pe = cell_pe->neighbor_pe[0][l][2];

    lc->rxl_x[l].pe = cell_pe->neighbor_pe[0][l][0];
    lc->rxl_y[l].pe = cell_pe->neighbor_pe[0][l][1];
    lc->rxl_z[l].pe = cell_pe->neighbor_pe[0][l][2];
  }
  
  /* x */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_x[l];
    if ((lc->neighbor[0]==1 && l==1) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    count = 0;
    for (k=0;k<cell_pe->n_cell;k++) {
      j=cell_pe->cell[k];
      if ((l==0 && lc->cell[j].ix == cell_pe->min[0]) ||
	  (l==1 && lc->cell[j].ix == cell_pe->max[0])) {
	count++;
      }
    }
    tr->n_cell = count;
    RIA_alloc(&tr->cid, count);
    count=0;
    for (k=0;k<cell_pe->n_cell;k++) {
      j=cell_pe->cell[k];
      if ((l==0 && lc->cell[j].ix == cell_pe->min[0]) ||
	  (l==1 && lc->cell[j].ix == cell_pe->max[0])) {
	RIA_set(&tr->cid, count, j);
	count++;
      }
    }
  }
  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_x[l];
    if ((lc->neighbor[0]==1 && l==0) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    remote_cell_pe = &lc->cell_pe[tr->pe];
    count = 0;
    for (k=0;k<remote_cell_pe->n_cell;k++) {
      j=remote_cell_pe->cell[k];
      if ((l==0 && lc->cell[j].ix == remote_cell_pe->max[0]) ||
	  (l==1 && lc->cell[j].ix == remote_cell_pe->min[0])) {
	count++;
      }
    }
    tr->n_cell = count;
    RIA_alloc(&tr->cid, count);
    count=0;
    for (k=0;k<remote_cell_pe->n_cell;k++) {
      j=remote_cell_pe->cell[k];
      if ((l==0 && lc->cell[j].ix == remote_cell_pe->max[0]) ||
	  (l==1 && lc->cell[j].ix == remote_cell_pe->min[0])) {
	RIA_set(&tr->cid, count, j);
	count++;
      }
    }
  }


  /* y */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_y[l];
    if ((lc->neighbor[1]==1 && l==1) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = local_pe;
    ipe = 1;
    for (ll=0;ll<2;ll++) {  /* x direction */
      if (lc->neighbor[0]==1 && ll==0) continue;
      pe[ipe]=cell_pe->neighbor_pe[0][ll][0];
      ipe++;
    }
    /* To avoid double counts */
    if (pe[1]==pe[2]) {
      ipe--;
      if (pe[0]==pe[1]) 
	ipe--;
    }
    count = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    ((l==0 && lc->cell[j].iy == remote_cell_pe->min[1]) ||
	     (l==1 && lc->cell[j].iy == remote_cell_pe->max[1]))) {
	  /*lprintf("%d %d (%d %d %d)\n", m, k, lc->cell[j].ix, lc->cell[j].iy, lc->cell[j].iz);*/
	  count++;
	}
      }
    }
    tr->n_cell = count;
    RIA_alloc(&tr->cid, count);
    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    ((l==0 && lc->cell[j].iy == remote_cell_pe->min[1]) ||
	     (l==1 && lc->cell[j].iy == remote_cell_pe->max[1]))) {
	  RIA_set(&tr->cid, count, j);
	  count++;
	}
      }
    }
  }
  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_y[l];
    if ((lc->neighbor[1]==1 && l==0) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    /* setting pe array */
    pe[0] = tr->pe;
    remote_cell_pe = &lc->cell_pe[tr->pe];
    ipe = 1;
    for (ll=0;ll<2;ll++) { /* x direction */
      if (lc->neighbor[0]==1 && ll==0) continue;
      pe[ipe]=remote_cell_pe->neighbor_pe[0][ll][0];
      ipe++;
    }
    /* To avoid double counts */
    if (pe[1]==pe[2]) {
      ipe--;
      if (pe[0]==pe[1]) 
	ipe--;
    }

    /* count number of cells */
    count = 0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    ((l==0 && lc->cell[j].iy == remote_cell_pe->max[1]) ||
	     (l==1 && lc->cell[j].iy == remote_cell_pe->min[1]))) {
	  count++;
	}
      }
    }
    tr->n_cell=count;
    RIA_alloc(&tr->cid, tr->n_cell);
    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    ((l==0 && lc->cell[j].iy == remote_cell_pe->max[1]) ||
	     (l==1 && lc->cell[j].iy == remote_cell_pe->min[1]))) {
	  RIA_set(&tr->cid, count, j);
	  count++;
	}
      }
    }
  }


  /* z */
  /* send */
  for (l=0;l<2;l++) {
    tr = &lc->sxl_z[l];
    if ((lc->neighbor[2]==1 && l==1) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = local_pe;
    ipe = 1;
    /* x direction */
    for (lll=0;lll<2;lll++) {
      if (lc->neighbor[0]==1 && lll==0) continue;
      pe[ipe]=cell_pe->neighbor_pe[0][lll][0];
      for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	if (pe[jpe]==pe[ipe]) break;
      }
      if (jpe==ipe) /* not hit */
	ipe++;
    }
    /* y direction */
    for (ll=0;ll<2;ll++) {
      if (lc->neighbor[1]==1 && ll==0) continue;
      pe[ipe]=cell_pe->neighbor_pe[0][ll][1];
      for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	if (pe[jpe]==pe[ipe]) break;
      }
      if (jpe==ipe) /* not hit */
	ipe++;
      remote_cell_pe = &lc->cell_pe[pe[ipe-1]];
      /* y-x direction */
      for (lll=0;lll<2;lll++) {
	if (lc->neighbor[0]==1 && lll==0) continue;
	pe[ipe]=remote_cell_pe->neighbor_pe[0][lll][0];
	for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	  if (pe[jpe]==pe[ipe]) break;
	}
	if (jpe==ipe) /* not hit */
	  ipe++;
      }
    }

    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    SDMD_periodic_range(lc->cell[j].iy, 
				cell_pe->min[1] - 1,
				cell_pe->max[1] + 1,
				lc->n_grid[1]) &&
	    ((l==0 && lc->cell[j].iz == remote_cell_pe->min[2]) ||
	     (l==1 && lc->cell[j].iz == remote_cell_pe->max[2]))) {
	  count++;
	}
      }
    }
    tr->n_cell = count;
    RIA_alloc(&tr->cid, tr->n_cell);
    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    SDMD_periodic_range(lc->cell[j].iy, 
				cell_pe->min[1] - 1,
				cell_pe->max[1] + 1,
				lc->n_grid[1]) &&
	    ((l==0 && lc->cell[j].iz == remote_cell_pe->min[2]) ||
	     (l==1 && lc->cell[j].iz == remote_cell_pe->max[2]))) {
	  RIA_set(&tr->cid, count, j);
	  count++;
	}
      }
    }
  }


  /* recv */
  for (l=0;l<2;l++) {
    tr = &lc->rxl_z[l];
    if ((lc->neighbor[2]==1 && l==0) || tr->pe == mpi.rank) {
      tr->n_cell = 0;
      continue;
    }
    pe[0] = tr->pe;
    ipe = 1;
    remote2_cell_pe = &lc->cell_pe[pe[0]];
    /* x direction */
    for (lll=0;lll<2;lll++) { 
      if (lc->neighbor[0]==1 && lll==0) continue;
      pe[ipe]=remote2_cell_pe->neighbor_pe[0][lll][0];
      for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	if (pe[jpe]==pe[ipe]) break;
      }
      if (jpe==ipe) /* not hit */
	ipe++;
    }
    /* y direction */
    for (ll=0;ll<2;ll++) {
      if (lc->neighbor[1]==1 && ll==0) continue;
      pe[ipe]=remote2_cell_pe->neighbor_pe[0][ll][1];
      for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	if (pe[jpe]==pe[ipe]) break;
      }
      if (jpe==ipe) /* not hit */
	ipe++;
      remote_cell_pe = &lc->cell_pe[pe[ipe-1]];
      /* y-x direction */
      for (lll=0;lll<2;lll++) { 
	if (lc->neighbor[0]==1 && lll==0) continue;
	pe[ipe]=remote_cell_pe->neighbor_pe[0][lll][0];
	for (jpe=0;jpe<ipe;jpe++) {  /* check double count */
	  if (pe[jpe]==pe[ipe]) break;
	}
	if (jpe==ipe) /* not hit */
	  ipe++;
      }
    }

    /* debug 
    if (local_pe == 1) {
      int iii, ix, iy, iz, peiii;
      for (iii=0;iii<ipe;iii++) {
	peiii = pe[iii];
	mpi_rank2xyz(peiii,&ix,&iy,&iz);
	printf("%d<-%d :%d (%d,%d,%d)\n",local_pe,tr->pe,peiii,ix,iy,iz);
      }
    }
    */

    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];

	/* debug 
	if (local_pe == 1 && pe[m] == 4) {
	  printf("1, 4: %d (%d %d %d) pe = %d \n", j, lc->cell[j].ix, lc->cell[j].iy, lc->cell[j].iz, lc->cell[j].pe);
	  if (j==74) {
	    printf("%d %d %d %d\n", j, cell_pe->min[0] - 1, cell_pe->max[0] + 1, lc->n_grid[0]);
	    printf("%d %d %d %d\n", j, cell_pe->min[1] - 1, cell_pe->max[1] + 1, lc->n_grid[1]);
	    printf("l=%d, %d %d %d\n", l, remote_cell_pe->max[2], remote_cell_pe->min[2]);
	    printf("%d %d %d\n",
		   SDMD_periodic_range(lc->cell[j].ix, 
				       cell_pe->min[0] - 1,
				       cell_pe->max[0] + 1,
				       lc->n_grid[0]),
		   SDMD_periodic_range(lc->cell[j].iy, 
				       cell_pe->min[1] - 1,
				       cell_pe->max[1] + 1,
				       lc->n_grid[1]),
		   ((l==0 && lc->cell[j].iz == remote_cell_pe->max[2]) ||
		    (l==1 && lc->cell[j].iz == remote_cell_pe->min[2])));
		   
	  }
	}
	*/
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    SDMD_periodic_range(lc->cell[j].iy, 
				cell_pe->min[1] - 1,
				cell_pe->max[1] + 1,
				lc->n_grid[1]) &&
	    ((l==0 && lc->cell[j].iz == remote_cell_pe->max[2]) ||
	     (l==1 && lc->cell[j].iz == remote_cell_pe->min[2]))) {
	  /* debug
	  if (local_pe == 1 && pe[m] == 4) {
	    printf("1, 4: %d OK \n", j);
	  }
	  */
	  count++;
	}
      }
    }
    tr->n_cell = count;
    RIA_alloc(&tr->cid, tr->n_cell);
    count=0;
    for (m=0;m<ipe;m++) {
      remote_cell_pe = &lc->cell_pe[pe[m]];
      for (k=0;k<remote_cell_pe->n_cell;k++) {
	j=remote_cell_pe->cell[k];
	if (SDMD_periodic_range(lc->cell[j].ix, 
				cell_pe->min[0] - 1,
				cell_pe->max[0] + 1,
				lc->n_grid[0]) &&
	    SDMD_periodic_range(lc->cell[j].iy, 
				cell_pe->min[1] - 1,
				cell_pe->max[1] + 1,
				lc->n_grid[1]) &&
	    ((l==0 && lc->cell[j].iz == remote_cell_pe->max[2]) ||
	     (l==1 && lc->cell[j].iz == remote_cell_pe->min[2]))) {
	  RIA_set(&tr->cid, count, j);
	  count++;
	}
      }
    }
  }


  /*
  if (mpi.rank == 0) {
  printf("rank %d (%d %d %d)-(%d %d %d):\n", mpi.rank, cell_pe->min[0], cell_pe->min[1], cell_pe->min[2], cell_pe->max[0], cell_pe->max[1], cell_pe->max[2]);
  for (ll=0;ll<2;ll++) {
    for (l=0;l<3;l++) {
      for (lll=0;lll<2;lll++) {
	if      (l==0 && ll==0) tr = &lc->sxl_x[lll];
	else if (l==1 && ll==0) tr = &lc->sxl_y[lll];
	else if (l==2 && ll==0) tr = &lc->sxl_z[lll];
	if      (l==0 && ll==1) tr = &lc->rxl_x[lll];
	else if (l==1 && ll==1) tr = &lc->rxl_y[lll];
	else if (l==2 && ll==1) tr = &lc->rxl_z[lll];
	printf("sr%d xyz%d rl%d %d %d (", ll, l, lll, tr->pe, tr->n_cell);
	for (m=0;m<tr->n_cell;m++)
	  printf(" %d", tr->cid.data[m]);
	printf(")\n");
      }
    }
  }
  printf("\n");
  }
  marble_exit(1);
  */


}


void SDMD_make_tr_list_xyz(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, l, ll, lll, m, prev;
  TR_LIST_XYZ *tr;

  for (l=0;l<3;l++) {
    for (ll=0;ll<2;ll++) {
      for (lll=0;lll<2;lll++) {
	if      (l==0 && ll==0) tr = &lc->sxl_x[lll];
	else if (l==1 && ll==0) tr = &lc->sxl_y[lll];
	else if (l==2 && ll==0) tr = &lc->sxl_z[lll];
	if      (l==0 && ll==1) tr = &lc->rxl_x[lll];
	else if (l==1 && ll==1) tr = &lc->rxl_y[lll];
	else if (l==2 && ll==1) tr = &lc->rxl_z[lll];

	tr->n_atom = 0;

	for (k=0;k<tr->n_cell;k++) {
	  tr->n_atom += lc->cell[tr->cid.data[k]].n_atom;
	}
	if (tr->n_atom==0) continue;
	RIA_alloc(&tr->aid, tr->n_atom);
	m=0;
	for (k=0;k<tr->n_cell;k++) {
	  j=tr->cid.data[k];
	  for (i=lc->cell[j].head;i>=0;i=lc->next_atom[i]) {
	    RIA_set(&tr->aid, m, i);
	    m++;
	  }
	}
      }
    }
  }

  for (i=0;i<lc->n_cell;i++) lc->cell[i].req_xyz = 0;
  for (l=0;l<3;l++) {
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->rxl_x[lll];
      else if (l==1) tr = &lc->rxl_y[lll];
      else if (l==2) tr = &lc->rxl_z[lll];
      for (k=0;k<tr->n_cell;k++) {
	j=tr->cid.data[k];
	/*lprintf("[%d] %d %d %d: %d %d\n", l, lc->cell[j].ix, lc->cell[j].iy, lc->cell[j].iz, lc->cell[j].req, lc->cell[j].req_xyz);*/
	lc->cell[j].req_xyz = lc->cell[j].req;
	lc->cell[j].req = 0;
      }
    }
  }
  /*
  for (i=0;i<lc->n_cell;i++) {
    lprintf("%d %d %d: %d %d\n", lc->cell[i].ix, lc->cell[i].iy, lc->cell[i].iz, lc->cell[i].req, lc->cell[i].req_xyz);
  }
  marble_exit(1);
  */
  SDMD_make_tr_list_by_cell_req(lc, ad);

  for (l=0;l<3;l++) {
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->rxl_x[lll];
      else if (l==1) tr = &lc->rxl_y[lll];
      else if (l==2) tr = &lc->rxl_z[lll];
      
      for (k=0;k<tr->n_cell;k++) {
	j=tr->cid.data[k];
	lc->cell[j].req = lc->cell[j].req_xyz;
	lc->cell[j].req_xyz = 1;
      }

      if      (l==0) tr = &lc->sxl_x[lll];
      else if (l==1) tr = &lc->sxl_y[lll];
      else if (l==2) tr = &lc->sxl_z[lll];

      for (k=0;k<tr->n_cell;k++) {
	j=tr->cid.data[k];
	lc->cell[j].req_xyz |= 2;
      }
    }
  }

  lc->req_head = -1;
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].req) {
      lc->cell[i].req_next = -1;
      if (lc->req_head == -1) {
	lc->req_head=i;
      } else {
	lc->cell[prev].req_next = i;
      }
      prev = i;
    }
  }

  SDMD_clear_tr_list(lc);
  SDMD_add_tr_list(lc, ad);

  /*SDMD_print_tr_list(lc, ad);*/

  SDMD_assign_atom_req(lc);
}


void SDMD_make_tr_list_by_cell_req(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, send_to, recv_from, ipe, kpe;
  int prev;

  lc->req_head = -1;
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].req) {
      lc->cell[i].req_next = -1;
      if (lc->req_head == -1) {
	lc->req_head=i;
      } else {
	lc->cell[prev].req_next = i;
      }
      prev = i;
    }
  }

  SDMD_clear_tr_list(lc);

#if 0
  for (i=lc->req_head;i>=0;i=lc->cell[i].req_next) {

      ipe=lc->cell[i].pe;
      for (j=lc->cell[i].head;j>=0;j=lc->next_atom[j]) {
	if (ad->ex[j].flag & ATOM_CHILD) {
	  k = ad->ex[j].parent;
	  kpe = lc->cell[lc->atom_cell[k]].pe;
	  /* debug
	  if (k<0||k>=ad->natom) printf("gege %d\n", k);
	  if (lc->atom_cell[k]<0||lc->atom_cell[k]>=lc->n_cell)
	    printf("gege2 %d %d %d\n", k, lc->atom_cell[k],lc->n_cell);
	  */
	  if (kpe != mpi.rank) {
	    lc->tr_list[kpe].recv_x[lc->tr_list[kpe].n_recv_x]=j;
	    lc->tr_list[kpe].n_recv_x++;

	    if (lc->tr_list[kpe].n_recv_x >= lc->tr_list[kpe].n_alloc) {
	      lprintf("ERROR: tr_list overflow\n");
	      fprintf(stderr,"ERROR: tr_list overflow\n");
	      marble_abort(1);
	    }

	  }
	} else {
	  /* j is not ATOM_CHILD: PARENT or other */
	  if (ipe != mpi.rank) {
	    lc->tr_list[ipe].recv_x[lc->tr_list[ipe].n_recv_x]=j;
	    lc->tr_list[ipe].n_recv_x++;

	    if (lc->tr_list[ipe].n_recv_x >= lc->tr_list[ipe].n_alloc) {
	      lprintf("ERROR: tr_list overflow\n");
	      fprintf(stderr,"ERROR: tr_list overflow\n");
	      marble_abort(1);
	    }

	  }
	}
      }
      /*  } */
  }
  
  SDMD_make_send_list(lc, ad);
  SDMD_add_tr_list(lc, ad);
#else
  if (lc->req_head>=0) {
    int err = 0;
    for (i=lc->req_head;i>=0;i=lc->cell[i].req_next) {
      if (mpi.rank != lc->cell[i].pe) {
	printf("ERROR: rank(%d), cell_req %d rank %d\n", mpi.rank, i, lc->cell[i].pe);
	err=1;
      }
    }
    if (err)
      marble_abort(1);
  }
#endif

  /*SDMD_print_tr_list(lc, ad);*/
  SDMD_assign_atom_req(lc);
}

void SDMD_make_tr_list_by_atom_req(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, pe;

  for (i=0;i<mpi.n_pe;i++) {
    lc->tr_list[i].n_recv_x = 0;
    lc->tr_list[i].n_send_x = 0;
  }
  
  for (i=0;i<ad->natom;i++) {
    /* if (ad->ex[i].flag & ATOM_REQ) { */
    if (lc->atom_req[i] & ATOM_REQ) {
      pe = ATOM_CPU(lc,ad,i);
      if (pe != mpi.rank) {
	lc->tr_list[pe].recv_x[lc->tr_list[pe].n_recv_x]=i;
	lc->tr_list[pe].n_recv_x++;

	if (lc->tr_list[pe].n_recv_x >= lc->tr_list[pe].n_alloc) {
	  lprintf("ERROR: tr_list overflow\n");
	  fprintf(stderr,"ERROR: tr_list overflow\n");
	  marble_abort(1);
	}

      }
    }
  }
  SDMD_add_tr_list(lc, ad);  
  SDMD_make_send_list(lc, ad);
}

void SDMD_clear_tr_list(LINKED_CELL *lc)
{
  int i;
  for (i=0;i<mpi.n_pe;i++) {
    lc->tr_list[i].n_recv_x = 0;
    lc->tr_list[i].n_send_x = 0;
    lc->tr_list[i].n_recv_f = 0;
    lc->tr_list[i].n_send_f = 0;
  }
}

void SDMD_add_tr_list(LINKED_CELL *lc, ATOM_DATA *ad)
{
  EX_TR_ATOM *p;
  int i,j,pe,iatom, icell, ok;
  char *func = "SDMD_add_tr_list";

  /* Send list updates are added on 2012/8/2 */

  for (p=lc->ex_tr_atom;p!=NULL;p=p->next) {
    iatom = p->atom_list[0];
    pe = ATOM_CPU(lc,ad,iatom);
    /*printf("rank[%d] 1 %d %d\n", mpi.rank, iatom, pe);*/
    if (pe==mpi.rank) {
      /* recv list */
      for (i=1;i<p->n_atom;i++) {
	iatom = p->atom_list[i];
	pe = ATOM_CPU(lc,ad,iatom);
	if (pe!=mpi.rank) {
	  /*printf("rank[%d] 2recv %d %d\n", mpi.rank, iatom, pe);*/
	  if (lc->cell[lc->atom_cell[iatom]].req_xyz & 1) {
	    /* iatom is transfered using xyz scheme */
	    continue;
	  }
	  ok = 0;
	  for (j=0;j<lc->tr_list[pe].n_recv_x;j++) {
	    if (lc->tr_list[pe].recv_x[j] == iatom) {
	      ok = 1;
	      break;
	    }
	  }
	  if (ok) continue;

	  /* iatom must be added in the list */
	  SDMD_check_alloc_tr_list(lc, pe, j);

	  lc->tr_list[pe].recv_x[j] = iatom;
	  lc->tr_list[pe].n_recv_x++;
	}
      }
#if 0
    } else {
      /* send list */
      for (i=1;i<p->n_atom;i++) {
	iatom = p->atom_list[i];
	pe = ATOM_CPU(lc,ad,iatom);
	if (pe==mpi.rank) {
	  printf("rank[%d] 3send %d %d\n", mpi.rank, iatom, pe);
	  if (lc->cell[lc->atom_cell[iatom]].req_xyz & 2) {
	    /* iatom is transfered using xyz scheme */
	    printf("rank[%d] 4send %d %d\n", mpi.rank, iatom, pe);
	    continue;
	  }
	  ok = 0;
	  for (j=0;j<lc->tr_list[pe].n_send_x;j++) {
	    if (lc->tr_list[pe].send_x[j] == iatom) {
	      ok = 1;
	      printf("rank[%d] 5send %d %d\n", mpi.rank, iatom, pe);
	      break;
	    }
	  }
	  if (ok) continue;

	  /* iatom must be added in the list */
	  if (j >= lc->tr_list[pe].n_alloc) {
	    lc->tr_list[pe].n_alloc += TR_LIST_ALLOC_UNIT;
	    lc->tr_list[pe].recv_x = erealloc(func, lc->tr_list[pe].recv_x, sizeof(int)*lc->tr_list[pe].n_alloc);
	    lc->tr_list[pe].send_x = erealloc(func, lc->tr_list[pe].send_x, sizeof(int)*lc->tr_list[pe].n_alloc);
	    lc->tr_list[pe].recv_f = lc->tr_list[pe].send_x;
	    lc->tr_list[pe].send_f = lc->tr_list[pe].recv_x;
	  }
	  lc->tr_list[pe].send_x[j] = iatom;
	  lc->tr_list[pe].n_send_x++;
	}
      }
#endif
    }
  }
  if (lc->ex_tr_atom) {
    lc->tr_dist_xf_flag = 1;
    SDMD_make_send_list(lc, ad);
  }
}

void SDMD_setup_ex_tr_atom(LINKED_CELL *lc, int n_atom, int *atom_list)
{
  EX_TR_ATOM *p;
  int i;
  char *fname = "SDMD_setup_ex_tr_atom";

  p=emalloc(fname, sizeof(EX_TR_ATOM));

  p->next = lc->ex_tr_atom;
  lc->ex_tr_atom = p;

  p->n_atom = n_atom;
  p->atom_list=emalloc(fname, sizeof(int)*n_atom);

  for (i=0;i<n_atom;i++) {
    p->atom_list[i]=atom_list[i];
  }
}

void SDMD_make_send_list(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, k, send_to, recv_from, ipe, kpe;
  MPI_Request req;
  MPI_Status stat;

  for (i=1;i<mpi.n_pe;i++) {
    send_to = (mpi.rank+i)%mpi.n_pe;
    recv_from = (mpi.rank-i+mpi.n_pe)%mpi.n_pe;
    MPI_Irecv(&lc->tr_list[recv_from].n_send_x, SDMD_TAG_MAKE_TR_LIST,
	      MPI_INT, recv_from, 1, mpi.comm, &req);
    MPI_Send(&lc->tr_list[send_to].n_recv_x, SDMD_TAG_MAKE_TR_LIST,
	     MPI_INT, send_to, 1, mpi.comm);
    MPI_Wait(&req, &stat);

    if (lc->tr_list[recv_from].n_send_x)
      MPI_Irecv(lc->tr_list[recv_from].send_x,lc->tr_list[recv_from].n_send_x,
		MPI_INT, recv_from, SDMD_TAG_MAKE_TR_LIST2, mpi.comm, &req);
    if (lc->tr_list[send_to].n_recv_x)
      MPI_Send(lc->tr_list[send_to].recv_x, lc->tr_list[send_to].n_recv_x,
	       MPI_INT, send_to, SDMD_TAG_MAKE_TR_LIST2, mpi.comm);
    if (lc->tr_list[recv_from].n_send_x)
      MPI_Wait(&req, &stat);
  }
  
  for (i=0;i<mpi.n_pe;i++) {
    lc->tr_list[i].n_send_f = lc->tr_list[i].n_recv_x;
    lc->tr_list[i].n_recv_f = lc->tr_list[i].n_send_x;
  }

  /* DEBUG 
  {
    int sum = 0, all_sum;
    for (i=0;i<mpi.n_pe;i++) {
      sum+=lc->tr_list[i].n_recv_x;
    }
    MPI_Reduce(&sum, &all_sum, 1, MPI_INT, MPI_SUM, mpi.master_pe, mpi.comm);
    lprintf("recv: %d\n", all_sum);
  }
  */
}

void SDMD_dist_x_xyz(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int l, lll, j, j_atom, req_id;
  int n_send[2], n_recv[2];
  double *send_buf[2], *recv_buf[2];
  TR_LIST_XYZ *tr;
  MPI_Request req[4];
  MPI_Status stat[4];
  int send_tag[2], recv_tag[2];

  if (lc->tr_mode == TR_MODE_MP) {
    send_tag[0]=recv_tag[1]=SDMD_TAG_DIST_X;
    send_tag[1]=recv_tag[0]=SDMD_TAG_DIST_X2;
  } else {
    send_tag[0]=recv_tag[1]=SDMD_TAG_DIST_X;
    send_tag[1]=recv_tag[0]=SDMD_TAG_DIST_X;
  }

  /* debug 
  int i;

  for (i=0;i<ad->natom;i++) {
    if (ATOM_CPU(lc, ad, i) != mpi.rank) {
      ad->x[i].x = ad->x[i].y = ad->x[i].z = 0.0;
    }
  }
  */

  for (lll=0;lll<2;lll++) {
    n_send[lll] = n_recv[lll] = 0;
  }

  for (l=0;l<3;l++) {
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->sxl_x[lll];
      else if (l==1) tr = &lc->sxl_y[lll];
      else if (l==2) tr = &lc->sxl_z[lll];
      
      if (n_send[lll] < tr->n_atom) n_send[lll] = tr->n_atom;

      if      (l==0) tr = &lc->rxl_x[lll];
      else if (l==1) tr = &lc->rxl_y[lll];
      else if (l==2) tr = &lc->rxl_z[lll];
      
      if (n_recv[lll] < tr->n_atom) n_recv[lll] = tr->n_atom;
    }
  }
  for (lll=0;lll<2;lll++) {
      send_buf[lll] = get_double_buf(n_send[lll]*3);
      recv_buf[lll] = get_double_buf(n_recv[lll]*3);
  }
  for (l=0;l<3;l++) {
    req_id = 0;
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->sxl_x[lll];
      else if (l==1) tr = &lc->sxl_y[lll];
      else if (l==2) tr = &lc->sxl_z[lll];

      for (j=0;j<tr->n_atom;j++) {
	j_atom=tr->aid.data[j];
	send_buf[lll][j*3]   = ad->x[j_atom].x;
	send_buf[lll][j*3+1] = ad->x[j_atom].y;
	send_buf[lll][j*3+2] = ad->x[j_atom].z;
      }
      if (tr->n_atom>0)
	MPI_Isend(send_buf[lll], tr->n_atom*3,
		  MPI_DOUBLE, tr->pe, send_tag[lll], mpi.comm, &req[req_id++]);

      if      (l==0) tr = &lc->rxl_x[lll];
      else if (l==1) tr = &lc->rxl_y[lll];
      else if (l==2) tr = &lc->rxl_z[lll];
      
      if (tr->n_atom>0)
	MPI_Irecv(recv_buf[lll], tr->n_atom*3,
		  MPI_DOUBLE, tr->pe, recv_tag[lll], mpi.comm, &req[req_id++]);
    }
    MPI_Waitall(req_id, req, stat);

    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->rxl_x[lll];
      else if (l==1) tr = &lc->rxl_y[lll];
      else if (l==2) tr = &lc->rxl_z[lll];

      for (j=0;j<tr->n_atom;j++) {
	j_atom=tr->aid.data[j];
	ad->x[j_atom].x = recv_buf[lll][j*3];
	ad->x[j_atom].y = recv_buf[lll][j*3+1];
	ad->x[j_atom].z = recv_buf[lll][j*3+2];
      }
    }
  }
  for (lll=0;lll<2;lll++) {
    free_buf(send_buf[lll]);
    free_buf(recv_buf[lll]);
  }

  /* debug
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].req_xyz & 1) {
      int ok=1, count=0;
      for (j=lc->cell[i].head;j>=0;j=lc->next_atom[j]) {
	if (ad->x[j].x == 0.0) {
	  count++;
	  ok=0;
	}
      }
      if (ok) {
	lprintf("%d (OK)\n", mpi.rank);
      } else {
	lprintf("%d (NG): %d %d %d %d %d\n", mpi.rank, 
		lc->cell[i].ix, lc->cell[i].iy, lc->cell[i].iz,
		count, lc->cell[i].n_atom);
      }
    }
  }
  */
  
}

void SDMD_dist_f_xyz(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int l, lll, j, j_atom, req_id;
  int n_send[2], n_recv[2];
  double *send_buf[2], *recv_buf[2];
  TR_LIST_XYZ *tr;
  MPI_Request req[4];
  MPI_Status stat[4];
  int send_tag[2], recv_tag[2];

  if (lc->tr_mode == TR_MODE_MP) {
    send_tag[0]=recv_tag[1]=SDMD_TAG_DIST_F;
    send_tag[1]=recv_tag[0]=SDMD_TAG_DIST_F2;
  } else {
    send_tag[0]=recv_tag[0]=SDMD_TAG_DIST_F;
    send_tag[1]=recv_tag[1]=SDMD_TAG_DIST_F;
  }

  for (lll=0;lll<2;lll++) {
    n_send[lll] = n_recv[lll] = 0;
  }

  for (l=0;l<3;l++) {
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->sfl_x[lll];
      else if (l==1) tr = &lc->sfl_y[lll];
      else if (l==2) tr = &lc->sfl_z[lll];
      
      if (n_send[lll] < tr->n_atom) n_send[lll] = tr->n_atom;

      if      (l==0) tr = &lc->rfl_x[lll];
      else if (l==1) tr = &lc->rfl_y[lll];
      else if (l==2) tr = &lc->rfl_z[lll];
      
      if (n_recv[lll] < tr->n_atom) n_recv[lll] = tr->n_atom;
    }
  }
  for (lll=0;lll<2;lll++) {
      send_buf[lll] = get_double_buf(n_send[lll]*3);
      recv_buf[lll] = get_double_buf(n_recv[lll]*3);
  }
  for (l=2;l>=0;l--) {
    req_id = 0;
    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->sfl_x[lll];
      else if (l==1) tr = &lc->sfl_y[lll];
      else if (l==2) tr = &lc->sfl_z[lll];

      for (j=0;j<tr->n_atom;j++) {
	j_atom=tr->aid.data[j];
	send_buf[lll][j*3]   = ad->f[j_atom].x;
	send_buf[lll][j*3+1] = ad->f[j_atom].y;
	send_buf[lll][j*3+2] = ad->f[j_atom].z;
      }
      if (tr->n_atom>0)
	MPI_Isend(send_buf[lll], tr->n_atom*3,
		  MPI_DOUBLE, tr->pe, send_tag[lll], mpi.comm, &req[req_id++]);

      if      (l==0) tr = &lc->rfl_x[lll];
      else if (l==1) tr = &lc->rfl_y[lll];
      else if (l==2) tr = &lc->rfl_z[lll];
      
      if (tr->n_atom>0)
	MPI_Irecv(recv_buf[lll], tr->n_atom*3,
		  MPI_DOUBLE, tr->pe, recv_tag[lll], mpi.comm, &req[req_id++]);
    }
    MPI_Waitall(req_id, req, stat);

    for (lll=0;lll<2;lll++) {
      if      (l==0) tr = &lc->rfl_x[lll];
      else if (l==1) tr = &lc->rfl_y[lll];
      else if (l==2) tr = &lc->rfl_z[lll];

      for (j=0;j<tr->n_atom;j++) {
	j_atom=tr->aid.data[j];
	ad->f[j_atom].x += recv_buf[lll][j*3];
	ad->f[j_atom].y += recv_buf[lll][j*3+1];
	ad->f[j_atom].z += recv_buf[lll][j*3+2];
      }
    }
  }

  for (lll=0;lll<2;lll++) {
    free_buf(send_buf[lll]);
    free_buf(recv_buf[lll]);
  }
}

void SDMD_dist_x(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, j_atom, send_to, recv_from;
  int err;
  MPI_Request rreq, sreq;
  MPI_Status stat;
  double *send_buf, *recv_buf;

  send_buf = get_double_buf(ad->natom*3);
  recv_buf = get_double_buf(ad->natom*3);
  for (i=1;i<mpi.n_pe;i++) {
    send_to = (mpi.rank+i)%mpi.n_pe;
    recv_from = (mpi.rank-i+mpi.n_pe)%mpi.n_pe;

    for (j=0;j<lc->tr_list[send_to].n_send_x;j++) {
      j_atom=lc->tr_list[send_to].send_x[j];
      send_buf[j*3]   = ad->x[j_atom].x;
      send_buf[j*3+1] = ad->x[j_atom].y;
      send_buf[j*3+2] = ad->x[j_atom].z;
    }

    if (lc->tr_list[recv_from].n_recv_x)
      MPI_Irecv(recv_buf,lc->tr_list[recv_from].n_recv_x*3,
		MPI_DOUBLE, recv_from, SDMD_TAG_DIST_X, mpi.comm, &rreq);
      
    if (lc->tr_list[send_to].n_send_x)
      MPI_Isend(send_buf, lc->tr_list[send_to].n_send_x*3,
		MPI_DOUBLE, send_to, SDMD_TAG_DIST_X, mpi.comm, &sreq);
    
    if (lc->tr_list[recv_from].n_recv_x)
      SDMD_MPI_Wait(lc, &rreq, &stat);

    for (j=0;j<lc->tr_list[recv_from].n_recv_x;j++) {
      j_atom=lc->tr_list[recv_from].recv_x[j];
      ad->x[j_atom].x = recv_buf[j*3];
      ad->x[j_atom].y = recv_buf[j*3+1];
      ad->x[j_atom].z = recv_buf[j*3+2];
    }
    if (lc->tr_list[send_to].n_send_x)
      SDMD_MPI_Wait(lc, &sreq, &stat);
  }
  free_buf(send_buf); free_buf(recv_buf);
  /* SDMD_check(lc); */
}

void SDMD_dist_v(LINKED_CELL *lc, ATOM_DATA *ad)
{
  VEC *tmp;
  tmp = ad->x;
  ad->x = ad->v;
  SDMD_dist_x(lc, ad);
  ad->x = tmp;
}

void SDMD_dist_f(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i, j, j_atom, send_to, recv_from;
  MPI_Request rreq, sreq;
  MPI_Status stat;
  double *send_buf, *recv_buf;

  send_buf = get_double_buf(ad->natom*3);
  recv_buf = get_double_buf(ad->natom*3);
  for (j=ad->node_atom_h;j>=0;j=ad->node_atom_n[j]) {
    lc->f[j].x = lc->f[j].y = lc->f[j].z = 0.0;
  }
  
  for (i=1;i<mpi.n_pe;i++) {
    send_to = (mpi.rank+i)%mpi.n_pe;
    recv_from = (mpi.rank-i+mpi.n_pe)%mpi.n_pe;

    for (j=0;j<lc->tr_list[send_to].n_send_f;j++) {
      j_atom=lc->tr_list[send_to].send_f[j];
      send_buf[j*3]   = ad->f[j_atom].x;
      send_buf[j*3+1] = ad->f[j_atom].y;
      send_buf[j*3+2] = ad->f[j_atom].z;
    }

    if (lc->tr_list[recv_from].n_recv_f)
      MPI_Irecv(recv_buf,lc->tr_list[recv_from].n_recv_f*3,
		MPI_DOUBLE, recv_from, SDMD_TAG_DIST_F, mpi.comm, &rreq);
    if (lc->tr_list[send_to].n_send_f)
      MPI_Isend(send_buf, lc->tr_list[send_to].n_send_f*3,
		MPI_DOUBLE, send_to, SDMD_TAG_DIST_F, mpi.comm, &sreq);
    if (lc->tr_list[recv_from].n_recv_f)
      SDMD_MPI_Wait(lc, &rreq, &stat);
    for (j=0;j<lc->tr_list[recv_from].n_recv_f;j++) {
      j_atom=lc->tr_list[recv_from].recv_f[j];
      lc->f[j_atom].x += recv_buf[j*3];
      lc->f[j_atom].y += recv_buf[j*3+1];
      lc->f[j_atom].z += recv_buf[j*3+2];
    }
    if (lc->tr_list[send_to].n_send_f)
      SDMD_MPI_Wait(lc, &sreq, &stat);
  }

  for (j=ad->node_atom_h;j>=0;j=ad->node_atom_n[j]) {
    ad->f[j].x += lc->f[j].x;
    ad->f[j].y += lc->f[j].y;
    ad->f[j].z += lc->f[j].z;
  }


  free_buf(send_buf); free_buf(recv_buf);
}

#define N_VAL_RMOL 13

void SDMD_dist_rmol(LINKED_CELL *lc, RMOL_DATA *md)
{
  int i, j, j_mol, send_to, recv_from;
  int err;
  MPI_Request rreq, sreq;
  MPI_Status stat;
  double *send_buf, *recv_buf;
  
  send_buf = get_double_buf(md->n_mol*N_VAL_RMOL);
  recv_buf = get_double_buf(md->n_mol*N_VAL_RMOL);
  for (i=1;i<mpi.n_pe;i++) {
    send_to = (mpi.rank+i)%mpi.n_pe;
    recv_from = (mpi.rank-i+mpi.n_pe)%mpi.n_pe;

    for (j=0;j<lc->tr_list[send_to].n_send_x;j++) {
      j_mol=lc->tr_list[send_to].send_x[j];
      send_buf[j*N_VAL_RMOL]   = md->mol[j_mol].rg.x;
      send_buf[j*N_VAL_RMOL+1] = md->mol[j_mol].rg.y;
      send_buf[j*N_VAL_RMOL+2] = md->mol[j_mol].rg.z;
      send_buf[j*N_VAL_RMOL+3] = md->mol[j_mol].vg.x;
      send_buf[j*N_VAL_RMOL+4] = md->mol[j_mol].vg.y;
      send_buf[j*N_VAL_RMOL+5] = md->mol[j_mol].vg.z;

      send_buf[j*N_VAL_RMOL+6] = md->mol[j_mol].q.qw;
      send_buf[j*N_VAL_RMOL+7] = md->mol[j_mol].q.qx;
      send_buf[j*N_VAL_RMOL+8] = md->mol[j_mol].q.qy;
      send_buf[j*N_VAL_RMOL+9] = md->mol[j_mol].q.qz;

      send_buf[j*N_VAL_RMOL+10] = md->mol[j_mol].l.x;
      send_buf[j*N_VAL_RMOL+11] = md->mol[j_mol].l.y;
      send_buf[j*N_VAL_RMOL+12] = md->mol[j_mol].l.z;
    }

    if (lc->tr_list[recv_from].n_recv_x)
      MPI_Irecv(recv_buf,lc->tr_list[recv_from].n_recv_x*N_VAL_RMOL,
		MPI_DOUBLE, recv_from, SDMD_TAG_DIST_RMOL, mpi.comm, &rreq);
      
    if (lc->tr_list[send_to].n_send_x)
      MPI_Isend(send_buf, lc->tr_list[send_to].n_send_x*N_VAL_RMOL,
		MPI_DOUBLE, send_to, SDMD_TAG_DIST_RMOL, mpi.comm, &sreq);
    
    if (lc->tr_list[recv_from].n_recv_x)
      SDMD_MPI_Wait(lc, &rreq, &stat);

    for (j=0;j<lc->tr_list[recv_from].n_recv_x;j++) {
      j_mol=lc->tr_list[recv_from].recv_x[j];
      
      md->mol[j_mol].rg.x = recv_buf[j*N_VAL_RMOL];
      md->mol[j_mol].rg.y = recv_buf[j*N_VAL_RMOL+1];
      md->mol[j_mol].rg.z = recv_buf[j*N_VAL_RMOL+2];
      md->mol[j_mol].vg.x = recv_buf[j*N_VAL_RMOL+3];
      md->mol[j_mol].vg.y = recv_buf[j*N_VAL_RMOL+4];
      md->mol[j_mol].vg.z = recv_buf[j*N_VAL_RMOL+5];
                            		 	 
      md->mol[j_mol].q.qw = recv_buf[j*N_VAL_RMOL+6];
      md->mol[j_mol].q.qx = recv_buf[j*N_VAL_RMOL+7];
      md->mol[j_mol].q.qy = recv_buf[j*N_VAL_RMOL+8];
      md->mol[j_mol].q.qz = recv_buf[j*N_VAL_RMOL+9];
                            		 	 
      md->mol[j_mol].l.x  = recv_buf[j*N_VAL_RMOL+10];
      md->mol[j_mol].l.y  = recv_buf[j*N_VAL_RMOL+11];
      md->mol[j_mol].l.z  = recv_buf[j*N_VAL_RMOL+12];
    }
    if (lc->tr_list[send_to].n_send_x)
      SDMD_MPI_Wait(lc, &sreq, &stat);
  }
  free_buf(send_buf); free_buf(recv_buf);
  /* SDMD_check(lc); */
}

void SDMD_MPI_Wait(LINKED_CELL *lc, MPI_Request *req, MPI_Status *stat)
{
  double check;
  int ret;
  check = MPI_Wtime();
  MPI_Wait(req, stat);
  lc->idle_time += MPI_Wtime()-check; 
}

void SDMD_gather_x(LINKED_CELL *lc, ATOM_DATA *ad)
{
  MPI_Status stat;
  int i, j, k, n, pe;
  double *send_buf, *recv_buf;
  
  if (mpi.master) {
    recv_buf = get_double_buf(ad->natom*3);
    for (pe=0;pe<mpi.n_pe;pe++) {
      if (pe==mpi.rank) continue;
      MPI_Recv(recv_buf,ad->natom*3,MPI_DOUBLE, pe, SDMD_TAG_GATHER_X, mpi.comm, &stat);
      n=0;
      for (i=0;i<lc->n_cell;i++) {
	if (lc->cell[i].pe != pe) continue;
	for (j=lc->cell[i].head; j>=0; j=lc->next_atom[j]) {
	  if (ad->ex[j].flag & ATOM_CHILD) continue;
	  
	  ad->x[j].x = recv_buf[n++];
	  ad->x[j].y = recv_buf[n++];
	  ad->x[j].z = recv_buf[n++];

	  if (ad->ex[j].flag & ATOM_PARENT) {
	    for (k=ad->ex[j].child_list;k>=0;k=ad->ex[k].child_list) {
	      ad->x[k].x = recv_buf[n++];
	      ad->x[k].y = recv_buf[n++];
	      ad->x[k].z = recv_buf[n++];
	    }
	  }
	}
      }
    }
    free_buf(recv_buf);
  } else {
    /* other nodes */
    send_buf = get_double_buf(ad->natom*3);
    n=0;
    /*
    for (i=0;i<lc->n_cell;i++) {
      if (lc->cell[i].pe != mpi.rank) continue;
    */
    for (i=lc->node_head;i>=0;i=lc->cell[i].node_next) {
    
      for (j=lc->cell[i].head; j>=0; j=lc->next_atom[j]) {
	
	if (ad->ex[j].flag & ATOM_CHILD) continue;

	send_buf[n++] = ad->x[j].x;
	send_buf[n++] = ad->x[j].y;
	send_buf[n++] = ad->x[j].z;

	if (ad->ex[j].flag & ATOM_PARENT) {
	  for (k=ad->ex[j].child_list;k>=0;k=ad->ex[k].child_list) {
	    send_buf[n++] = ad->x[k].x;
	    send_buf[n++] = ad->x[k].y;
	    send_buf[n++] = ad->x[k].z;
	  }
	}
      }
    }
    MPI_Send(send_buf,n, MPI_DOUBLE, mpi.master_pe, SDMD_TAG_GATHER_X, mpi.comm);
    free_buf(send_buf);
  }
}

void SDMD_gather_v(LINKED_CELL *lc, ATOM_DATA *ad)
{
  VEC *tmp;
  tmp = ad->x;
  ad->x = ad->v;
  SDMD_gather_x(lc, ad);
  ad->x = tmp;
}

void SDMD_gather_f(LINKED_CELL *lc, ATOM_DATA *ad)
{
  VEC *tmp;
  tmp = ad->f;
  ad->x = ad->f;
  SDMD_gather_x(lc, ad);
  ad->x = tmp;
}

void SDMD_gather_all(LINKED_CELL *lc, ATOM_DATA *ad, RMOL_DATA *md)
{
  SDMD_gather_x(lc, ad);
  SDMD_gather_v(lc, ad);
  if (md->n_mol > 0) {
    RMOL_DATA_gather(md, lc, ad);
    if (mpi.master)
      RMOL_DATA_mol_to_room_all(md, ad);
  }
}

#if 0
/* nonbond list */
void SDMD_alloc_nonbond_list(LINKED_CELL *lc, NONBOND_LIST *nl,
			     ATOM_DATA *ad, BOUNDARY *bc)
{
  int require_size, nlist;
  char *func="SDMD_alloc_nonbond_list";

  nlist = SDMD_make_nonbond_list(lc, nl, ad, bc, 1);
  require_size = nlist * 1.2;

  if (nl->n_alloc < require_size) {
    if (nl->n_alloc == 0) {
#ifdef J_LIST      
      nl->j_list = emalloc(func, sizeof(int)*require_size);
#else      
      nl->ij_list = emalloc(func, sizeof(int)*2*require_size);
#endif      
    } else {
#ifdef J_LIST
      nl->j_list = erealloc(func, nl->j_list, sizeof(int)*require_size);
#else      
      nl->ij_list = erealloc(func, nl->ij_list, sizeof(int)*2*require_size);
#endif      
    }
    nl->n_alloc = require_size;
    lprintf("NONBOND PAIR LIST: N_LIST %d, ALLOCATED %d\n", nlist, nl->n_alloc);
  }
}

int SDMD_make_nonbond_list(LINKED_CELL *lc, NONBOND_LIST *nl,
			   ATOM_DATA *ad, BOUNDARY *bc, int only_count)
{
  int nlist;
  int i,j;
  double dx, dy, dz;
  double offset_x, offset_y, offset_z;
  int icp, icell, jcell;
  double cutoff2;
  int n_atom, npart, ipart, mod, iatom, jatom, min_iatom, max_iatom;
  double cp_start_time;
  double xi,yi,zi;
#ifdef J_LIST
  int *pcount;
#endif  

  cutoff2 = nl->rl_list * nl->rl_list;
  
  for (i=0;i<ad->natom;i++) {
    ad->ex[i].id = -1;
  }

  nlist=0;
  /*
  for (icp = 0; icp < lc->n_cell_pair; icp++) {
    if (lc->cell_pair[icp].pe != mpi.rank) continue;
  */
  for (icp=lc->pair_head;icp>=0;icp=lc->cell_pair[icp].next) {
    
    cp_start_time = MPI_Wtime();

    icell = lc->cell_pair[icp].i;
    jcell = lc->cell_pair[icp].j;
    offset_x = bc->offset_v[lc->cell_pair[icp].offset].x;
    offset_y = bc->offset_v[lc->cell_pair[icp].offset].y;
    offset_z = bc->offset_v[lc->cell_pair[icp].offset].z;
    lc->cell_pair[icp].alist_start=nlist;

    /* for partition */
    n_atom = lc->cell[icell].n_atom;
    npart = lc->cell_pair[icp].npart;
    ipart = lc->cell_pair[icp].ipart;
    mod = n_atom % npart;
    if (mod <= ipart)
      min_iatom = n_atom / npart * ipart + mod;
    else
      min_iatom = (n_atom / npart + 1 ) * ipart;
    if (mod <= ipart+1)
      max_iatom = n_atom / npart * (ipart+1) + mod - 1;
    else
      max_iatom = (n_atom / npart + 1) * (ipart+1) - 1;
    /* end of partition */

    for (i=lc->cell[icell].head, iatom=0; i>=0; i=lc->next_atom[i], iatom++) {
      if (iatom < min_iatom) continue;
      if (iatom > max_iatom) break;

#ifdef J_LIST
      if (!only_count) {
	if (nlist >= nl->n_alloc) return -1;
	/* nl->j_list[nlist++] = i; */
	nl->j_list[nlist++] = lc->cell[icell].fold_id + iatom;
	pcount = &(nl->j_list[nlist++]);
	*pcount = 0;
      } else {
	nlist += 2;
      }
#endif
      xi = ad->fold_x[i].x + offset_x;
      yi = ad->fold_x[i].y + offset_y;
      zi = ad->fold_x[i].z + offset_z;
      
      for (j=0;j<ad->ex[i].n_exatom;j++) {
	ad->ex[ad->ex[i].exatom[j]].id = i;
      }
      
      for (j=lc->cell[jcell].head, jatom = 0; j>=0; j=lc->next_atom[j], jatom++) {
	if (icell == jcell && i>=j) continue;
	if (ad->ex[j].id == i) continue;

	/* i, j are a pair of nonbonded atoms  */
	
	dx = xi - ad->fold_x[j].x;
	dy = yi - ad->fold_x[j].y;
	dz = zi - ad->fold_x[j].z;

	if (Length2(dx, dy, dz) <= cutoff2) {
	  if (!only_count) {
	    if (nlist >= nl->n_alloc) return -1;

#ifdef J_LIST
	    /* nl->j_list[nlist] = j; */
	    nl->j_list[nlist] = lc->cell[jcell].fold_id + jatom;
	    (*pcount)++;
#else
	    nl->ij_list[nlist][0] = i;
	    nl->ij_list[nlist][1] = j;
#endif	    

#ifdef TR_LIST_ATOM_REQ
	    /*
	    ad->ex[i].flag |= ATOM_REQ_NB;
	    ad->ex[j].flag |= ATOM_REQ_NB;
	    */
	    lc->atom_req[i] |= ATOM_REQ_NB;
	    lc->atom_req[j] |= ATOM_REQ_NB;
#endif
	  }
	  nlist++;
	}
      }
    }
    lc->cell_pair[icp].alist_end=nlist-1;

    /*
    lprintf("(%d %d %d)-(%d %d %d) %d %f %f %f\n",
	    lc->cell[icell].ix,lc->cell[icell].iy,lc->cell[icell].iz,
	    lc->cell[jcell].ix,lc->cell[jcell].iy,lc->cell[jcell].iz,
	    lc->cell_pair[icp].alist_end-lc->cell_pair[icp].alist_start+1,
	    offset_x, offset_y, offset_z);
    */
    
    lc->cell_pair[icp].time += MPI_Wtime() - cp_start_time;
  }
  nl->n_list = nlist;
  /* plprintf("PAIR LIST = %d\n", nl->n_list); */
  /*
  {
    int sum;
    MPI_Reduce(&nlist, &sum, 1, MPI_INT, MPI_SUM, 0, mpi.comm);
    if (mpi.rank==0) {
      lprintf("list: %d\n",sum);
    }
  }
  */
  return nlist;
}
#endif

/* resetup for NPT simulation */
int SDMD_check_resetup(MD_SYSTEM *sys)
{
  int i, resetup;
  NONBOND_LIST *nl;
  BOUNDARY *bc;
  LINKED_CELL *lc;

  nl = &sys->nonbond;  bc = &sys->boundary; lc = &sys->linked_cell;

  resetup=0;
  for (i=0;i<3;i++) {
    if (bc->reclen[i]/lc->n_grid[i]*lc->neighbor[i]<nl->rl_list)
      resetup=1;
  }
  if (resetup) {
    lprintf("RESETUP is required: Reciprocal Length (%f,%f,%f)\n",
	    bc->reclen[0]/lc->n_grid[0]*lc->neighbor[0],
	    bc->reclen[1]/lc->n_grid[1]*lc->neighbor[1],
	    bc->reclen[2]/lc->n_grid[2]*lc->neighbor[2]);
    lprintf("                     NB List Cutoff    (%f)\n",nl->rl_list);
  }
  
  return resetup;
}

void SDMD_resetup(MD_SYSTEM *sys)
{
  char *func="SDMD_setup";
  LINKED_CELL *lc;
  NONBOND_LIST *nl;
  ATOM_DATA *ad;
  BOUNDARY *bc;
  BOND_DATA *bd;
  ANGLE_DATA *and;
  DIHEDRAL_DATA *dd;
  EWALD *ew;
  RMOL_DATA *md;
  int i;

  lc = &sys->linked_cell; nl = &sys->nonbond; ad = &sys->atom; bc = &sys->boundary;
  bd = &sys->bond; and = &sys->angle; dd = &sys->dihed;
  ew = &sys->ewald;
  md = &sys->rigid_mol;
  
  /* step 0. misc. setups.. */
  SDMD_gather_all(lc, ad, md);
  sync_xv(sys);
  
  for (i=0;i<SDMD_N_TIME;i++) lc->time[i]=0.0;
  
  /* step 1. determination of cell size */
  LINKED_CELL_setup(lc, nl, ad, bc);
  
  /* step 2. assign atoms to cell */
  LINKED_CELL_assign_atom(lc, ad, bc);

  /* step 3. assign cell to PE */
  SDMD_assign_cells_to_PE(lc);
  SDMD_assign_node_atom(lc, ad);
  SDMD_assign_node_rmol(md, ad);
  SDMD_assign_node_rattle(&sys->rattle, ad);

  /*SDMD_EW_assign_tr(ew, lc, nl, bc);*/

  /* step 4. assign bond, angle, dihedral to cell or PE */
  SDMD_clear_cell_req(lc,ad);
  SDMD_assign_internal_to_PE(lc, ad, bd, and, dd, ew);

  /* step 5. assign cellpair to PE in a simple way */
  LINKED_CELL_make_cell_pairlist(lc, bc);
  SDMD_assign_cellpair_to_PE(lc);
  
  /* step 6. load balancer setup: n_cell and n_cell_pair are required */
  SDMD_load_balancer_setup(lc, nl);

  if (lc->load_balancer_step > 0)
    lc->load_balancer_exec_step = 1;
  else
    lc->load_balancer_exec_step = 0;

  /* step 7. construct cell data transfer list */
  SDMD_make_tr_list_by_cell_req(lc, ad);

  /* step 8. construct atom pair list */
  LINKED_CELL_calc_tr_x(lc, ad, bc);

  /*
  if (SDMD_make_nonbond_list(lc, nl, ad, bc, 0) < 0) {
    SDMD_alloc_nonbond_list(lc, nl, ad, bc);
    SDMD_make_nonbond_list(lc, nl, ad, bc, 0);
  }
  */
  if (LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 0) < 0) {
    LINKED_CELL_alloc_nonbond_list(lc, nl, ad, bc);
    LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 0);
  }

  /* step 8.1. remake tr_list */
  
#ifdef TR_LIST_ATOM_REQ
  SDMD_make_tr_list_by_atom_req(lc, ad);
#endif

  /*
  {
    static int i=0;
    if (i==1)
      marble_exit(1);
    i++;
    }*/
}

#else
static int dummy;
#endif  /* MPI_SDMD */
