/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#include "misc.h"
#include "atom.h"
#include "boundary.h"
#include "linked_cell.h"
#include "nonbond.h"

#ifdef _OPENMP /**/
#include "omp.h"
#endif

/* for debug 
LINKED_CELL *_lc;
*/

void LINKED_CELL_init(LINKED_CELL *lc)
{
  lc->n_grid[0] =  lc->n_grid[1] =  lc->n_grid[2] = 1;
  lc->n_grid_h[0] =  lc->n_grid_h[1] =  lc->n_grid_h[2] = 0;
  lc->cell = NULL;
  lc->next_atom = NULL;
  lc->n_alloc_cell=0;
  lc->n_alloc_cell_pair=0;

  lc->request_grid = 0;
  lc->neighbor[0] = lc->neighbor[1] = lc->neighbor[2] = 2;
  
  /* for debug 
  _lc = lc;
  */

#ifdef MPI_SDMD
  lc->ex_tr_atom = NULL;
  lc->n_part = 1;
  lc->start_overload = 1.02;
  lc->max_overload = 1.2;
  lc->check_time_overflow = 1;
  lc->load_balancer_step = 10000;
  lc->load_balancer_flag = 0;
  lc->tr_list_factor = 1;
  lc->cell_pe = NULL;
  lc->tr_mode = TR_MODE_MP;
  lc->tr_dist_xf_flag = 0;

  lc->request_npx = 0;
#endif  
}

void LINKED_CELL_setup(LINKED_CELL *lc, NONBOND_LIST *nl,
		       ATOM_DATA *ad,   BOUNDARY *bc)
{
  int i, n_grid[3];
  double dx;
  char *func="LINKED_CELL_setup";

  if (lc->request_grid) {
    for (i=0;i<3;i++) {
      n_grid[i] = lc->n_grid[i];
      dx = bc->reclen[i] / n_grid[i];
      lc->neighbor[i] = (int) (nl->cell_div / dx) + 1;
    }
  } else {
    for (i=0;i<3;i++) {
      dx = nl->cell_div / lc->neighbor[i];
      n_grid[i] = bc->reclen[i] / dx;
      if (n_grid[i]<=0)
	n_grid[i] = 1;
    }
  }

  if (n_grid[0] == 1 && n_grid[1] == 1 && n_grid[2] == 1) {
    lc->neighbor[0] = lc->neighbor[1] = lc->neighbor[2] = 1;
  }

  lprintf("Linked_Cell: Number of Cells      = %d %d %d\n", n_grid[0], n_grid[1], n_grid[2]);
  lprintf("Linked_Cell: Number of Neighbors  = %d %d %d\n", lc->neighbor[0], lc->neighbor[1], lc->neighbor[2]);

  if (bc->type == PERIODIC_BOUNDARY) {
    if (lc->neighbor[0] != 2 || lc->neighbor[1] != 2 || lc->neighbor[2] != 2) {
      lprintf("Currently, number of neighbors must be 2.\n");
      marble_exit(1);
    }
    if (n_grid[0] <= 1 ||
	n_grid[1] <= 1 ||
	n_grid[2] <= 1) {
      lprintf("ERROR: Minimum image convention requires that n_grid is more than 1.\n");
      marble_exit(1);
    }
  }
  
  LINKED_CELL_alloc(lc, n_grid, ad->natom);
  
  ATOM_DATA_allocate_for_periodic_boundary(ad);
  
  BOUNDARY_set_offset_v(bc);
  /* LINKED_CELL_make_cell_pairlist(lc, bc); */
  nl->lc = lc;

#if 1  /*for _OPENMP*/
  lc->td = emalloc(func, sizeof(THREAD_DATA)*mpi.n_threads);
  for (i=0;i<mpi.n_threads;i++) {
    lc->td[i].fold_f = emalloc(func, sizeof(VEC)*ad->natom);
    RIA_init(&lc->td[i].j_list, 1000);
  }
#endif
}


void LINKED_CELL_set_neighbor(LINKED_CELL *lc, BOUNDARY *bc, NONBOND_LIST *nl)
{
  int i;
  
  for (i=0;i<3;i++) {
    lc->neighbor[i] = (int) (nl->rl_list / (bc->reclen[i]/lc->n_grid[i])) + 1;
  }
}

void LINKED_CELL_alloc(LINKED_CELL *lc, int n_grid[3], int n_atom)
{
  int i;
  LINKED_CELL *cell_buf;
  int ix, iy, iz;
  char *func = "LINKED_CELL_alloc";

  for (i=0;i<3;i++) {
    lc->n_grid[i] = n_grid[i];
    lc->n_grid_h[i] = n_grid[i]/2;
  }

  lc->n_cell = lc->n_grid[0]*lc->n_grid[1]*lc->n_grid[2];

  if (lc->n_alloc_cell == 0) {
    lc->n_alloc_cell = lc->n_cell;
    lc->cell = emalloc(func, sizeof(CELL)*lc->n_cell);
  } else if (lc->n_alloc_cell < lc->n_cell) {
    lc->n_alloc_cell = lc->n_cell;
    lc->cell = erealloc(func,lc->cell,sizeof(CELL)*lc->n_cell);
  }

  if (lc->next_atom == NULL) {
    lc->next_atom  = emalloc(func, sizeof(int)*n_atom);
    /* for SDMD */
    lc->prev_atom  = emalloc(func, sizeof(int)*n_atom);
  }

  i=0;
  for (ix = 0; ix < n_grid[0]; ix++) {
    for (iy = 0; iy < n_grid[1]; iy++) {
      for (iz = 0; iz < n_grid[2]; iz++) {
	lc->cell[i].ix = ix;
	lc->cell[i].iy = iy;
	lc->cell[i].iz = iz;
	i++;
      }
    }
  }

  lc->atom_cell_req = emalloc(func, sizeof(int)*n_atom);
  lc->n_atom_cell_req = 0;
}

void LINKED_CELL_free(LINKED_CELL *lc)
{
  free(lc->cell);
  free(lc->next_atom);

  lc->cell = NULL;
  lc->next_atom = NULL;
}

void LINKED_CELL_assign_atom(LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  int i, j;
  int ix,iy,iz, icell;
  int nhydr;
  VEC frac, xx;

  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].head = lc->cell[i].tail = -1;
    lc->cell[i].n_atom = 0;
  }

  for (i=0;i<ad->natom;i++) {
#ifdef HG_MODE
    if (ad->ex[i].flag & ATOM_CHILD) continue;
#endif
    xx.x = ad->x[i].x - bc->min[0];
    xx.y = ad->x[i].y - bc->min[1];
    xx.z = ad->x[i].z - bc->min[2];
    frac.x = VEC_MUL_MAT_X(xx,bc->recip);
    frac.y = VEC_MUL_MAT_Y(xx,bc->recip);
    frac.z = VEC_MUL_MAT_Z(xx,bc->recip);

    if (bc->type == PERIODIC_BOUNDARY) {
      frac.x -= floor(frac.x);
      frac.y -= floor(frac.y);
      frac.z -= floor(frac.z);
    } else {
      if (frac.x > 1.0) frac.x = 1.0;
      if (frac.y > 1.0) frac.y = 1.0;
      if (frac.z > 1.0) frac.z = 1.0;
      if (frac.x < 0.0) frac.x = 0.0;
      if (frac.y < 0.0) frac.y = 0.0;
      if (frac.z < 0.0) frac.z = 0.0;
    }
    
    ix = frac.x * lc->n_grid[0];
    iy = frac.y * lc->n_grid[1];
    iz = frac.z * lc->n_grid[2];
    
    /* for a rare case of round error and no peridic boundaries */
    if (ix == lc->n_grid[0]) ix--;
    if (iy == lc->n_grid[1]) iy--;
    if (iz == lc->n_grid[2]) iz--;
    
    icell=CELL_INDEX(lc,ix,iy,iz);
    
    lc->cell[icell].n_atom++;
    lc->prev_atom[i] = lc->cell[icell].tail;
    lc->cell[icell].tail = i;
    lc->next_atom[i] = -1;
    if (lc->prev_atom[i]>=0)
      lc->next_atom[lc->prev_atom[i]] = i;
    if (lc->cell[icell].head == -1)
      lc->cell[icell].head = i;
#ifdef MPI_SDMD      
    lc->atom_cell[i] = icell;
#endif
#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
    for (j=ad->ex[i].child_list;j>=0;j=ad->ex[j].child_list) {
      lc->cell[icell].n_atom++;
      lc->prev_atom[j] = lc->cell[icell].tail;
      lc->cell[icell].tail = j;
      lc->next_atom[j] = -1;
      if (lc->prev_atom[j]>=0)
	lc->next_atom[lc->prev_atom[j]] = j;
      /*
      if (lc->cell[icell].head == -1)
	lc->cell[icell].head = j;
      */
#ifdef MPI_SDMD      
      lc->atom_cell[j] = icell;
#endif
    }
#endif
 
  }

  /* lc->max_n_atom = maximum of number of atoms in the cell */
  lc->max_n_atom = 0;
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].n_atom > lc->max_n_atom) {
      lc->max_n_atom = lc->cell[i].n_atom;
    }
  }
}

void LINKED_CELL_calc_tr_x(LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  int i, j, ic;
  VEC frac;
  int k;

  k = 0;
  if (bc->type == PERIODIC_BOUNDARY) {
#ifdef MPI_SDMD
    /*
    for (j=0;j<lc->n_cell;j++) {
      if (!lc->cell[j].req) continue;
    */
    for (j=lc->req_head;j>=0;j=lc->cell[j].req_next) {
#else	
    for (j=0;j<lc->n_cell;j++) {
#endif
      lc->cell[j].fold_id = k;
      for (i=lc->cell[j].head; i>=0; i=lc->next_atom[i]) {
#ifdef HG_MODE
	if (ad->ex[i].flag & ATOM_CHILD) continue;
#endif
	frac.x = VEC_MUL_MAT_X(ad->x[i],bc->recip);
	frac.y = VEC_MUL_MAT_Y(ad->x[i],bc->recip);
	frac.z = VEC_MUL_MAT_Z(ad->x[i],bc->recip);
	
	ad->tr_x[i].x = floor(frac.x);
	ad->tr_x[i].y = floor(frac.y);
	ad->tr_x[i].z = floor(frac.z);

	frac.x -= ad->tr_x[i].x;
	frac.y -= ad->tr_x[i].y;
	frac.z -= ad->tr_x[i].z;

	ad->fold_x[k].x = VEC_MUL_MAT_X(frac,bc->boxv);
	ad->fold_x[k].y = VEC_MUL_MAT_Y(frac,bc->boxv);
	ad->fold_x[k].z = VEC_MUL_MAT_Z(frac,bc->boxv);
	  
	ad->fold_id[k]  = ad->vdw_type[i];
	ad->fold_q[k]   = ad->q[i];
	  
	k++;

#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
	for (ic=ad->ex[i].child_list;ic>=0;ic=ad->ex[ic].child_list) {
	  ad->fold_x[k].x = ad->x[ic].x - VEC_MUL_MAT_X(ad->tr_x[i],bc->boxv);
	  ad->fold_x[k].y = ad->x[ic].y - VEC_MUL_MAT_Y(ad->tr_x[i],bc->boxv);
	  ad->fold_x[k].z = ad->x[ic].z - VEC_MUL_MAT_Z(ad->tr_x[i],bc->boxv);

	  ad->fold_id[k]  = ad->vdw_type[ic];
	  ad->fold_q[k]   = ad->q[ic];
	  k++;
	}
#endif
      }
    }
  } else {
#ifdef MPI_SDMD
    /*
    for (j=0;j<lc->n_cell;j++) {
      if (!lc->cell[j].req) continue;
    */
    for (j=lc->req_head;j>=0;j=lc->cell[j].req_next) {
#else      
    for (j=0;j<lc->n_cell;j++) {
#endif
      lc->cell[j].fold_id = k;
      for (i=lc->cell[j].head; i>=0; i=lc->next_atom[i]) {
#ifdef HG_MODE
	if (ad->ex[i].flag & ATOM_CHILD) continue;
#endif
	ad->tr_x[i].x = ad->tr_x[i].y = ad->tr_x[i].z = 0.0;
	
	ad->fold_x[k].x = ad->x[i].x;
	ad->fold_x[k].y = ad->x[i].y;
	ad->fold_x[k].z = ad->x[i].z;
	  
	ad->fold_id[k]  = ad->vdw_type[i];
	ad->fold_q[k]   = ad->q[i];
	  
	k++;

#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
	for (ic=ad->ex[i].child_list;ic>=0;ic=ad->ex[ic].child_list) {
	  ad->fold_x[k].x = ad->x[ic].x;
	  ad->fold_x[k].y = ad->x[ic].y;
	  ad->fold_x[k].z = ad->x[ic].z;

	  ad->fold_id[k]  = ad->vdw_type[ic];
	  ad->fold_q[k]   = ad->q[ic];
	  k++;
	}
#endif
      }
    }
  }
}

void LINKED_CELL_make_cell_pairlist(LINKED_CELL *lc, BOUNDARY *bc)
{
  int ix, iy, iz;
  int nbx, nby, nbz;
  int nbx_id, nby_id, nbz_id;
  int offset_x, offset_y, offset_z;
  int icp, icell;
  int ipart, npart;
  char *func = "LINKED_CELL_make_cell_pairlist";

#if 0 /*def MPI_SDMD*/
  lc->n_cell_pair = 10*lc->n_cell*(((lc->neighbor[0]*2+1)*(lc->neighbor[1]*2+1)*(lc->neighbor[2]*2+1)-1)/2+1);
#else
  lc->n_cell_pair = lc->n_cell*(((lc->neighbor[0]*2+1)*(lc->neighbor[1]*2+1)*(lc->neighbor[2]*2+1)-1)/2+1);
#endif  

  if (lc->n_alloc_cell_pair==0) {
    lc->n_alloc_cell_pair = lc->n_cell_pair;
    lc->cell_pair = emalloc(func,sizeof(CELL_PAIR)*lc->n_cell_pair);
  } else if (lc->n_alloc_cell_pair < lc->n_cell_pair) {
    lc->n_alloc_cell_pair = lc->n_cell_pair;
    lc->cell_pair = erealloc(func,lc->cell_pair,sizeof(CELL_PAIR)*lc->n_cell_pair);
  }

  icp=0;
  /* lc->neighbor[0] = lc->neighbor[1] = lc->neighbor[2] = 0; */
  
  for (icell = 0; icell < lc->n_cell; icell++) {
    ix = lc->cell[icell].ix;
    iy = lc->cell[icell].iy;
    iz = lc->cell[icell].iz;
    
    for (nbx = ix; nbx <= ix+lc->neighbor[0]; nbx++) {
      if (bc->type == PERIODIC_BOUNDARY) {
	if (nbx >= lc->n_grid[0]) {
	  nbx_id = nbx - lc->n_grid[0];
	  offset_x = -1;
	} else {
	  nbx_id = nbx;
	  offset_x = 0;
	}
      } else {
	if (nbx >= lc->n_grid[0]) continue;
	nbx_id = nbx;
	offset_x = 0;
      }
      for (nby = iy - lc->neighbor[1]; nby <= iy+lc->neighbor[1]; nby++) {
	if (nbx == ix && nby < iy) continue;
	if (bc->type == PERIODIC_BOUNDARY) {
	  if (nby < 0) {
	    nby_id = nby + lc->n_grid[1];
	    offset_y = 1;
	  } else if (nby >= lc->n_grid[1]) {
	    nby_id = nby - lc->n_grid[1];
	    offset_y = -1;
	  } else {
	    nby_id = nby;
	    offset_y = 0;
	  }
	} else {
	  if (nby < 0 || nby >= lc->n_grid[1]) continue;
	  nby_id = nby;
	  offset_y = 0;
	}
	for (nbz = iz - lc->neighbor[2]; nbz <= iz+lc->neighbor[2]; nbz++) {
	  if (nbx == ix && nby == iy && nbz < iz) continue;
	  if (bc->type == PERIODIC_BOUNDARY) {
	    if (nbz < 0) {
	      nbz_id = nbz + lc->n_grid[2];
	      offset_z = 1;
	    } else if (nbz >= lc->n_grid[2]) {
	      nbz_id = nbz - lc->n_grid[2];
	      offset_z = -1;
	    } else {
	      nbz_id = nbz;
	      offset_z = 0;
	    }
	  } else {
	    if (nbz < 0 || nbz >= lc->n_grid[2]) continue;
	    nbz_id = nbz;
	    offset_z = 0;
	  }

	  /*
	  RIA_init(&lc->cell_pair[icp].j_list, CP_JLIST_UNIT);
	  RVA_init(&lc->cell_pair[icp].fi, CP_FI_UNIT);
	  RVA_init(&lc->cell_pair[icp].fj, CP_FI_UNIT);
	  */
	  
#ifdef MPI_SDMD
	  /*
	  if (icell==CELL_INDEX(lc,nbx_id,nby_id,nbz_id)) {
	    int n_atom;
	    n_atom = lc->cell[icell].n_atom;
	    npart = 1 + (n_atom > 50) + (n_atom*n_atom)/50000;
	  } else {
	    int n_atom1, n_atom2;
	    n_atom1 = lc->cell[icell].n_atom;
	    n_atom2 = lc->cell[CELL_INDEX(lc,nbx_id,nby_id,nbz_id)].n_atom;
	    npart = 1 + (n_atom1*n_atom2 > 2500) + (n_atom1*n_atom2)/100000;
	  }
	  */
	  npart = lc->n_part; 
	  for (ipart=0;ipart<npart;ipart++) {
	    if (icp >= lc->n_cell_pair) {
	      lprintf("icp exceeded!!!\n");
	      marble_exit(1);
	    }
	    lc->cell_pair[icp].i = icell;
	    lc->cell_pair[icp].j = CELL_INDEX(lc,nbx_id,nby_id,nbz_id);
	    lc->cell_pair[icp].offset = (offset_z+1)+3*(offset_y+1+3*(offset_x+1));
	    lc->cell_pair[icp].ipart = ipart;
	    lc->cell_pair[icp].npart = npart;
	    icp++;
	  }
#else		 
	  
	  lc->cell_pair[icp].i = icell;
	  lc->cell_pair[icp].j = CELL_INDEX(lc,nbx_id,nby_id,nbz_id);
	  lc->cell_pair[icp].offset = (offset_z+1)+3*(offset_y+1+3*(offset_x+1));
	  icp++;
#endif	  
	}
      }
    }
  }
  lprintf("icp=%d, n_cell_pair=%d\n",icp, lc->n_cell_pair);
  lc->n_cell_pair=icp;
}

/* for OMP */

void LINKED_CELL_alloc_nonbond_list(LINKED_CELL *lc, NONBOND_LIST *nl,
				    ATOM_DATA *ad, BOUNDARY *bc)
{
  int require_size, n_list, icp, min, min_thread;
  int i, it, ip, jcp, j_n_list, ip_n_list, tmp;
  CELL_PAIR *cp;
  char *func="LINKED_CELL_alloc_nonbond_list";


  LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 1);

#if 0 /*def _OPENMP*/
  /* sort req array */
  for (i=0;i<lc->n_cell_pair_req;i++) {
    ip = i;
    icp = lc->cell_pair_req[i];
    ip_n_list = lc->cell_pair[icp].n_list;
    for (j=i+1; j<lc->n_cell_pair_req; j++) {
      jcp = lc->cell_pair_req[j];
      j_n_list = lc->cell_pair[jcp].n_list;
      if (ip_n_list < j_n_list) {
	ip_n_list = j_n_list;
	ip = j;
      }
    }
    if (ip != i) {
      tmp = lc->cell_pair_req[ip];
      lc->cell_pair_req[ip]=lc->cell_pair_req[i];
      lc->cell_pair_req[i] = tmp;
    }
  }
  /* end of sort */
#endif

  for (i=0;i<mpi.n_threads;i++) {
    lc->td[i].n_list = 0;
    lc->td[i].n_i_list = 0;
    lc->td[i].n_cell_pair_req = 0;
  }
#if defined(OVERLAP) && defined(_OPENMP)
  if (mpi.n_threads > 1) {
    min_thread = 1;
  } else {
    min_thread = 0;
  }
#else
  min_thread = 0;
#endif
  for (i=0;i<lc->n_cell_pair_req;i++) {
    icp = lc->cell_pair_req[i];
    cp = &lc->cell_pair[icp];
    ip = min_thread;
    min = lc->td[min_thread].n_list;
    for (it=min_thread+1;it<mpi.n_threads;it++) {
      if (min > lc->td[it].n_list) {
	ip = it;
	min = lc->td[it].n_list;
      }
    }
    lc->td[ip].n_list += cp->n_list;
    lc->td[ip].n_i_list += cp->n_i_list;
    lc->td[ip].cell_pair_req[lc->td[ip].n_cell_pair_req] = icp;
    lc->td[ip].n_cell_pair_req++;
  }
    
  n_list = 0;
  for (it=0;it<mpi.n_threads;it++) {
    RIA_alloc(&lc->td[it].j_list, lc->td[it].n_list);
    n_list += lc->td[it].n_list - lc->td[it].n_i_list * 2;
    
  }

  /*
  printf("rank %4d: %d %d", mpi.rank, lc->n_cell_pair_req, n_list);
  for (it=0;it<mpi.n_threads;it++) {
    printf(" %d", lc->td[it].n_list - lc->td[it].n_i_list * 2);
  }
  printf("\n");

  {
    int n_list_all;
    MPI_Reduce(&n_list, &n_list_all, 1, MPI_INT, MPI_SUM, 0, mpi.comm);
    lprintf("n_list = %d\n", n_list_all);
  }
  */

}
  
void LINKED_CELL_make_nonbond_list_only_count(LINKED_CELL *lc, NONBOND_LIST *nl,
					      ATOM_DATA *ad, BOUNDARY *bc)
{
  int ii;

#ifdef _OPENMP /**/
#pragma omp parallel for schedule(dynamic)
#endif  
  for (ii=0;ii<lc->n_cell_pair_req;ii++) {
    int i,j, jj;
    double dx, dy, dz;
    double offset_x, offset_y, offset_z;
    int icp, icell, jcell;
    double cutoff2;
    double cp_start_time;
    double xi,yi,zi;
    int ifold, jfold;
    int iatom, jatom;
    int n_atom, ipos;
    int n_list;
    int minex, maxex, flagex;
    CELL_PAIR *cp;
   
    cp_start_time = MPI_Wtime();

    icp = lc->cell_pair_req[ii];
    cutoff2 = nl->rl_list * nl->rl_list;

    cp = &lc->cell_pair[icp];
    icell = cp->i;
    jcell = cp->j;
    offset_x = bc->offset_v[cp->offset].x;
    offset_y = bc->offset_v[cp->offset].y;
    offset_z = bc->offset_v[cp->offset].z;

    ipos = n_list = 0;
    cp->n_i_list = 0;

    for (i=lc->cell[icell].head, iatom=0; i>=0; i=lc->next_atom[i], iatom++) {

      ifold = lc->cell[icell].fold_id + iatom;
      ipos = n_list;
      n_list += 2;

      xi = ad->fold_x[ifold].x + offset_x;
      yi = ad->fold_x[ifold].y + offset_y;
      zi = ad->fold_x[ifold].z + offset_z;

      minex = ad->natom;
      maxex = -1;
      for (jj=0;jj<ad->ex[i].n_exatom;jj++) {
	if (minex > ad->ex[i].exatom[jj]) minex=ad->ex[i].exatom[jj];
	if (maxex < ad->ex[i].exatom[jj]) maxex=ad->ex[i].exatom[jj];
      }
      
      for (j=lc->cell[jcell].head, jatom = 0; j>=0; j=lc->next_atom[j], jatom++) {
	if (icell == jcell && i>=j) continue;

	if (j>=minex && j<=maxex) {
	  flagex=0;
	  for (jj=0;jj<ad->ex[i].n_exatom;jj++) {
	    if (ad->ex[i].exatom[jj] == j) {
	      flagex=1;
	      break;
	    }
	  }
	  if (flagex) continue;
	}

	/* i, j are a pair of nonbonded atoms  */

	jfold = lc->cell[jcell].fold_id + jatom;
	dx = xi - ad->fold_x[jfold].x;
	dy = yi - ad->fold_x[jfold].y;
	dz = zi - ad->fold_x[jfold].z;

	if (Length2(dx, dy, dz) <= cutoff2) {
	  n_list++;
	}
      }

      if (n_list - ipos == 2) {
	n_list -= 2;
      } else {
	cp->n_i_list++;
      }
    }
    cp->n_list = n_list;

#ifdef MPI_SDMD
    cp->time += MPI_Wtime() - cp_start_time;
#endif  /* MPI_SDMD */
  }
  
  return;
}

int LINKED_CELL_make_nonbond_list(LINKED_CELL *lc, NONBOND_LIST *nl,
				  ATOM_DATA *ad, BOUNDARY *bc, int only_count)
{
  int itmp,jtmp;
  if (only_count == 1) {
    LINKED_CELL_make_nonbond_list_only_count(lc, nl,ad, bc);
    return 0;
  }

#if 0
  for (itmp = 0; itmp < mpi.n_threads; itmp++) {
    printf("%d\n", lc->td[itmp].n_cell_pair_req);
  }
  printf("kita\n");
#endif

#ifdef _OPENMP /**/
#pragma omp parallel
 {
#endif
  int ii, i_thread;
  int i,j, jj;
  double dx, dy, dz;
  double offset_x, offset_y, offset_z;
  int icp, icell, jcell;
  double cutoff2;
  double cp_start_time;
  double xi,yi,zi;
  int ifold, jfold;
  int iatom, jatom;
  int n_atom, ipos;
  int n_list, n_i_list;
  int minex, maxex, flagex;
  CELL_PAIR *cp;
  THREAD_DATA *td;

#ifdef _OPENMP /**/
  i_thread = omp_get_thread_num();
#else
  i_thread = 0;
#endif
  
  n_list = 0;
  n_i_list = 0;
  td = &lc->td[i_thread];

  for (ii=0;ii<td->n_cell_pair_req;ii++) {
    cp_start_time = MPI_Wtime();

    icp = td->cell_pair_req[ii];
    cutoff2 = nl->rl_list * nl->rl_list;

    cp = &lc->cell_pair[icp];
    icell = cp->i;
    jcell = cp->j;
    offset_x = bc->offset_v[cp->offset].x;
    offset_y = bc->offset_v[cp->offset].y;
    offset_z = bc->offset_v[cp->offset].z;

    cp->alist_start = n_list;

    for (i=lc->cell[icell].head, iatom=0; i>=0; i=lc->next_atom[i], iatom++) {

      ifold = lc->cell[icell].fold_id + iatom;
      ipos = n_list;
      n_list += 2;

      xi = ad->fold_x[ifold].x + offset_x;
      yi = ad->fold_x[ifold].y + offset_y;
      zi = ad->fold_x[ifold].z + offset_z;

      minex = ad->natom;
      maxex = -1;
      for (jj=0;jj<ad->ex[i].n_exatom;jj++) {
	if (minex > ad->ex[i].exatom[jj]) minex=ad->ex[i].exatom[jj];
	if (maxex < ad->ex[i].exatom[jj]) maxex=ad->ex[i].exatom[jj];
      }
      
      for (j=lc->cell[jcell].head, jatom = 0; j>=0; j=lc->next_atom[j], jatom++) {
	if (icell == jcell && i>=j) continue;

	if (j>=minex && j<=maxex) {
	  flagex=0;
	  for (jj=0;jj<ad->ex[i].n_exatom;jj++) {
	    if (ad->ex[i].exatom[jj] == j) {
	      flagex=1;
	      break;
	    }
	  }
	  if (flagex) continue;
	}

	/* i, j are a pair of nonbonded atoms  */

	jfold = lc->cell[jcell].fold_id + jatom;
	dx = xi - ad->fold_x[jfold].x;
	dy = yi - ad->fold_x[jfold].y;
	dz = zi - ad->fold_x[jfold].z;

	if (Length2(dx, dy, dz) <= cutoff2) {

	  RIA_set(&td->j_list, n_list, jfold);

	  n_list++;
	}
      }

      if (n_list - ipos == 2) {
	n_list -= 2;
      } else {
	n_i_list++;

	RIA_set(&td->j_list, ipos, ifold);
	RIA_set(&td->j_list, ipos+1, n_list- ipos - 2);
      }
    }
    cp->alist_end = n_list-1;
#ifdef MPI_SDMD
    cp->time += MPI_Wtime() - cp_start_time;
#endif  /* MPI_SDMD */

  }
  td->n_list = n_list;
  td->n_i_list = n_i_list;
#ifdef _OPENMP
 }
#endif

#if 0
 {
   int it, n_list;
   n_list = 0;
   for (it=0;it<mpi.n_threads;it++) {
     RIA_alloc(&lc->td[it].j_list, lc->td[it].n_list);
     n_list += lc->td[it].n_list - lc->td[it].n_i_list * 2;
   }
   /*printf("rank %d: n_list = %d\n", mpi.rank, n_list);*/
   {
     int n_list_all;
     MPI_Reduce(&n_list, &n_list_all, 1, MPI_INT, MPI_SUM, 0, mpi.comm);
     lprintf("n_list = %d\n", n_list_all);
   }
 }
#endif
  
  return 0;
}

void LINKED_CELL_nonbond_energy_force_smooth(LINKED_CELL *lc, NONBOND_LIST *nl,
					     ATOM_DATA *ad, BOUNDARY *bc,
					     double *vdw, double *elec, double *hbond)
{
  int i,j,k;
  int vdw_index;
  int start, end;
  double dx, dy, dz;
  double len, len2;
  double rlen, rlen2, rlen6, rlen12;
  double vdw12, vdw6, hb12, hb10, force, ene_tmp, elec_tmp;
  /* for atom_ene */
  int group_i, group_j;
  double ene_t2, elec_t2;
  
  VEC offset_v;
  int icp;
  
#ifdef MPI_SDMD
  double cp_start_time;
#endif  

#ifdef CHARMM_SMOOTH
  double S, dS, rl_on2, rl_off2, len2_rl_off, rl_tmp, rl_diff3;
  double Se, dSe;

  rl_on2  = nl->rl_on * nl->rl_on;
  rl_off2 = nl->rl_off * nl->rl_off;
  rl_diff3 = 1.0/pow(rl_off2-rl_on2, 3.0);
  rl_tmp = rl_off2 - 3.0*rl_on2;
  
#else  
  /* for smoothing */
  double S, dS, len_rl_on, rl_tmp, rl_diff3;
  double Se, dSe;
  
  rl_tmp = 3.0 * nl->rl_off - nl->rl_on;
  rl_diff3 = pow(nl->rl_off - nl->rl_on, 3.0);
#endif  
  
  *vdw = *elec = *hbond = 0.0;

#ifdef MPI_SDMD
  if (bc->type == PERIODIC_BOUNDARY) {
      /*
    for (j=0;j<lc->n_cell;j++) {
      if (lc->cell[j].req) {
      */
    for (j=lc->req_head;j>=0;j=lc->cell[j].req_next) {
      for (i=lc->cell[j].head;i>=0;i=lc->next_atom[i]) {
	ad->fold_x[i].x = ad->x[i].x - VEC_MUL_MAT_X(ad->tr_x[i],bc->boxv);
	ad->fold_x[i].y = ad->x[i].y - VEC_MUL_MAT_Y(ad->tr_x[i],bc->boxv);
	ad->fold_x[i].z = ad->x[i].z - VEC_MUL_MAT_Z(ad->tr_x[i],bc->boxv);
      }
    }
  } else {
      /*
    for (j=0;j<lc->n_cell;j++) {
      if (lc->cell[j].req) {
      */
    for (j=lc->req_head;j>=0;j=lc->cell[j].req_next) {
      for (i=lc->cell[j].head;i>=0;i=lc->next_atom[i]) {
	ad->fold_x[i].x = ad->x[i].x;
	ad->fold_x[i].y = ad->x[i].y;
	ad->fold_x[i].z = ad->x[i].z;
      }
    }
  }
#else  
  if (bc->type == PERIODIC_BOUNDARY) {
    for (i=0;i<ad->natom;i++) {
      ad->fold_x[i].x = ad->x[i].x - VEC_MUL_MAT_X(ad->tr_x[i],bc->boxv);
      ad->fold_x[i].y = ad->x[i].y - VEC_MUL_MAT_Y(ad->tr_x[i],bc->boxv);
      ad->fold_x[i].z = ad->x[i].z - VEC_MUL_MAT_Z(ad->tr_x[i],bc->boxv);
    }
  } else {
    for (i=0;i<ad->natom;i++) {
      ad->fold_x[i].x = ad->x[i].x;
      ad->fold_x[i].y = ad->x[i].y;
      ad->fold_x[i].z = ad->x[i].z;
    }
  }
#endif

#ifdef MPI_SDMD
  for (icp=lc->pair_head;icp>=0;icp=lc->cell_pair[icp].next) {
    /* if (lc->cell_pair[icp].pe != mpi.rank) continue; */
    cp_start_time = MPI_Wtime();
#else    
  for (icp=0;icp<lc->n_cell_pair;icp++) {
#endif
    offset_v=bc->offset_v[lc->cell_pair[icp].offset];
    start=lc->cell_pair[icp].alist_start;
    end=lc->cell_pair[icp].alist_end;

    for (k=start;k<=end;k++) {
      i = nl->ij_list[k][0];
      j = nl->ij_list[k][1];
      
      dx = ad->fold_x[i].x - ad->fold_x[j].x + offset_v.x;
      dy = ad->fold_x[i].y - ad->fold_x[j].y + offset_v.y;
      dz = ad->fold_x[i].z - ad->fold_x[j].z + offset_v.z;

      len2 = dx * dx + dy * dy + dz * dz;
      if (len2 >= nl->rl_off2) continue;

      /*
      len6 = len2 * len2 * len2;
      len12 = len6 * len6;
      */
      rlen2 = 1.0/len2;
      rlen6  = rlen2*rlen2*rlen2;
      rlen12 = rlen6*rlen6;
      len    = sqrt(len2);
      rlen   = 1.0/len;

#ifdef CHARMM_SMOOTH
      if (len > nl->rl_on) {
	len2_rl_off = rl_off2 - len2;
	S = len2_rl_off*len2_rl_off*(rl_tmp+2.0*len2)*rl_diff3;
	dS = 12.0*len*len2_rl_off*(rl_on2-len2)*rl_diff3;
      } else {
	S = 1.0; dS = 0.0;
      }
#else      
      if (len > nl->rl_on) {
	len_rl_on = len - nl->rl_on;
	S = 1.0-len_rl_on * len_rl_on * (rl_tmp - 2*len) / rl_diff3;
	dS = -6.0 * len_rl_on * (nl->rl_off - len) / rl_diff3;
      } else {
	S = 1.0; dS = 0.0;
      }
#endif      
      /* shifted force */
      dSe = 1.0 - len/nl->rl_off;
      Se = dSe*dSe; dSe*=-2.0/nl->rl_off;
      /*
      dSe = 0.0;
      Se = 1.0; */
      /* debug Se = S; dSe=dS; */
    
      vdw_index = ad->index[ad->vdw_type[i]+ad->vdw_type[j]*ad->ntype];
#ifdef HBOND
      if (vdw_index >= 0 /* || len2 > ad->hbond_criteria2 */) {
#endif	
	vdw12 = ad->vdw12[vdw_index] * rlen12;
	vdw6 = ad->vdw6[vdw_index]   * rlen6;
	/*
	 *vdw += vdw12 - vdw6;
	 force = 12.0 * vdw12 - 6.0 * vdw6;
	*/
	*vdw += (vdw12 - vdw6)*S;
	ene_tmp = vdw12 - vdw6;
	/*
	if (len >= 1.833844 && len <= 1.833845)
	  printf("%d %d %d %.16e %.16e %.16e %.16e\n", vdw_index,
		 ad->vdw_type[i],ad->vdw_type[j],
		 ad->vdw12[vdw_index], ad->vdw6[vdw_index],
		 ad->eps[vdw_index], ad->rmin[vdw_index]);
	*/
	/* printf("%.16f %.16e\n",sqrt(len2), ene_tmp*S); */
	force = 12.0 * vdw12 - 6.0 * vdw6;
#ifdef HBOND
      } else {
	vdw_index = - vdw_index - 2;
	hb12 = ad->hb12[vdw_index] * rlen12;
	hb10 = ad->hb10[vdw_index] * rlen6 * rlen2 * rlen2;
	/*
	 *hbond += hb12 - hb10;
	 force = 12.0 * hb12 - 10.0 * hb10;
	*/
	*hbond += (hb12 - hb10)*S;
	ene_tmp = hb12 - hb10;
	force = 12.0 * hb12 - 10.0 * hb10;
      }
#endif      
      elec_tmp = ad->q[i] * ad->q[j] * rlen;
      /*
       *elec += elec_t;
       force = (force + elec_t) / len2;
      */
      *elec += elec_tmp*Se;
      force = (force*S + elec_tmp*Se) * rlen2 - (ene_tmp*dS + elec_tmp*dSe) * rlen; 
    
      if (ad->atom_ene_sample_flag) {
	ene_t2 = ene_tmp*0.5*S;
	/* elec_t2 = elec_tmp*0.5*S; */
	elec_t2 = elec_tmp*0.5*Se;
	group_i = ad->atom_ene_group[i];
	group_j = ad->atom_ene_group[j];
	ad->atom_ene[i][group_j][ATOM_ENE_VDW] += ene_t2;
	ad->atom_ene[j][group_i][ATOM_ENE_VDW] += ene_t2;
	ad->atom_ene[i][group_j][ATOM_ENE_ELEC] += elec_t2;
	ad->atom_ene[j][group_i][ATOM_ENE_ELEC] += elec_t2;
      }
    
      ad->f[i].x += force * dx;
      ad->f[i].y += force * dy;
      ad->f[i].z += force * dz;
    
      ad->f[j].x -= force * dx;
      ad->f[j].y -= force * dy;
      ad->f[j].z -= force * dz;
      
    /* virial */
      ad->virial[0] += force * dx * dx;
      ad->virial[1] += force * dy * dy;
      ad->virial[2] += force * dz * dz;
      ad->virial[3] += force * dx * dy;
      ad->virial[4] += force * dx * dz;
      ad->virial[5] += force * dy * dz;
    }
#ifdef MPI_SDMD
    lc->cell_pair[icp].time += MPI_Wtime() - cp_start_time;
#endif    
  }
}

#ifdef MPI_SDMD
void LINKED_CELL_migration(LINKED_CELL *lc, int iatom,
			   int oldcell, int newcell)
{
  int next, prev;

  /* remove iatom from old cell */
  next = lc->next_atom[iatom];
  prev = lc->prev_atom[iatom];

  if (next < 0)
    lc->cell[oldcell].tail = prev;
  else
    lc->prev_atom[next] = prev;

  if (prev < 0)
    lc->cell[oldcell].head = next;
  else
    lc->next_atom[prev] = next;
  
  lc->cell[oldcell].n_atom--;
  /* end of removing iatom from old cell */

  /* adding iatom to new cell */
  lc->cell[newcell].n_atom++;
  lc->prev_atom[iatom] = lc->cell[newcell].tail;
  lc->cell[newcell].tail = iatom;
  lc->next_atom[iatom] = -1;
  if (lc->prev_atom[iatom]>=0)
    lc->next_atom[lc->prev_atom[iatom]] = iatom;
  
  if (lc->cell[newcell].head == -1)
    lc->cell[newcell].head = iatom;
  
  lc->atom_cell[iatom] = newcell;
  
  /* end of adding iatom to new cell */
}

#endif
