/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#if (NB_VDW != NV_PSW) && (NB_VDW != NV_FSW)
#error "NB_VDW must be one of NV_PSW, NV_FSW"
#endif

#if (NB_ELEC != NE_FSH) && (NB_ELEC != NE_EWALD) && (NB_ELEC != NE_PSW)
#error "NB_ELEC must be one of NE_FSH, NE_EWALD, NE_PSW"
#endif

{
  int ii, jj, kk, ll, ic;

  kk=0;
  if (bc->type == PERIODIC_BOUNDARY) {
    for (jj=0;jj<lc->n_cell;jj++) {
#ifdef MPI_SDMD
      if (lc->cell[jj].req) {
#endif	
	for (ii=lc->cell[jj].head;ii>=0;ii=lc->next_atom[ii]) {
#ifdef HG_MODE
	  if (ad->ex[ii].flag & ATOM_CHILD) continue;
#endif
	  ad->fold_x[kk].x = ad->x[ii].x - VEC_MUL_MAT_X(ad->tr_x[ii],bc->boxv);
	  ad->fold_x[kk].y = ad->x[ii].y - VEC_MUL_MAT_Y(ad->tr_x[ii],bc->boxv);
	  ad->fold_x[kk].z = ad->x[ii].z - VEC_MUL_MAT_Z(ad->tr_x[ii],bc->boxv);
	  kk++;

#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
	  for (ic=ad->ex[ii].child_list;ic>=0;ic=ad->ex[ic].child_list) {
	    ad->fold_x[kk].x = ad->x[ic].x - VEC_MUL_MAT_X(ad->tr_x[ii],bc->boxv);
	    ad->fold_x[kk].y = ad->x[ic].y - VEC_MUL_MAT_Y(ad->tr_x[ii],bc->boxv);
	    ad->fold_x[kk].z = ad->x[ic].z - VEC_MUL_MAT_Z(ad->tr_x[ii],bc->boxv);
	    kk++;
	  }
#endif
	}
#ifdef MPI_SDMD
      }
#endif      
    }
  } else {
    for (jj=0;jj<lc->n_cell;jj++) {
#ifdef MPI_SDMD
      if (lc->cell[jj].req) {
#endif	
	for (ii=lc->cell[jj].head;ii>=0;ii=lc->next_atom[ii]) {
#ifdef HG_MODE
	  if (ad->ex[ii].flag & ATOM_CHILD) continue;
#endif
	  ad->fold_x[kk].x = ad->x[ii].x;
	  ad->fold_x[kk].y = ad->x[ii].y;
	  ad->fold_x[kk].z = ad->x[ii].z;
	  kk++;

#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
	  for (ic=ad->ex[ii].child_list;ic>=0;ic=ad->ex[ic].child_list) {
	    ad->fold_x[kk].x = ad->x[ic].x;
	    ad->fold_x[kk].y = ad->x[ic].y;
	    ad->fold_x[kk].z = ad->x[ic].z;
	    kk++;
	  }
#endif

	}
#ifdef MPI_SDMD
      }
#endif      
    }
  }

#ifdef _OPENMP
#pragma omp parallel private(ii,jj,kk)
#endif
 {
  VEC offset_v;
  int icp;
  int i,j,k;
  int vdw_index;
  int start, end;
  double dx, dy, dz;
  double len, len2;
  double rlen, rlen2, rlen6, rlen12;
  double vdw12, vdw6, force, vdw_tmp, elec_tmp;
  double vdw_total, elec_total;
  double rl_on2, rl_off2;
#ifdef HBOND
  double hb12, hb10;
#endif  
#ifdef ATOM_ENE  
  int group_i, group_j;
  double ene_t2, ene_t2;
#endif  
#if NB_ELEC == NE_EWALD
  double ewald_dir, ew_force;
#ifdef ERFC_TABLE
  double x, h, val_erfc, dval_erfc, erfc2, erfc3;
  double beta, rdx_erfc_table, dx_erfc_table;
  double (*erfc_table)[4];
  int ix;
#endif  
#endif
#if NB_ELEC == NE_FSH
  double Se, dSe;
  double rl_roff;
#endif
#if NB_ELEC == NE_PSW
  double Se, dSe;
#endif
#if NB_VDW == NV_PSW
  double S, dS, len2_rl_off, len2_rl_on, rl_tmp, rl_diff3;
#endif
#if NB_VDW == NV_FSW
  double rl_on3, rl_on6, rl_off3, rl_off6, rl_coef3, rl_coef6;
  double rl_roff3, rl_roff6, rl_ronoff3, rl_ronoff6;
  double rlen3, tmp3, tmp6;
#endif  
#ifdef MPI_SDMD
  double cp_start_time;
#endif
  int nj,kj;
  double xi, yi, zi, fix, fiy, fiz;
  double virial[6];
  int fold_id_i, fold_id_j;
  int i_thread;
  CELL_PAIR *cp;
  THREAD_DATA *td;
  VEC *fold_f;

#ifdef _OPENMP
  i_thread = omp_get_thread_num();
#if NB_ELEC == NE_EWALD
#ifdef OVERLAP
  if (i_thread == 0) 
    SDMD_EW_pme2(ew, lc, ad, bc);
#endif
#endif
#else
  i_thread = 0;
#endif
  td = &lc->td[i_thread];
  fold_f = td->fold_f;
  
  kk=0;
  for (jj=0;jj<lc->n_cell;jj++) {
    if (lc->cell[jj].req) {
      for (ii=lc->cell[jj].head;ii>=0;ii=lc->next_atom[ii]) {
	fold_f[kk].x = fold_f[kk].y = fold_f[kk].z = 0.0;
	kk++;
      }
    }
  }
  for (ii=0;ii<6;ii++)
    td->virial[ii]=0.0;
  td->vdw = 0.0;
  td->elec = 0.0;

#if NB_ELEC == NE_EWALD
#ifdef MPI_SDMD
  /*SDMD_EW_dtime();*/
#endif
  beta = ew->beta;
  dx_erfc_table = ew->dx_erfc_table;
  rdx_erfc_table = 1.0/dx_erfc_table;
  erfc_table = ew->erfc_table;
#endif
#if NB_ELEC == NE_FSH
  rl_roff = 1.0/nl->rl_off;
#endif
  
  rl_on2  = nl->rl_on * nl->rl_on;
  rl_off2 = nl->rl_off * nl->rl_off;

#if NB_VDW == NV_PSW
  rl_diff3 = 1.0/pow(rl_off2-rl_on2, 3.0);
  rl_tmp = rl_off2 - 3.0*rl_on2;
#endif  
#if NB_VDW == NV_FSW
  rl_off3 = nl->rl_off * nl->rl_off * nl->rl_off;
  rl_off6 = rl_off3 * rl_off3;
  rl_on3  = nl->rl_on  * nl->rl_on  * nl->rl_on;
  rl_on6  = rl_on3  * rl_on3;
  rl_coef3 = rl_off3 / (rl_off3 - rl_on3);
  rl_coef6 = rl_off6 / (rl_off6 - rl_on6);
  rl_roff3 = 1.0/rl_off3;
  rl_roff6 = 1.0/rl_off6;
  rl_ronoff3 = 1.0/(rl_on3 * rl_off3);
  rl_ronoff6 = 1.0/(rl_on6 * rl_off6);
#endif  
#if NB_VDW == NV_PSW2
  rl_tmp = 3.0 * nl->rl_off - nl->rl_on;
  rl_diff3 = pow(nl->rl_off - nl->rl_on, 3.0);
#endif

  for (ii=0;ii<td->n_cell_pair_req;ii++) {

#ifdef MPI_SDMD
    cp_start_time = MPI_Wtime();
#endif

    icp = td->cell_pair_req[ii];
    cp = &lc->cell_pair[icp];

    vdw_total = elec_total = 0.0;
    for (i=0;i<6;i++) virial[i] = 0.0;
  
    fold_id_i = lc->cell[cp->i].fold_id;
    fold_id_j = lc->cell[cp->j].fold_id;
  
    offset_v=bc->offset_v[lc->cell_pair[icp].offset];

    end=cp->alist_end;
    k=cp->alist_start;
    while (k<=end) {
     i  = td->j_list.data[k++];
     nj = td->j_list.data[k++];
     xi = ad->fold_x[i].x + offset_v.x;
     yi = ad->fold_x[i].y + offset_v.y;
     zi = ad->fold_x[i].z + offset_v.z;
     fix = fiy = fiz = 0.0;
      
     for (kj = 0; kj < nj; kj++) {
      j = td->j_list.data[k++];
      
      dx =  xi - ad->fold_x[j].x ;
      dy =  yi - ad->fold_x[j].y ;
      dz =  zi - ad->fold_x[j].z ;

      len2 = dx * dx + dy * dy + dz * dz;


      if (len2 >= rl_off2) continue;

      len = sqrt(len2);

#if NB_VDW == NV_PSW
      if (len2 > rl_on2) {
	len2_rl_off = rl_off2 - len2;
	len2_rl_on  = rl_on2  - len2;
	S = len2_rl_off*len2_rl_off*(len2_rl_off-3.0*len2_rl_on)*rl_diff3; 
	dS = 12.0*len*len2_rl_off*len2_rl_on*rl_diff3;
      } else {
	S = 1.0; dS = 0.0;
      }
#endif
#if NB_VDW == NV_PSW2
      if (len2 > rl_on2) {
	len_rl_on = len - nl->rl_on;
	S = 1.0-len_rl_on * len_rl_on * (rl_tmp - 2*len) / rl_diff3;
	dS = -6.0 * len_rl_on * (nl->rl_off - len) / rl_diff3;
      } else {
	S = 1.0; dS = 0.0;
      }
#endif
#if NB_ELEC == NE_FSH
      dSe = 1.0 - len*rl_roff;
      Se = dSe*dSe;
      dSe*=-2.0*rl_roff;
#endif
#if NB_ELEC == NE_PSW
      dSe = dS;
      Se = S;
#endif

      rlen2 = 1.0 / len2;
      rlen6 = rlen2 * rlen2 * rlen2;
      rlen12 = rlen6 * rlen6;
      rlen = rlen2*len;
      
      /*
      vdw_index = ad->index[ad->vdw_type[i]+ad->vdw_type[j]*ad->ntype];
      */
      vdw_index = ad->index[ad->fold_id[i]+ad->fold_id[j]*ad->ntype];
#ifdef HBOND
      if (vdw_index >= 0) {
#endif
#if NB_VDW == NV_FSW
	vdw12 = ad->vdw12[vdw_index];
	vdw6  = ad->vdw6[vdw_index];

	if (len2 > rl_on2) {
	  rlen3 = rlen2 * rlen;
	  tmp6 = rlen6 - rl_roff6;
	  tmp3 = rlen3 - rl_roff3;
	  vdw12 *= rl_coef6 * tmp6;
	  vdw6  *= rl_coef3 * tmp3;
	  vdw_tmp = vdw12 * tmp6 - vdw6 * tmp3;
	  vdw_total += vdw_tmp;
	  force = 12.0 * vdw12 * rlen6 - 6.0 * vdw6 * rlen3;
	} else { 
	  vdw_tmp = vdw12 * (rlen12 - rl_ronoff6) - vdw6 * (rlen6 - rl_ronoff3);
	  vdw_total += vdw_tmp;
	  force = 12.0 * vdw12 * rlen12 - 6.0 * vdw6 * rlen6;
	} 
#else  
	vdw12 = ad->vdw12[vdw_index] * rlen12;
	vdw6 = ad->vdw6[vdw_index] * rlen6;
	vdw_tmp = vdw12 - vdw6;
	vdw_total += vdw_tmp * S;
	force = 12.0 * vdw12 - 6.0 * vdw6;
#endif 	/* NV_FSW */
#ifdef HBOND	
      } else {
	vdw_index = - vdw_index - 2;
	hb12 = ad->hb12[vdw_index] * rlen12;
	hb10 = ad->hb10[vdw_index] * rlen6 * rlen2 * rlen2;
	*hbond += (hb12 - hb10)*S;
	vdw_tmp = hb12 - hb10;
	force = 12.0 * hb12 - 10.0 * hb10;
      }
#endif /* HBOND */

#if NB_ELEC == NE_EWALD      
#ifdef ERFC_TABLE
      x = beta*len;
      ix = (int) (x * rdx_erfc_table);
      h = x - ix * dx_erfc_table;

      val_erfc  = erfc_table[ix][0];
      dval_erfc = erfc_table[ix][1];
      erfc2     = erfc_table[ix][2]*h;
      erfc3     = erfc_table[ix][3]*h*h;
      val_erfc  += (dval_erfc+erfc2+erfc3)*h;
      dval_erfc += 2.0*erfc2 + 3.0*erfc3;

      /*
      val_erfc = ew->erfc_table[ix][0] + (ew->erfc_table[ix][1] +
                (ew->erfc_table[ix][2] + ew->erfc_table[ix][3] * h) * h ) * h;
      dval_erfc = ew->erfc_table[ix][1] +
                (2.0*ew->erfc_table[ix][2] + 3.0*ew->erfc_table[ix][3] * h) * h;
      */
      
      /* ewald_dir = ad->q[i]*ad->q[j]*val_erfc*rlen; */
      ewald_dir = ad->fold_q[i]*ad->fold_q[j]*val_erfc*rlen;
      ew_force  = ewald_dir - ad->fold_q[i]*ad->fold_q[j]*beta*dval_erfc;
      
#else  /* ERFC_TABLE */
      /* ewald_dir = ad->q[i]*ad->q[j]*erfc(ew->beta*len) * rlen; */
      ewald_dir = ad->fold_q[i]*ad->fold_q[j]*erfc(ew->beta*len) * rlen; 
      ew_force  = ewald_dir + ad->fold_q[i]*ad->fold_q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2);
#endif  /* ERFC_TABLE */
      elec_total += ewald_dir;
#if NB_VDW == NV_FSW
      force = (force + ew_force) * rlen2;
#else   /* NV_FSW  */
      force = (force*S + ew_force) * rlen2 - vdw_tmp*dS*rlen;
#endif  /* NV_FSW  */
#endif  /* NE_EWALD */

      
#if (NB_ELEC == NE_FSH) || (NB_ELEC == NE_PSW)
      /* elec_tmp = ad->q[i] * ad->q[j] * rlen; */
      elec_tmp = ad->fold_q[i] * ad->fold_q[j] * rlen; 
      elec_total += elec_tmp*Se;
#if NB_VDW == NV_FSW
      force = (force + elec_tmp*Se) * rlen2 - elec_tmp*dSe * rlen; 
#else
      force = (force*S + elec_tmp*Se) * rlen2 - (vdw_tmp*dS + elec_tmp*dSe) * rlen;
#endif  /* NV_FSW */
#endif  /* NE_FSH */

#ifdef ATOM_ENE
    if (ad->atom_ene_sample_flag) {
      ene_t2 = vdw_tmp*0.5*S;
      elec_t2 = elec_tmp*0.5*Se;
      group_i = ad->atom_ene_group[i];
      group_j = ad->atom_ene_group[j];
      ad->atom_ene[i][group_j][ATOM_ENE_VDW] += ene_t2;
      ad->atom_ene[j][group_i][ATOM_ENE_VDW] += ene_t2;
      ad->atom_ene[i][group_j][ATOM_ENE_ELEC] += elec_t2;
      ad->atom_ene[j][group_i][ATOM_ENE_ELEC] += elec_t2;
    }
#endif
      fix += force * dx;
      fiy += force * dy;
      fiz += force * dz;

      /*
      ad->f[j].x -= force * dx;
      ad->f[j].y -= force * dy;
      ad->f[j].z -= force * dz;
      */
      fold_f[j].x -= force * dx;
      fold_f[j].y -= force * dy;
      fold_f[j].z -= force * dz;
      
      /* virial */
      virial[0] += force * dx * dx;
      virial[1] += force * dy * dy;
      virial[2] += force * dz * dz;
      
      virial[3] += force * dx * dy;
      virial[4] += force * dx * dz;
      virial[5] += force * dy * dz;
     }
     /*
     ad->f[i].x += fix;
     ad->f[i].y += fiy;
     ad->f[i].z += fiz;
     */
     fold_f[i].x += fix;
     fold_f[i].y += fiy;
     fold_f[i].z += fiz;
    }

    for (i=0;i<6;i++) td->virial[i] += virial[i];
    td->vdw   += vdw_total;
    td->elec  += elec_total;
#ifdef MPI_SDMD
    lc->cell_pair[icp].time += MPI_Wtime() - cp_start_time;
#endif  /* MPI_SDMD */
  }
 }

  *vdw = *edir = 0.0;
  for (ll=0;ll<mpi.n_threads;ll++) {

    *vdw  += lc->td[ll].vdw;
    *edir += lc->td[ll].elec;
    for (ii=0;ii<6;ii++) ad->virial[ii] += lc->td[ll].virial[ii];
    kk=0;
    for (jj=0;jj<lc->n_cell;jj++) {
#ifdef MPI_SDMD
      if (lc->cell[jj].req)
#endif  
	for (ii=lc->cell[jj].head;ii>=0;ii=lc->next_atom[ii]) {
	  ad->f[ii].x += lc->td[ll].fold_f[kk].x;
	  ad->f[ii].y += lc->td[ll].fold_f[kk].y;
	  ad->f[ii].z += lc->td[ll].fold_f[kk].z;
	  kk++;
	}
    }
  }

#if (NB_ELEC == NE_EWALD) && defined(MPI_SDMD)
  /*SDMD_EW_add_dtime(&ew->time[0]); */
#endif   /* NE_EWALD */


}

#undef NB_VDW
#undef NB_ELEC
