/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#define EWALD_C

#include <stdio.h>
#include <math.h>
#include <stdlib.h>

#include "misc.h"
#include "atom.h"
#include "boundary.h"
#include "linked_cell.h"
#include "nonbond.h"
#include "ewald.h"

#ifdef _OPENMP
#include "omp.h"
#endif

double erfc(double);
void SDMD_EW_pme2(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc);

#define PME_STORE 0
#define FORCE_SMOOTH 1

void EWALD_init(EWALD *ew)
{
  ew->pressure_flag = 0;
  ew->opt_infl = 0;
  ew->self_energy_v_flag = 1;
}

void EWALD_normal_init(EWALD *ew, ATOM_DATA *ad,
		       double beta, int mmax, double diel_sur)
{
  int mx, my, mz;
  int n_k;
  int kmax2;
  double b;
  int i;

  ew->flag = FLAG_EWALD;
  ew->diel_sur = diel_sur;
  ew->beta = beta;
  ew->mmax = mmax;
  ew->dir_const = 2.0/sqrt(M_PI)*beta;
  
  ew->sx = emalloc("sx in EWALD_INIT", sizeof(VEC)*ad->natom);
  
  ew->exp_mx = emalloc("exp_mx in EWALD_INIT",
                       sizeof(MD_COMPLEX) * ad->natom * (mmax+1));
  ew->exp_my = emalloc("exp_my in EWALD_INIT",
                       sizeof(MD_COMPLEX) * ad->natom * (2*mmax+1));
  ew->exp_mz = emalloc("exp_mz in EWALD_INIT",
                       sizeof(MD_COMPLEX) * ad->natom * (2*mmax+1));
  ew->exp_mr = emalloc("exp_mr in EWALD_INIT",
                       sizeof(MD_COMPLEX) * ad->natom);

  /* calculation of self energy */
  ew->self_energy = 0.0;
  for (i=0;i<ad->natom;i++) {
    ew->self_energy -= ad->q[i] * ad->q[i];
  }
  ew->self_energy *= ew->beta / sqrt(M_PI);
  /* end of calculation of self energy */

  lprintf("Ewald Set up: beta = %f, max of m number = %d\n", ew->beta,
	 ew->mmax);
  lprintf("  Surroundings:");
  if (ew->diel_sur == 0.0) {
    lprintf("metal\n");
  } else if (ew->diel_sur == 1.0) {
    lprintf("vacuum\n");
  } else if (ew->diel_sur == 80.0) {
    lprintf("water (diel=80)\n");
  } else {
    lprintf("diel=%.1f\n",ew->diel_sur);
  }
}

#if PME_STORE

#define Mnx(i,j)     ew->Mnx[(j)+(i)*ew->n_spline]
#define Mny(i,j)     ew->Mny[(j)+(i)*ew->n_spline]
#define Mnz(i,j)     ew->Mnz[(j)+(i)*ew->n_spline]
#define Mn1x(i,j)    ew->Mn1x[(j)+(i)*ew->n_spline]
#define Mn1y(i,j)    ew->Mn1y[(j)+(i)*ew->n_spline]
#define Mn1z(i,j)    ew->Mn1z[(j)+(i)*ew->n_spline]

#endif

#if 0

void EW_pme_init(EWALD *ew, BOUNDARY *bc, NONBOND_LIST *nl, ATOM_DATA *ad, 
		 double tol_beta, int tol_beta_flag,
		 int gridx, int gridy, int gridz,
		 double diel_sur, int n_spline, int erfc_resolution)
{
  int i;
  double ewbeta_lo, ewbeta_hi;

  if (bc->type != PERIODIC_BOUNDARY) {
    lprintf("ERROR: Ewald: No periodic boundary condition found.\n");
    marble_exit(1);
  }
  
  ew->flag = FLAG_PME;
  ew->diel_sur = diel_sur;

  if (tol_beta_flag) {
    ew->tolerance = tol_beta;
    ew->beta = 1.0;
    while (erfc(ew->beta*nl->rl_off)/nl->rl_off >= ew->tolerance) ew->beta *= 2.0;
    ewbeta_lo = 0.0;
    ewbeta_hi = ew->beta;
    for (i=0;i<100;i++) {
      ew->beta = 0.5*(ewbeta_lo+ewbeta_hi);
      if (erfc(ew->beta*nl->rl_off)/nl->rl_off >= ew->tolerance) {
	ewbeta_lo = ew->beta;
      } else {
	ewbeta_hi = ew->beta;
      }
    }
  } else {
    ew->beta = tol_beta;
    ew->tolerance = erfc(ew->beta*nl->rl_off)/nl->rl_off;
  }
  ew->dir_const = 2.0/sqrt(M_PI)*ew->beta;
  
  ew->pme_grid[0] = gridx;
  ew->pme_grid[1] = gridy;
  ew->pme_grid[2] = gridz;
  ew->n_spline = n_spline;

  /* ew->f = emalloc("f in PME_INIT", sizeof(VEC)*ad->natom); */
  
  /* calculation of self energy */
  ew->self_energy = 0.0;
  for (i=0;i<ad->natom;i++) {
    ew->self_energy -= ad->q[i] * ad->q[i];
  }
  ew->self_energy *= ew->beta / sqrt(M_PI);
  /* end of calculation of self energy */

  ew->Q = emalloc("Q in PME_INIT", sizeof(FFTW_COMPLEX)*
		  ew->pme_grid[0]*ew->pme_grid[1]*ew->pme_grid[2]);
  ew->B = emalloc("B in PME_INIT", sizeof(double)*
		  ew->pme_grid[0]*ew->pme_grid[1]*ew->pme_grid[2]);
  ew->C = emalloc("C in PME_INIT", sizeof(double)*
		  ew->pme_grid[0]*ew->pme_grid[1]*ew->pme_grid[2]);

  ew->Qindex = alloc_3d_index(ew->Q, sizeof(FFTW_COMPLEX), ew->pme_grid);
  ew->Bindex = alloc_3d_index(ew->B, sizeof(double), ew->pme_grid);
  ew->Cindex = alloc_3d_index(ew->C, sizeof(double), ew->pme_grid);

#if PME_STORE
  ew->Mnx = emalloc("Mnx in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);
  ew->Mny = emalloc("Mny in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);
  ew->Mnz = emalloc("Mnz in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);
  ew->Mn1x = emalloc("Mn1x in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);
  ew->Mn1y = emalloc("Mn1y in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);
  ew->Mn1z = emalloc("Mn1z in PME_INIT", sizeof(double)*(ew->n_spline)*ad->natom);

#else
  ew->Mnx = emalloc("Mnx in PME_INIT", sizeof(double)*(ew->n_spline+1));
  ew->Mny = emalloc("Mny in PME_INIT", sizeof(double)*(ew->n_spline+1));
  ew->Mnz = emalloc("Mnz in PME_INIT", sizeof(double)*(ew->n_spline+1));
  ew->Mn1x = emalloc("Mn1x in PME_INIT", sizeof(double)*(ew->n_spline+1));
  ew->Mn1y = emalloc("Mn1y in PME_INIT", sizeof(double)*(ew->n_spline+1));
  ew->Mn1z = emalloc("Mn1z in PME_INIT", sizeof(double)*(ew->n_spline+1));
#endif  
  
  ew->for_plan
    = fftw3d_create_plan(ew->pme_grid[0],ew->pme_grid[1],ew->pme_grid[2],
			 FFTW_FORWARD, FFTW_MEASURE | FFTW_IN_PLACE);
  
  ew->back_plan
    = fftw3d_create_plan(ew->pme_grid[0],ew->pme_grid[1],ew->pme_grid[2],
			 FFTW_BACKWARD, FFTW_MEASURE | FFTW_IN_PLACE);

  EW_calc_b_array(ew);
  EW_calc_c_array(ew, bc);
#ifdef ERFC_TABLE  
  EW_set_erfc_table(ew, nl->rl_off, erfc_resolution);
#endif

  lprintf("Particle Mesh Ewald:\n");
  lprintf("  Ewald Tolerance  : %.2e\n", ew->tolerance);
  lprintf("  Ewald Coefficient: %f\n", ew->beta);
  lprintf("  PME Grid         : %d %d %d\n",
	  ew->pme_grid[0],ew->pme_grid[1],ew->pme_grid[2]);
  lprintf("  Spline Order     : %d\n", ew->n_spline);
  lprintf("  Resolution of erfc : %d\n", erfc_resolution);
  lprintf("  Table for erfc   : %d points for %.2f Angstrom\n", ew->n_erfc_table, nl->rl_off+1.0);
  
}

#endif /* if 0 */

#ifdef ERFC_TABLE
void EW_set_erfc_table(EWALD *ew, double cutoff, int erfc_resolution)
{
  double factor;
  double x;
  int i;

  ew->n_erfc_table = ew->beta*(cutoff+1.0)*erfc_resolution;
  ew->dx_erfc_table = 1.0/erfc_resolution;
  ew->max_erfc_table = ew->dx_erfc_table * ew->n_erfc_table;

  if (ew->erfc_table != NULL) {
    free(ew->erfc_table);
  }

  ew->erfc_table = emalloc("set_erfc_table",
                           sizeof(double)*4*ew->n_erfc_table);
			   
  for (i=0;i<ew->n_erfc_table;i++) {
    x = ew->dx_erfc_table * i;
    ew->erfc_table[i][0] = erfc(x);
  }
  factor = -2.0/sqrt(M_PI);
  ew->erfc_table[0][1] = factor;
  x = (ew->n_erfc_table - 1) * ew->dx_erfc_table;
  ew->erfc_table[ew->n_erfc_table - 1][1] = factor * exp(-x*x);

  /*
  fprintf(stderr,"%e %e\n", factor * exp(-x*x), (erfc(x+1.0e-5)-erfc(x))/1.0e-5);
  */
  
  EW_cubic_spline(ew->erfc_table, ew->n_erfc_table, ew->dx_erfc_table);
}

/*
 * calculate cubic spline
 * input:  n : number of input
 *         dx: interval of x
 *         y[0..n-1][0]: function
 *         y[0][1], y[n-1][1]: first derivatives at end point
 * output  y[0..n-1][m]: coefficient of spline function (m=1..3)
 *
 */
void EW_cubic_spline(double (*y)[4], int n, double dx)
{
  double p, A;
  int i;
  double qn, un;
  
  /* y[i][2], y[i][3]: temporary
     h[i] := y[i][2], u[i] := y[i][3] */
  
  /* first derivative at end point */
  y[0][2] = -0.5;
  y[0][3] = (3.0/dx)*((y[1][0]-y[0][0])/dx-y[0][1]);

  /* forward substitution */
  for (i=1;i<n-1;i++) {
    p = y[i-1][2]+4.0;
    y[i][2] = -1.0/p;
    A = (y[i+1][0]-2.0*y[i][0]+y[i-1][0])/dx;
    y[i][3] = ((6.0*A/dx) - y[i-1][3]) / p;
  }

  /* first derivative at end point */
  qn = 0.5;
  un = (3.0/dx)*(y[n-1][1]-(y[n-1][0]-y[n-2][0])/dx);
  y[n-1][2] = (un-qn*y[n-2][3])/(qn*y[n-2][2]+1.0);

  /* backsubstitution */
  for (i=n-2;i>=0;i--) {
    y[i][2] = y[i][2]*y[i+1][2]+y[i][3];
  }

  /* calculation of y[i][1], y[i][2], y[i][3] */
  for (i=0;i<n-1;i++) {
    y[i][3] = (y[i+1][2]-y[i][2])/(6.0*dx);
    y[i][2] /= 2.0;
    y[i][1] = (y[i+1][0]-y[i][0])/dx -y[i][2]*dx -y[i][3]*dx*dx;
  }
}
#endif  /* #ifdef ERFC_TABLE */

#if 1
void EW_direct_energy_force(EWALD *ew, LINKED_CELL *lc, NONBOND_LIST *nl,
			    ATOM_DATA *ad, BOUNDARY *bc,
			    double *vdw, double *edir, double *hbond)
{
#ifdef MPI_SDMD
#define MPI_SDMD_TMP
#undef MPI_SDMD
#endif  
  if (nl->vdw_method == NV_PSW)
#define NB_VDW   NV_PSW
#define NB_ELEC  NE_EWALD
#include "nonbond_direct.h"
  else if (nl->vdw_method == NV_FSW)
#define NB_VDW   NV_FSW
#define NB_ELEC  NE_EWALD
#include "nonbond_direct.h"
    
#ifdef MPI_SDMD_TMP
#define MPI_SDMD
#undef MPI_SDMD_TMP
#endif  
}

#else

void EW_direct_energy_force(EWALD *ew, LINKED_CELL *lc, NONBOND_LIST *nl,
			    ATOM_DATA *ad, BOUNDARY *bc,
			    double *vdw, double *edir, double *hbond)
{
  int i,j,k,start,end;
  int vdw_index;
  double dx, dy, dz;
  double len, len2, len6, len12;
  double vdw12, vdw6, hb12, hb10, force, elec_tmp, ene_tmp, elec_t2, ene_t2;
  double ewald_dir, ew_force;
#ifdef ERFC_TABLE
  double x, h, val_erfc, dval_erfc;
  int ix;
#endif
  VEC offset_v;
  int icp;
  /* for smoothing */
  double S, dS, Se, len_rl_on, len_rs_on, rl_tmp, rs_tmp, rl_diff3, rs_diff3;
  /* for atom_ene */
  int group_i, group_j;
  
  rl_tmp = 3.0 * nl->rl_off - nl->rl_on;
  rl_diff3 = pow(nl->rl_off - nl->rl_on, 3.0);

  *vdw = *elec = *hbond = 0.0;
  
  for (i=0;i<ad->natom;i++) {
    ad->fold_x[i].x = ad->x[i].x - VEC_MUL_MAT_X(ad->tr_x[i],bc->boxv);
    ad->fold_x[i].y = ad->x[i].y - VEC_MUL_MAT_Y(ad->tr_x[i],bc->boxv);
    ad->fold_x[i].z = ad->x[i].z - VEC_MUL_MAT_Z(ad->tr_x[i],bc->boxv);
  }
  
  for (icp=0;icp<lc->n_cell_pair;icp++) {
    offset_v=bc->offset_v[lc->cell_pair[icp].offset];
    start=lc->cell_pair[icp].alist_start;
    end=lc->cell_pair[icp].alist_end;
    for (k=start;k<=end;k++) {
  
      i = nl->ij_list[k][0];
      j = nl->ij_list[k][1];

      dx = ad->fold_x[i].x - ad->fold_x[j].x + offset_v.x;
      dy = ad->fold_x[i].y - ad->fold_x[j].y + offset_v.y;
      dz = ad->fold_x[i].z - ad->fold_x[j].z + offset_v.z;

      len2 = dx * dx + dy * dy + dz * dz;
      if (len2 >= nl->rl_off2) continue;

      len6 = len2 * len2 * len2;
      len12 = len6 * len6;
      len = sqrt(len2);

      if (len > nl->rl_on) {
	len_rl_on = len - nl->rl_on;
	S = 1.0-len_rl_on * len_rl_on * (rl_tmp - 2*len) / rl_diff3;
	dS = -6.0 * len_rl_on * (nl->rl_off - len) / rl_diff3;
      } else {
	S = 1.0; dS = 0.0;
      }

      vdw_index = ad->index[ad->vdw_type[i]+ad->vdw_type[j]*ad->ntype];
#ifdef HBOND    
      if (vdw_index >= 0) {
#endif      
	vdw12 = ad->vdw12[vdw_index] / len12;
	vdw6 = ad->vdw6[vdw_index] / len6;
	*vdw += (vdw12 - vdw6)*S;
	ene_tmp = vdw12 - vdw6;
	force = 12.0 * vdw12 - 6.0 * vdw6;
#ifdef HBOND      
      } else {
	vdw_index = - vdw_index - 2;
	hb12 = ad->hb12[vdw_index] / len12;
	hb10 = ad->hb10[vdw_index] / (len6 * len2 * len2);
	*hbond += (hb12 - hb10)*S;
	ene_tmp = hb12 - hb10;
	force = 12.0 * hb12 - 10.0 * hb10;
      }
#endif    

#ifdef ERFC_TABLE
      x = ew->beta*len;
      ix = (int) (x / ew->dx_erfc_table);
      h = x - ix * ew->dx_erfc_table;

      val_erfc = ew->erfc_table[ix][0] + (ew->erfc_table[ix][1] + (ew->erfc_table[ix][2] + ew->erfc_table[ix][3] * h) * h ) * h;
      dval_erfc = ew->erfc_table[ix][1] + (2.0*ew->erfc_table[ix][2] + 3.0*ew->erfc_table[ix][3] * h) * h;
    
      ewald_dir = ad->q[i]*ad->q[j]*val_erfc/len;
      ew_force = ewald_dir - ad->q[i]*ad->q[j]*ew->beta*dval_erfc;
#else  /* #ifdef ERFC_TABLE */
      ewald_dir = ad->q[i]*ad->q[j]*erfc(ew->beta*len) / len;
      ew_force  = ewald_dir +
	ad->q[i]*ad->q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2);
#endif /* #ifdef ERFC_TABLE */
    
#if 0  /* EWALD SMOOTH */      
      *elec += ewald_dir*S;
      force = (force + ew_force) / len2 * S - (ene_tmp + ewald_dir) * dS / len;
#else
      *elec += ewald_dir;
      force = (force*S + ew_force) / len2 - ene_tmp*dS/len;
#endif
      /*
       *elec += ewald_dir;
       force = (force + ew_force) / len2;
      */

      if (ad->atom_ene_sample_flag) {
	ene_t2 = ene_tmp*0.5*S;
	elec_t2 = ewald_dir*0.5;
	group_i = ad->atom_ene_group[i];
	group_j = ad->atom_ene_group[j];
	ad->atom_ene[i][group_j][ATOM_ENE_VDW] += ene_t2;
	ad->atom_ene[j][group_i][ATOM_ENE_VDW] += ene_t2;
	ad->atom_ene[i][group_j][ATOM_ENE_ELEC] += elec_t2;
	ad->atom_ene[j][group_i][ATOM_ENE_ELEC] += elec_t2;
      }

      ad->f[i].x += force * dx;
      ad->f[i].y += force * dy;
      ad->f[i].z += force * dz;
    
      ad->f[j].x -= force * dx;
      ad->f[j].y -= force * dy;
      ad->f[j].z -= force * dz;

      /* virial */
      ad->virial[0] += force * dx * dx;
      ad->virial[1] += force * dy * dy;
      ad->virial[2] += force * dz * dz;
    
      ad->virial[3] += force * dx * dy;
      ad->virial[4] += force * dx * dz;
      ad->virial[5] += force * dy * dz;
    }
  }
}
#endif

#if 0

#define EXP_MX(n,m)    ew->exp_mx[(n)*(ew->mmax+1)+(m)]
#define EXP_MY(n,m)    ew->exp_my[(n)*(2*ew->mmax+1)+(m)+ew->mmax]
#define EXP_MZ(n,m)    ew->exp_mz[(n)*(2*ew->mmax+1)+(m)+ew->mmax]
#define EXP_MR(n)      ew->exp_mr[n]

void EW_rec_energy_force(EWALD *ew, double box[3], double min[3],
			 ATOM_DATA *ad, double *elec)
{
  int mx, my, mz;
  MD_COMPLEX s, t;
  double c, cc, mlen2, factor, coef, e;
  int i,k,mmax2;

  for (i=0;i<ad->natom;i++) {
    ew->sx[i].x = (ad->x[i].x - min[0]) / box[0];
    ew->sx[i].y = (ad->x[i].y - min[1]) / box[1];
    ew->sx[i].z = (ad->x[i].z - min[2]) / box[2];
  }

  /* calculation of EXP_MX,EXP_MY, EXP_MZ */
  /* direct calculation for MX, MY, MZ = 0, -1, 1 */
  for (i=0;i<ad->natom;i++) {
    EXP_MX(i,0).r = 1.0;  EXP_MX(i,0).i = 0.0;
    EXP_MY(i,0).r = 1.0;  EXP_MY(i,0).i = 0.0;
    EXP_MZ(i,0).r = 1.0;  EXP_MZ(i,0).i = 0.0;

    EXP_MX(i,1).r = cos(2.0*M_PI*ew->sx[i].x);
    EXP_MX(i,1).i = sin(2.0*M_PI*ew->sx[i].x);
    
    EXP_MY(i,1).r = cos(2.0*M_PI*ew->sx[i].y);
    EXP_MY(i,1).i = sin(2.0*M_PI*ew->sx[i].y);
    
    EXP_MZ(i,1).r = cos(2.0*M_PI*ew->sx[i].z);
    EXP_MZ(i,1).i = sin(2.0*M_PI*ew->sx[i].z);
    
    EXP_MY(i,-1).r =  EXP_MY(i,1).r;    EXP_MY(i,-1).i = -EXP_MY(i,1).i;
    EXP_MZ(i,-1).r =  EXP_MZ(i,1).r;    EXP_MZ(i,-1).i = -EXP_MZ(i,1).i;
    for (k = 2; k <= ew->mmax; k++) {
      MUL_COMPLEX(EXP_MX(i,k), EXP_MX(i,k-1), EXP_MX(i,1));
      MUL_COMPLEX(EXP_MY(i,k), EXP_MY(i,k-1), EXP_MY(i,1));
      MUL_COMPLEX(EXP_MZ(i,k), EXP_MZ(i,k-1), EXP_MZ(i,1));

      EXP_MY(i,-k).r = EXP_MY(i,k).r;    EXP_MY(i,-k).i = -EXP_MY(i,k).i;
      EXP_MZ(i,-k).r = EXP_MZ(i,k).r;    EXP_MZ(i,-k).i = -EXP_MZ(i,k).i;
    }
  }
  
  cc = 1.0/(2.0*M_PI*box[0]*box[1]*box[2]);
  /* *elec = 0.0; */
  mmax2 = ew->mmax*ew->mmax;
  for (mx=0; mx<=ew->mmax; mx++) {
    if (mx == 0) factor = 1.0;
    else         factor = 2.0;
    for (my=-ew->mmax; my<=ew->mmax; my++) {
      for (mz=-ew->mmax; mz<=ew->mmax; mz++) {
        if (mx==0&&my==0&&mz==0) continue;
        if (mx*mx+my*my+mz*mz > mmax2) continue;
        
        mlen2 = (mx/box[0])*(mx/box[0])+(my/box[1])*(my/box[1])
          + (mz/box[2])*(mz/box[2]);
        c = factor*cc*exp(-M_PI*M_PI*mlen2/(ew->beta*ew->beta))/mlen2;
        s.r = s.i = 0.0;
        for (i=0;i<ad->natom;i++) {
          MUL_COMPLEX(t,EXP_MX(i,mx),EXP_MY(i,my));
          MUL_COMPLEX(EXP_MR(i),t,EXP_MZ(i,mz));
          s.r += ad->q[i] * EXP_MR(i).r;
          s.i += ad->q[i] * EXP_MR(i).i;
        }
        e =  c * (s.r * s.r + s.i * s.i);
        *elec += e;
	ad->virial[0] += e*(1.0-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *mx*mx/(box[0]*box[0]));
	ad->virial[1] += e*(1.0-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *my*my/(box[1]*box[1]));
	ad->virial[2] += e*(1.0-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *mz*mz/(box[2]*box[2]));
	
	ad->virial[3] += e*(-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *mx*mx/(box[0]*box[0]));
	ad->virial[4] += e*(-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *my*my/(box[1]*box[1]));
	ad->virial[5] += e*(-2.0*(1.0/mlen2+M_PI*M_PI/(ew->beta*ew->beta))
			    *mz*mz/(box[2]*box[2]));
	
        for (i=0;i<ad->natom;i++) {
          t.r = ad->q[i] * EXP_MR(i).r;
          t.i = ad->q[i] * EXP_MR(i).i;
          coef = c * 4.0*M_PI * (-s.r * t.i + s.i * t.r);
          ad->f[i].x -= coef * mx / box[0];
          ad->f[i].y -= coef * my / box[1];
          ad->f[i].z -= coef * mz / box[2];
        }
      }
    }
  }

#if 0   /* DEBUG */
  *elec = 0.0;
  {
    int m1, m2, m3;
    for (m1=-ew->mmax; m1<=ew->mmax; m1++) {
      for (m2=-ew->mmax; m2<=ew->mmax; m2++) {
        for (m3=-ew->mmax; m3<=ew->mmax; m3++) {
          if (mx==0&&my==0&&mz==0) continue;
          if (mx*mx+my*my+mz*mz > mmax2) continue;
    
          c = cc*exp(-M_PI*M_PI*mlen2/(ew->beta*ew->beta))/mlen2;

          s.r = s.i = 0.0;
          for (i=0;i<ad->natom;i++) {
            s.r += ad->q[i] * cos(2.0*M_PI*(m1*ew->sx[i].x+m2*ew->sx[i].y+m3*ew->sx[i].z));
            s.i += ad->q[i] * sin(2.0*M_PI*(m1*ew->sx[i].x+m2*ew->sx[i].y+m3*ew->sx[i].z));
          }
          *elec += c * (s.r * s.r + s.i * s.i);
          for (i=0;i<ad->natom;i++) {
            t.r = ad->q[i] * cos(2.0*M_PI*(m1*ew->sx[i].x+m2*ew->sx[i].y+m3*ew->sx[i].z));
            t.i = ad->q[i] * sin(2.0*M_PI*(m1*ew->sx[i].x+m2*ew->sx[i].y+m3*ew->sx[i].z));
            ad->f[i].x -= c * 4.0*M_PI * m1 / box[0] * (-s.r * t.i + s.i * t.r);
            ad->f[i].y -= c * 4.0*M_PI * m2 / box[1] * (-s.r * t.i + s.i * t.r);
            ad->f[i].z -= c * 4.0*M_PI * m3 / box[2] * (-s.r * t.i + s.i * t.r);
          }
        }
      }
    }
  }
#endif  /* 0 ** DEBUG */
}

void EW_nb_cor_energy_force(EWALD *ew, double box[3],
			    ATOM_DATA *ad, NONBOND_LIST *nl,
			    double *elec)
{
  int i,j,k;
  int vdw_index;
  double dx, dy, dz;
  double len, len2, len6, len12;
  double vdw12, vdw6, hb12, hb10, force, elec_t, ene_t;
  double ewald_dir, ew_force;
#ifdef ERFC_TABLE
  double x, h, val_erfc, dval_erfc;
  int ix;
#endif  
  /* for smoothing */
  double S, dS, len_rl_on, len_rs_on, rl_tmp, rs_tmp, rl_diff3, rs_diff3;

  rl_tmp = 3.0 * nl->rl_off - nl->rl_on;
  rs_tmp = 3.0 * nl->rs_off - nl->rs_on;
  rl_diff3 = pow(nl->rl_off - nl->rl_on, 3.0);
  rs_diff3 = pow(nl->rs_off - nl->rs_on, 3.0);

  /* *elec = 0.0; */
  /* *vdw = *hbond = 0.0; */
  for (k=0;k<nl->n_list;k++) {
    i = nl->ij_list[k][0];
    j = nl->ij_list[k][1];

    dx = ad->x[i].x - ad->x[j].x;
    dy = ad->x[i].y - ad->x[j].y;
    dz = ad->x[i].z - ad->x[j].z;

    if (dx > box[0]*0.5) dx -= box[0];
    if (dy > box[1]*0.5) dy -= box[1];
    if (dz > box[2]*0.5) dz -= box[2];
    
    if (dx < -box[0]*0.5) dx += box[0];
    if (dy < -box[1]*0.5) dy += box[1];
    if (dz < -box[2]*0.5) dz += box[2];
    
    len2 = dx * dx + dy * dy + dz * dz;
    if (len2 >= nl->rl_off2 || len2 <= nl->rs_on2) continue;
    /*
    len6 = len2 * len2 * len2;
    len12 = len6 * len6;
    */
    len = sqrt(len2);
    
    if (len > nl->rl_on) {
      len_rl_on = len - nl->rl_on;
      S = 1.0-len_rl_on * len_rl_on * (rl_tmp - 2*len) / rl_diff3;
      dS = -6.0 * len_rl_on * (nl->rl_off - len) / rl_diff3;
    } else if (len < nl->rs_off) {
      len_rs_on = len - nl->rs_on;
      S = len_rs_on * len_rs_on * (rs_tmp - 2*len) / rs_diff3;
      dS = 6.0 * len_rs_on * (nl->rs_off - len) / rs_diff3;
    } else {
      S = 1.0; dS = 0.0;
    }

    ene_t = force = 0.0;
#if 0    
    vdw_index = ad->index[ad->vdw_type[i]+ad->vdw_type[j]*ad->ntype];
    if (vdw_index >= 0 /* || len2 > ad->hbond_criteria2 */) {
      vdw12 = ad->vdw12[vdw_index] / len12;
      vdw6 = ad->vdw6[vdw_index] / len6;
      *vdw += (vdw12 - vdw6)*S;
      ene_t = vdw12 - vdw6;
      force = 12.0 * vdw12 - 6.0 * vdw6;
    } else {
      vdw_index = - vdw_index - 2;
      hb12 = ad->hb12[vdw_index] / len12;
      hb10 = ad->hb10[vdw_index] / (len6 * len2 * len2);
      *hbond += (hb12 - hb10)*S;
      ene_t = hb12 - hb10;
      force = 12.0 * hb12 - 10.0 * hb10;
    }
#endif

#ifdef ERFC_TABLE
    x = ew->beta*len;
    ix = (int) (x / ew->dx_erfc_table);
    h = x - ix * ew->dx_erfc_table;

    val_erfc = ew->erfc_table[ix][0] + (ew->erfc_table[ix][1] + (ew->erfc_table[ix][2] + ew->erfc_table[ix][3] * h) * h ) * h;
    dval_erfc = ew->erfc_table[ix][1] + (2.0*ew->erfc_table[ix][2] + 3.0*ew->erfc_table[ix][3] * h) * h;
    
    ewald_dir = -ad->q[i]*ad->q[j]*(1.0-val_erfc)/len;
    ew_force = ewald_dir - ad->q[i]*ad->q[j]*ew->beta*dval_erfc;
    /*
    printf("%e %e\n",ew_force, ewald_dir +
      ad->q[i]*ad->q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2));
    */

#else    
    ewald_dir = -ad->q[i]*ad->q[j]*(1.0-erfc(ew->beta*len)) / len;
    ew_force  = ewald_dir +
      ad->q[i]*ad->q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2);
#endif    
    
    *elec += ewald_dir*S;
#if 0   /* TEST */
    force = (force + ew_force) / len2 * S - (ene_t + ewald_dir) * dS / len;
#else
    force = (force + ew_force) / len2 * S;
#endif    
    /*
    *elec += ewald_dir;
    force = (force + ew_force) / len2;
    */
    ad->f[i].x += force * dx;
    ad->f[i].y += force * dy;
    ad->f[i].z += force * dz;
    
    ad->f[j].x -= force * dx;
    ad->f[j].y -= force * dy;
    ad->f[j].z -= force * dz;

    /* virial */
    ad->virial[0] += force * dx * dx;
    ad->virial[1] += force * dy * dy;
    ad->virial[2] += force * dz * dz;
    ad->virial[3] += force * dx * dy;
    ad->virial[4] += force * dx * dz;
    ad->virial[5] += force * dy * dz;
  }
}


void EW_cor_energy_force(EWALD *ew, BOUNDARY *bc, ATOM_DATA *ad, double *elec)
{
  int i, j, iex;
  double ewald_cor, force;
  double dx, dy, dz;
  double len2, len;
  double factor;

  for (i=0;i<ad->natom;i++) {
    for (iex=0;iex<ad->ex[i].n_exatom;iex++) {
      j = ad->ex[i].exatom[iex];
      if (i>j) continue;
      
      dx = ad->x[i].x - ad->x[j].x;
      dy = ad->x[i].y - ad->x[j].y;
      dz = ad->x[i].z - ad->x[j].z;
      len2 = dx * dx + dy * dy + dz * dz;
      len = sqrt(len2);
      
      ewald_cor = - ad->q[i]*ad->q[j]*(1.0-erfc(ew->beta*len)) / len;
      force  = (ewald_cor +
		ad->q[i]*ad->q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2))
	       / len2;
    
      *elec += ewald_cor;
      
    if (ad->atom_ene_sample_flag) {
      ewald_cor *= 0.5;
      ad->atom_ene[i][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] += ewald_cor;
      ad->atom_ene[j][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] += ewald_cor;
    }
    
      ad->f[i].x += force * dx;
      ad->f[i].y += force * dy;
      ad->f[i].z += force * dz;
    
      ad->f[j].x -= force * dx;
      ad->f[j].y -= force * dy;
      ad->f[j].z -= force * dz;
      
      /* virial */
      ad->virial[0] += force * dx * dx;
      ad->virial[1] += force * dy * dy;
      ad->virial[2] += force * dz * dz;
      
      ad->virial[3] += force * dx * dy;
      ad->virial[4] += force * dx * dz;
      ad->virial[5] += force * dy * dz;
      
    }
  }

#if 1
  if (ew->diel_sur > 0.0) {
    dx = dy = dz = 0.0;
    factor = 2.0*M_PI/((2.0*ew->diel_sur+1.0)*bc->V);
    
    for (i=0;i<ad->natom;i++) {
      dx += ad->q[i]*ad->x[i].x;
      dy += ad->q[i]*ad->x[i].y;
      dz += ad->q[i]*ad->x[i].z;
    }
    *elec += factor*(dx*dx+dy*dy+dz*dz);
    ad->virial[0] += factor*(-dx*dx+dy*dy+dz*dz);
    ad->virial[1] += factor*(dx*dx-dy*dy+dz*dz);
    ad->virial[2] += factor*(dx*dx+dy*dy-dz*dz);
    ad->virial[3] += - 2.0 * factor*dx*dy;
    ad->virial[4] += - 2.0 * factor*dx*dz;
    ad->virial[5] += - 2.0 * factor*dy*dz;
    for (i=0;i<ad->natom;i++) {
      force = 2.0 * factor * ad->q[i];
      ad->f[i].x -= force * dx;
      ad->f[i].y -= force * dy;
      ad->f[i].z -= force * dz;
    }
  }
#endif /* 1 */  
  *elec += ew->self_energy;
}



#if PME_STORE

void EW_pme_energy_force(EWALD *ew, ATOM_DATA *ad, BOUNDARY *bc, double *elec)
{
  int i, n, k;
  int ix, iy, iz;
  double ux, uy, uz;
  int kx, ky, kz;
  int nx, ny, nz;
  int ikx, iky, ikz;
  FFTW_COMPLEX s;
  double energy, b, c, e;
  double mdx,mdy,mdz;
  
  /* Step 1. calculation of Q */
  for (ix=0;ix<ew->pme_grid[0];ix++) {
    for (iy=0;iy<ew->pme_grid[1];iy++) {
      for (iz=0;iz<ew->pme_grid[2];iz++) {
	QARR(ix,iy,iz).re = QARR(ix,iy,iz).im = 0.0;
      }
    }
  }
  for (i=0;i<ad->natom;i++) {
    /*
    ux = ew->pme_grid[0] * (ad->x[i].x - min[0])/ box[0];
    uy = ew->pme_grid[1] * (ad->x[i].y - min[1])/ box[1];
    uz = ew->pme_grid[2] * (ad->x[i].z - min[2])/ box[2];
    */
    ux = ew->pme_grid[0] * VEC_MUL_MAT_X(ad->x[i],bc->recip);
    uy = ew->pme_grid[1] * VEC_MUL_MAT_Y(ad->x[i],bc->recip);
    uz = ew->pme_grid[2] * VEC_MUL_MAT_Z(ad->x[i],bc->recip);
    
    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);
    
    Mnx(i,0) = ux - kx;            /* value for kx */
    Mnx(i,1) = 1.0 - Mnx(i,0);     /* value for kx - 1 */
    
    Mny(i,0) = uy - ky;            /* value for ky */
    Mny(i,1) = 1.0 - Mny(i,0);     /* value for ky - 1 */

    Mnz(i,0) = uz - kz;            /* value for kz */
    Mnz(i,1) = 1.0 - Mnz(i,0);     /* value for kz - 1 */
    
    for (n=3;n<=ew->n_spline;n++) {
      Mnx(i,n-1) = Mnx(i,n-2) * (-ux+kx+1) / (n-1);   /* (n-(ux-kx+n-1)/(n-1) */
      Mny(i,n-1) = Mny(i,n-2) * (-uy+ky+1) / (n-1);
      Mnz(i,n-1) = Mnz(i,n-2) * (-uz+kz+1) / (n-1);
      
      for (k=n-2;k>=1;k--) {
	Mnx(i,k) = (ux-kx+k)/(n-1)*Mnx(i,k) + (n-(ux-kx+k))/(n-1)*Mnx(i,k-1);
	Mny(i,k) = (uy-ky+k)/(n-1)*Mny(i,k) + (n-(uy-ky+k))/(n-1)*Mny(i,k-1);
	Mnz(i,k) = (uz-kz+k)/(n-1)*Mnz(i,k) + (n-(uz-kz+k))/(n-1)*Mnz(i,k-1);
      }
      Mnx(i,0) = (ux-kx)/(n-1)*Mnx(i,0);
      Mny(i,0) = (uy-ky)/(n-1)*Mny(i,0);
      Mnz(i,0) = (uz-kz)/(n-1)*Mnz(i,0);
      
      if (n == ew->n_spline-1) {
	for (k=0;k<=n-1;k++) {
	  Mn1x(i,k) = Mnx(i,k);
	  Mn1y(i,k) = Mny(i,k);
	  Mn1z(i,k) = Mnz(i,k);
	}
	Mn1x(i,n) = Mn1y(i,n) = Mn1z(i,n) = 0.0;
      }
    }

    for (nx=0;nx<ew->n_spline;nx++) {
      ikx = kx - nx;
      if (ikx < 0)                 ikx +=  ew->pme_grid[0];
      if (ikx >= ew->pme_grid[0])  ikx -=  ew->pme_grid[0];
      
      for (ny=0;ny<ew->n_spline;ny++) {
	iky = ky - ny;
	if (iky < 0)                 iky +=  ew->pme_grid[1];
	if (iky >= ew->pme_grid[1])  iky -=  ew->pme_grid[1];
	
	for (nz=0;nz<ew->n_spline;nz++) {
	  ikz = kz - nz;
	  if (ikz < 0)                 ikz +=  ew->pme_grid[2];
	  if (ikz >= ew->pme_grid[2])  ikz -=  ew->pme_grid[2];
	  
	  QARR(ikx,iky,ikz).re += ad->q[i]*Mnx(i,nx)*Mny(i,ny)*Mnz(i,nz);

	  /* DEBUG 
	  printf("Q: %d %d %d %f\n", ikx, iky, ikz, QARR(ikx,iky,ikz).re); */
	}
      }
    }
  }
  
  /* Step 2. inverse FFT */
  fftwnd(ew->back_plan, 1, ew->Q, 1, 0, 0, 0, 0);

  /* Step 3. calculation of C array */
  if (ew->pressure_flag) {
    EW_calc_c_array(ew, bc);
  }

  /* Step 4. calculation of Energy */
  energy = 0.0;
  for (ikx=0;ikx<ew->pme_grid[0];ikx++) {
    for (iky=0;iky<ew->pme_grid[1];iky++) {
      for (ikz=0;ikz<ew->pme_grid[2];ikz++) {
	s = QARR(ikx,iky,ikz);
	b = BARR(ikx,iky,ikz);
	c = CARR(ikx,iky,ikz);
	e = 0.5*b*c*(s.re*s.re+s.im*s.im);
	energy += e;
	m2 = mx*mx+my*my+mz*mz;
	if (m2 != 0.0) {
	  ad->virial[0] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*mx*mx);
	  ad->virial[1] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*my*my);
	  ad->virial[2] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*mz*mz);
	}
	QARR(ikx,iky,ikz).re *= b*c;
	QARR(ikx,iky,ikz).im *= b*c;
	/* DEBUG 
	   printf("%f %f %f %f\n",s.re,s.im,CARR(ikx,iky,ikz),BARR(ikx,iky,ikz)); */
      }
    }
  }
  *elec += energy;

  /* Step 5. calculation of virial tensor */

  /* Step 6. calculation of Force */
  fftwnd(ew->for_plan, 1, ew->Q, 1, 0, 0, 0, 0);

  for (i=0;i<ad->natom;i++) {
    ux = ew->pme_grid[0] * (ad->x[i].x - min[0]) / box[0];
    uy = ew->pme_grid[1] * (ad->x[i].y - min[1]) / box[1];
    uz = ew->pme_grid[2] * (ad->x[i].z - min[2]) / box[2];
    
    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);
    
    for (nx=0;nx<ew->n_spline;nx++) {
      ikx = kx - nx;
      if (ikx < 0)                 ikx +=  ew->pme_grid[0];
      if (ikx >= ew->pme_grid[0])  ikx -=  ew->pme_grid[0];
      
      if (nx == 0) {
	mdx = Mn1x(i,0);
      } else {
	mdx = Mn1x(i,nx) - Mn1x(i,nx-1);
      }
      
      for (ny=0;ny<ew->n_spline;ny++) {
	iky = ky - ny;
	if (iky < 0)                 iky +=  ew->pme_grid[1];
	if (iky >= ew->pme_grid[1])  iky -=  ew->pme_grid[1];
	
	if (ny == 0) {
	  mdy = Mn1y(i,0);
	} else {
	  mdy = Mn1y(i,ny)-Mn1y(i,ny-1);
	}
	
	for (nz=0;nz<ew->n_spline;nz++) {
	  ikz = kz - nz;
	  if (ikz < 0)                 ikz +=  ew->pme_grid[2];
	  if (ikz >= ew->pme_grid[2])  ikz -=  ew->pme_grid[2];

	  if (nz == 0) {
	    mdz = Mn1z(i,0);
	  } else {
	    mdz = Mn1z(i,nz) - Mn1z(i,nz-1);
	  }
	    
	  ad->f[i].x -= ew->pme_grid[0]/box[0]*ad->q[i]*mdx*Mny(i,ny)*Mnz(i,nz)
	    * QARR(ikx,iky,ikz).re;
	  ad->f[i].y -= ew->pme_grid[1]/box[1]*ad->q[i]*Mnx(i,nx)*mdy*Mnz(i,nz)
	    * QARR(ikx,iky,ikz).re;
	  ad->f[i].z -= ew->pme_grid[2]/box[2]*ad->q[i]*Mnx(i,nx)*Mny(i,ny)*mdz
	    * QARR(ikx,iky,ikz).re;

	  /* DEBUG 
	  printf("Q: %d %d %d %f\n", ikx, iky, ikz, QARR(ikx,iky,ikz).re); */
	}
      }
    }
  }

  /* DEBUG */
#if 0    
  {
    FILE *fp;
    int j;
    static int step = 0;

    step++;
    if (step == 100) {
    fp = fopen("tmp","w");
    for (i=0;i<ad->natom;i++) {
      for (j=0;j<ew->n_spline;j++) {
	fprintf(fp,"%d %d %25.17e %25.17e\n",i,j,Mnx(i,j),Mn1x(i,j));
      }
    }
    fclose(fp);
    /* exit(1); */
    }
  
  }
#endif /* 0 ** DEBUG */
}

#else  /** #if PME_STORE **/

void EW_pme_energy_force(EWALD *ew, ATOM_DATA *ad, BOUNDARY *bc, double *elec)
{
  int i, n, k;
  int ix, iy, iz;
  double ux, uy, uz;
  int kx, ky, kz;
  int nx, ny, nz;
  int ikx, iky, ikz;
  int imx, imy, imz;
  FFTW_COMPLEX s;
  double energy, b, c, e;
  double mdx,mdy,mdz;
  VEC total_f;
  double twopi_beta2;
  double mx, my, mz, m2;
  double fx, fy, fz, qQ;

  twopi_beta2 = -M_PI*M_PI/(ew->beta*ew->beta);
  
  /* Step 1. calculation of Q */
  for (ix=0;ix<ew->pme_grid[0];ix++) {
    for (iy=0;iy<ew->pme_grid[1];iy++) {
      for (iz=0;iz<ew->pme_grid[2];iz++) {
	QARR(ix,iy,iz).re = QARR(ix,iy,iz).im = 0.0;
      }
    }
  }
  for (i=0;i<ad->natom;i++) {
    /*
    ux = ew->pme_grid[0] * (ad->x[i].x - min[0]) / box[0];
    uy = ew->pme_grid[1] * (ad->x[i].y - min[1]) / box[1];
    uz = ew->pme_grid[2] * (ad->x[i].z - min[2]) / box[2];
    */
    ux = ew->pme_grid[0] * VEC_MUL_MAT_X(ad->x[i],bc->recip);
    uy = ew->pme_grid[1] * VEC_MUL_MAT_Y(ad->x[i],bc->recip);
    uz = ew->pme_grid[2] * VEC_MUL_MAT_Z(ad->x[i],bc->recip);
    
    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);
    
    ew->Mnx[0] = ux - kx;            /* value for kx */
    ew->Mnx[1] = 1.0 - ew->Mnx[0];   /* value for kx - 1 */
    
    ew->Mny[0] = uy - ky;            /* value for ky */
    ew->Mny[1] = 1.0 - ew->Mny[0];   /* value for ky - 1 */

    ew->Mnz[0] = uz - kz;            /* value for kz */
    ew->Mnz[1] = 1.0 - ew->Mnz[0];   /* value for kz - 1 */
    
    for (n=3;n<=ew->n_spline;n++) {
      ew->Mnx[n-1] = ew->Mnx[n-2]* (-ux+kx+1) / (n-1);   /* (n-(ux-kx+n-1))/(n-1) */
      ew->Mny[n-1] = ew->Mny[n-2]* (-uy+ky+1) / (n-1);
      ew->Mnz[n-1] = ew->Mnz[n-2]* (-uz+kz+1) / (n-1);
      
      for (k=n-2;k>=1;k--) {
	ew->Mnx[k] = (ux-kx+k)/(n-1)*ew->Mnx[k] + (n-(ux-kx+k))/(n-1)*ew->Mnx[k-1];
	ew->Mny[k] = (uy-ky+k)/(n-1)*ew->Mny[k] + (n-(uy-ky+k))/(n-1)*ew->Mny[k-1];
	ew->Mnz[k] = (uz-kz+k)/(n-1)*ew->Mnz[k] + (n-(uz-kz+k))/(n-1)*ew->Mnz[k-1];
      }
      ew->Mnx[0] = (ux-kx)/(n-1)*ew->Mnx[0];
      ew->Mny[0] = (uy-ky)/(n-1)*ew->Mny[0];
      ew->Mnz[0] = (uz-kz)/(n-1)*ew->Mnz[0];
    }

    for (nx=0;nx<ew->n_spline;nx++) {
      ikx = kx - nx;
      if (ikx < 0)                 ikx +=  ew->pme_grid[0];
      if (ikx >= ew->pme_grid[0])  ikx -=  ew->pme_grid[0];
      
      for (ny=0;ny<ew->n_spline;ny++) {
	iky = ky - ny;
	if (iky < 0)                 iky +=  ew->pme_grid[1];
	if (iky >= ew->pme_grid[1])  iky -=  ew->pme_grid[1];
	
	for (nz=0;nz<ew->n_spline;nz++) {
	  ikz = kz - nz;
	  if (ikz < 0)                 ikz +=  ew->pme_grid[2];
	  if (ikz >= ew->pme_grid[2])  ikz -=  ew->pme_grid[2];
	  
	  QARR(ikx,iky,ikz).re += ad->q[i]*ew->Mnx[nx]*ew->Mny[ny]*ew->Mnz[nz];

	  /* DEBUG 
	  printf("Q: %d %d %d %f\n", ikx, iky, ikz, QARR(ikx,iky,ikz).re); */
	}
      }
    }
  }
  
  /* Step 2. inverse FFT */
  fftwnd(ew->back_plan, 1, ew->Q, 1, 0, 0, 0, 0);

  /* Step 3. calculation of C array */
  if (ew->pressure_flag) {
    EW_calc_c_array(ew, bc);
  }

  /* Step 4. calculation of Energy */
  /* Step 5. calculation of virial tensor */
  energy = 0.0;
  for (ikx=0;ikx<ew->pme_grid[0];ikx++) {
    if (ikx <= ew->pme_grid[0]/2)
      imx = ikx;
    else
      imx = ikx-ew->pme_grid[0];
    for (iky=0;iky<ew->pme_grid[1];iky++) {
      if (iky <= ew->pme_grid[1]/2)
	imy = iky;
      else
	imy = iky-ew->pme_grid[1];
      for (ikz=0;ikz<ew->pme_grid[2];ikz++) {
	if (ikz <= ew->pme_grid[2]/2)
	  imz = ikz;
	else
	  imz = ikz-ew->pme_grid[2];

	s = QARR(ikx,iky,ikz);
	b = BARR(ikx,iky,ikz);
	c = CARR(ikx,iky,ikz);
	/*
	e = b*c*(s.re*s.re+s.im*s.im);
	if (ikz==0 || (ikz==ew->pme_grid[2]/2 && ew->pme_grid[2]%2==0))
	  e *= 0.5;
	*/
	e = 0.5*b*c*(s.re*s.re+s.im*s.im);
	energy += e;
	mx=imx*bc->recip[0][0]+imy*bc->recip[0][1]+imz*bc->recip[0][2];
	my=imx*bc->recip[1][0]+imy*bc->recip[1][1]+imz*bc->recip[1][2];
	mz=imx*bc->recip[2][0]+imy*bc->recip[2][1]+imz*bc->recip[2][2];
	
	m2 = mx*mx+my*my+mz*mz;
	if (m2 != 0.0) {
	  ad->virial[0] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*mx*mx);
	  ad->virial[1] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*my*my);
	  ad->virial[2] += e*(1.0-2.0*(1.0/m2-twopi_beta2)*mz*mz);
	  
	  ad->virial[3] += e*(-2.0*(1.0/m2-twopi_beta2)*mx*my);
	  ad->virial[4] += e*(-2.0*(1.0/m2-twopi_beta2)*mx*mz);
	  ad->virial[5] += e*(-2.0*(1.0/m2-twopi_beta2)*my*mz);
	}
	
	QARR(ikx,iky,ikz).re *= b*c;
	QARR(ikx,iky,ikz).im *= b*c;
	/* DEBUG 
	  printf("%f %f %f %f\n",s.re,s.im,CARR(ikx,iky,ikz),BARR(ikx,iky,ikz)); */
      }
    }
  }
  *elec += energy;

  /* Step 6. calculation of Force */
  fftwnd(ew->for_plan, 1, ew->Q, 1, 0, 0, 0, 0);

  for (i=0;i<ad->natom;i++) {
    /*
    ux = ew->pme_grid[0] * (ad->x[i].x - min[0]) / box[0];
    uy = ew->pme_grid[1] * (ad->x[i].y - min[1]) / box[1];
    uz = ew->pme_grid[2] * (ad->x[i].z - min[2]) / box[2];
    */
    ux = ew->pme_grid[0] * VEC_MUL_MAT_X(ad->x[i],bc->recip);
    uy = ew->pme_grid[1] * VEC_MUL_MAT_Y(ad->x[i],bc->recip);
    uz = ew->pme_grid[2] * VEC_MUL_MAT_Z(ad->x[i],bc->recip);
    
    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);

    ew->Mnx[0] = ux - kx;            /* value for kx */
    ew->Mnx[1] = 1.0 - ew->Mnx[0];   /* value for kx - 1 */
    
    ew->Mny[0] = uy - ky;            /* value for ky */
    ew->Mny[1] = 1.0 - ew->Mny[0];   /* value for ky - 1 */

    ew->Mnz[0] = uz - kz;            /* value for kz */
    ew->Mnz[1] = 1.0 - ew->Mnz[0];   /* value for kz - 1 */
    
    for (n=3;n<=ew->n_spline;n++) {
      ew->Mnx[n-1] = ew->Mnx[n-2]* (-ux+kx+1) / (n-1);   /* (n-(ux-kx+n-1))/(n-1) */
      ew->Mny[n-1] = ew->Mny[n-2]* (-uy+ky+1) / (n-1);
      ew->Mnz[n-1] = ew->Mnz[n-2]* (-uz+kz+1) / (n-1);
      
      for (k=n-2;k>=1;k--) {
	ew->Mnx[k] = (ux-kx+k)/(n-1)*ew->Mnx[k] + (n-(ux-kx+k))/(n-1)*ew->Mnx[k-1];
	ew->Mny[k] = (uy-ky+k)/(n-1)*ew->Mny[k] + (n-(uy-ky+k))/(n-1)*ew->Mny[k-1];
	ew->Mnz[k] = (uz-kz+k)/(n-1)*ew->Mnz[k] + (n-(uz-kz+k))/(n-1)*ew->Mnz[k-1];
      }
      ew->Mnx[0] = (ux-kx)/(n-1)*ew->Mnx[0];
      ew->Mny[0] = (uy-ky)/(n-1)*ew->Mny[0];
      ew->Mnz[0] = (uz-kz)/(n-1)*ew->Mnz[0];
      
      if (n == ew->n_spline-1) {
	for (k=0;k<=n-1;k++) {
	  ew->Mn1x[k] = ew->Mnx[k];
	  ew->Mn1y[k] = ew->Mny[k];
	  ew->Mn1z[k] = ew->Mnz[k];
	}
	ew->Mn1x[n] = ew->Mn1y[n] = ew->Mn1z[n] = 0.0;
      }
    }
    
    for (nx=0;nx<ew->n_spline;nx++) {
      ikx = kx - nx;
      if (ikx < 0)                 ikx +=  ew->pme_grid[0];
      if (ikx >= ew->pme_grid[0])  ikx -=  ew->pme_grid[0];
      
      if (nx == 0) {
	mdx = ew->Mn1x[0];
      } else {
	mdx = ew->Mn1x[nx] - ew->Mn1x[nx-1];
      }
      
      for (ny=0;ny<ew->n_spline;ny++) {
	iky = ky - ny;
	if (iky < 0)                 iky +=  ew->pme_grid[1];
	if (iky >= ew->pme_grid[1])  iky -=  ew->pme_grid[1];
	
	if (ny == 0) {
	  mdy = ew->Mn1y[0];
	} else {
	  mdy = ew->Mn1y[ny]-ew->Mn1y[ny-1];
	}
	
	for (nz=0;nz<ew->n_spline;nz++) {
	  ikz = kz - nz;
	  if (ikz < 0)                 ikz +=  ew->pme_grid[2];
	  if (ikz >= ew->pme_grid[2])  ikz -=  ew->pme_grid[2];

	  if (nz == 0) {
	    mdz = ew->Mn1z[0];
	  } else {
	    mdz = ew->Mn1z[nz] - ew->Mn1z[nz-1];
	  }

	  /* atomic energy calculation */
	  if (ad->atom_ene_sample_flag) {
	    ad->atom_ene[i][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] +=
	      0.5*ad->q[i]*ew->Mnx[nx]*ew->Mny[ny]*ew->Mnz[nz]*QARR(ikx,iky,ikz).re;
	  }
	  
	  qQ=ad->q[i]*QARR(ikx,iky,ikz).re;
	  
	  fx=ew->pme_grid[0]*mdx*ew->Mny[ny]*ew->Mnz[nz]*qQ;
	  fy=ew->pme_grid[1]*ew->Mnx[nx]*mdy*ew->Mnz[nz]*qQ;
	  fz=ew->pme_grid[2]*ew->Mnx[nx]*ew->Mny[ny]*mdz*qQ;
	  
	  ad->f[i].x -= fx*bc->recip[0][0]+fy*bc->recip[0][1]+fz*bc->recip[0][2];
	  ad->f[i].y -= fx*bc->recip[1][0]+fy*bc->recip[1][1]+fz*bc->recip[1][2];
	  ad->f[i].z -= fx*bc->recip[2][0]+fy*bc->recip[2][1]+fz*bc->recip[2][2];

	  /* DEBUG 
	  printf("Q: %d %d %d %f\n", ikx, iky, ikz, QARR(ikx,iky,ikz).re); */
	}
      }
    }
  }
}

#endif  /** #if PME_STORE **/


void EW_calc_c_array(EWALD *ew, BOUNDARY *bc)
{
  int ikx, iky, ikz;
  int imx, imy, imz;
  double mx, my, mz;
  double pi_v, twopi_beta2;
  double m2;

  pi_v = 1.0/(M_PI*bc->V);
  twopi_beta2 = -M_PI*M_PI/(ew->beta*ew->beta);
  
  CARR(0,0,0) = 0.0;
  for (ikx=0;ikx<ew->pme_grid[0];ikx++) {
    if (ikx <= ew->pme_grid[0]/2)
      imx = ikx;
    else
      imx = ikx-ew->pme_grid[0];
    for (iky=0;iky<ew->pme_grid[1];iky++) {
      if (iky <= ew->pme_grid[1]/2)
	imy = iky;
      else
	imy = iky-ew->pme_grid[1];
      for (ikz=0;ikz<ew->pme_grid[2];ikz++) {
	if (ikz <= ew->pme_grid[2]/2)
	  imz = ikz;
	else
	  imz = ikz-ew->pme_grid[2];
	
	if (ikx==0&&iky==0&&ikz==0) continue;

	mx=imx*bc->recip[0][0]+imy*bc->recip[0][1]+imz*bc->recip[0][2];
	my=imx*bc->recip[1][0]+imy*bc->recip[1][1]+imz*bc->recip[1][2];
	mz=imx*bc->recip[2][0]+imy*bc->recip[2][1]+imz*bc->recip[2][2];
	m2 = mx*mx+my*my+mz*mz;
	CARR(ikx,iky,ikz) = pi_v * exp(twopi_beta2*m2)/m2;
      }
    }
  }
}

void EW_calc_b_array(EWALD *ew)
{
  double *b[3];
  MD_COMPLEX t, t2, sum;
  double *Mn;
  int n, k, i, m;
  int ix, iy, iz;
  
  b[0] = emalloc("b1 in calc_b_array", sizeof(double)*ew->pme_grid[0]);
  b[1] = emalloc("b2 in calc_b_array", sizeof(double)*ew->pme_grid[1]);
  b[2] = emalloc("b3 in calc_b_array", sizeof(double)*ew->pme_grid[2]);
  Mn   = emalloc("Mn in calc_b_array", sizeof(double)*(ew->n_spline+1));

  /* case of n == 2 */
  Mn[0] = 0.0;
  Mn[1] = 1.0;
  Mn[2] = 0.0;
  
  for (n=3;n<=ew->n_spline;n++) {
    Mn[n] = 0.0;
    for (k=n-1;k>=1;k--) {
      Mn[k] = Mn[k] * (double) k / (n-1) + Mn[k-1] * (double) (n-k)/(n-1);
    }
    Mn[0] = 0.0;
  }
  
  /* DEBUG 
  for (k=0;k<=ew->n_spline;k++) {
    printf("Mn[%d] = %f\n", k, Mn[k]);
  }
  */
  
  for (i=0;i<3;i++) {
    for (m=0;m<ew->pme_grid[i];m++) {
      sum.r = sum.i = 0.0;
      for (k=0;k<ew->n_spline-1;k++) {
	sum.r += Mn[k+1] * cos(2.0*M_PI*m*k/ew->pme_grid[i]);
	sum.i += Mn[k+1] * sin(2.0*M_PI*m*k/ew->pme_grid[i]);
	/* DEBUG
	printf("sum(%d,%d,%d) = (%f,%f)\n", i, m, k,sum.r,sum.i); */
      }
      
      /* DEBUG 
      printf("sum(%d,%d) = (%f,%f)\n", i, m, sum.r,sum.i); */
      
      if (fabs(sum.r) < EPS && fabs(sum.i) < EPS) {
	b[i][m] = 0.0;
	continue;
      }
      
      t.r = cos(2.0*M_PI*(ew->n_spline-1)*m/ew->pme_grid[i]);
      t.i = sin(2.0*M_PI*(ew->n_spline-1)*m/ew->pme_grid[i]);
      t2.r = (t.r * sum.r + t.i * sum.i) / (sum.r*sum.r+sum.i*sum.i);
      t2.i = (t.i * sum.r - t.r * sum.i) / (sum.r*sum.r+sum.i*sum.i);
      b[i][m] = t2.r * t2.r + t2.i * t2.i;
      /* DEBUG 
      printf("b[%d][%d] = %f\n", i, m, b[i][m]);
      */
    }
  }

  for (ix=0;ix<ew->pme_grid[0];ix++) {
    for (iy=0;iy<ew->pme_grid[1];iy++) {
      for (iz=0;iz<ew->pme_grid[2];iz++) {
	BARR(ix,iy,iz) = b[0][ix] * b[1][iy] * b[2][iz];
	/* DEBUG 
	printf("B(%d %d %d) = %e\n", ix,iy,iz,BARR(ix,iy,iz));
	*/
      }
    }
  }

  free(b[0]);
  free(b[1]);
  free(b[2]);
  free(Mn);
}

#endif /* if 0 */
