/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include "md_system.h"

#ifdef MPI_SDMD

#include "parallel.h"
#include "sdmd.h"

#ifdef _OPENMP
#include "omp.h"
#endif

#define SDMD_EW_MEASURE_TIME

#ifndef SDMD_EW_MEASURE_TIME
#define SDMD_EW_dtime()
#define SDMD_EW_add_dtime(x)
#endif

double erfc(double);

void SDMD_EW_pme_init(EWALD *ew, BOUNDARY *bc, NONBOND_LIST *nl, ATOM_DATA *ad, 
		      double tol_beta, int tol_beta_flag, 
		      int grid[3], double d_grid, 
		      double diel_sur, int n_spline, int erfc_resolution,
		      int pme_n_pe)
{
  int i, tmp;
  double ewbeta_lo, ewbeta_hi;
  double sum_q;
  char *func = "SDMD_EW_pme_init";

  if (bc->type != PERIODIC_BOUNDARY) {
    lprintf("ERROR: Ewald: No periodic boundary condition found.\n");
    marble_exit(1);
  }

  ew->flag = FLAG_PME;
  ew->diel_sur = diel_sur;
  
  if (tol_beta_flag) {
    ew->tolerance = tol_beta;
    ew->beta = 1.0;
    while (erfc(ew->beta*nl->rl_off)/nl->rl_off >= ew->tolerance) ew->beta *= 2.0;
    ewbeta_lo = 0.0;
    ewbeta_hi = ew->beta;
    for (i=0;i<100;i++) {
      ew->beta = 0.5*(ewbeta_lo+ewbeta_hi);
      if (erfc(ew->beta*nl->rl_off)/nl->rl_off >= ew->tolerance) {
	ewbeta_lo = ew->beta;
      } else {
	ewbeta_hi = ew->beta;
      }
    }
  } else {
    ew->beta = tol_beta;
    ew->tolerance = erfc(ew->beta*nl->rl_off)/nl->rl_off;
  }
  ew->dir_const = 2.0/sqrt(M_PI)*ew->beta;

  /* calculation of self energy */
  sum_q = 0.0;
  ew->self_energy = 0.0;
  for (i=0;i<ad->natom;i++) {
    ew->self_energy -= ad->q[i] * ad->q[i];
    sum_q += ad->q[i];
  }
  ew->self_energy *= ew->beta / sqrt(M_PI);
  ew->self_energy_v = -0.5 * M_PI * sum_q * sum_q / (ew->beta*ew->beta);
  /* end of calculation of self energy */

  ew->n_spline = n_spline;
  ew->n_spline_h = (n_spline + 1) / 2;
  ew->Mnx  = emalloc(func, sizeof(double)*(ew->n_spline+1));
  ew->Mny  = emalloc(func, sizeof(double)*(ew->n_spline+1));
  ew->Mnz  = emalloc(func, sizeof(double)*(ew->n_spline+1));
  ew->Mn1x = emalloc(func, sizeof(double)*(ew->n_spline+1));
  ew->Mn1y = emalloc(func, sizeof(double)*(ew->n_spline+1));
  ew->Mn1z = emalloc(func, sizeof(double)*(ew->n_spline+1));
#ifdef ERFC_TABLE  
  EW_set_erfc_table(ew, nl->rl_off, erfc_resolution);
#endif

  /*
  ew->n_pme_local = 1;
  for (i=0;i<3;i++) {
    ew->pme_grid[i] = grid[i];
    if (grid[i] % mpi.npx[i] != 0) {
      lprintf("ERROR: PME: n_grid (%d) must be divisible by n_pe (%d) in dimension %d.\n", 
	      grid[i], mpi.npx[i], i);
      marble_exit(1);
    }
    ew->pme_local[i] = ew->pme_grid[i] / mpi.npx[i];
    ew->n_pme_local *= ew->pme_local[i];
    ew->local_start[i] = ew->pme_local[i] * mpi.px[i];
    ew->local_end[i] = ew->local_start[i] + ew->pme_local[i] - 1;
  }
  for (i=0;i<3;i++) {
    if ((ew->n_pme_local / ew->pme_local[i]) % mpi.npx[i] != 0) {
      lprintf("ERROR: PME: local_n_grid2 (%d) must be divisible by n_pe (%d) in dimension %d.\n", 
	      ew->n_pme_local / ew->pme_local[i], mpi.npx[i], i);
      marble_exit(1);
    }
  }
  */
  /*
  } else {
    for (i=0;i<3;i++) {
      tmp = (bc->reclen[i] / mpi.npx[i]) / grid_dx;
      ew->pme_grid[i] = (tmp + 1) * mpi.npx[i];
    }
  }
  */
  if (grid[0] < 0) {
    int dd;
    /* request grid calc */
    for (i=0;i<3;i++) {
      dd = ceil(bc->boxv[i][i] / (mpi.npx[i] * d_grid));
      while (!SDMD_EW_grid_divisor(dd, 7)) {
	dd++;
      }
      ew->pme_grid[i] =  dd * mpi.npx[i];
    }
  } else {
    for (i=0;i<3;i++) {
      ew->pme_grid[i] = grid[i];
    }
  }

  ew->fft3d = emalloc(func, sizeof(FFT3D));
  if (FFT3D_setup(ew->fft3d, ew->pme_grid, 0)) {
    marble_exit(1);
  }

  ew->n_pme_local = 1;
  for (i=0;i<3;i++) {
    ew->pme_grid_h[i] = ew->pme_grid[i]/2;
    ew->pme_local[i] = ew->fft3d->lnx[i];
    ew->n_pme_local *= ew->pme_local[i];
    ew->local_start[i] = ew->fft3d->start_x[i];
    ew->local_end[i] = ew->local_start[i] + ew->pme_local[i] - 1;
  }
  
  ew->qr = emalloc(func, sizeof(fftw_real) * ew->fft3d->n_data);
  ew->B  = emalloc(func, sizeof(double) * ew->fft3d->n_tdata);
  ew->C  = emalloc(func, sizeof(double) * ew->fft3d->n_tdata);

  ew->fqc = emalloc(func, sizeof(fftw_complex) * ew->fft3d->n_tdata);

  SDMD_EW_calc_b_array(ew);
  SDMD_EW_calc_c_array(ew, bc);

#ifdef _OPENMP
  ew->td = emalloc(func, sizeof(EW_TD)*mpi.n_threads);
  for (i=0;i<mpi.n_threads;i++) {
    ew->td[i].qr = emalloc(func, sizeof(fftw_real) * ew->fft3d->n_data);
    ew->td[i].Mnx  = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].Mny  = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].Mnz  = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].Mn1x = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].Mn1y = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].Mn1z = emalloc(func, sizeof(double)*(ew->n_spline+1));
    ew->td[i].recf = emalloc(func, sizeof(VEC)*ad->natom);
  }
#endif
  
  tmp = 0;
  for (i=0;i<ad->natom;i++) {
    tmp += ad->ex[i].n_exatom;
  }
  ew->cor_list = emalloc(func, sizeof(EW_COR)*tmp);

  lprintf("Particle Mesh Ewald:\n");
  lprintf("  Ewald Tolerance  : %.2e\n", ew->tolerance);
  lprintf("  Ewald Coefficient: %f\n", ew->beta);
  lprintf("  PME Grid         : %d %d %d\n",
	  ew->pme_grid[0],ew->pme_grid[1],ew->pme_grid[2]);
  lprintf("  PME Delta Grid   : %.2f %.2f %.2f\n",
	  bc->boxv[0][0]/ew->pme_grid[0],
	  bc->boxv[1][1]/ew->pme_grid[1],
	  bc->boxv[2][2]/ew->pme_grid[2]);
  lprintf("  Spline Order     : %d\n", ew->n_spline);
  if (ew->diel_sur > 0.0) {
    lprintf("  Surroundings     : ");
    if (ew->diel_sur == 1.0) {
      lprintf("vacuum\n");
    } else if (ew->diel_sur == 80.0) {
      lprintf("water (diel=80)\n");
    } else {
      lprintf("diel=%.1f\n",ew->diel_sur);
    }
  }
  lprintf("  Optimized Influence Function : %s\n", ew->opt_infl ? "on" : "off");
  {
    double charge_const;
    if (ad->mdat_format == MDAT_CHARMM)
      charge_const = sqrt(332.0716);
    else
      charge_const = 18.2223;  /* amber */
    
    lprintf("  Total Charges : %.2f\n", sum_q/charge_const);
    if (fabs(sum_q/charge_const) > 0.01) {
      lprintf("  Assuming uniform neutralizing plasma : %s\n", ew->self_energy_v_flag ? "on" : "off");
    }
  }
  lprintf("  Resolution of erfc : %d\n", erfc_resolution);
  lprintf("  Table for erfc   : %d points for %.2f Angstrom\n", ew->n_erfc_table, nl->rl_off+1.0);
  lprintf("\n");
  lflush();
 
}

int SDMD_EW_grid_divisor(int grid, int max)
{
  int gg, n_divisor, cdiv, divisor[100];
  
  n_divisor = 0;
  gg = grid;
  cdiv = 2;
  while (gg > 1) {
    if (gg % cdiv == 0) {
      divisor[n_divisor] = cdiv;
      if (++n_divisor >= 100) {
	lprintf("ERROR: number of grids (%d) is too large\n", grid);
	marble_exit(1);
      }
      gg /= cdiv;
    } else {
      cdiv++;
      if (cdiv > max)
	return 0;
    }
  }
  return 1;
}


void SDMD_EW_assign_cor_to_PE(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad)
{
  int iatom, jatom, j, icell, jcell, cell1, cell2, j_pe;
  int nlist = 0;

  if (ew->flag == FLAG_NO_EWALD) return;

  for (icell=0;icell<lc->n_cell;icell++) {
    if (lc->cell[icell].pe != mpi.rank) continue;
    
    for (iatom=lc->cell[icell].head; iatom>=0;iatom=lc->next_atom[iatom]) {
      
      for (j=0;j<ad->ex[iatom].n_exatom;j++) {
	jatom = ad->ex[iatom].exatom[j];
	jcell=lc->atom_cell[jatom];
	j_pe = lc->cell[jcell].pe;
	
	if (j_pe == mpi.rank && iatom>=jatom) continue;
	
	if (SDMD_cellcmp(lc,icell,jcell)) {
	  cell1=icell; cell2=jcell;
	} else {
	  cell1=jcell; cell2=icell;
	}
	
	if (lc->cell[cell1].pe == mpi.rank) {
	  ew->cor_list[nlist].i = iatom;
	  ew->cor_list[nlist].j = jatom;
	  nlist++;
	  lc->cell[cell1].req |= CELL_REQ_EW_COR;
	  lc->cell[cell2].req |= CELL_REQ_EW_COR;
	
#ifdef TR_LIST_ATOM_REQ
  	  /*
	    ad->ex[iatom].flag |= ATOM_REQ_INT;
	    ad->ex[jatom].flag |= ATOM_REQ_INT;
	  */
	  lc->atom_req[iatom] |= ATOM_REQ_INT;
	  lc->atom_req[jatom] |= ATOM_REQ_INT;
#endif	
	}
      }
    }
  }
  ew->n_cor_list = nlist;
  /*lprintf("n_cor_list = %d\n",nlist);*/
}




void SDMD_EW_direct_energy_force(EWALD *ew, LINKED_CELL *lc, NONBOND_LIST *nl,
				 ATOM_DATA *ad, BOUNDARY *bc,
				 double *vdw, double *edir, double *erec, double *hbond)
{
  if (nl->vdw_method == NV_PSW)
#define NB_VDW   NV_PSW
#define NB_ELEC  NE_EWALD
#include "nonbond_direct.h"
  else if (nl->vdw_method == NV_FSW)
#define NB_VDW   NV_FSW
#define NB_ELEC  NE_EWALD
#include "nonbond_direct.h"
}

/******************************************************************/
/***                     Particle Mesh Ewald                    ***/
/******************************************************************/

void SDMD_EW_pme(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc, double *elec)
{
  SDMD_EW_dtime();

  SDMD_EW_charge_grid(ew, lc, ad, bc);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_CHARGE_GRID]);

  SDMD_EW_fft(ew);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FFT]);

  SDMD_EW_energy(ew, ad, bc, elec);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_ENERGY]);

  SDMD_EW_fft_back(ew);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FFTB]);

  SDMD_EW_force(ew, lc, ad, bc);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FORCE]);
}

void SDMD_EW_pme1(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  SDMD_EW_dtime();
  SDMD_EW_charge_grid(ew, lc, ad, bc);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_CHARGE_GRID]);
}

void SDMD_EW_pme2(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  SDMD_EW_dtime();
  SDMD_EW_fft(ew);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FFT]);
  ew->erec = 0.0;
  SDMD_EW_energy(ew, ad, bc, &ew->erec);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_ENERGY]);
  SDMD_EW_fft_back(ew);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FFTB]);
}

void SDMD_EW_pme3(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  SDMD_EW_dtime();
  SDMD_EW_force(ew, lc, ad, bc);
  SDMD_EW_add_dtime(&ew->time[SDMD_EW_TIME_FORCE]);
}

void SDMD_EW_charge_grid(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
  
#ifdef _OPENMP
#pragma omp parallel
{  
  int i_th;
#endif
  int ok;
  int i, j, n, k;
  int ix, iy, iz;
  int kx, ky, kz;
  int nx, ny, nz;
  int ikx, iky, ikz;
  double ux, uy, uz;
  double frx, fry, frz;
  double invn1;
  int n_atom_cell_req, * restrict atom_cell_req;
  fftw_real * restrict qr;
  double * restrict Mnx, * restrict Mny, * restrict Mnz;
  double * restrict Mn1x, * restrict Mn1y, * restrict Mn1z;
  double (* restrict xx)[3], (* restrict recip)[3], * restrict q;
  int * restrict pme_grid, * restrict local_start, * restrict local_end;
  int * restrict pme_local;
  int n_spline, n_spline_h;

#ifdef _OPENMP
  i_th = omp_get_thread_num();
  qr = ew->td[i_th].qr;
  Mnx = ew->td[i_th].Mnx;
  Mny = ew->td[i_th].Mny;
  Mnz = ew->td[i_th].Mnz;
  Mn1x = ew->td[i_th].Mn1x;
  Mn1y = ew->td[i_th].Mn1y;
  Mn1z = ew->td[i_th].Mn1z;
#else
  qr = ew->qr;
  Mnx = ew->Mnx;
  Mny = ew->Mny;
  Mnz = ew->Mnz;
  Mn1x = ew->Mn1x;
  Mn1y = ew->Mn1y;
  Mn1z = ew->Mn1z;
#endif  
  n_atom_cell_req = lc->n_atom_cell_req;
  atom_cell_req = lc->atom_cell_req;
  xx = (double (*)[3]) ad->x;
  q  = ad->q;
  recip = (double (*)[3]) bc->recip;
  pme_grid = ew->pme_grid;
  local_start = ew->local_start;
  local_end = ew->local_end;
  pme_local = ew->pme_local;
  n_spline = ew->n_spline;
  n_spline_h = ew->n_spline_h;
  
  /* Step 1. calculation of Q */
  memset(qr, 0, sizeof(fftw_real) * ew->fft3d->n_data);

#ifdef _OPENMP
#pragma omp for schedule(guided)
#endif  
  for (j=0;j<n_atom_cell_req;j++) {
    i = atom_cell_req[j];
      
    ux = V3_MUL_MAT_X(xx[i],recip);
    uy = V3_MUL_MAT_Y(xx[i],recip);
    uz = V3_MUL_MAT_Z(xx[i],recip);

    frx = ux-floor(ux);
    fry = uy-floor(uy);
    frz = uz-floor(uz);
      
    ux = frx * pme_grid[0];
    uy = fry * pme_grid[1];
    uz = frz * pme_grid[2];
    
    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);

    /* check routine */
    ok = 0;
    for (nx=0;nx<n_spline;nx++) {
      ikx = kx - nx + n_spline_h;
      if (ikx < 0)             ikx +=  pme_grid[0];
      if (ikx >= pme_grid[0])  ikx -=  pme_grid[0];
      if (ikx >= local_start[0] && ikx <= local_end[0]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;

    ok = 0;
    for (ny=0;ny<n_spline;ny++) {
      iky = ky - ny + n_spline_h;
      if (iky < 0)                 iky +=  pme_grid[1];
      if (iky >= pme_grid[1])  iky -=  pme_grid[1];
      
      if (iky >= local_start[1] && iky <= local_end[1]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;
      
    for (nz=0;nz<n_spline;nz++) {
      ikz = kz - nz + n_spline_h;
      if (ikz < 0)                 ikz +=  pme_grid[2];
      if (ikz >= ew->pme_grid[2])  ikz -=  pme_grid[2];

      if (ikz >= local_start[2] && ikz <= local_end[2]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;
    /* end of check routine */

    /* Spline order 2 */
    Mnx[0] = ux - kx;            /* value for kx */
    Mnx[1] = 1.0 - Mnx[0];   /* value for kx - 1 */
    
    Mny[0] = uy - ky;            /* value for ky */
    Mny[1] = 1.0 - Mny[0];   /* value for ky - 1 */

    Mnz[0] = uz - kz;            /* value for kz */
    Mnz[1] = 1.0 - Mnz[0];   /* value for kz - 1 */
    
    for (n=3;n<=n_spline;n++) {
      invn1 = 1.0/(n-1);

      Mnx[n-1] = Mnx[n-2]* (-ux+kx+1) * invn1;   /* (n-(ux-kx+n-1))/(n-1) */
      Mny[n-1] = Mny[n-2]* (-uy+ky+1) * invn1;
      Mnz[n-1] = Mnz[n-2]* (-uz+kz+1) * invn1;
      
      for (k=n-2;k>=1;k--) {
	Mnx[k] = ((ux-kx+k)*Mnx[k] + (n-(ux-kx+k))*Mnx[k-1])*invn1;
	Mny[k] = ((uy-ky+k)*Mny[k] + (n-(uy-ky+k))*Mny[k-1])*invn1;
	Mnz[k] = ((uz-kz+k)*Mnz[k] + (n-(uz-kz+k))*Mnz[k-1])*invn1;
      }
      Mnx[0] = (ux-kx)*Mnx[0]*invn1;
      Mny[0] = (uy-ky)*Mny[0]*invn1;
      Mnz[0] = (uz-kz)*Mnz[0]*invn1;
    }

    for (nz=0;nz<n_spline;nz++) {
      ikz = kz - nz + n_spline_h;
      if (ikz < 0)                 ikz +=  pme_grid[2];
      if (ikz >= pme_grid[2])  ikz -=  pme_grid[2];
      
      if (ikz < local_start[2] || ikz > local_end[2]) continue;
      ikz -= local_start[2];

      
      for (ny=0;ny<n_spline;ny++) {
	iky = ky - ny + n_spline_h;
	if (iky < 0)                 iky +=  pme_grid[1];
	if (iky >= pme_grid[1])  iky -=  pme_grid[1];

	if (iky < local_start[1] || iky > local_end[1]) continue;
	iky -= local_start[1];

	for (nx=0;nx<n_spline;nx++) {
	  ikx = kx - nx + n_spline_h;
	  if (ikx < 0)             ikx +=  pme_grid[0];
	  if (ikx >= pme_grid[0])  ikx -=  pme_grid[0];
	  
	  if (ikx < local_start[0] || ikx > local_end[0]) continue;
	  ikx -= local_start[0];

          qr[(ikx)+pme_local[0]*((iky)+pme_local[1]*(ikz))] += 
	    q[i]*Mnx[nx]*Mny[ny]*Mnz[nz];
	}
      }
    }
  }

#ifdef _OPENMP
 } /* end of parrallel */
 {
   int i, j;
   fftw_real * restrict qr, * restrict tdqr;
   int n_data;
   
   n_data = ew->fft3d->n_data;
   qr = ew->qr;

   memcpy(qr, ew->td[0].qr, sizeof(fftw_real)*n_data);

   for (i=1;i<mpi.n_threads;i++) {
     tdqr = ew->td[i].qr;
     for (j=0;j<n_data;j++) {
       qr[j] += tdqr[j];
     }
   }
 }
#endif

  /*
  for (ikx=0;ikx<ew->pme_local[0];ikx++) {
    for (iky=0;iky<ew->pme_local[1];iky++) {
      for (ikz=0;ikz<ew->pme_local[2];ikz++) {
	printf("Q(%d %d %d) = %e\n", 
	       ikx+ew->local_start[0], iky+ew->local_start[1], 
	       ikz+ew->local_start[2], QRARR(ikx,iky,ikz)); 
      }
    }
  }
  */

}

void SDMD_EW_fft(EWALD *ew)
{
  /*
  int ikx, iky, ikz, i, j;

  for (ikx=0;ikx<ew->pme_local[0];ikx++) {
    for (iky=0;iky<ew->pme_local[1];iky++) {
      for (ikz=0;ikz<ew->pme_local[2];ikz++) {
	printf("Q(%d %d %d) = %e\n", 
	       ikx+ew->local_start[0], iky+ew->local_start[1], 
	       ikz+ew->local_start[2], QRARR(ikx,iky,ikz)); 
      }
    }
  }
  */
  /*
  { FILE *fp;
    fp = fopen("jacQ.dat", "w");
    fwrite(ew->qr, sizeof(double), ew->fft3d->n_data, fp);
    fclose(fp);
  }
  */

  FFT3D_forward(ew->fft3d, ew->qr, ew->fqc);

  /*
  for (i=0,j=0;i<ew->fft3d->n_ikxy_tdata;i++) {
    ikx = ew->fft3d->ikxy_tdata[i][0];
    iky = ew->fft3d->ikxy_tdata[i][1];
    for (ikz=0;ikz<ew->fft3d->nx[2];ikz++,j++) {
      printf("S(%d %d %d) = (%e %e)\n", ikx,iky,ikz, c_re(ew->fqc[j]),c_im(ew->fqc[j]));
    }
  }
  */

}

void SDMD_EW_energy(EWALD *ew, ATOM_DATA *ad, BOUNDARY *bc, double *elec)
{
  int i, j;
  int ikx, iky, ikz;
  int imx, imy, imz;
  double s_re, s_im, b, c, e, energy, m2, inv_m2, mx, my, mz;
  double twopi_beta2;
  fftw_complex * restrict fqc;
  double * restrict B;
  double * restrict C;
  double (* restrict recip)[3];
  double * restrict virial;
  int * restrict pme_local;
  int * restrict local_start;
  int * restrict pme_grid;
  int * restrict pme_grid_h;
  int * restrict nx;
  int (* restrict ikxy_tdata)[2];
  int n_ikxy_tdata;

  energy = 0.0;
  twopi_beta2 = -M_PI*M_PI/(ew->beta*ew->beta);
  pme_local   = ew->pme_local;
  local_start = ew->local_start;
  pme_grid    = ew->pme_grid;
  pme_grid_h  = ew->pme_grid_h;
  nx = ew->fft3d->nx;
  ikxy_tdata = ew->fft3d->ikxy_tdata;
  n_ikxy_tdata = ew->fft3d->n_ikxy_tdata;
  recip = bc->recip;
  fqc = ew->fqc;
  B = ew->B;
  C = ew->C;
  virial = ad->virial;
  
  if (ew->pressure_flag) {
    SDMD_EW_calc_c_array(ew, bc);
  }

  j=0;

  for (i=0;i<n_ikxy_tdata;i++) {
    ikx = ikxy_tdata[i][0];
    iky = ikxy_tdata[i][1];
    if (ikx <= pme_grid_h[0])
      imx = ikx;
    else
      imx = ikx - pme_grid[0];
    if (iky <= pme_grid_h[1])
      imy = iky;
    else
      imy = iky - pme_grid[1];

    for (ikz=0;ikz<nx[2];ikz++,j++) {
      if (ikz <= pme_grid_h[2])
	imz = ikz;
      else
	imz = ikz - pme_grid[2];
      
      s_re = c_re(fqc[j]);
      s_im = c_im(fqc[j]);
      b = B[j];
      c = C[j];
      
      e = 0.5*b*c*(s_re*s_re+s_im*s_im);
      /*printf("tmp %d %d %d %e %e %e %e\n",ikx, iky, ikz, e*2.0, s_re*s_re+s_im*s_im, b,c);*/
      energy += e;
      mx=imx*recip[0][0]+imy*recip[0][1]+imz*recip[0][2];
      my=imx*recip[1][0]+imy*recip[1][1]+imz*recip[1][2];
      mz=imx*recip[2][0]+imy*recip[2][1]+imz*recip[2][2];
      m2 = mx*mx+my*my+mz*mz;
      if (m2 != 0.0) {
	inv_m2 = 1.0/m2;
	virial[0] += e*(1.0-2.0*(inv_m2-twopi_beta2)*mx*mx);
	virial[1] += e*(1.0-2.0*(inv_m2-twopi_beta2)*my*my);
	virial[2] += e*(1.0-2.0*(inv_m2-twopi_beta2)*mz*mz);
	
	virial[3] += e*(-2.0*(inv_m2-twopi_beta2)*mx*my);
	virial[4] += e*(-2.0*(inv_m2-twopi_beta2)*mx*mz);
	virial[5] += e*(-2.0*(inv_m2-twopi_beta2)*my*mz);
      }
	
      c_re(fqc[j]) *= b*c;
      c_im(fqc[j]) *= b*c;
    }
  }
  *elec += energy;
}

void SDMD_EW_fft_back(EWALD *ew)
{
  FFT3D_backward(ew->fft3d, ew->qr, ew->fqc);
}

void SDMD_EW_force(EWALD *ew, LINKED_CELL *lc, ATOM_DATA *ad, BOUNDARY *bc)
{
#ifdef _OPENMP
#pragma omp parallel
{  
  int i_th;
#endif
  int ok;
  int i, j, n, k;
  int ix, iy, iz;
  int kx, ky, kz;
  int nx, ny, nz;
  int ikx, iky, ikz;
  double ux, uy, uz;
  double frx, fry, frz;
  double mdx,mdy,mdz;
  double mx, my, mz, m2;
  double fx, fy, fz, qQ;
  double invn1;
  int n_atom_cell_req, * restrict atom_cell_req;
  fftw_real * restrict qr;
  double * restrict Mnx, * restrict Mny, * restrict Mnz;
  double * restrict Mn1x, * restrict Mn1y, * restrict Mn1z;
  double (* restrict xx)[3], (* restrict recip)[3], * restrict q;
  double (* restrict ff)[3];
  int * restrict pme_grid, * restrict local_start, * restrict local_end;
  int * restrict pme_local;
  int n_spline, n_spline_h;

#ifdef _OPENMP
  i_th = omp_get_thread_num();
  Mnx = ew->td[i_th].Mnx;
  Mny = ew->td[i_th].Mny;
  Mnz = ew->td[i_th].Mnz;
  Mn1x = ew->td[i_th].Mn1x;
  Mn1y = ew->td[i_th].Mn1y;
  Mn1z = ew->td[i_th].Mn1z;
  ff = (double (*)[3]) ew->td[i_th].recf;
#else
  Mnx = ew->Mnx;
  Mny = ew->Mny;
  Mnz = ew->Mnz;
  Mn1x = ew->Mn1x;
  Mn1y = ew->Mn1y;
  Mn1z = ew->Mn1z;
  ff = (double (*)[3]) ad->f;
#endif  
  qr = ew->qr;
  n_atom_cell_req = lc->n_atom_cell_req;
  atom_cell_req = lc->atom_cell_req;
  xx = (double (*)[3]) ad->x;
  q  = ad->q;
  recip = (double (*)[3]) bc->recip;
  pme_grid = ew->pme_grid;
  local_start = ew->local_start;
  local_end = ew->local_end;
  pme_local = ew->pme_local;
  n_spline = ew->n_spline;
  n_spline_h = ew->n_spline_h;

#ifdef _OPENMP
  for (j=0;j<n_atom_cell_req;j++) {
    i = atom_cell_req[j];
    ff[i][0] = ff[i][1] = ff[i][2] = 0.0;
  }
#endif
#ifdef _OPENMP
#pragma omp for schedule(guided)
#endif  
  for (j=0;j<n_atom_cell_req;j++) {
    i = atom_cell_req[j];

    ux = V3_MUL_MAT_X(xx[i],recip);
    uy = V3_MUL_MAT_Y(xx[i],recip);
    uz = V3_MUL_MAT_Z(xx[i],recip);

    frx = ux-floor(ux);
    fry = uy-floor(uy);
    frz = uz-floor(uz);
      
    ux = frx * pme_grid[0];
    uy = fry * pme_grid[1];
    uz = frz * pme_grid[2];

    kx = (int) floor(ux);
    ky = (int) floor(uy);
    kz = (int) floor(uz);

    /* check routine */
    ok = 0;
    for (nx=0;nx<n_spline;nx++) {
      ikx = kx - nx + n_spline_h;
      if (ikx < 0)             ikx +=  pme_grid[0];
      if (ikx >= pme_grid[0])  ikx -=  pme_grid[0];
      if (ikx >= local_start[0] && ikx <= local_end[0]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;

    ok = 0;
    for (ny=0;ny<n_spline;ny++) {
      iky = ky - ny + n_spline_h;
      if (iky < 0)                 iky +=  pme_grid[1];
      if (iky >= pme_grid[1])  iky -=  pme_grid[1];
      
      if (iky >= local_start[1] && iky <= local_end[1]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;
      
    for (nz=0;nz<n_spline;nz++) {
      ikz = kz - nz + n_spline_h;
      if (ikz < 0)                 ikz +=  pme_grid[2];
      if (ikz >= ew->pme_grid[2])  ikz -=  pme_grid[2];

      if (ikz >= local_start[2] && ikz <= local_end[2]) {
	ok = 1;
	break;
      }
    }
    if (!ok) continue;
    /* end of check routine */
    
    Mnx[0] = ux - kx;            /* value for kx */
    Mnx[1] = 1.0 - Mnx[0];   /* value for kx - 1 */
    
    Mny[0] = uy - ky;            /* value for ky */
    Mny[1] = 1.0 - Mny[0];   /* value for ky - 1 */

    Mnz[0] = uz - kz;            /* value for kz */
    Mnz[1] = 1.0 - Mnz[0];   /* value for kz - 1 */
    
    for (n=3;n<=n_spline;n++) {
      invn1 = 1.0/(n-1);
      
      Mnx[n-1] = Mnx[n-2]* (-ux+kx+1) * invn1;   /* (n-(ux-kx+n-1))/(n-1) */
      Mny[n-1] = Mny[n-2]* (-uy+ky+1) * invn1;
      Mnz[n-1] = Mnz[n-2]* (-uz+kz+1) * invn1;
      
      for (k=n-2;k>=1;k--) {
	Mnx[k] = ((ux-kx+k)*Mnx[k] + (n-(ux-kx+k))*Mnx[k-1])*invn1;
	Mny[k] = ((uy-ky+k)*Mny[k] + (n-(uy-ky+k))*Mny[k-1])*invn1;
	Mnz[k] = ((uz-kz+k)*Mnz[k] + (n-(uz-kz+k))*Mnz[k-1])*invn1;
      }
      Mnx[0] = (ux-kx)*Mnx[0]*invn1;
      Mny[0] = (uy-ky)*Mny[0]*invn1;
      Mnz[0] = (uz-kz)*Mnz[0]*invn1;
      
      if (n == n_spline-1) {
	for (k=0;k<=n-1;k++) {
	  Mn1x[k] = Mnx[k];
	  Mn1y[k] = Mny[k];
	  Mn1z[k] = Mnz[k];
	}
	Mn1x[n] = Mn1y[n] = Mn1z[n] = 0.0;
      }
    }

    for (nz=0;nz<n_spline;nz++) {
      ikz = kz - nz + n_spline_h;
      if (ikz < 0)                 ikz +=  pme_grid[2];
      if (ikz >= pme_grid[2])  ikz -=  pme_grid[2];

      if (ikz < local_start[2] || ikz > local_end[2]) continue;
      ikz -= local_start[2];

      if (nz == 0) {
	mdz = Mn1z[0];
      } else {
	mdz = Mn1z[nz] - Mn1z[nz-1];
      }
      
      for (ny=0;ny<n_spline;ny++) {
	iky = ky - ny + n_spline_h;
	if (iky < 0)                 iky +=  pme_grid[1];
	if (iky >= pme_grid[1])  iky -=  pme_grid[1];
	if (iky < local_start[1] || iky > local_end[1]) continue;
	iky -= local_start[1];

	if (ny == 0) {
	  mdy = Mn1y[0];
	} else {
	  mdy = Mn1y[ny]-Mn1y[ny-1];
	}
	

	for (nx=0;nx<n_spline;nx++) {
	  ikx = kx - nx + n_spline_h;
	  if (ikx < 0)             ikx +=  pme_grid[0];
	  if (ikx >= pme_grid[0])  ikx -=  pme_grid[0];
	  if (ikx < local_start[0] || ikx > local_end[0]) continue;
	  ikx -= local_start[0];
      
	  if (nx == 0) {
	    mdx = Mn1x[0];
	  } else {
	    mdx = Mn1x[nx] - Mn1x[nx-1];
	  }

	  /* atomic energy calculation
	  if (ad->atom_ene_sample_flag) {
	    ad->atom_ene[i][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] +=
	      0.5*ad->q[i]*Mnx[nx]*Mny[ny]*Mnz[nz]*QARR(ikx,iky,ikz).re;
	  }
	  */

	  qQ=q[i]*qr[(ikx)+pme_local[0]*((iky)+pme_local[1]*(ikz))];
	  fx=pme_grid[0]*mdx*Mny[ny]*Mnz[nz]*qQ;
	  fy=pme_grid[1]*Mnx[nx]*mdy*Mnz[nz]*qQ;
	  fz=pme_grid[2]*Mnx[nx]*Mny[ny]*mdz*qQ;
	  
	  ff[i][0] -= fx*recip[0][0]+fy*recip[0][1]+fz*recip[0][2];
	  ff[i][1] -= fx*recip[1][0]+fy*recip[1][1]+fz*recip[1][2];
	  ff[i][2] -= fx*recip[2][0]+fy*recip[2][1]+fz*recip[2][2];

	  /* DEBUG 
	  printf("Q: %d %d %d %f\n", ikx, iky, ikz, QARR(ikx,iky,ikz).re); */
	}
      }
    }
  }
#ifdef _OPENMP
 } /* end of parrallel */
 {
   int i, j, k;
   double (* restrict ff)[3], (* restrict tdff)[3];
   int n_atom_cell_req, * restrict atom_cell_req;

   n_atom_cell_req = lc->n_atom_cell_req;
   atom_cell_req = lc->atom_cell_req;
   
   ff = (double (*)[3]) ad->f;
   for (k=0;k<mpi.n_threads;k++) {
     tdff = (double (*)[3]) ew->td[k].recf;
     for (j=0;j<n_atom_cell_req;j++) {
       i = atom_cell_req[j];
       ff[i][0] += tdff[i][0];
       ff[i][1] += tdff[i][1];
       ff[i][2] += tdff[i][2];
     }
   }
 }
#endif

}

void SDMD_EW_cor_energy_force(EWALD *ew, BOUNDARY *bc,
			      ATOM_DATA *ad, double *elec)
{
  int n, i, j, iex;
  double ewald_cor, force;
  double dx, dy, dz;
  double len2, len;
  double factor;

  /*
  for (i=0;i<ad->natom;i++) {

    for (iex=0;iex<ad->ex[i].n_exatom;iex++) {
      if (ew->cor_pe[i][iex] != mpi.rank) continue;
      j = ad->ex[i].exatom[iex];
      if (i>j) continue;
  */
#if 1
  for (n=0;n<ew->n_cor_list;n++) {
    i = ew->cor_list[n].i;
    j = ew->cor_list[n].j;
      
      dx = ad->x[i].x - ad->x[j].x;
      dy = ad->x[i].y - ad->x[j].y;
      dz = ad->x[i].z - ad->x[j].z;
      len2 = dx * dx + dy * dy + dz * dz;
      len = sqrt(len2);
      
      ewald_cor = - ad->q[i]*ad->q[j]*(1.0-erfc(ew->beta*len)) / len;
      force  = (ewald_cor +
		ad->q[i]*ad->q[j]*ew->dir_const*exp(-ew->beta*ew->beta*len2))
	       / len2;
    
      *elec += ewald_cor;

    /*
    if (ad->atom_ene_sample_flag) {
      ewald_cor *= 0.5;
      ad->atom_ene[i][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] += ewald_cor;
      ad->atom_ene[j][ad->n_atom_ene_group-1][ATOM_ENE_ELEC] += ewald_cor;
    }
    */
    
      ad->f[i].x += force * dx;
      ad->f[i].y += force * dy;
      ad->f[i].z += force * dz;
    
      ad->f[j].x -= force * dx;
      ad->f[j].y -= force * dy;
      ad->f[j].z -= force * dz;
      
      /* virial */
      ad->virial[0] += force * dx * dx;
      ad->virial[1] += force * dy * dy;
      ad->virial[2] += force * dz * dz;
      ad->virial[3] += force * dx * dy;
      ad->virial[4] += force * dx * dz;
      ad->virial[5] += force * dy * dz;
      /* } */
  }

  if (ew->diel_sur > 0.0) {
    int j;
    VEC frac, offset;
    double dd[3], dd2[3], ene;
    
    dx = dy = dz = 0.0;
    factor = 2.0*M_PI/((2.0*ew->diel_sur+1.0)*bc->V);
    
    /* for (i=0;i<ad->natom;i++) { */
    for (i = ad->node_atom_h; i>=0; i=ad->node_atom_n[i]) {

      dx += ad->q[i]*ad->x[i].x;
      dy += ad->q[i]*ad->x[i].y;
      dz += ad->q[i]*ad->x[i].z;

      /*
      if (ad->ex[i].flag & ATOM_PARENT) {
	frac.x = VEC_MUL_MAT_X(ad->x[i],bc->recip);
	frac.y = VEC_MUL_MAT_Y(ad->x[i],bc->recip);
	frac.z = VEC_MUL_MAT_Z(ad->x[i],bc->recip);

	if (bc->origin_flag == 1) {
	  frac.x += 0.5;
	  frac.y += 0.5;
	  frac.z += 0.5;
	}
	
	frac.x = floor(frac.x);
	frac.y = floor(frac.y);
	frac.z = floor(frac.z);
	
	offset.x = VEC_MUL_MAT_X(frac,bc->boxv);
	offset.y = VEC_MUL_MAT_Y(frac,bc->boxv);
	offset.z = VEC_MUL_MAT_Z(frac,bc->boxv);
	
	for (j=i;j>=0;j=ad->ex[j].child_list) {
	  dx += ad->q[j]*(ad->x[j].x - offset.x);
	  dy += ad->q[j]*(ad->x[j].y - offset.y);
	  dz += ad->q[j]*(ad->x[j].z - offset.z);
	}
      }
      */
    }

    dd[0] = dx; dd[1] = dy; dd[2] = dz;
    MPI_Allreduce(dd, dd2, 3, MPI_DOUBLE, MPI_SUM, mpi.comm);
    dx = dd2[0]; dy = dd2[1]; dz = dd2[2];

    if (mpi.master) {
      ene = factor*(dx*dx+dy*dy+dz*dz);
      *elec += ene;
      ad->virial[0] += ene - 2.0 * factor*dx*dx;
      ad->virial[1] += ene - 2.0 * factor*dy*dy;
      ad->virial[2] += ene - 2.0 * factor*dz*dz;
      ad->virial[3] += - 2.0 * factor*dx*dy;
      ad->virial[4] += - 2.0 * factor*dx*dz;
      ad->virial[5] += - 2.0 * factor*dy*dz;
    }
    
    /* for (i=0;i<ad->natom;i++) { */
    for (i = ad->node_atom_h; i>=0; i=ad->node_atom_n[i]) {
      force = 2.0 * factor * ad->q[i];
      ad->f[i].x -= force * dx;
      ad->f[i].y -= force * dy;
      ad->f[i].z -= force * dz;
    }
  }
#endif
  
  /* SDMD self_energy term */
  if (mpi.master) {
    *elec += ew->self_energy;

    if (ew->self_energy_v_flag) {
      double self_energy_v;
      
      self_energy_v = ew->self_energy_v / bc->V;
      *elec += self_energy_v;

      ad->virial[0] += self_energy_v;
      ad->virial[1] += self_energy_v;
      ad->virial[2] += self_energy_v;
    }
  }

}

void SDMD_EW_calc_b_array(EWALD *ew)
{
  double * restrict b[3];
  double * restrict Mn;
  double * restrict B;
  int * restrict pme_grid;
  int * restrict nx;
  int (* restrict ikxy_tdata)[2];
  int n_ikxy_tdata;
  int n_spline;
  fftw_complex t, sum;
  int n, k, i, j, m;
  int ikx, iky, ikz;
  double lambda;
  
  ikxy_tdata = ew->fft3d->ikxy_tdata;
  n_ikxy_tdata = ew->fft3d->n_ikxy_tdata;
  nx = ew->fft3d->nx;
  n_spline = ew->n_spline;
  pme_grid = ew->pme_grid;
  B = ew->B;
  
  b[0] = emalloc("b1 in SDMD_EW_calc_b_array", sizeof(double) * pme_grid[0]);
  b[1] = emalloc("b2 in SDMD_EW_calc_b_array", sizeof(double) * pme_grid[1]);
  b[2] = emalloc("b3 in SDMD_EW_calc_b_array", sizeof(double) * pme_grid[2]);
  Mn   = ew->Mnx;

  /* case of n == 2 */
  Mn[0] = 0.0;
  Mn[1] = 1.0;
  Mn[2] = 0.0;
  
  for (n=3;n<=n_spline;n++) {
    Mn[n] = 0.0;
    for (k=n-1;k>=1;k--) {
      Mn[k] = Mn[k] * (double) k / (n-1) + Mn[k-1] * (double) (n-k)/(n-1);
    }
    Mn[0] = 0.0;
  }
  
  /* DEBUG 
  for (k=0;k<=ew->n_spline;k++) {
    printf("Mn[%d] = %f\n", k, Mn[k]);
  }
  */
  
  for (i=0;i<3;i++) {
    for (m=0;m<pme_grid[i];m++) {
      c_re(sum) = c_im(sum) = 0.0;
      for (k=0;k<n_spline-1;k++) {
	c_re(sum) += Mn[k+1] * cos(2.0*M_PI*m*k/pme_grid[i]);
	c_im(sum) += Mn[k+1] * sin(2.0*M_PI*m*k/pme_grid[i]);
	/* DEBUG
	printf("sum(%d,%d,%d) = (%f,%f)\n", i, m, k,sum.r,sum.i); */
      }
      if (!ew->opt_infl) {
	lambda = 1.0;
      } else {
	double g_sum, g_sum2, x;
	int k, mm, pme_grid_h, k_max;
	k_max = 50;
	pme_grid_h = pme_grid[i]/2;
	if (m < pme_grid_h)
	  mm = m;
	else
	  mm = m - pme_grid[i];
	x = M_PI * mm / pme_grid[i];
	g_sum = g_sum2 = 1.0;
	if (m!=0) {
	  for (k=1;k<=k_max;k++) {
	    g_sum  += pow(x/(x + M_PI*k), ew->n_spline);
	    g_sum  += pow(x/(x - M_PI*k), ew->n_spline);
	    g_sum2 += pow(x/(x + M_PI*k), ew->n_spline*2);
	    g_sum2 += pow(x/(x - M_PI*k), ew->n_spline*2);
	  }
	}
	lambda = g_sum / g_sum2;
      }
      
      /* DEBUG 
      printf("sum(%d,%d) = (%f,%f)\n", i, m, sum.r,sum.i); */
      
      if (fabs(c_re(sum)) < EPS && fabs(c_im(sum)) < EPS) {
	b[i][m] = 0.0;
      } else {
	c_re(t) = cos(2.0*M_PI*(n_spline-1)*m/pme_grid[i]);
	c_im(t) = sin(2.0*M_PI*(n_spline-1)*m/pme_grid[i]);
	b[i][m] = lambda * lambda * 
	  (c_re(t) * c_re(t) + c_im(t) * c_im(t))
	  / (c_re(sum) * c_re(sum) + c_im(sum) * c_im(sum));
      }
      /* DEBUG 
      printf("b[%d][%d] = %f\n", i, m, b[i][m]);
      */
    }
  }


  j=0;
  for (i=0;i<n_ikxy_tdata;i++) {
    ikx = ikxy_tdata[i][0];
    iky = ikxy_tdata[i][1];
    for (ikz=0;ikz<nx[2];ikz++,j++) {
      B[j] = b[0][ikx] * b[1][iky] * b[2][ikz];
      /* DEBUG
      printf("B(%d %d %d) = %e %e %e %e\n", ikx,iky,ikz, B[j], b[0][ikx], b[1][iky], b[2][ikz]);
      */
    }
  }

  free(b[0]);
  free(b[1]);
  free(b[2]);
}


void SDMD_EW_calc_c_array(EWALD *ew, BOUNDARY *bc)
{
  int ikx, iky, ikz;
  int imx, imy, imz;
  double mx, my, mz;
  double pi_v, twopi_beta2;
  double m2;
  int i, j;
  int * restrict pme_grid;
  int * restrict pme_grid_h;
  int * restrict nx;
  double (* restrict recip)[3];
  double * restrict C;
  int (* restrict ikxy_tdata)[2];
  int n_ikxy_tdata;

  pme_grid   = ew->pme_grid;
  pme_grid_h = ew->pme_grid_h;
  nx = ew->fft3d->nx;
  ikxy_tdata = ew->fft3d->ikxy_tdata;
  n_ikxy_tdata = ew->fft3d->n_ikxy_tdata;
  recip = bc->recip;
  C = ew->C;

  pi_v = 1.0/(M_PI*bc->V);
  twopi_beta2 = -M_PI*M_PI/(ew->beta*ew->beta);

  j=0;
  for (i=0;i<n_ikxy_tdata;i++) {
    ikx = ikxy_tdata[i][0];
    iky = ikxy_tdata[i][1];
    if (ikx <= pme_grid_h[0])
      imx = ikx;
    else
      imx = ikx - pme_grid[0];
    if (iky <= pme_grid_h[1])
      imy = iky;
    else
      imy = iky - pme_grid[1];

    for (ikz=0;ikz<nx[2];ikz++,j++) {
      if (ikz <= pme_grid_h[2])
	imz = ikz;
      else
	imz = ikz - pme_grid[2];
      
      if (ikx==0 && iky ==0 && ikz ==0) {
	C[j] = 0;
      } else {
	mx=imx*recip[0][0]+imy*recip[0][1]+imz*recip[0][2];
	my=imx*recip[1][0]+imy*recip[1][1]+imz*recip[1][2];
	mz=imx*recip[2][0]+imy*recip[2][1]+imz*recip[2][2];
	m2 = mx*mx+my*my+mz*mz;
	C[j] = pi_v * exp(twopi_beta2*m2)/m2;
      }
      /*printf("C(%d %d %d) = %e\n", ikx,iky,ikz, C[j]);*/
    }
  }
}

#ifdef SDMD_EW_dtime
#undef SDMD_EW_dtime
#endif

#ifdef SDMD_EW_add_dtime
#undef SDMD_EW_add_dtime
#endif


double SDMD_EW_check_time;

void SDMD_EW_clear_time(EWALD *ew)
{
  int i;
  for (i=0;i<SDMD_EW_N_TIME;i++) {
    ew->time[i] = 0.0;
  }
}

void SDMD_EW_dtime()
{
  SDMD_EW_check_time = MPI_Wtime();
}

void SDMD_EW_add_dtime(double *d)
{
  double check2;
  check2=MPI_Wtime();
  *d+=check2-SDMD_EW_check_time;
  SDMD_EW_check_time=check2;
}

void SDMD_EW_print_time(EWALD *ew)
{
  int i=1,j;
  double val[3],sum;

  if (ew->flag == FLAG_NO_EWALD) return;

  for (j=1,sum=0.0;j<SDMD_EW_N_TIME;j++) sum+=ew->time[j];

  lprintf("** PME Time (ave/min/max)**\n");
  ave_min_max(sum, val);
  lprintf("PME Total Recip      %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);
  ave_min_max(ew->time[SDMD_EW_TIME_CHARGE_GRID], val);
  lprintf("    Charge Grid      %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);
  ave_min_max(ew->time[SDMD_EW_TIME_FFT], val);
  lprintf("    FFT Forward      %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);
  ave_min_max(ew->time[SDMD_EW_TIME_ENERGY], val);
  lprintf("    Calc Energy      %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);
  ave_min_max(ew->time[SDMD_EW_TIME_FFTB], val);
  lprintf("    FFT Backward     %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);
  ave_min_max(ew->time[SDMD_EW_TIME_FORCE], val);
  lprintf("    Calc Forces      %10.2f/%10.2f/%10.2f sec\n", val[0],val[1],val[2]);

}


#else
static int dummy;
#endif  /* MPI_SDMD */
