/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>

#include "md_system.h"

#ifdef MPI_SDMD

#include "parallel.h"
#include "sdmd.h"

/* #include "mpi_debug.c" */

#ifdef KCOMP_FIPP
#include <fj_tool/fipp.h>
#endif

#ifdef KCOMP_FAPP
#include <fj_tool/fapp.h>
#endif

#ifdef KCOMP_FPCOLL
#include <fj_tool/fjsamp.h>
#endif

void SDMD_main_loop(MD_SYSTEM *sys, int max_step)
{
  int i, step;
  double start_time;
  double dt, dt_ps;
  ATOM_DATA *ad;
  LINKED_CELL *lc;
  double loop_start_time, loop_end_time;
  
  start_time = get_user_time();
  dt = sys->dt;
  dt_ps = sys->dt_ps;
  ad = &sys->atom;
  lc = &sys->linked_cell;

  SDMD_clear_time(sys);
  
  SDMD_info(sys, max_step);

  SDMD_setup(sys);
  if (mpi.master)
    MD_SYSTEM_print_md_status(sys,0);


  if (lc->load_balancer_step > 0)
    lc->load_balancer_exec_step = 1;
  else
    lc->load_balancer_exec_step = 0;

#ifdef KCOMP_FIPP
  fipp_start();
#endif
#ifdef KCOMP_FAPP
  fapp_start("section", 1,1);
#endif
#ifdef KCOMP_FPCOLL
  fpcoll_start();
#endif

  MPI_Barrier(mpi.comm);
  SDMD_clear_time(sys);
  loop_start_time = get_user_time();
  lc->time[SDMD_TIME_SETUP] = loop_start_time - start_time;

  for (step = 0; step < max_step; step++) {

    /* for debug 
    if (step == 300) {
      start_time = get_user_time();
      SDMD_clear_time(sys);
    } */
    sys->current_step = step;
    sys->current_time += dt_ps;
    EP_gradual_change(&sys->extra_pot, sys, step+1);
    PTC_gradual_change(sys, step);

    if (lc->load_balancer_step > 0 &&
	step % lc->load_balancer_step == 0) {
      if (!lc->load_balancer_exec_step)
	lc->load_balancer_exec_step = 5;
    }

    if (sys->remove_momentum_step > 0 &&
	(step+1) % sys->remove_momentum_step == 0)
      MD_SYSTEM_remove_momentum(sys);

    PTC_update1(sys, dt);
    /* add_dtime(&lc->time[SDMD_TIME_EX_SYSTEM]); */
    
    SDMD_integrate_vel1(sys, dt, ad->f);
    /* add_dtime(&lc->time[SDMD_TIME_INT_V]); */
    
    if (sys->Ex_System_P_flag)
      PTC_Ex_System_PT_update_x(sys, dt);
    else
      SDMD_integrate_coord(sys, dt);
    /* add_dtime(&lc->time[SDMD_TIME_INT_X]); */
    
    if (SDMD_check_migration(sys,step)) {
      SDMD_migration(sys, step, sys->dt_ps);
    } else {
      dtime();
      SDMD_dist_x_xyz(&sys->linked_cell, &sys->atom);
      if (sys->linked_cell.tr_dist_xf_flag)
	SDMD_dist_x(&sys->linked_cell, &sys->atom);
      add_dtime(&lc->time[SDMD_TIME_COMM_X]);
    }

    SDMD_calc_force(sys);
    
    SDMD_integrate_vel2(sys, dt, ad->f);
    /* add_dtime(&lc->time[SDMD_TIME_INT_V]);*/
    
    if (sys->Ex_System_P_flag)
      MD_SYSTEM_calc_kene_full(sys);
    else
      MD_SYSTEM_calc_kene(sys);
      /* SDMD_calc_kene(sys); */
      
    /* add_dtime(&lc->time[SDMD_TIME_KENE]); */
    
    PTC_update2(sys, dt);
    /* add_dtime(&lc->time[SDMD_TIME_EX_SYSTEM]); */

    SDMD_output(sys, step);
    /* add_dtime(&lc->time[SDMD_TIME_OUTPUT]); */
    /* SDMD_check(sys); */
  }
  MPI_Barrier(mpi.comm);
  loop_end_time = get_user_time();
  dtime();

#ifdef KCOMP_FIPP
  fipp_stop();
#endif
#ifdef KCOMP_FAPP
  fapp_stop("section", 1,1);
#endif
#ifdef KCOMP_FPCOLL
  fpcoll_stop();
#endif
#if defined(KCOMP_FIPP) || defined(KCOMP_FAPP) || defined(KCOMP_FPCOLL)
  if (mpi.master)
    lprintf("main_loop_time = %f\n", loop_end_time - loop_start_time);
#endif

  MD_SYSTEM_collect_statistics(sys);
  SDMD_gather_all(&sys->linked_cell, &sys->atom, &sys->rigid_mol);

  MPI_Barrier(mpi.comm);
  add_dtime(&lc->time[SDMD_TIME_AFTER_LOOP]);
  SDMD_print_time(sys, get_user_time()-start_time, loop_end_time-loop_start_time);
}

void SDMD_info(MD_SYSTEM *sys, int max_step)
{
  lprintf("Space Decomposition Parallel Molecular Dynamics\n");
  lprintf("  Time Step  = %5.2f fs\n", sys->dt_ps*1000);
  lprintf("  Max  Step  = %5d (%.2f ps)\n", max_step, sys->dt_ps*max_step);
  /*
  lprintf("  Load Balancing Interval Step  = %5d\n", sys->linked_cell.load_balancer_step);
  */
  
  if (sys->prop_out_step > 0)
    lprintf("  Property Output   = %s (interval = %d step(s))\n",
	    sys->prop_out_fname, sys->prop_out_step);
  MD_SYSTEM_print_trj_info(sys);
  lprintf("\n");  
}

void SDMD_clear_time(MD_SYSTEM *sys)
{
  int i;
  for (i=0;i<SDMD_N_TIME;i++) sys->linked_cell.time[i]=0.0;
  
  SDMD_EW_clear_time(&sys->ewald);
}

void SDMD_print_time(MD_SYSTEM *sys, double total, double loop)
{
  int i, id;
  double min[SDMD_N_TIME], max[SDMD_N_TIME], ave[SDMD_N_TIME];
  LINKED_CELL *lc;
  static struct {
    int id;
    char *name;
  } label[] = {
    SDMD_TIME_BOND,      "BOND",
    SDMD_TIME_ANGLE,     "ANGLE",
    SDMD_TIME_DIHED,     "DIHEDRAL",
    SDMD_TIME_BOUNDARY,  "BOUNDARY",
    SDMD_TIME_EP,        "EX POT",
    SDMD_TIME_NONBOND,   "NONBOND",
#if defined(OVERLAP) && defined(_OPENMP)
    SDMD_TIME_EW_DIR,    "EWALD DIR & FFT",
#else
    SDMD_TIME_EW_DIR,    "EWALD DIRECT",
#endif
    SDMD_TIME_EW_REC,    "EWALD REC",
    SDMD_TIME_EW_COR,    "EWALD CORR",
    SDMD_TIME_FMM,       "FMM",
    SDMD_TIME_NBLIST,    "NBLIST",
    SDMD_TIME_FRCMAN,    "FORCE MANIP",
    SDMD_TIME_MIGRATION, "MIGRATION",
    SDMD_TIME_LB,        "LOAD BALANCER",
    SDMD_TIME_COMM_X,    "COMM X",
    SDMD_TIME_COMM_F,    "COMM F",    
    SDMD_TIME_COMM_E,    "COMM E",  
    SDMD_TIME_RATTLE,    "RATTLE",
    SDMD_TIME_IDLE,      "IDLE",      
    SDMD_TIME_OTHER,     "OTHER",  
    SDMD_TIME_SETUP,     "SETUP",
    SDMD_TIME_AFTER_LOOP,"AFTER LOOP",
    -1,                "" };
  
  lc = &sys->linked_cell;
  lc->time[SDMD_TIME_OTHER] = total;
  for (i=0;i<SDMD_N_TIME;i++) {
    if (i!=SDMD_TIME_OTHER)
      lc->time[SDMD_TIME_OTHER]-=lc->time[i];
  }

  MPI_Allreduce(lc->time, min, SDMD_N_TIME, MPI_DOUBLE, MPI_MIN, mpi.comm);
  MPI_Allreduce(lc->time, max, SDMD_N_TIME, MPI_DOUBLE, MPI_MAX, mpi.comm);
  MPI_Allreduce(lc->time, ave, SDMD_N_TIME, MPI_DOUBLE, MPI_SUM, mpi.comm);
  
  lprintf("----------  CPU TIME  -------------\n");
  for (i=0;label[i].id>=0;i++) {
    id = label[i].id;
    if (id == SDMD_TIME_NONBOND &&
	sys->ewald.flag != FLAG_NO_EWALD) continue;
    if ((id == SDMD_TIME_EW_DIR ||
	 id == SDMD_TIME_EW_REC ||
	 id == SDMD_TIME_EW_COR) &&
	sys->ewald.flag == FLAG_NO_EWALD) continue;
    if (id == SDMD_TIME_FMM &&
	!sys->fmm_flag) continue;
    
    lprintf("%-15s%10.2f sec (%10.2f - %10.2f)\n" , label[i].name,
	    ave[id]/mpi.n_pe, min[id], max[id]);
  }
  lprintf("-----------------------------------\n");
  lprintf("TOTAL        %10.2f sec (LOOP %.2f sec)\n\n", total, loop);
  
#ifdef TIMING
  if (mpi.master) {
    MPI_Status stat;
    int j;
    for (i=0;i<mpi.n_pe;i++) {
      if (i!=mpi.master_pe) {
	MPI_Recv(lc->time, SDMD_N_TIME, MPI_DOUBLE, i, 10, mpi.comm, &stat);
      }
      lprintf("Timing: %d", i);
      for (j=0;j<SDMD_N_TIME;j++) {
	lprintf(" %e",lc->time[j]);
      }
      lprintf("\n");
    }
  } else {
    MPI_Send(lc->time, SDMD_N_TIME, MPI_DOUBLE, mpi.master_pe, 10, mpi.comm);
  }
#endif
  
  SDMD_EW_print_time(&sys->ewald);
}

void SDMD_check_idle_time(LINKED_CELL *lc)
{
  double check;
  check=MPI_Wtime();
  MPI_Barrier(mpi.comm);
  check=MPI_Wtime()-check;
  lc->time[SDMD_TIME_IDLE]+=check;
  lc->idle_time+=check;
  dtime();
}

void SDMD_integrate_vel1(MD_SYSTEM *sys, double dt, VEC *f)
{
  double hdt;
  int i;
  ATOM_DATA *ad;
  ad = &sys->atom;
  hdt = 0.5*dt;
  ad->f = f;
  /* for (i = 0; i < sys->n_flex_atom; i++) { */
  for (i = ad->node_fatom_h; i>=0; i=ad->node_fatom_n[i]) {
    
    ad->v[i].x += hdt * f[i].x / ad->w[i];
    ad->v[i].y += hdt * f[i].y / ad->w[i];
    ad->v[i].z += hdt * f[i].z / ad->w[i];
  }
  if (sys->rigid_mol_flag) {
    RMOL_DATA_time_integration_v1(&sys->rigid_mol, &sys->atom, dt);
  }
  if (sys->rattle.flag) {
    dtime();
    RATTLE_time_integration_v1(&sys->rattle, &sys->bond, &sys->atom, dt);
    add_dtime(&sys->linked_cell.time[SDMD_TIME_RATTLE]);
  }
}

void SDMD_integrate_vel2(MD_SYSTEM *sys, double dt, VEC *f)
{
  double hdt;
  int i;
  ATOM_DATA *ad;
  ad = &sys->atom;
  hdt = 0.5*dt;
  ad->f = f;
  
  /* for (i = 0; i < sys->n_flex_atom; i++) { */
  for (i = ad->node_fatom_h; i>=0; i=ad->node_fatom_n[i]) {

    ad->v[i].x += hdt * f[i].x / ad->w[i];
    ad->v[i].y += hdt * f[i].y / ad->w[i];
    ad->v[i].z += hdt * f[i].z / ad->w[i];
  }
  if (sys->rigid_mol_flag) {
    RMOL_DATA_time_integration_v2(&sys->rigid_mol, &sys->atom, dt);
  }
  if (sys->rattle.flag) {
    dtime();
    RATTLE_time_integration_v2(&sys->rattle, &sys->bond, &sys->atom, dt);
    add_dtime(&sys->linked_cell.time[SDMD_TIME_RATTLE]);
  }
}

void SDMD_integrate_coord(MD_SYSTEM *sys, double dt)
{
  int i;
  ATOM_DATA *ad;

  ad = &sys->atom;
  /* for (i = 0; i < sys->n_flex_atom; i++) { */
  for (i = ad->node_fatom_h; i>=0; i=ad->node_fatom_n[i]) {
    ad->x[i].x += dt * ad->v[i].x;
    ad->x[i].y += dt * ad->v[i].y;
    ad->x[i].z += dt * ad->v[i].z;
  }
  if (sys->rigid_mol_flag) {
    RMOL_DATA_time_integration_p(&sys->rigid_mol, ad, dt);
  }
  if (sys->rattle.flag) {
    dtime();
    RATTLE_time_integration_p(&sys->rattle, &sys->bond, &sys->atom, dt);
    add_dtime(&sys->linked_cell.time[SDMD_TIME_RATTLE]);
  }
}


void SDMD_output(MD_SYSTEM *sys, int step)
{
  
  if (!mpi.master) {
    if (sys->trj_out_step > 0 && (step+1) % sys->trj_out_step == 0) {
      SDMD_gather_x(&sys->linked_cell, &sys->atom);
      if (sys->trj_out_type & TCT_V) {
	RMOL_DATA_set_atom_velocity(&sys->rigid_mol, &sys->atom);
	SDMD_gather_v(&sys->linked_cell, &sys->atom);
      }
    }
    if (sys->prop_out_step > 0 && (step+1) % sys->prop_out_step == 0)
      MD_SYSTEM_prop_out(sys);
    return;
  }
  
  if (sys->trj_out_step > 0 && (step+1) % sys->trj_out_step == 0) {
    SDMD_gather_x(&sys->linked_cell, &sys->atom);
    if (sys->trj_out_type & TCT_V) {
      RMOL_DATA_set_atom_velocity(&sys->rigid_mol, &sys->atom);
      SDMD_gather_v(&sys->linked_cell, &sys->atom);
    }
    MD_SYSTEM_trj_out(sys);
  }
  
  MD_SYSTEM_sum_potential(sys);
  MD_SYSTEM_sum_total_energy(sys);
  MD_SYSTEM_calc_Pint(sys);

  if (sys->sample_step > 0 && (step+1) % sys->sample_step == 0)
    MD_SYSTEM_sample_statistics(sys);

  if (sys->total_ene >= 1.0e10 ||
      (sys->print_out_step > 0 && (step+1) % sys->print_out_step == 0))
    MD_SYSTEM_print_md_status(sys,step+1);
  
  if (sys->prop_out_step > 0 && (step+1) % sys->prop_out_step == 0)
    MD_SYSTEM_prop_out(sys);
  
}

void SDMD_calc_force(MD_SYSTEM *sys)
{
  double total_time = 0.0, start_time, end_time, check;
  int i;
  LINKED_CELL *lc;

  dtime();

  lc = &sys->linked_cell;
  SDMD_clear_force(sys);

  for (i=0;i<N_ENE;i++) sys->ene[i]=0.0;

  if (sys->atom.atom_ene_sample_flag) {
    ATOM_DATA_atom_ene_clear(&sys->atom);
  }

  /*add_dtime(&lc->time[SDMD_TIME_CLFRC]); */
  dtime();

#if 1

  bond_energy_force(&(sys->bond), &(sys->atom), &(sys->ene[BOND_ENE]));
  add_dtime(&lc->time[SDMD_TIME_BOND]);

  angle_energy_force(&(sys->angle), &(sys->atom),
		     &(sys->ene[ANGLE_ENE]),&(sys->ene[UB_ENE]));
  add_dtime(&lc->time[SDMD_TIME_ANGLE]);


  dihedral_energy_force(&(sys->dihed), &(sys->atom),
			&(sys->ene[DIHED_ENE]), &(sys->ene[IMPR_ENE]));
  
  nonbond14_energy_force(&(sys->dihed), &(sys->nonbond), &(sys->atom),
                         &(sys->ene[VDW14_ENE]),&(sys->ene[ELEC14_ENE]));

  DD_CMAP_energy_force(&(sys->dihed), &(sys->atom), &(sys->ene[CMAP_ENE]));
  add_dtime(&lc->time[SDMD_TIME_DIHED]);
  
  SDMD_calc_virial(sys);
  /* add_dtime(&lc->time[SDMD_TIME_VIRIAL]); */

  BOUNDARY_energy_force(&(sys->boundary), &(sys->atom),
			&(sys->ene[BOUNDARY_ENE]));
  add_dtime(&lc->time[SDMD_TIME_BOUNDARY]);

#endif



#if 1

  EP_energy_force(&(sys->extra_pot), sys, &(sys->ene[EXTRA_ENE]));
  add_dtime(&lc->time[SDMD_TIME_EP]);


  if (sys->ewald.flag == FLAG_NO_EWALD) {
    nonbond_energy_force(&(sys->nonbond), &(sys->atom), &(sys->boundary),
			 &(sys->ene[VDW_ENE]),&(sys->ene[ELEC_ENE]),
			 &(sys->ene[HBOND_ENE]));
    /*
    SDMD_nonbond_energy_force_smooth(&(sys->linked_cell),&(sys->nonbond),
				     &(sys->atom), &(sys->boundary),
			     &(sys->ene[VDW_ENE]),&(sys->ene[ELEC_ENE]),
			     &(sys->ene[HBOND_ENE]));
    */
    
    add_dtime(&lc->time[SDMD_TIME_NONBOND]);
  } else {
    
    /* EWALD */
#if defined(OVERLAP) && defined(_OPENMP)
    SDMD_EW_cor_energy_force(&(sys->ewald), &(sys->boundary), &(sys->atom),
			   &(sys->ene[EWCOR_ENE]));
    add_dtime(&lc->time[SDMD_TIME_EW_COR]);

    SDMD_EW_pme1(&(sys->ewald), &(sys->linked_cell),
		 &(sys->atom), &(sys->boundary));
    add_dtime(&lc->time[SDMD_TIME_EW_REC]); 

    SDMD_EW_direct_energy_force(&(sys->ewald), &(sys->linked_cell),
				&(sys->nonbond), &(sys->atom), &(sys->boundary),
				&(sys->ene[VDW_ENE]),&(sys->ene[ELEC_ENE]),
				&(sys->ene[EWREC_ENE]),	&(sys->ene[HBOND_ENE]));

    add_dtime(&lc->time[SDMD_TIME_EW_DIR]);

    SDMD_EW_pme3(&(sys->ewald), &(sys->linked_cell),
		 &(sys->atom), &(sys->boundary));
    sys->ene[EWREC_ENE] = sys->ewald.erec;  

    add_dtime(&lc->time[SDMD_TIME_EW_REC]); 
#else
    SDMD_EW_cor_energy_force(&(sys->ewald), &(sys->boundary), &(sys->atom),
			   &(sys->ene[EWCOR_ENE]));
    add_dtime(&lc->time[SDMD_TIME_EW_COR]);

    SDMD_EW_pme(&(sys->ewald), &(sys->linked_cell), &(sys->atom), &(sys->boundary),
		&(sys->ene[EWREC_ENE]));
    add_dtime(&lc->time[SDMD_TIME_EW_REC]); 

    MPI_Barrier(mpi.comm);
    add_dtime(&lc->time[SDMD_TIME_IDLE]); 


    SDMD_EW_direct_energy_force(&(sys->ewald), &(sys->linked_cell),
				&(sys->nonbond), &(sys->atom), &(sys->boundary),
				&(sys->ene[VDW_ENE]),&(sys->ene[ELEC_ENE]),
				&(sys->ene[EWREC_ENE]),	&(sys->ene[HBOND_ENE]));
    add_dtime(&lc->time[SDMD_TIME_EW_DIR]);
#endif
  }

  /*
  if (sys->fmm_flag) {
    calc_fmm(&(sys->atom));
    FMM_energy_force(&(sys->atom),&(sys->ene[FMM_ENE]));
    add_dtime(&lc->time[SDMD_TIME_FMM]);
  }
  */
#endif

  /*SDMD_check_idle_time(lc);*/

  if (sys->linked_cell.tr_mode == TR_MODE_ES ||
      sys->linked_cell.tr_mode == TR_MODE_MP) {
    SDMD_dist_f_xyz(&sys->linked_cell, &sys->atom);
  }
  if (sys->linked_cell.tr_dist_xf_flag)
    SDMD_dist_f(&sys->linked_cell, &sys->atom);
  add_dtime(&lc->time[SDMD_TIME_COMM_F]);

  /*
  {ATOM_DATA *ad; ad = &sys->atom;
  if (ATOM_CPU(&sys->linked_cell,&sys->atom,4) == mpi.rank)
    printf("4: %f %f %f\n", ad->f[4].x, ad->f[4].y, ad->f[4].z);
  if (ATOM_CPU(&sys->linked_cell,&sys->atom,5486) == mpi.rank)
    printf("5486: %f %f %f\n", ad->f[5486].x, ad->f[5486].y, ad->f[5486].z);
  }
  */

  if (sys->remove_total_force)
    MD_SYSTEM_remove_total_force(sys);

  SDMD_correct_virial(sys);
  add_dtime(&lc->time[SDMD_TIME_FRCMAN]);

  SDMD_reduce_energy(sys);
  add_dtime(&lc->time[SDMD_TIME_COMM_E]);
  
  MD_SYSTEM_sum_potential(sys);

  /*
  if (sys->scale_system_method == MOL_BASED_SCALE) {
    ATOM_DATA_molecular_virial(&sys->atom);
  }

  if (sys->atom.atom_ene_sample_flag) {
    ATOM_DATA_atom_ene_sample(&sys->atom);
  }
  */
}

void SDMD_clear_force(MD_SYSTEM *sys)
{
  int i,j,k;
  ATOM_DATA *ad;
  LINKED_CELL *lc;

  ad = &sys->atom;
  lc = &sys->linked_cell;

  /*
  for (j=ad->node_atom_h;j>=0;j=ad->node_atom_n[j]) {
    ad->f[j].x = ad->f[j].y = ad->f[j].z = 0.0;
  }
  for (i=0;i<mpi.n_pe;i++) {
    for (k=0;k<lc->tr_list[i].n_recv_x;k++) {
      j = lc->tr_list[i].recv_x[k];
      ad->f[j].x = ad->f[j].y = ad->f[j].z = 0.0;
    }
  }
  */

  /*
  for (i=lc->req_head;i>=0;i=lc->cell[i].req_next) {
    for (j=lc->cell[i].head;j>=0;j=lc->next_atom[j]) {
  */

  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].req || lc->cell[i].req_xyz) {
      for (j=lc->cell[i].head;j>=0;j=lc->next_atom[j]) {
	ad->f[j].x = ad->f[j].y = ad->f[j].z = 0.0;
      }
    }
  }

  /* for extra transfer */
  if (lc->tr_dist_xf_flag) {
    for (k=0;k<mpi.n_pe;k++) {
      for (i=0;i<lc->tr_list[k].n_send_f;i++) {
	j = lc->tr_list[k].send_f[i];
	ad->f[j].x = ad->f[j].y = ad->f[j].z = 0.0;
      }
    }
  }
  /*
  for (j=0;j<ad->natom;j++) {
    ad->f[j].x = ad->f[j].y = ad->f[j].z = 0.0;
  }
  */
  
  for (i=0;i<6;i++) {
    ad->virial[i] = 0.0;
  }
}

void SDMD_calc_virial(MD_SYSTEM *sys)
{
  int i,j;
  ATOM_DATA *ad;
  LINKED_CELL *lc;

  ad = &sys->atom;
  lc = &sys->linked_cell;

  /*
  for (i=0;i<lc->n_cell;i++) {
    if (lc->cell[i].req) {
  */
  for (i=lc->req_head;i>=0;i=lc->cell[i].req_next) {
    for (j=lc->cell[i].head;j>=0;j=lc->next_atom[j]) {
      ad->virial[0] += ad->f[j].x * ad->x[j].x;
      ad->virial[1] += ad->f[j].y * ad->x[j].y;
      ad->virial[2] += ad->f[j].z * ad->x[j].z;
      ad->virial[3] += ad->f[j].x * ad->x[j].y;
      ad->virial[4] += ad->f[j].x * ad->x[j].z;
      ad->virial[5] += ad->f[j].y * ad->x[j].z;
    }
  }
}

void SDMD_correct_virial(MD_SYSTEM *sys)
{
  int i;

  for (i=3;i<=5;i++)
    sys->atom.virial[i+3] =  sys->atom.virial[i];

  if (sys->n_fixed_atom > 0)
    MD_SYSTEM_fixed_atom_correct_virial(sys);
  if (sys->rigid_mol_flag)
    RMOL_DATA_correct_virial(&sys->rigid_mol, &sys->atom);
  if (sys->rattle.flag)
    RATTLE_correct_virial(&sys->rattle, &sys->atom);
}

void SDMD_reduce_energy(MD_SYSTEM *sys)
{
  double ene_tmp[N_ENE],virial[9];
  int i;

  MPI_Allreduce(sys->ene, ene_tmp, N_ENE, MPI_DOUBLE, MPI_SUM, mpi.comm);
  for (i=0;i<N_ENE;i++) sys->ene[i] = ene_tmp[i];

  MPI_Allreduce(sys->atom.virial, virial, 9, MPI_DOUBLE, MPI_SUM, mpi.comm);
  for (i=0;i<9;i++) sys->atom.virial[i] = virial[i];
}

#if 0
void SDMD_calc_kene(MD_SYSTEM *sys)
{
  int i,iex;
  double kene = 0.0;
  double kmolv = 0.0, kmolr = 0.0, tmp;
  double tmp_arr[MAX_EX_SYSTEM+3], tmp_arr2[MAX_EX_SYSTEM+3];
  ATOM_DATA *ad;

  ad = &(sys->atom);

  sys->kene_tr = sys->kene_rot = 0.0;
  for (iex=0;iex<sys->n_ex_system;iex++) {
    sys->kene_arr[iex] = 0.0;
  }
  
  /* for (i=0;i<sys->n_flex_atom;i++) { */
  for (i = ad->node_fatom_h; i>=0; i=ad->node_fatom_n[i]) {
    tmp = ad->w[i] * Length2(ad->v[i].x, ad->v[i].y, ad->v[i].z);
    sys->kene_tr += tmp;
    
    iex = ATOM_FLAG_EX_SYSTEM(ad->ex[i].flag);
    sys->kene_arr[iex] += tmp;
    
  }
  if (sys->rigid_mol_flag)
    RMOL_DATA_kene(&sys->rigid_mol, &sys->kene_tr, &sys->kene_rot, sys->kene_arr);
  if (sys->rattle.flag)
    RATTLE_kene(&sys->rattle, &sys->atom,
		&sys->kene_tr, &sys->kene_rot, sys->kene_arr);
  
  sys->kene_tr  *= 0.5;
  sys->kene_rot *= 0.5;
  for (i=0;i<sys->n_ex_system;i++)
    sys->kene_arr[i] *= 0.5;

  tmp_arr[0]=sys->kene_tr;
  tmp_arr[1]=sys->kene_rot;
  for (i=0;i<sys->n_ex_system;i++)
    tmp_arr[i+2]=sys->kene_arr[i];
  MPI_Allreduce(tmp_arr, tmp_arr2, 2+sys->n_ex_system, MPI_DOUBLE, MPI_SUM, mpi.comm);

  sys->kene_tr  =tmp_arr2[0];
  sys->kene_rot =tmp_arr2[1];
  for (i=0;i<sys->n_ex_system;i++)
    sys->kene_arr[i]=tmp_arr2[i+2];
  
  sys->kene = sys->kene_tr + sys->kene_rot;
  sys->temperature = 2.0*sys->kene/(sys->degree_of_freedom*K*KCAL);

  for (i=0;i<sys->n_ex_system;i++) {
    if (sys->degree_of_freedom_arr[i]>0)
      sys->temperature_arr[i] = 2.0*sys->kene_arr[i]/(sys->degree_of_freedom_arr[i]*K*KCAL);
    else
      sys->temperature_arr[i] = 0.0;
  }
}
#endif

int SDMD_calc_momentum(MD_SYSTEM *sys)
{
  int i;
  double gcv[4];
  ATOM_DATA *ad;

  ad = &sys->atom;
  gcv[0] = gcv[1] = gcv[2] = gcv[3] = 0.0;
  for (i=0;i<ad->natom;i++) {
    gcv[0] += ad->v[i].x * ad->w[i];
    gcv[1] += ad->v[i].y * ad->w[i];
    gcv[2] += ad->v[i].z * ad->w[i];
    gcv[3] += ad->w[i];
  }
  lprintf("Total Momentum = (%f,%f,%f)\n",gcv[0],gcv[1],gcv[2]);
  return 0;
}


/* migration routines */
int SDMD_check_migration(MD_SYSTEM *sys,int step)
{
  NONBOND_LIST *nl;

  nl=&sys->nonbond;
  if (nl->update_step > 0) {
    if ((step+1) % nl->update_step == 0)
      return 1;
    else
      return 0;
  }
  return 1;
}

void SDMD_migration(MD_SYSTEM *sys, int step, double dt_ps)
{
  int i, j, n, jatom, jmol, old_cell, new_cell, old_pe, new_pe, ix, iy, iz;
  LINKED_CELL *lc;
  ATOM_DATA *ad;
  BOUNDARY *bc;
  BOND_DATA *bd;
  ANGLE_DATA *and;
  DIHEDRAL_DATA *dd;
  NONBOND_LIST *nl;
  EWALD *ew;
  RMOL_DATA *md;
  MPI_Status stat;
  int *recv_count, *displ, *send_buf, *recv_buf;
  int nhydr;
  VEC frac, xx;

  dtime();
  
  bc = &sys->boundary;
  lc = &sys->linked_cell; nl = &sys->nonbond; ad = &sys->atom;
  bd = &sys->bond; and = &sys->angle; dd = &sys->dihed;
  ew = &sys->ewald; md = &sys->rigid_mol;
   
  if (sys->Ex_System_P_flag) {
    BOUDNARY_bcast_boxv(bc);
    if (SDMD_check_resetup(sys)) {
      SDMD_resetup(sys);
      return;
    }
  }

  send_buf=get_int_buf(ad->natom);
  recv_count=get_int_buf(mpi.n_pe);
  displ=get_int_buf(mpi.n_pe);
  recv_buf=get_int_buf(ad->natom);

  for (i=0;i<mpi.n_pe;i++) {
    lc->tr_list[i].n_recv_x = 0;
    lc->tr_list[i].n_send_x = 0;
  }

  n=0;
  for (i=ad->node_atom_h;i>=0;i=ad->node_atom_n[i]) {
#ifdef HG_MODE
    if (ad->ex[i].flag & ATOM_CHILD) continue;
#endif
    xx.x = ad->x[i].x - bc->min[0];
    xx.y = ad->x[i].y - bc->min[1];
    xx.z = ad->x[i].z - bc->min[2];
	
    frac.x = VEC_MUL_MAT_X(xx,bc->recip);
    frac.y = VEC_MUL_MAT_Y(xx,bc->recip);
    frac.z = VEC_MUL_MAT_Z(xx,bc->recip);
      
    if (bc->type == PERIODIC_BOUNDARY) {
      frac.x -= floor(frac.x);
      frac.y -= floor(frac.y);
      frac.z -= floor(frac.z);
    } else {
      if (frac.x > 1.0) frac.x = 1.0;
      if (frac.y > 1.0) frac.y = 1.0;
      if (frac.z > 1.0) frac.z = 1.0;
      if (frac.x < 0.0) frac.x = 0.0;
      if (frac.y < 0.0) frac.y = 0.0;
      if (frac.z < 0.0) frac.z = 0.0;
    }

    ix = frac.x * lc->n_grid[0];
    iy = frac.y * lc->n_grid[1];
    iz = frac.z * lc->n_grid[2];

    /* for rare case of round error */
    if (ix == lc->n_grid[0]) ix--;
    if (iy == lc->n_grid[1]) iy--;
    if (iz == lc->n_grid[2]) iz--;

    old_cell = lc->atom_cell[i];
    new_cell = CELL_INDEX(lc, ix, iy, iz);

    if (old_cell!=new_cell) {
      send_buf[n++] = i;
      send_buf[n++] = new_cell;
    }

#ifdef HG_MODE
    /* all child atoms connecting to the parent atom belong to the same cell. */
    for (j=ad->ex[i].child_list;j>=0;j=ad->ex[j].child_list) {
      old_cell = lc->atom_cell[j];
      if (old_cell!=new_cell) {
	send_buf[n++] = j;
	send_buf[n++] = new_cell;
      }
    }
#endif
  }

  MPI_Allgather(&n,1,MPI_INT,recv_count,1,MPI_INT,mpi.comm);
  displ[0]=0;
  for (i=1;i<mpi.n_pe;i++) displ[i]=displ[i-1]+recv_count[i-1];
  MPI_Allgatherv(send_buf,n,MPI_INT,recv_buf,recv_count,displ,MPI_INT,mpi.comm);
  for (n=0,i=0;i<mpi.n_pe;i++) n+=recv_count[i];
  
  for (j=0;j<n;j+=2) {
    jatom = recv_buf[j];
    
    old_cell = lc->atom_cell[jatom];
    new_cell = recv_buf[j+1];
    old_pe = lc->cell[old_cell].pe;
    new_pe = lc->cell[new_cell].pe;
    LINKED_CELL_migration(lc,jatom, old_cell, new_cell);
    
    recv_buf[j+1] = old_cell;  /* This is for rigid mol transfers */

    if ((ad->ex[jatom].flag & ATOM_RIGID) ||
	(ad->ex[jatom].flag & ATOM_CHILD)) continue; 
    
    if (old_pe != new_pe) {
      if (old_pe == mpi.rank) {
	SDMD_check_alloc_tr_list(lc, new_pe, lc->tr_list[new_pe].n_send_x);
	
	lc->tr_list[new_pe].send_x[lc->tr_list[new_pe].n_send_x]=jatom;
	lc->tr_list[new_pe].n_send_x++;

	/* In the case of the parent atom, children atoms are also transfered. */
	if (ad->ex[jatom].flag & ATOM_PARENT) {
	  for (i=ad->ex[jatom].child_list;i>=0;i=ad->ex[i].child_list) {
	    SDMD_check_alloc_tr_list(lc, new_pe, lc->tr_list[new_pe].n_send_x);

	    lc->tr_list[new_pe].send_x[lc->tr_list[new_pe].n_send_x]=i;
	    lc->tr_list[new_pe].n_send_x++;
	  }
	}
      } else if (new_pe == mpi.rank) {

	SDMD_check_alloc_tr_list(lc, old_pe, lc->tr_list[old_pe].n_recv_x);
	lc->tr_list[old_pe].recv_x[lc->tr_list[old_pe].n_recv_x]=jatom;
	lc->tr_list[old_pe].n_recv_x++;

	/* In the case of the parent atom, children atoms are also transfered. */
	if (ad->ex[jatom].flag & ATOM_PARENT) {
	  for (i=ad->ex[jatom].child_list;i>=0;i=ad->ex[i].child_list) {

	    SDMD_check_alloc_tr_list(lc, old_pe, lc->tr_list[old_pe].n_recv_x);
	    lc->tr_list[old_pe].recv_x[lc->tr_list[old_pe].n_recv_x]=i;
	    lc->tr_list[old_pe].n_recv_x++;
	  }
	}
	
      }
    }
  }
  free_buf(send_buf);
  free_buf(recv_count);
  free_buf(displ);

  /* Notice: In this time, tr_list is made for migration. */
  SDMD_dist_x(lc, ad);
  SDMD_dist_v(lc, ad);

  for (i=0;i<mpi.n_pe;i++) {
    lc->tr_list[i].n_recv_x = 0;
    lc->tr_list[i].n_send_x = 0;
  }
  
  for (j=0;j<n;j+=2) {
    jatom = recv_buf[j];
    
    if (!((ad->ex[jatom].flag & ATOM_RIGID) &&
	  (ad->ex[jatom].flag & ATOM_PARENT))) continue;
    
    new_cell = lc->atom_cell[jatom];
    old_cell = recv_buf[j+1];
    
    old_pe = lc->cell[old_cell].pe;
    new_pe = lc->cell[new_cell].pe;

    jmol = ad->ex[jatom].parent;

    if (old_pe != new_pe) {
      if (old_pe == mpi.rank) {
	SDMD_check_alloc_tr_list(lc, new_pe, lc->tr_list[new_pe].n_send_x);
	lc->tr_list[new_pe].send_x[lc->tr_list[new_pe].n_send_x]=jmol;
	lc->tr_list[new_pe].n_send_x++;
      } else if (new_pe == mpi.rank) {
	SDMD_check_alloc_tr_list(lc, old_pe, lc->tr_list[old_pe].n_recv_x);
	lc->tr_list[old_pe].recv_x[lc->tr_list[old_pe].n_recv_x]=jmol;
	lc->tr_list[old_pe].n_recv_x++;
      }
    }
  }
  free_buf(recv_buf);

  SDMD_dist_rmol(lc, md);
  
  SDMD_assign_node_atom(lc, ad);
  SDMD_assign_node_rmol(md, ad);
  SDMD_assign_node_rattle(&sys->rattle, ad);

  RMOL_DATA_mol_to_room(md, ad);

  SDMD_clear_cell_req_internal(lc,ad);

  SDMD_assign_internal_to_PE(lc, ad, bd, and, dd, ew);

  add_dtime(&lc->time[SDMD_TIME_MIGRATION]);

  SDMD_load_balancer(lc, step, dt_ps);

  add_dtime(&lc->time[SDMD_TIME_LB]);

  if (lc->tr_mode == TR_MODE_ES ||
      lc->tr_mode == TR_MODE_MP) {
    SDMD_make_tr_list_xyz(lc, ad); 
  } else {
    SDMD_make_tr_list_by_cell_req(lc, ad);
  }

  if (lc->tr_mode == TR_MODE_ES ||
      lc->tr_mode == TR_MODE_MP) {
    SDMD_dist_x_xyz(lc, ad);
  }
  if (sys->linked_cell.tr_dist_xf_flag)
    SDMD_dist_x(lc, ad);

  /* calculate tr_x */
  LINKED_CELL_calc_tr_x(lc, ad, bc);

  add_dtime(&lc->time[SDMD_TIME_MIGRATION]);

  /*
  if (SDMD_make_nonbond_list(lc, nl, ad, bc, 0) < 0) {
    SDMD_alloc_nonbond_list(lc, nl, ad, bc);
    SDMD_make_nonbond_list(lc, nl, ad, bc, 0);
  }
  */
#if 1
  if (LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 0) < 0) {
    LINKED_CELL_alloc_nonbond_list(lc, nl, ad, bc);
    LINKED_CELL_make_nonbond_list(lc, nl, ad, bc, 0);
  }
#endif

#ifdef TR_LIST_ATOM_REQ
  SDMD_make_tr_list_by_atom_req(lc, ad);
#endif

  add_dtime(&lc->time[SDMD_TIME_NBLIST]);

  /*
  MPI_Barrier(mpi.comm);
  add_dtime(&lc->time[SDMD_TIME_RATTLE]);
  */
}

void SDMD_load_balancer_setup(LINKED_CELL *lc, NONBOND_LIST *nl)
{
  int i;
  static int n_alloc_cell = 0, n_alloc_cell_pair = 0;
  LOAD_BALANCER *lb;
  char *func = "SDMD_load_balancer_setup";

  if (mpi.master) {
    if (n_alloc_cell==0) {
      /* first */
      lc->lb = lb = emalloc(func, sizeof(LOAD_BALANCER));
      lb->pe = emalloc(func, sizeof(LB_PE)*mpi.n_pe);
      for (i=0;i<mpi.n_pe;i++)
	lb->pe[i].cell = emalloc(func, sizeof(LB_CELL)*lc->n_cell);
      lb->cell_pair = emalloc(func, sizeof(LB_CELL_PAIR)*lc->n_cell_pair);
      
      n_alloc_cell=lc->n_cell;
      n_alloc_cell_pair = lc->n_cell_pair;
    } else {
      /* second or later */
      lb = lc->lb;
      if (n_alloc_cell<lc->n_cell) {
	for (i=0;i<mpi.n_pe;i++)
	  lb->pe[i].cell = erealloc(func, lb->pe[i].cell, sizeof(LB_CELL)*lc->n_cell);
	n_alloc_cell = lc->n_cell;
      }
      if (n_alloc_cell_pair<lc->n_cell_pair) {
	lb->cell_pair = erealloc(func, lb->cell_pair, sizeof(LB_CELL_PAIR)*lc->n_cell_pair);
	n_alloc_cell_pair = lc->n_cell_pair;
      }
    }
  }
  
  for (i=0;i<lc->n_cell_pair;i++) {
    lc->cell_pair[i].time = 0.0;
  }
  lc->idle_time = 0.0;

  lc->lb_cycle = 100/nl->update_step; 
  /* lc->lb_cycle = 5; */
}

void SDMD_load_balancer(LINKED_CELL *lc, int step, double dt_ps)
{
  static int count=0, prev_step, count2=0;
  static double check;

  switch (lc->load_balancer_exec_step) {
  case 0:
    /* do nothing */
    break;
  case 1:
    /* initialization */
    prev_step=step;
    SDMD_load_balancer_clear_time(lc, &check);
    count=0;
    lc->load_balancer_exec_step = 2;
    break;
  case 2:
    if (++count < lc->lb_cycle) break;
    SDMD_load_balancer_measure_time(lc, check, step-prev_step, dt_ps);
    if (lc->load_balancer_flag) {
      SDMD_load_balancer_gather(lc);
      SDMD_load_balancer_exec(lc, 0);
    }
    prev_step=step;
    SDMD_load_balancer_clear_time(lc, &check);
    count=0;
    lc->load_balancer_exec_step = 3;
    break;
  case 3:
    if (++count < lc->lb_cycle) break;
    SDMD_load_balancer_measure_time(lc, check, step-prev_step, dt_ps);
    if (lc->load_balancer_flag) {
      SDMD_load_balancer_gather(lc);
      SDMD_load_balancer_exec(lc, 1);
    }
    prev_step=step;
    SDMD_load_balancer_clear_time(lc, &check);
    count=0;
    lc->load_balancer_exec_step = 4;
    break;
  case 4:
    if (++count < lc->lb_cycle) break;
    SDMD_load_balancer_measure_time(lc, check, step-prev_step, dt_ps);
    SDMD_load_balancer_clear_time(lc, &check);
    count=0;
    prev_step=step;
    if (count2<0) {
      lc->load_balancer_exec_step = 4;
      count2++;
    } else {
      lc->load_balancer_exec_step = 0;
      count2=0;
    }
    break;
  case 5:
    /* during simulation ... */
    prev_step=step;
    SDMD_load_balancer_clear_time(lc, &check);
    count=0;
    lc->load_balancer_exec_step = 3;
    break;
  }
}

void SDMD_load_balancer_clear_time(LINKED_CELL *lc, double *check)
{
  int i;
  for (i=0;i<lc->n_cell_pair;i++)
    lc->cell_pair[i].time = 0.0;
  lc->idle_time = 0.0;
  MPI_Barrier(mpi.comm);
  *check = MPI_Wtime();
}

void SDMD_load_balancer_measure_time(LINKED_CELL *lc, double check,
				     int dstep, double dt_ps)
{
  int i;
  MPI_Barrier(mpi.comm);
  lc->total_time = MPI_Wtime()-check;
  lc->nb_time = 0.0;
  /*
  for (i=0;i<lc->n_cell_pair;i++) {
    if (lc->cell_pair[i].pe == mpi.rank)
  */
  for (i=lc->pair_head;i>=0;i=lc->cell_pair[i].next) {
#ifdef _OPENMP
    lc->cell_pair[i].time /= mpi.n_threads;
#endif
    lc->nb_time += lc->cell_pair[i].time;
  }
  if (dstep==0) return;
  lprintf("Benchmark Time [sec/step] = %f, ns/day = %f\n",
	  lc->total_time/dstep, dstep*dt_ps*(60*60*24)/lc->total_time/1000);
}

void SDMD_check_time_overflow(LINKED_CELL *lc, double *time, char *comment)
{
  if (!lc->check_time_overflow) return;
  while (*time < -2100.0) {
    /* this is may be 32 bit rounding error. */
    lprintf("Warning: The measured time %f of %s is largely negative!\n", *time, comment);
    *time += (double) ((unsigned long) 1 << 31) * 1.0e-6;
    lprintf("Warning: The time is corrected to %f assuming 32 bit overflow.\n",
	    *time);
  }
  while (*time > 2100.0) {
    /* this is may be 32 bit rounding error. */
    lprintf("Warning: The measured time %f of %s is largely positive!\n", *time, comment);
    *time -= (double) ((unsigned long) 1 << 31) * 1.0e-6;
    lprintf("Warning: The time is corrected to %f assuming 32 bit overflow.\n", *time);
  }
  if (*time < 0.0) {
    lprintf("Warning: The measured time %f of %s is negative! Reseted to zero.\n",
	    *time, comment);
    *time = 0.0;
  }
}

void SDMD_load_balancer_gather(LINKED_CELL *lc)
{
  int i, rank;
  LOAD_BALANCER *lb;
  MPI_Status stat;
  double time[3];
  double *send_buf, *recv_buf;
  int *recv_count, *dspl, *send_ibuf, *recv_ibuf;
  
  lb = lc->lb;
  if (mpi.master) {
    /* receive cell_pair_time */
    recv_buf=get_double_buf(lc->n_cell_pair);
    recv_count=get_int_buf(mpi.n_pe);
    dspl=get_int_buf(mpi.n_pe);
    for (i=0;i<mpi.n_pe;i++) recv_count[i]=0;
    for (i=0;i<lc->n_cell_pair;i++) {
      recv_count[lc->cell_pair[i].pe]++;
    }
    dspl[0]=0;
    for (i=1;i<mpi.n_pe;i++) dspl[i]=dspl[i-1]+recv_count[i-1];
    for (i=0;i<mpi.n_pe;i++) {
      if (i!=mpi.rank) {
	MPI_Recv(&recv_buf[dspl[i]],recv_count[i],MPI_DOUBLE, i, 5, mpi.comm, &stat);
      }
    }
    for (i=0;i<mpi.n_pe;i++) recv_count[i]=0;
    for (i=0;i<lc->n_cell_pair;i++) {
      rank=lc->cell_pair[i].pe;
      if (rank!=mpi.rank) {
	lc->cell_pair[i].time = recv_buf[dspl[rank]+recv_count[rank]];
	lb->cell_pair[i].time = lc->cell_pair[i].time;
	recv_count[rank]++;
      } else {
	lb->cell_pair[i].time = lc->cell_pair[i].time;
      }
      lb->cell_pair[i].icp = i;
      
      /* for time measurement error */
      SDMD_check_time_overflow(lc, &(lb->cell_pair[i].time), "cp");
      /* end of correction of time measurement error */
    }
    free_buf(recv_buf);
    free_buf(recv_count);
    free_buf(dspl);
    
    /* receive cell data */
    recv_ibuf = get_int_buf(lc->n_cell);
    for (rank=0;rank<mpi.n_pe;rank++) {
      if (rank==mpi.rank) {
	for (i=0;i<lc->n_cell;i++) {
	  lb->pe[rank].cell[i].req = lc->cell[i].req;
	}
	lb->pe[rank].bg_time = lc->total_time - lc->idle_time - lc->nb_time;
	lb->pe[rank].nb_time = lc->nb_time;
      } else {
	MPI_Recv(recv_ibuf, lc->n_cell, MPI_INT, rank, 6, mpi.comm, &stat);
	for (i=0;i<lc->n_cell;i++) {
	  lb->pe[rank].cell[i].req = recv_ibuf[i];
	}
	MPI_Recv(time, 2, MPI_DOUBLE, rank, 7, mpi.comm, &stat);
	lb->pe[rank].bg_time = time[0];
	lb->pe[rank].nb_time = time[1];
      }
      SDMD_check_time_overflow(lc,&(lb->pe[rank].bg_time), "bg");
      SDMD_check_time_overflow(lc,&(lb->pe[rank].nb_time), "nb");
      /* lb->pe[rank].bg_time = 0.0; */
      /*
      lprintf("LB: %d %f %f\n", rank, lb->pe[rank].bg_time, lb->pe[rank].nb_time);
      */
      
    }
    free_buf(recv_ibuf);
    
    
  } else {
    /* send cell pair data */
    int n_send;
    send_buf=get_double_buf(lc->n_cell_pair);
    n_send=0;
    /* for (i=0;i<lc->n_cell_pair;i++) {
       if (lc->cell_pair[i].pe == mpi.rank) { */
    for (i=lc->pair_head;i>=0;i=lc->cell_pair[i].next) {
      send_buf[n_send]=lc->cell_pair[i].time;
      n_send++;
    }
    MPI_Send(send_buf, n_send, MPI_DOUBLE, mpi.master_pe, 5, mpi.comm);

    /* send cell data */
    send_ibuf=get_int_buf(lc->n_cell);
    for (i=0;i<lc->n_cell;i++) {
      send_ibuf[i] = lc->cell[i].req;
    }
    MPI_Send(send_ibuf, lc->n_cell, MPI_INT, mpi.master_pe, 6, mpi.comm);
    time[0]=lc->total_time - lc->idle_time - lc->nb_time;
    time[1]=lc->nb_time;
    MPI_Send(time, 2, MPI_DOUBLE, mpi.master_pe, 7, mpi.comm);
    free_buf(send_ibuf);
  }
}

int SDMD_cell_pair_cmp(LB_CELL_PAIR *cp1,const LB_CELL_PAIR *cp2)
{
  if (cp1->time < cp2->time) return 1;
  if (cp1->time > cp2->time) return -1;
  return 0;
}

void SDMD_load_balancer_exec(LINKED_CELL *lc, int method)
{
  LOAD_BALANCER *lb;
  int *tr_buf, i, rank;
  int prev;
  double over_load, ave_load;
  int max_pe, min_pe;
  double max_load, min_load;
  int ret_multirefine;

  tr_buf=get_int_buf(lc->n_cell_pair);
  lb=lc->lb;
  if (mpi.master) {
    /* calculate ave_load */
    ave_load = 0.0;
    for (rank=0;rank<mpi.n_pe;rank++) {
      ave_load += lb->pe[rank].bg_time + lb->pe[rank].nb_time;
    }
    ave_load /= mpi.n_pe;

    /*
    for (i=0;i<mpi.n_pe;i++) {
      lprintf("load (measured): %d %f %f %f\n",i,lb->pe[i].bg_time, lb->pe[i].nb_time, lb->pe[i].bg_time+lb->pe[i].nb_time);
    }
    */
    max_load=0.0; min_load=1.0e10;
    for (i=0;i<mpi.n_pe;i++) {
      if (lb->pe[i].bg_time+lb->pe[i].nb_time > max_load) {
	max_load = lb->pe[i].bg_time+lb->pe[i].nb_time;
	max_pe = i;
      }
      if (lb->pe[i].bg_time+lb->pe[i].nb_time < min_load) {
	min_load = lb->pe[i].bg_time+lb->pe[i].nb_time;
	min_pe = i;
      }
    }
    lprintf("load_balance(measured):   min %f(%d), max %f(%d), ave %f\n",
	    min_load, min_pe, max_load, max_pe, ave_load);

    if (method == 0) {
      SDMD_load_balancer_all(lc, 1.2, ave_load);
    }
    
    ret_multirefine = SDMD_load_balancer_multirefine(lc, lc->start_overload,
						     lc->max_overload,
						     ave_load);

    /*
    for (i=0;i<mpi.n_pe;i++) {
      lprintf("load_balance(prediction): %d %f %f\n",i,lb->pe[i].cur_load, ave_load);
    }
    */
    if (!ret_multirefine) {
      lprintf("Load Balancing Failure. Conditions in previous steps are used.\n");
    }
    
    max_load=0.0; min_load=1.0e10;
    for (i=0;i<mpi.n_pe;i++) {
      if (lb->pe[i].cur_load > max_load) {
	max_load = lb->pe[i].cur_load;
	max_pe = i;
      }
      if (lb->pe[i].cur_load < min_load) {
	min_load = lb->pe[i].cur_load;
	min_pe = i;
      }
    }
    lprintf("load_balance(prediction): min %f(%d), max %f(%d), ave %f\n",
	    min_load, min_pe, max_load, max_pe, ave_load);

    /* send pe data */
    for (i=0;i<lc->n_cell_pair;i++) {
      tr_buf[i] = lc->cell_pair[i].pe;
    }
    MPI_Bcast(tr_buf, lc->n_cell_pair, MPI_INT, mpi.master_pe, mpi.comm);
  } else {
    /* other nodes */
    MPI_Bcast(tr_buf, lc->n_cell_pair, MPI_INT, mpi.master_pe, mpi.comm);
    for (i=0;i<lc->n_cell_pair;i++) {
      lc->cell_pair[i].pe = tr_buf[i];
    }
  }

  for (i=0;i<lc->n_cell;i++) {
    lc->cell[i].req &= ~CELL_REQ_NONBOND;
  }
  
  lc->pair_head = -1;
  lc->n_cell_pair_req=0;

  for (i=0;i<lc->n_cell_pair;i++) {
    if (lc->cell_pair[i].pe == mpi.rank) {
      lc->cell[lc->cell_pair[i].i].req |= CELL_REQ_NONBOND;
      lc->cell[lc->cell_pair[i].j].req |= CELL_REQ_NONBOND;

      lc->cell_pair[i].next = -1;
      if (lc->pair_head == -1) {
	lc->pair_head=i;
      } else {
	lc->cell_pair[prev].next = i;
      }
      prev = i;

      lc->cell_pair_req[lc->n_cell_pair_req]=i;
      lc->n_cell_pair_req++;
      
    }
  }
  free_buf(tr_buf);
}

int SDMD_load_balancer_affinity(LINKED_CELL *lc, LOAD_BALANCER *lb,
				int rank, int ic, int jc)
{
  int affinity, iaff, jaff;
#if 0
  if (lb->pe[rank].cell[ic].req & CELL_REQ_HOME) iaff = 2;
  else if (lb->pe[rank].cell[ic].req) iaff = 1;
  else iaff = 0;
  if (lb->pe[rank].cell[jc].req & CELL_REQ_HOME) jaff = 2;
  else if (lb->pe[rank].cell[jc].req) jaff = 1;
  else jaff = 0;
  if (iaff==2&&jaff==2) affinity = 6;
  else if ((iaff==2&&jaff==1)||(iaff==1&&jaff==2)) affinity = 5;
  else if (iaff==1&&jaff==1) affinity = 4;
  else if ((iaff==2&&jaff==0)||(iaff==0&&jaff==2)) affinity = 3;
  else if ((iaff==1&&jaff==0)||(iaff==0&&jaff==1)) affinity = 2;
  else affinity = 1;  /* if (iaff==0&&jaff==0) */
#elif 1
  if (lb->pe[rank].cell[ic].req && lb->pe[rank].cell[jc].req) affinity = 3;
  else if (lb->pe[rank].cell[ic].req || lb->pe[rank].cell[jc].req) affinity = 2;
  else affinity = 1;
#else
  /* for SC ... */
  if (lb->pe[rank].cell[ic].req && lb->pe[rank].cell[jc].req) affinity = 6;
  else if (lb->pe[rank].cell[ic].req) {
    if (lc->cell[jc].pe / 4 == rank / 4) affinity = 5;
    else affinity = 4;
  } else if (lb->pe[rank].cell[jc].req) {
    if (lc->cell[ic].pe / 4 == rank / 4) affinity = 5;
    else affinity = 4;
  } else {
    if (lc->cell[ic].pe / 4 == rank / 4 && lc->cell[jc].pe / 4 == rank / 4)
      affinity = 3;
    else if (lc->cell[ic].pe / 4 == rank / 4 || lc->cell[jc].pe / 4 == rank / 4)
      affinity = 2;
    else
      affinity = 1;
  }
#endif
  return affinity;
}

void SDMD_load_balancer_all(LINKED_CELL *lc, double over_load, double ave_load)
{
  LOAD_BALANCER *lb;
  double cur_load, cp_time;
  int i,affinity, icp, ic, jc, rank, cur_affinity, cur_rank, iaff, jaff;

  lb = lc->lb;

  /* initialization of initial load for each PE */
  for (rank=0;rank<mpi.n_pe;rank++) {
    lb->pe[rank].cur_load = lb->pe[rank].bg_time;
  }
  for (rank=0;rank<mpi.n_pe;rank++) {
    for (i=0;i<lc->n_cell;i++) {
      lb->pe[rank].cell[i].req &= ~CELL_REQ_NONBOND;
    }
  }

  qsort(lb->cell_pair, lc->n_cell_pair, sizeof(LB_CELL_PAIR), 
	(int (*) (const void *, const void *)) SDMD_cell_pair_cmp);
  
  for (i=0;i<lc->n_cell_pair;i++) {
    icp=lb->cell_pair[i].icp;
    cp_time=lb->cell_pair[i].time;
    ic=lc->cell_pair[icp].i;
    jc=lc->cell_pair[icp].j;
      
    cur_affinity = 0;
    cur_load = 1.0e10;
    cur_rank = -1;
    
    for (rank=0;rank<mpi.n_pe;rank++) {
      /* affinity */
      affinity = SDMD_load_balancer_affinity(lc, lb, rank, ic, jc);

      if (lb->pe[rank].cur_load + cp_time <= ave_load * over_load) affinity+=6;
      
      if (cur_affinity < affinity) {
	cur_rank = rank;
	cur_affinity = affinity;
	cur_load = lb->pe[rank].cur_load;
      } else if (cur_affinity == affinity) {
	if (cur_load > lb->pe[rank].cur_load) {
	  cur_rank = rank;
	  cur_load = lb->pe[rank].cur_load;
	}
      }
    }
    if (cur_rank < 0) {
      lprintf("Internal Error! cur_rank < 0\n");
      marble_abort(1);
    }
    /* lprintf("%d %d %d %d\n", i, cur_rank, lc->cell_pair[icp].pe, cur_affinity); */
    lb->pe[cur_rank].cur_load += cp_time;
    lc->cell_pair[icp].pe = cur_rank;
    lb->pe[cur_rank].cell[ic].req |= CELL_REQ_NONBOND;
    lb->pe[cur_rank].cell[jc].req |= CELL_REQ_NONBOND;
  }
}

int SDMD_load_balancer_multirefine(LINKED_CELL *lc, double min_over_load,
				   double max_over_load,
				   double ave_load)
{
  double cur_over_load;
  int rank, i;
  LOAD_BALANCER *lb;

  lb = lc->lb;
  cur_over_load = min_over_load;
  while (!SDMD_load_balancer_refine(lc, cur_over_load, ave_load)) {
    if (cur_over_load > max_over_load) {
      lprintf("LOAD BALANCE: overload becomes more than %f\n", max_over_load);
      return 0;
    }
    cur_over_load += 0.01;
  }
  
  lprintf("LOAD BALANCE: Refining overload %f\n", cur_over_load);
  return 1;
}

int SDMD_load_balancer_refine(LINKED_CELL *lc, double over_load, double ave_load)
{
  int i, j, k, rank, max_pe, cur_cell_pair, cur_affinity;
  int iaff, jaff, ic, jc, new_pe, affinity;
  unsigned long count;
  double max_load, cp_time, cur_load;
  LOAD_BALANCER *lb;

  lb = lc->lb;
  /* calculation of nb_time for each PE */
  for (rank=0;rank<mpi.n_pe;rank++) lb->pe[rank].nb_time=0.0;
  for (i=0;i<lc->n_cell_pair;i++) {
    rank=lc->cell_pair[i].pe;
    lb->pe[rank].nb_time += lc->cell_pair[i].time;
  }
  
  /* initialization of initial load including nb_time for each PE */
  for (rank=0;rank<mpi.n_pe;rank++) {
    lb->pe[rank].cur_load = lb->pe[rank].bg_time + lb->pe[rank].nb_time;
  }
  
  /* initialization of cell req flags */
  for (rank=0;rank<mpi.n_pe;rank++) {
    for (i=0;i<lc->n_cell;i++) {
      lb->pe[rank].cell[i].nb=0;
      lb->pe[rank].cell[i].req &= ~CELL_REQ_NONBOND;
    }
  }
  for (i=0;i<lc->n_cell_pair;i++) {
    ic=lc->cell_pair[i].i;
    jc=lc->cell_pair[i].j;
    rank=lc->cell_pair[i].pe;
    lb->cell_pair[i].icp = rank;  /* for backup */
    lb->pe[rank].cell[ic].nb++;
    lb->pe[rank].cell[ic].req |= CELL_REQ_NONBOND;
    lb->pe[rank].cell[jc].nb++;
    lb->pe[rank].cell[jc].req |= CELL_REQ_NONBOND;
  }

  count = 0;
  while (1) {
    count++;
    /* find the heaviest PE */
    max_load = 0.0;
    for (rank=0;rank<mpi.n_pe;rank++) {
      if (max_load < lb->pe[rank].cur_load) {
	max_load = lb->pe[rank].cur_load;
	max_pe = rank;
      }
    }

    if (max_load <= ave_load * over_load) return 1;

    cur_load = 1.0e10;
    cur_cell_pair = -1;
    cur_affinity = 0;
    for (i=0;i<lc->n_cell_pair;i++) {
      if (lc->cell_pair[i].pe != max_pe) continue;
      
      cp_time=lc->cell_pair[i].time;
      if (cp_time == 0.0) continue;
      
      ic=lc->cell_pair[i].i;
      jc=lc->cell_pair[i].j;
	
      for (rank=0;rank<mpi.n_pe;rank++) {
	if (rank == max_pe) continue;
	if (lb->pe[rank].cur_load + cp_time > ave_load * over_load) continue;
	
	/* calculation of affinity */
	affinity = SDMD_load_balancer_affinity(lc, lb, rank, ic, jc);

	/* select or not */
	if (cur_affinity < affinity) {
	  cur_cell_pair = i;
	  cur_affinity = affinity;
	  cur_load = cp_time;
	  new_pe = rank;
	} else if (cur_affinity == affinity) {
	  if (cur_load > cp_time) {
	    cur_cell_pair = i;
	    cur_load = cp_time;
	    new_pe = rank;
	  }
	}
      }
    }
    
    if (cur_cell_pair<0 ||
	count >= 1000000000) {
      /* can't find cell pair */
      /* restore backup data */
      for (i=0;i<lc->n_cell_pair;i++) {
	lc->cell_pair[i].pe = lb->cell_pair[i].icp;
      }
      lprintf("No solution in overload %f. count = %d\n", over_load, count);
      return 0;   /* fail return */
    }

    /*
    lprintf("%d->%d, %f->%f, %f->%f\n", max_pe,new_pe,
	    lb->pe[max_pe].cur_load,
	    lb->pe[max_pe].cur_load-cur_load,
	    lb->pe[new_pe].cur_load,
	    lb->pe[new_pe].cur_load+cur_load); */
    
    lc->cell_pair[cur_cell_pair].pe = new_pe;
    lb->pe[max_pe].cur_load -= cur_load;
    lb->pe[new_pe].cur_load += cur_load;
    
    ic = lc->cell_pair[cur_cell_pair].i;
    jc = lc->cell_pair[cur_cell_pair].j;
    lb->pe[max_pe].cell[ic].nb--;
    lb->pe[max_pe].cell[jc].nb--;
    
    if (lb->pe[max_pe].cell[ic].nb == 0)
      lb->pe[max_pe].cell[ic].req &= ~CELL_REQ_NONBOND;
    if (lb->pe[max_pe].cell[jc].nb == 0)
      lb->pe[max_pe].cell[jc].req &= ~CELL_REQ_NONBOND;
    
    lb->pe[new_pe].cell[ic].nb++;
    lb->pe[new_pe].cell[jc].nb++;
    lb->pe[new_pe].cell[ic].req |= CELL_REQ_NONBOND;
    lb->pe[new_pe].cell[jc].req |= CELL_REQ_NONBOND;
  }
}

/* non list version */
void SDMD_nonbond_energy_force_smooth(LINKED_CELL *lc, NONBOND_LIST *nl,
				      ATOM_DATA *ad, BOUNDARY *bc,
				      double *vdw, double *elec, double *hbond)
{
  int i,j,k;
  int vdw_index;
  int start, end;
  double dx, dy, dz;
  double len, len2, len6, len12;
  double vdw12, vdw6, hb12, hb10, force, ene_tmp, elec_tmp;
  VEC offset_v;
  int icp;
  int icell, jcell;

  /* for smoothing */
  double S, dS, len_rl_on, rl_tmp, rl_diff3;
  double Se, dSe;

  double cp_start_time;
  
  int n_atom, npart, ipart, mod;
  int min_iatom, max_iatom, iatom, min_jatom, max_jatom, jatom;
  int ip;
  static int *pairlist=NULL;
  VEC *x_iatom, *x_jatom;
  
  rl_tmp = 3.0 * nl->rl_off - nl->rl_on;
  rl_diff3 = pow(nl->rl_off - nl->rl_on, 3.0);
  
  *vdw = *elec = *hbond = 0.0;
  S=1.0; dS=0.0; Se=1.0; dSe=0.0;
  
  if (!pairlist) pairlist=emalloc("",sizeof(int)*10000);

  ip=0;
  for (j=0;j<lc->n_cell;j++) {
    lc->cell[j].fold_id = ip;
    for (i=lc->cell[j].head;i>=0;i=lc->next_atom[i]) {
      
      ad->fold_x[ip].x = ad->x[i].x - VEC_MUL_MAT_X(ad->tr_x[i],bc->boxv);
      ad->fold_x[ip].y = ad->x[i].y - VEC_MUL_MAT_Y(ad->tr_x[i],bc->boxv);
      ad->fold_x[ip].z = ad->x[i].z - VEC_MUL_MAT_Z(ad->tr_x[i],bc->boxv);
      
      ad->fold_id[ip++] = i;
      ad->ex[i].id = -1;
    }
  }

  /*
  for (icp=0;icp<lc->n_cell_pair;icp++) {
    if (lc->cell_pair[icp].pe != mpi.rank) continue;
  */
  for (icp=lc->pair_head;icp>=0;icp=lc->cell_pair[icp].next) {

    cp_start_time = MPI_Wtime();
    
    offset_v=bc->offset_v[lc->cell_pair[icp].offset];
    
    icell = lc->cell_pair[icp].i;
    jcell = lc->cell_pair[icp].j;

    /* for partition */
    n_atom = lc->cell[icell].n_atom;
    npart = lc->cell_pair[icp].npart;
    ipart = lc->cell_pair[icp].ipart;
    mod = n_atom % npart;
    if (mod <= ipart)
      min_iatom = n_atom / npart * ipart + mod;
    else
      min_iatom = (n_atom / npart + 1 ) * ipart;
    if (mod <= ipart+1)
      max_iatom = n_atom / npart * (ipart+1) + mod - 1;
    else
      max_iatom = (n_atom / npart + 1) * (ipart+1) - 1;
    /* end of partition */

    min_iatom += lc->cell[icell].fold_id;
    max_iatom += lc->cell[icell].fold_id;
    
    min_jatom = lc->cell[jcell].fold_id;
    max_jatom = lc->cell[jcell].fold_id + lc->cell[jcell].n_atom-1;

    x_iatom = &(ad->fold_x[min_iatom]);
    for (iatom = min_iatom; iatom <= max_iatom; iatom++, x_iatom++) {
      i=ad->fold_id[iatom];
      for (j=0;j<ad->ex[i].n_exatom;j++) {
	ad->ex[ad->ex[i].exatom[j]].id = i;
      }
      ip=0;
      x_jatom = &(ad->fold_x[min_jatom]);
      for (jatom=min_jatom;jatom<=max_jatom;jatom++, x_jatom++) {
	j=ad->fold_id[jatom];
	if (icell == jcell && i>=j) continue;
	if (ad->ex[j].id == i) continue;
	
	dx = x_iatom->x - x_jatom->x + offset_v.x;
	dy = x_iatom->y - x_jatom->y + offset_v.y;
	dz = x_iatom->z - x_jatom->z + offset_v.z;

	len2 = dx * dx + dy * dy + dz * dz;
	if (len2 >= nl->rl_off2) continue;
	pairlist[ip++]=jatom;
      }
	
#if 0	
	len6 = len2 * len2 * len2;
	len12 = len6 * len6;
	
	vdw_index = ad->index[ad->vdw_type[i]+ad->vdw_type[j]*ad->ntype];
	vdw12 = ad->vdw12[vdw_index] / len12;
	vdw6 = ad->vdw6[vdw_index] / len6;
	ene_tmp = vdw12 - vdw6;
	*vdw += ene_tmp;
	force = 12.0 * vdw12 - 6.0 * vdw6;

	force = ene_tmp = 0.0;
	
	len = sqrt(len2);
	elec_tmp = ad->q[i] * ad->q[j]/len;
	*elec += elec_tmp;
	force = (force+elec_tmp)/len2-(ene_tmp+elec_tmp)/len;
    
	ad->f[i].x += force * dx;
	ad->f[i].y += force * dy;
	ad->f[i].z += force * dz;
    
	ad->f[j].x -= force * dx;
	ad->f[j].y -= force * dy;
	ad->f[j].z -= force * dz;
      
	/* virial */
	ad->virial[0] += force * dx * dx;
	ad->virial[1] += force * dy * dy;
	ad->virial[2] += force * dz * dz;
#endif
    }
  }
  lc->cell_pair[icp].time += MPI_Wtime() - cp_start_time;
}

void SDMD_check_force(LINKED_CELL *lc, ATOM_DATA *ad)
{
  int i;
  VEC f, f2;

  SDMD_gather_f(lc, ad);
  if (!mpi.master) return;

  f.x = f.y = f.z = 0.0;
  f2.x = f2.y = f2.z = 0.0;
  for (i=0;i<ad->natom;i++) {
    f.x += ad->f[i].x;
    f.y += ad->f[i].y;
    f.z += ad->f[i].z;
    if (i % 2 == 0) {
      f2.x += ad->f[i].x;
      f2.y += ad->f[i].y;
      f2.z += ad->f[i].z;
    } else {
      f2.x -= ad->f[i].x;
      f2.y -= ad->f[i].y;
      f2.z -= ad->f[i].z;
    }
  }
  lprintf("(%f %f %f), (%f %f %f)\n", f.x, f.y, f.z, f2.x, f2.y, f2.z);
 
}

#if 0
SDMD_check(LINKED_CELL *lc)
{
  int flag, size;
  MPI_Status stat;
  static int count = 0;

  /* do { */
    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, mpi.comm, &flag, &stat);

    if (flag) { 
      MPI_Get_count(&stat, MPI_BYTE, &size);
      printf("Probe %d<-%d tag:%d err:%d size:%d count:%d\n", mpi.rank,
	   stat.MPI_SOURCE, stat.MPI_TAG, stat.MPI_ERROR, size, count);
      printf("tr_list[%d].n_recv_x = %d\n", stat.MPI_SOURCE,
	   lc->tr_list[stat.MPI_SOURCE].n_recv_x);
      /*
      MPI_Recv(buf, size, MPI_BYTE, stat.MPI_SOURCE, stat.MPI_TAG,
      mpi.comm, &stat); */
    }
  /* } while (flag); */
  count++;
  /* marble_exit(1); */
}
#endif

#else
static int dummy;
#endif  /* MPI_SDMD */
