/*
 * 
 * This source code is part of 
 *   MARBLE (MoleculAR simulation package for BiomoLEcules)
 * 
 * Written by Mitsunori Ikeguchi
 * Copyright (c) 2012 Yokohama City University
 *  
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include <mpi.h>

#include "misc.h"

#include "mpi_global.h"

#ifdef FFT_FLOAT
#define MPI_FFT_REAL MPI_FLOAT
#else
#define MPI_FFT_REAL MPI_DOUBLE
#endif

/*#define FFT_OMP */
/*#define FFT_OMP2 */

#ifdef _OPENMP
#include "omp.h"
#endif

#include "fft3d.h"

int FFT3D_setup(FFT3D *ft, int nx[3], int transpose)
{
  int i,j, ipx[3], split_key;
  int free_coords[3], lnx3, max_n_data;
  FFT3DXYZ *xyz;
  char *func = "FFT3D_setup";
  MPI_Group group_world, group_line;
  unsigned int fftw_flags;
  fftw_complex *fft_data, *fft_tdata;

  ft->transpose = transpose;

  for (i=0;i<3;i++) {
    ft->nx[i] = nx[i];
    ft->npx[i] = mpi.npx[i];
    if (ft->nx[i] % ft->npx[i] != 0) {
      printf("ERROR: nx %d must be dividable by npx %d.\n", ft->nx[i], ft->npx[i]);
      return 1;
    }
    ft->lnx[i] = ft->nx[i] / ft->npx[i];
    ft->px[i] = mpi.px[i];
    ft->start_x[i] = ft->px[i]*ft->lnx[i];
  }
  ft->pe = mpi.cart_rank;

  lnx3 = ft->lnx[0]*ft->lnx[1]*ft->lnx[2];

  max_n_data = 0;
  MPI_Comm_group(mpi.cart_comm, &group_world);

  for (i=0;i<3;i++) {
    xyz = &ft->xyz[i];
    xyz->n_pe = ft->npx[i];
    xyz->pe_list  = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->sf_count = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->sf_displ = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->rf_count = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->rf_displ = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->sb_count = xyz->rf_count;
    xyz->sb_displ = xyz->rf_displ;
    xyz->rb_count = xyz->sf_count;
    xyz->rb_displ = xyz->sf_displ;
    xyz->sf_count_c = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->sf_displ_c = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->rf_count_c = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->rf_displ_c = FFT3D_emalloc(func, sizeof(int)*xyz->n_pe);
    xyz->sb_count_c = xyz->rf_count_c;
    xyz->sb_displ_c = xyz->rf_displ_c;
    xyz->rb_count_c = xyz->sf_count_c;
    xyz->rb_displ_c = xyz->sf_displ_c;
    /*
    for (j=0;j<3;j++) {
      free_coords[j] = (i==j) ? 1 : 0;
    }
    MPI_Cart_sub(mpi.cart_comm, free_coords, &ft->xyz[i].comm);
    */
    for (j=0;j<3;j++) {
      ipx[j]=mpi.px[j];
    }
    for (j=0;j<xyz->n_pe;j++) {
      ipx[i]=j;
      xyz->pe_list[i]=ipx[0]+ft->npx[0]*(ipx[1]+ft->npx[1]*ipx[2]);
    }
    ipx[i]=0;
    split_key = ipx[0]+ft->npx[0]*(ipx[1]+ft->npx[1]*ipx[2]);
    MPI_Comm_split(mpi.cart_comm, split_key, mpi.cart_rank, &xyz->comm);

    xyz->n_all_fft = lnx3 / ft->lnx[i];
    xyz->n_local_fft     = xyz->n_all_fft / ft->npx[i];
    xyz->n_local_fft_div = xyz->n_local_fft;
    xyz->n_local_fft_mod = xyz->n_all_fft % ft->npx[i];

    if (xyz->n_local_fft_mod == 0) {
      xyz->alltoall_flag = 1;
      xyz->count0 = xyz->n_local_fft_div * ft->lnx[i];
      xyz->count0_c = xyz->count0*2;
    } else {
      xyz->alltoall_flag = 0;
    }
    /*
    if (mpi.master)
      printf("dimension %d, alltoall_flag %d\n", i, xyz->alltoall_flag);
    */

    if (ft->px[i] < xyz->n_local_fft_mod) {
      xyz->n_local_fft++;
    }

    for (j=0;j<xyz->n_pe;j++) {
      xyz->sf_count[j] = xyz->n_local_fft_div*ft->lnx[i];
      if (j < xyz->n_local_fft_mod)
	xyz->sf_count[j] += ft->lnx[i];

      xyz->rf_count[j] = xyz->n_local_fft * ft->lnx[i];
    
      if (j==0) {
	xyz->sf_displ[j]=0;
	xyz->rf_displ[j]=0;
      } else {
	xyz->sf_displ[j]=xyz->sf_displ[j-1]+xyz->sf_count[j-1];
	xyz->rf_displ[j]=xyz->rf_displ[j-1]+xyz->rf_count[j-1];
      }
      
      xyz->sf_count_c[j] = xyz->sf_count[j]*2;
      xyz->sf_displ_c[j] = xyz->sf_displ[j]*2;
      xyz->rf_count_c[j] = xyz->rf_count[j]*2;
      xyz->rf_displ_c[j] = xyz->rf_displ[j]*2;
    }
    if (max_n_data < xyz->sf_count_c[xyz->n_pe-1]+xyz->sf_displ_c[xyz->n_pe-1]) {
      max_n_data = xyz->sf_count_c[xyz->n_pe-1]+xyz->sf_displ_c[xyz->n_pe-1];
    }
    if (max_n_data < xyz->rf_count_c[xyz->n_pe-1]+xyz->rf_displ_c[xyz->n_pe-1]) {
      max_n_data = xyz->rf_count_c[xyz->n_pe-1]+xyz->rf_displ_c[xyz->n_pe-1];
    }
      
  }
  ft->s_data   = FFT3D_emalloc(func, sizeof(fftw_real)*max_n_data);
  ft->r_data   = FFT3D_emalloc(func, sizeof(fftw_real)*max_n_data);
  ft->fft_data  = FFT3D_emalloc(func, sizeof(fftw_real)*max_n_data);
  ft->fft_tdata = FFT3D_emalloc(func, sizeof(fftw_real)*max_n_data);
  fft_data  = (fftw_complex *) ft->fft_data;
  fft_tdata = (fftw_complex *) ft->fft_tdata;

  ft->n_data = ft->lnx[0]*ft->lnx[1]*ft->lnx[2];
  if (ft->transpose) {
    ft->n_tdata = ft->n_data;
  } else {
    int ikxy_start, ikxy;

    ft->n_tdata = ft->xyz[2].n_local_fft * ft->nx[2];
    ft->n_ikxy_tdata = ft->xyz[2].n_local_fft;
    ft->ikxy_tdata = FFT3D_emalloc(func, sizeof(int) * ft->xyz[2].n_local_fft * 2);

    if (ft->px[2] < ft->xyz[2].n_local_fft_mod) {
      ikxy_start = ft->px[2] * (ft->xyz[2].n_local_fft_div + 1);
    } else {
      ikxy_start = ft->px[2] * ft->xyz[2].n_local_fft_div + ft->xyz[2].n_local_fft_mod;
    }
    for (i=0;i<ft->n_ikxy_tdata;i++) {
      ikxy = ikxy_start + i;
      ft->ikxy_tdata[i][0] = (ikxy % ft->lnx[0]) + ft->start_x[0];
      ft->ikxy_tdata[i][1] = (ikxy / ft->lnx[0]) + ft->start_x[1];
    }
  }

  /* create fftw plans */
#ifdef FFT_OMP
#define FFT3D_PLAN_FLAG FFTW_ESTIMATE

  xyz = &ft->xyz[0];
  xyz->f_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  xyz->b_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  for (j=0;j<xyz->n_local_fft;j++) {
    xyz->f_plan[j] = fftw_plan_dft_1d(ft->nx[0], &fft_data[j*ft->nx[0]], &fft_tdata[j*ft->nx[0]],
				      FFTW_FORWARD, FFT3D_PLAN_FLAG);
    xyz->b_plan[j] = fftw_plan_dft_1d(ft->nx[0], &fft_data[j*ft->nx[0]], &fft_tdata[j*ft->nx[0]],
				      FFTW_BACKWARD, FFT3D_PLAN_FLAG);
  }
  xyz = &ft->xyz[1];
  xyz->f_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  xyz->b_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  for (j=0;j<xyz->n_local_fft;j++) {
    xyz->f_plan[j] = fftw_plan_dft_1d(ft->nx[1], &fft_data[j*ft->nx[1]], &fft_tdata[j*ft->nx[1]],
				      FFTW_FORWARD, FFT3D_PLAN_FLAG);
    xyz->b_plan[j] = fftw_plan_dft_1d(ft->nx[1], &fft_data[j*ft->nx[1]], &fft_tdata[j*ft->nx[1]],
				      FFTW_BACKWARD, FFT3D_PLAN_FLAG);
  }
  xyz = &ft->xyz[2];
  xyz->f_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  xyz->b_plan = FFT3D_emalloc(func, sizeof(fftw_plan)*xyz->n_local_fft);
  for (j=0;j<xyz->n_local_fft;j++) {
    xyz->f_plan[j] = fftw_plan_dft_1d(ft->nx[2], &fft_data[j*ft->nx[2]], &fft_tdata[j*ft->nx[2]],
				      FFTW_FORWARD, FFT3D_PLAN_FLAG);
    xyz->b_plan[j] = fftw_plan_dft_1d(ft->nx[2], &fft_data[j*ft->nx[2]], &fft_tdata[j*ft->nx[2]],
				      FFTW_BACKWARD, FFT3D_PLAN_FLAG);
  }

#else /* def FFT_OMP */

#ifdef FFT_OMP2
  fftw_init_threads();
  fftw_plan_with_nthreads(omp_get_max_threads());
#endif

  /* x direction */
#define FFT3D_PLAN_FLAG FFTW_ESTIMATE
  xyz = &ft->xyz[0];
  xyz->f_plan = fftw_plan_many_dft(1, &ft->nx[0], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[0],
				   fft_tdata, NULL, 1, ft->nx[0], 
				   FFTW_FORWARD, FFT3D_PLAN_FLAG);

  xyz->b_plan = fftw_plan_many_dft(1, &ft->nx[0], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[0],
				   fft_tdata, NULL, 1, ft->nx[0], 
				   FFTW_BACKWARD, FFT3D_PLAN_FLAG);

  /* y direction */
  xyz = &ft->xyz[1];
  xyz->f_plan = fftw_plan_many_dft(1, &ft->nx[1], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[1],
				   fft_tdata, NULL, 1, ft->nx[1], 
				   FFTW_FORWARD, FFT3D_PLAN_FLAG);
  xyz->b_plan = fftw_plan_many_dft(1, &ft->nx[1], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[1],
				   fft_tdata, NULL, 1, ft->nx[1], 
				   FFTW_BACKWARD, FFT3D_PLAN_FLAG);
  /* z direction */
  xyz = &ft->xyz[2];
  xyz->f_plan = fftw_plan_many_dft(1, &ft->nx[2], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[2],
				   fft_tdata, NULL, 1, ft->nx[2], 
				   FFTW_FORWARD, FFT3D_PLAN_FLAG);
  xyz->b_plan = fftw_plan_many_dft(1, &ft->nx[2], xyz->n_local_fft, 
				   fft_data,  NULL, 1, ft->nx[2],
				   fft_tdata, NULL, 1, ft->nx[2], 
				   FFTW_BACKWARD, FFT3D_PLAN_FLAG);
  
#endif /* FFT_OMP */
  
  /*
  {
    int ix, iy, iz;
    MPI_Comm_rank(ft->xyz[0].comm, &ix);
    MPI_Comm_rank(ft->xyz[1].comm, &iy);
    MPI_Comm_rank(ft->xyz[2].comm, &iz);
    printf("%d %d (%d %d %d) (%d %d %d)\n", mpi.rank, mpi.cart_rank, ix, iy, iz,mpi.px[0],mpi.px[1],mpi.px[2]);
    MPI_Finalize();
    exit(1);
  }
  */
  return 0;
}

void *FFT3D_emalloc(char *routine, size_t size)
{
  void *ret;

  if (size == 0) return NULL;

  if ((ret = fftw_malloc(size)) == NULL) {
    malloc_error_exit(routine, size);
  }
  return ret;
}

#ifdef FFT_OMP

void FFT3D_forward(FFT3D *ft, fftw_real *data, fftw_complex *tdata)
{
#ifdef _OPENMP
#pragma omp parallel
{
#endif
  int ix, iy, iz, i, ii, p, iix, iiy, iiz;
  FFT3DXYZ *xyz;
  fftw_complex * restrict s_data, * restrict r_data, * restrict fft_data, * restrict fft_tdata;
  fftw_real * restrict ft_r_data;
  int n_local_fft, n_pe;
  int * restrict lnx, * restrict nx;

  s_data = (fftw_complex*) ft->s_data;
  r_data = (fftw_complex*) ft->r_data;
  fft_data  = (fftw_complex*) ft->fft_data;
  fft_tdata = (fftw_complex*) ft->fft_tdata;

  ft_r_data  = ft->r_data;
  
  lnx = ft->lnx;
  nx  = ft->nx;

  /* x direction */
  xyz = &ft->xyz[0];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;
  /* send to x direction */
  /* data is already ordered to be sent to x direction */

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(data,       xyz->count0, MPI_FFT_REAL, 
		 ft->r_data, xyz->count0, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(data,       xyz->sf_count, xyz->sf_displ, MPI_FFT_REAL, 
		  ft->r_data, xyz->rf_count, xyz->rf_displ, MPI_FFT_REAL, xyz->comm);

#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_re(ARR3D(fft_data,ix,p,i,lnx[0],n_pe)) =
	  ARR3D(ft_r_data, ix, i, p, lnx[0], n_local_fft);
	c_im(ARR3D(fft_data,ix,p,i,lnx[0],n_pe)) = 0.0;
      }
    }
    fftw_execute(xyz->f_plan[i]);
  }

  /*fftw(xyz->f_plan, xyz->n_local_fft, fft_data, 1, ft->nx[0], fft_tdata, 1, ft->nx[0]);*/

#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(s_data,   ix,i,p,lnx[0],n_local_fft),
		ARR3D(fft_tdata,ix,p,i,lnx[0],n_pe));
      }
    }
  }

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);
  
  /* y direction */
  xyz = &ft->xyz[1];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

#ifdef _OPENMP
#pragma omp for
#endif
  for (ix=0;ix<lnx[0];ix++) {
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,iy,iz,ix, lnx[1],lnx[2]),
		ARR3D(r_data,ix,iy,iz, lnx[0],lnx[1]));
      }
    }
  }

/*
    for (iiz=0;iiz<ft->lnx[2];iiz+=NBLOCK) {  
      for (iiy=0;iiy<ft->lnx[1];iiy+=NBLOCK) {
	  for (iy=iiy;iy<MIN(ft->lnx[1],iiy+NBLOCK);iy++) {
	    for (iz=iiz;iz<MIN(ft->lnx[2],iiz+NBLOCK);iz++) {
*/

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);


#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(fft_data,iy,p,i,lnx[1],n_pe),
		ARR3D(r_data,  iy,i,p,lnx[1],n_local_fft));
      }
    }
    fftw_execute(xyz->f_plan[i]);
  }

  /*fftw(xyz->f_plan, xyz->n_local_fft, fft_data, 1, ft->nx[1], fft_tdata, 1, ft->nx[1]);*/

#ifdef _OPENMP
#pragma omp for
#endif
  for (p=0;p<n_pe;p++) {
    for (i=0;i<n_local_fft;i++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,   iy,i,p,lnx[1],n_local_fft),
		ARR3D(fft_tdata,iy,p,i,lnx[1],n_pe));
      }
    }
  }

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);
  
  /* z direction */
  xyz = &ft->xyz[2];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

#ifdef _OPENMP
#pragma omp for
#endif
  for (iy=0;iy<lnx[1];iy++) {
    for (ix=0;ix<lnx[0];ix++) {
      for (iz=0;iz<lnx[2];iz++) {
	c_assgn(ARR3D(s_data,iz,ix,iy, lnx[2],lnx[0]),
		ARR3D(r_data,iy,iz,ix, lnx[1],lnx[2]));
      }
    }
  }

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);

  if (!ft->transpose) {
#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
      fftw_execute(xyz->f_plan[i]);
    }

#ifdef _OPENMP
#pragma omp single
#endif
    memcpy(tdata, fft_tdata, ft->n_tdata * sizeof(fftw_complex));

  } else {
    /* transpose */

#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
      fftw_execute(xyz->f_plan[i]);
    }

  /*fftw(xyz->f_plan, xyz->n_local_fft, fft_data, 1, ft->nx[2], fft_tdata, 1, ft->nx[2]);*/

#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }

#ifdef _OPENMP
#pragma omp single
#endif
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

#ifdef _OPENMP
#pragma omp for
#endif
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	for (ix=0;ix<lnx[0];ix++) {
	  c_assgn(ARR3D(tdata, ix,iy,iz, lnx[0],lnx[1]),
		  ARR3D(r_data,iz,ix,iy, lnx[2],lnx[0]));
	}
      }
    }

  }
#ifdef _OPENMP
}
#endif
}

void FFT3D_backward(FFT3D *ft, fftw_real *data, fftw_complex *tdata)
{
#ifdef _OPENMP
#pragma omp parallel
{
#endif
  int ix, iy, iz, i, ii, p, iix, iiy, iiz;
  FFT3DXYZ *xyz;
  fftw_complex *s_data, *r_data, *fft_data, *fft_tdata;
  fftw_real * restrict ft_s_data;
  int n_local_fft, n_pe;
  int * restrict lnx, * restrict nx;

  s_data = (fftw_complex*) ft->s_data;
  r_data = (fftw_complex*) ft->r_data;
  fft_data  = (fftw_complex*) ft->fft_data;
  fft_tdata = (fftw_complex*) ft->fft_tdata;

  ft_s_data  = ft->s_data;

  lnx = ft->lnx;
  nx  = ft->nx;

  /* z direction */
  xyz = &ft->xyz[2];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  if (!ft->transpose) {

#ifdef _OPENMP
#pragma omp single
#endif
    memcpy(fft_data, tdata, ft->n_tdata * sizeof(fftw_complex));

#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      fftw_execute(xyz->b_plan[i]);
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }

#ifdef _OPENMP
#pragma omp single
#endif
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

  } else {
    /* transpose */

#ifdef _OPENMP
#pragma omp for
#endif
    for (iy=0;iy<lnx[1];iy++) {
      for (ix=0;ix<lnx[0];ix++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data, iz,ix,iy, lnx[2],lnx[0]),
		  ARR3D(tdata,  ix,iy,iz, lnx[0],lnx[1]));
	}
      }
    }
    
#ifdef _OPENMP
#pragma omp single
#endif
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);
  
#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
      fftw_execute(xyz->b_plan[i]);
    }
    
#ifdef _OPENMP
#pragma omp for
#endif
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }

#ifdef _OPENMP
#pragma omp single
#endif
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

  }
  
  /* y direction */
  xyz = &ft->xyz[1];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

#ifdef _OPENMP
#pragma omp for
#endif
  for (ix=0;ix<lnx[0];ix++) {
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,iy,iz,ix, lnx[1],lnx[2]),
		ARR3D(r_data,iz,ix,iy, lnx[2],lnx[0]));
      }
    }
  }
  
#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm); 
  
#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(fft_data,iy,p,i,lnx[1],n_pe),
		ARR3D(r_data,  iy,i,p,lnx[1],n_local_fft));
      }
    }
    fftw_execute(xyz->b_plan[i]);
  }

#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,   iy,i,p,lnx[1],n_local_fft),
		ARR3D(fft_tdata,iy,p,i,lnx[1],n_pe));
      }
    }
  }
  
#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);


  /* x direction */
  xyz = &ft->xyz[0];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

#ifdef _OPENMP
#pragma omp for
#endif
  for (iz=0;iz<lnx[2];iz++) {
    for (iy=0;iy<lnx[1];iy++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(s_data,ix,iy,iz, lnx[0],lnx[1]),
		ARR3D(r_data,iy,iz,ix, lnx[1],lnx[2]));
      }
    }
  }
  
#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);


#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(fft_data,ix,p,i,lnx[0],n_pe),
		ARR3D(r_data,  ix,i,p,lnx[0],n_local_fft));
      }
    }
    fftw_execute(xyz->b_plan[i]);
  }

#ifdef _OPENMP
#pragma omp for
#endif
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	ARR3D       (ft_s_data, ix,i,p,lnx[0],n_local_fft) = 
	  c_re(ARR3D(fft_tdata,  ix,p,i,lnx[0],n_pe));
      }
    }
  }

#ifdef _OPENMP
#pragma omp single
#endif
  if (xyz->alltoall_flag) 
    MPI_Alltoall(ft->s_data, xyz->count0, MPI_FFT_REAL, 
		 data,       xyz->count0, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(ft->s_data, xyz->sb_count, xyz->sb_displ, MPI_FFT_REAL, 
		  data,       xyz->rb_count, xyz->rb_displ, MPI_FFT_REAL, xyz->comm);

#ifdef _OPENMP
}
#endif
}


#else   /*  def FFT_OMP */


void FFT3D_forward(FFT3D *ft, fftw_real * restrict data, fftw_complex * restrict tdata)
{
  int ix, iy, iz, i, ii, p;
  FFT3DXYZ *xyz;
  fftw_complex * restrict s_data, * restrict r_data, * restrict fft_data, * restrict fft_tdata;
  fftw_real * restrict ft_r_data;
  int n_local_fft, n_pe;
  int * restrict lnx;

  s_data = (fftw_complex*) ft->s_data;
  r_data = (fftw_complex*) ft->r_data;
  fft_data  = (fftw_complex*) ft->fft_data;
  fft_tdata = (fftw_complex*) ft->fft_tdata;

  ft_r_data  = ft->r_data;
  
  lnx = ft->lnx;

  /* x direction */
  xyz = &ft->xyz[0];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  /* send to x direction */
  /* data is already ordered to be sent to x direction */

  if (xyz->alltoall_flag) 
    MPI_Alltoall(data,       xyz->count0, MPI_FFT_REAL, 
		 ft->r_data, xyz->count0, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(data,       xyz->sf_count, xyz->sf_displ, MPI_FFT_REAL, 
		  ft->r_data, xyz->rf_count, xyz->rf_displ, MPI_FFT_REAL, xyz->comm);

  /* debug 
  if (mpi.master) {
    printf("%d\n", xyz->count0);
    for (p=0;p<xyz->n_pe;p++) {
      printf("%d, sfc %d, sfd %d, rfc %d, rfd %d\n", 
	     p, xyz->sf_count[p], xyz->sf_displ[p], xyz->rf_count[p], xyz->rf_displ[p]);
    }
  }
  exit(1);
  */

  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_re(ARR3D(fft_data,ix,p,i,lnx[0],n_pe)) =
	  ARR3D(ft_r_data, ix, i, p, lnx[0], n_local_fft);
	c_im(ARR3D(fft_data,ix,p,i,lnx[0],n_pe)) = 0.0;
      }
    }
  }

  fftw_execute(xyz->f_plan);

  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(s_data,   ix,i,p,lnx[0],n_local_fft),
		ARR3D(fft_tdata,ix,p,i,lnx[0],n_pe));
      }
    }
  }
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);
  
  /* y direction */
  xyz = &ft->xyz[1];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  for (ix=0;ix<lnx[0];ix++) {
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,iy,iz,ix, lnx[1],lnx[2]),
		ARR3D(r_data,ix,iy,iz, lnx[0],lnx[1]));
      }
    }
  }

  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);

  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(fft_data,iy,p,i,lnx[1],n_pe),
		ARR3D(r_data,  iy,i,p,lnx[1],n_local_fft));
      }
    }
  }

  fftw_execute(xyz->f_plan);

  for (p=0;p<n_pe;p++) {
    for (i=0;i<n_local_fft;i++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,   iy,i,p,lnx[1],n_local_fft),
		ARR3D(fft_tdata,iy,p,i,lnx[1],n_pe));
      }
    }
  }

  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

  /* z direction */
  xyz = &ft->xyz[2];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  for (iy=0;iy<lnx[1];iy++) {
    for (ix=0;ix<lnx[0];ix++) {
      for (iz=0;iz<lnx[2];iz++) {
	c_assgn(ARR3D(s_data,iz,ix,iy, lnx[2],lnx[0]),
		ARR3D(r_data,iy,iz,ix, lnx[1],lnx[2]));
      }
    }
  }

  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);

  if (!ft->transpose) {

    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
    }

    fftw_execute(xyz->f_plan);
    memcpy(tdata, fft_tdata, ft->n_tdata * sizeof(fftw_complex));

  } else {
    /* transpose */

    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
    }

    fftw_execute(xyz->f_plan);

    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }
    
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);
    
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	for (ix=0;ix<lnx[0];ix++) {
	  c_assgn(ARR3D(tdata, ix,iy,iz, lnx[0],lnx[1]),
		  ARR3D(r_data,iz,ix,iy, lnx[2],lnx[0]));
	}
      }
    }
  }
}

void FFT3D_backward(FFT3D *ft, fftw_real * restrict data, fftw_complex * restrict tdata)
{
  int ix, iy, iz, i, ii, p;
  FFT3DXYZ *xyz;
  fftw_complex * restrict s_data, * restrict r_data, * restrict fft_data, * restrict fft_tdata;
  fftw_real * restrict ft_s_data;
  int n_local_fft, n_pe;
  int * restrict lnx;

  s_data = (fftw_complex*) ft->s_data;
  r_data = (fftw_complex*) ft->r_data;
  fft_data  = (fftw_complex*) ft->fft_data;
  fft_tdata = (fftw_complex*) ft->fft_tdata;

  ft_s_data  = ft->s_data;

  lnx = ft->lnx;

  /* z direction */
  xyz = &ft->xyz[2];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  if (!ft->transpose) {
    
    memcpy(fft_data, tdata, ft->n_tdata * sizeof(fftw_complex));

    fftw_execute(xyz->b_plan);

    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }

    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

  } else {
    /* transpose */

    for (iy=0;iy<lnx[1];iy++) {
      for (ix=0;ix<lnx[0];ix++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data, iz,ix,iy, lnx[2],lnx[0]),
		  ARR3D(tdata,  ix,iy,iz, lnx[0],lnx[1]));
	}
      }
    }
    
    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);
  
    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(fft_data,iz,p,i,lnx[2],n_pe),
		  ARR3D(r_data,  iz,i,p,lnx[2],n_local_fft));
	}
      }
    }

    fftw_execute(xyz->b_plan);

    for (i=0;i<n_local_fft;i++) {
      for (p=0;p<n_pe;p++) {
	for (iz=0;iz<lnx[2];iz++) {
	  c_assgn(ARR3D(s_data,   iz,i,p,lnx[2],n_local_fft),
		  ARR3D(fft_tdata,iz,p,i,lnx[2],n_pe));
	}
      }
    }

    if (xyz->alltoall_flag) 
      MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		   r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
    else
      MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		    r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);
  }
  
  /* y direction */
  xyz = &ft->xyz[1];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  for (ix=0;ix<lnx[0];ix++) {
    for (iz=0;iz<lnx[2];iz++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,iy,iz,ix, lnx[1],lnx[2]),
		ARR3D(r_data,iz,ix,iy, lnx[2],lnx[0]));
      }
    }
  }
  
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm); 
  
  
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(fft_data,iy,p,i,lnx[1],n_pe),
		ARR3D(r_data,  iy,i,p,lnx[1],n_local_fft));
      }
    }
  }

  fftw_execute(xyz->b_plan);
  
  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (iy=0;iy<lnx[1];iy++) {
	c_assgn(ARR3D(s_data,   iy,i,p,lnx[1],n_local_fft),
		ARR3D(fft_tdata,iy,p,i,lnx[1],n_pe));
      }
    }
  }

  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sb_count_c, xyz->sb_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rb_count_c, xyz->rb_displ_c, MPI_FFT_REAL, xyz->comm);

  /* x direction */
  xyz = &ft->xyz[0];
  n_local_fft = xyz->n_local_fft;
  n_pe = xyz->n_pe;

  for (iz=0;iz<lnx[2];iz++) {
    for (iy=0;iy<lnx[1];iy++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(s_data,ix,iy,iz, lnx[0],lnx[1]),
		ARR3D(r_data,iy,iz,ix, lnx[1],lnx[2]));
      }
    }
  }
  
  if (xyz->alltoall_flag) 
    MPI_Alltoall(s_data, xyz->count0_c, MPI_FFT_REAL, 
		 r_data, xyz->count0_c, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(s_data, xyz->sf_count_c, xyz->sf_displ_c, MPI_FFT_REAL, 
		  r_data, xyz->rf_count_c, xyz->rf_displ_c, MPI_FFT_REAL, xyz->comm);

  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	c_assgn(ARR3D(fft_data,ix,p,i,lnx[0],n_pe),
		ARR3D(r_data,  ix,i,p,lnx[0],n_local_fft));
      }
    }
  }

  fftw_execute(xyz->b_plan);

  for (i=0;i<n_local_fft;i++) {
    for (p=0;p<n_pe;p++) {
      for (ix=0;ix<lnx[0];ix++) {
	ARR3D       (ft_s_data, ix,i,p,lnx[0],n_local_fft) = 
	  c_re(ARR3D(fft_tdata,  ix,p,i,lnx[0],n_pe));
      }
    }
  }

  if (xyz->alltoall_flag) 
    MPI_Alltoall(ft->s_data, xyz->count0, MPI_FFT_REAL, 
		 data,       xyz->count0, MPI_FFT_REAL, xyz->comm);
  else
    MPI_Alltoallv(ft->s_data, xyz->sb_count, xyz->sb_displ, MPI_FFT_REAL, 
		  data,       xyz->rb_count, xyz->rb_displ, MPI_FFT_REAL, xyz->comm);

}

#endif  /* def FFT_OMP */
