!{\src2tex{textfont=tt}}
!!****f* ABINIT/sg_fftrisc_htor
!! NAME
!! sg_fftrisc_htor
!!
!! FUNCTION
!! Carry out Fourier transforms between real and reciprocal (G) space,
!! for wavefunctions, contained in a sphere in reciprocal space,
!! in both directions. Also accomplish some post-processing.
!!
!! COPYRIGHT
!! Copyright (C) 1999-2007 ABINIT group (XG).
!! This file is distributed under the terms of the
!! GNU General Public License, see ~abinit/COPYING
!! or http://www.gnu.org/copyleft/gpl.txt .
!! For the initials of contributors, see ~abinit/doc/developers/contributors.txt .
!!
!! NOTES
!! Specifically uses rather sophisticated algorithms, based on S Goedecker
!! routines, specialized for superscalar RISC architecture.
!! Zero padding : saves 7/12 execution time
!! Bi-dimensional data locality in most of the routine : cache reuse
!! For k-point (0 0 0) : takes advantage of symmetry of data.
!! Note however that no blocking is used, in both 1D z-transform
!! or subsequent 2D transform. This should be improved.
!!
!! INPUTS
!!  cplex= if 1 , denpot is real, if 2 , denpot is complex
!!     (cplex=2 only allowed for option=2 when istwf_k=1)
!!     one can also use cplex=0 if option=0 or option=3
!!  fofgin(2,npwin)=holds input wavefunction in G vector basis sphere.
!!  gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space
!!  gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space
!!  istwf_k=option parameter that describes the storage of wfs
!!  kg_kin(3,npwin)=reduced planewave coordinates, input
!!  kg_kout(3,npwout)=reduced planewave coordinates, output
!!  npwin=number of elements in fofgin array (for option 0, 1 and 2)
!!  npwout=number of elements in fofgout array (for option 2 and 3)
!!  mgfft=maximum size of 1D FFTs
!!  ngfft(18)=contain all needed information about 3D FFT, see ~ABINIT/doc/input_variables/vargs.htm#ngfft
!!  n4,n5,n6=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr.
!!  option= if 0: do direct FFT
!!          if 1: do direct FFT, then sum the density
!!          if 2: do direct FFT, multiply by the potential, then do reverse FFT
!!          if 3: do reverse FFT only
!!  weight=weight to be used for the accumulation of the density in real space
!!          (needed only when option=1)
!!
!! OUTPUT
!!  (see side effects)
!!
!! OPTIONS
!!  The different options are:
!!  - reciprocal to real space and output the result (when option=0),
!!  - reciprocal to real space and accumulate the density (when option=1) or
!!  - reciprocal to real space, apply the local potential to the wavefunction
!!    in real space and produce the result in reciprocal space (when option=2)
!!  - real space to reciprocal space (when option=3).
!!  option=0 IS NOT ALLOWED when istwf_k>2
!!  option=3 IS NOT ALLOWED when istwf_k>=2
!!
!! SIDE EFFECTS
!!  for option==0, fofgin(2,npwin)=holds input wavefunction in G sphere;
!!                 fofr(2,n4,n5,n6) contains the Fourier Transform of fofgin;
!!                 no use of denpot, fofgout and npwout.
!!  for option==1, fofgin(2,npwin)=holds input wavefunction in G sphere;
!!                 denpot(cplex*n4,n5,n6) contains the input density at input,
!!                 and the updated density at output;
!!                 fofr(2,n4,n5,n6) contains the Fourier transform of fofgin,
!!                 except in the case of the hp library subroutine;
!!                 no use of fofgout and npwout.
!!  for option==2, fofgin(2,npwin)=holds input wavefunction in G sphere;
!!                 denpot(cplex*n4,n5,n6) contains the input local potential;
!!                 fofgout(2,npwout) contains the output function;
!!                 fofr(2,n4,n5,n6) contains the Fourier transform of fofgin,
!!                 except in the case of the hp library subroutine.
!!  for option==3, fofr(2,n4,n5,n6) contains the real space wavefunction;
!!                 fofgout(2,npwout) contains its Fourier transform;
!!                 no use of fofgin and npwin.
!!
!! TODO
!! Complete input and output list.
!!
!! PARENTS
!!      cgwf_htor
!!
!! CHILDREN
!!      indfftrisc,mpi_alltoall,sg_ctrig,sg_fftpx,sg_ffty,zdscal
!!
!! SOURCE

#if defined HAVE_CONFIG_H
#include "config.h"
#endif

subroutine sg_fftrisc_htor(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,&
& istwf_k,kg_kin,kg_kout,mgfft,ngfft,npwin,npwout,n4,n5,n6,option,weight,mpi_enreg, maxgbin, maxgbout,&
& gnpw, blkgb, packin, unpackin, packout, unpackout)

 use defs_basis
 use defs_datatypes

!This section has been created automatically by the script Abilint (TD). Do not modify these by hand.
#ifdef HAVE_FORTRAN_INTERFACES
 use interfaces_12ffts
#endif
!End of the abilint section

 implicit none

#if defined MPI
 include 'mpif.h'
#endif

!Arguments ------------------------------------
!scalars
 integer,intent(in) :: cplex,istwf_k,mgfft,n4,n5,n6,npwin,npwout,option,maxgbin,maxgbout
 real(dp),intent(in) :: weight
!arrays
 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
 type(MPI_type),intent(in) :: mpi_enreg
 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18),gnpw,blkgb
 integer,intent(in) :: packin(gnpw),packout(blkgb,ngfft(3)),unpackin(blkgb,ngfft(3)),unpackout(gnpw)
! real(dp),intent(in) :: fofgin(2,npwin)
 real(dp),intent(in) :: fofgin(2,mpi_enreg%mgblk*mpi_enreg%gngroup)
 real(dp),intent(inout) :: denpot(cplex*n4,n5,n6),fofr(2,n4,n5,n6)
! real(dp),intent(out) :: fofgout(2,npwout)
 real(dp),intent(out) :: fofgout(2,mpi_enreg%mgblk*mpi_enreg%gngroup)

!Local variables-------------------------------
!scalars
 integer,parameter :: mfac=11,mg=2048
 integer,save :: ic1,ic2,ic3,ic4,ic5,ic6,n1_save=0,n2_save=0,n3_save=0
 integer,save :: n4_save=0,n5_save=0,n6_save=0
 integer :: fftcache,g2max,g2min,i1,i1inv,i1max,i2,i3,i3inv,ier,ifftig,ig,igb,ipw,jgb
 integer :: igb_inv,igbmax,igbmin,ii,ii1,ii2,ii3,index,isign,lot,lotin,lotout,mgb,n1,ind
 integer :: n1half1,n1halfm,n1i,n2,n2half1,n3,n4half1,n5half1,nfftot
 integer :: nlot,ierr,i3min,i3max,i3step,wk1dtype,i,igbind,iproc,iiproc,bufoffset,maxgb
 integer :: ipwmax,ipwmin,jpw,ngb
 real(dp) :: ai,ar,fraction,norm,phai,phar,ris,sqr,wkim,wkre,ri,t1,t0
 character(len=500) :: message
!arrays
 integer,save :: aft1(mfac),aft2(mfac),aft3(mfac),aft4(mfac),aft5(mfac)
 integer,save :: aft6(mfac),bef1(mfac),bef2(mfac),bef3(mfac),bef4(mfac)
 integer,save :: bef5(mfac),bef6(mfac),ind1(mg),ind2(mg),ind3(mg),ind4(mg)
 integer,save :: ind5(mg),ind6(mg),now1(mfac),now2(mfac),now3(mfac),now4(mfac)
 integer,save :: now5(mfac),now6(mfac)
 integer :: gbound_dum(4)
 integer,allocatable :: indpw_k(:,:),ngbproc(:), scounts(:), rcounts(:), sdispls(:), rdispls(:)
 real(dp),save :: trig1(2,mg),trig2(2,mg),trig3(2,mg),trig4(2,mg),trig5(2,mg)
 real(dp),save :: trig6(2,mg),tsec(2)
 real(dp),allocatable :: pha1(:,:),pha2(:,:),pha3(:,:),wk1d_a(:,:,:,:),a2abuf(:),b2abuf(:)
 real(dp),allocatable :: wk1d_b(:,:,:,:),wk2d_a(:,:,:,:),wk2d_b(:,:,:,:)
 real(dp),allocatable :: wk2d_c(:,:,:,:),wk2d_d(:,:,:,:),work1(:,:,:,:),buffer(:)
 real(dp), external :: DDOT

#ifdef VMS
!DEC$ ATTRIBUTES ALIAS:'ZDSCAL' :: zdscal
#endif

 n1=ngfft(1) ; n2=ngfft(2) ; n3=ngfft(3) ; nfftot=n1*n2*n3
 fftcache=ngfft(8)

!#ifdef MPI
! gnpw=mpi_enreg%gmax-mpi_enreg%gmin+1
!#else
! gnpw=npwout
!#endif

#ifdef MPI_TRACE
 t0 = MPI_WTIME()
#endif

 ! the indices are not going to change during the calculation ... we could save
 ! them!
 allocate(indpw_k(4,npwin))
! allocate(indpw_kout(4,npwout))
 call indfftrisc(gboundin(3:3+2*mgfft+4,1),indpw_k,kg_kin,mgfft,ngb,ngfft,npwin)
! call indfftrisc(gboundout(3,1),indpw_kout,kg_kout,mgfft,ngbout,ngfft,npwout)

 !Define the dimension of the first work arrays, for 1D transforms along z ,
 !taking into account the need to avoid the cache trashing
! mgb=max(ngb,ngbout)
 mgb=ngb
 if(mod(mgb,2)/=1)mgb=mgb+1

!Compute auxiliary arrays needed for FFTs
 if(n1/=n1_save)then
  call sg_ctrig(n1,trig1,aft1,bef1,now1,1.0_dp,ic1,ind1,mfac,mg)
  call sg_ctrig(n1,trig4,aft4,bef4,now4,-1.0_dp,ic4,ind4,mfac,mg)
  n1_save=n1
 end if
 if(n2/=n2_save)then
  call sg_ctrig(n2,trig2,aft2,bef2,now2,1.0_dp,ic2,ind2,mfac,mg)
  call sg_ctrig(n2,trig5,aft5,bef5,now5,-1.0_dp,ic5,ind5,mfac,mg)
  n2_save=n2
 end if
 if(n3/=n3_save)then
  call sg_ctrig(n3,trig3,aft3,bef3,now3,1.0_dp,ic3,ind3,mfac,mg)
  call sg_ctrig(n3,trig6,aft6,bef6,now6,-1.0_dp,ic6,ind6,mfac,mg)
  n3_save=n3
 end if

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'ctrig + indfftrisc', t1-t0
 t0 = MPI_WTIME()
#endif

 ! Note that the z transform will appear as a y transform
 allocate (wk1d_a(2,mgb,n3,1),wk1d_b(2,mgb,n3,1))

 ! this is necessary - the indices in wk1d_a which are not used
 ! (which are not in indpw_k) must be 0!!!!
 wk1d_a(:,:,:,:)=0.0_dp

 ! TODO: replace lotin/lotout with lot
 ! lotin is the number of xy-lines, each processor holds for the z-transform of
 ! the input wave function
 lotin=ngb/mpi_enreg%gngroup+1
! allocate(buffer(2*n3*lotin*mpi_enreg%gngroup))
! allocate(ngbproc(mpi_enreg%gngroup))
 allocate(scounts(mpi_enreg%gngroup))
 allocate(rcounts(mpi_enreg%gngroup))
 allocate(sdispls(mpi_enreg%gngroup))
 allocate(rdispls(mpi_enreg%gngroup))
! igbmin=mpi_enreg%gindex*lotin+1
! igbmax=min((mpi_enreg%gindex+1)*lotin,ngb)

 ! get number of xy-lines for each processor
 ! this whole loop is just to get the datasize for the ALLTOALL
 ! it is saved in maxgb afterwards ...
! ngbproc(:) = 0
! maxgb=0

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'allocations + zeroing', t1-t0
 t0 = MPI_WTIME()
#endif

! ! the iiproc loop is to get the maximum of ngbproc across all processors!
! ! (this saves an ALLREDUCE :)
! do iiproc=0,mpi_enreg%gngroup-1
!  ipwmin=iiproc*mpi_enreg%mgblk+1
!  ipwmax=min((iiproc+1)*mpi_enreg%mgblk,npwin)
!  do iproc=0,mpi_enreg%gngroup-1
!   ! for each processor lower and upper bounds of igb
!   igbmin=iproc*lotin+1
!   igbmax=min((iproc+1)*lotin,ngb)
!   do igb=igbmin,igbmax ! for every line iproc needs
!    do ipw=ipwmin,ipwmax ! every npw iiproc has
!     if(indpw_k(4,ipw)==igb) then ! if iproc needs it from iiproc -> count it
!      ngbproc(iproc+1)=ngbproc(iproc+1)+1
!     end if
!    end do
!   end do
!   ! if iiproc sends to me (I have to receive from him): save it as rcount
!   if (iproc==mpi_enreg%gindex) rcounts(iiproc+1)=ngbproc(iproc+1)
!  end do
!  ! if this ngbproc belongs to (I send as iiproc): me save it as scount
!  if (iiproc==mpi_enreg%gindex) scounts(:)=ngbproc(:)
!  ! the maximum number of gb's over all iiproc - size for alltoall :)
!  if (maxgb<maxval(ngbproc)) maxgb=maxval(ngbproc)
!!  if (mpi_enreg%me==0) write (*,*) 'i', iiproc, '-', ngbproc(:)
! end do
! ! don't send (copy) to myself!
! rcounts(mpi_enreg%gindex+1)=0
! scounts(mpi_enreg%gindex+1)=0
!! if(mpi_enreg%gindex==0) write(*,*) 'scounts', mpi_enreg%me, scounts(:)
!! if(mpi_enreg%gindex==0) write(*,*) 'rcounts', mpi_enreg%me, rcounts(:)
 maxgb=maxgbin;

 allocate(a2abuf(2*maxgb*mpi_enreg%gngroup))
 allocate(b2abuf(2*maxgb*mpi_enreg%gngroup))

! allocate(packin(gnpw))
! ! fill packin array
! ipwmin=mpi_enreg%gmin
! ipwmax=mpi_enreg%gmax
! do iproc=0,mpi_enreg%gngroup-1
!  ! for each processor lower and upper bounds of igb
!  igbmin=iproc*lotin+1
!  igbmax=min((iproc+1)*lotin,ngb)
!  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
!  do igb=igbmin,igbmax ! for every line that iproc needs
!   do ipw=ipwmin,ipwmax ! for every ipw I have
!    if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> pack it
!     ! the gmin..gmax are as 1..gmax-gmin+1 in the fofgin array (it's distributed :)
!     jpw=ipw-mpi_enreg%gmin+1
!!     if (iproc==mpi_enreg%gindex) then ! if I am iproc - copy it directly
!!      i3=indpw_k(3,ipw)
!!      wk1d_a(1,igb,i3,1)=fofgin(1,jpw)
!!      wk1d_a(2,igb,i3,1)=fofgin(2,jpw)
!!     else ! pack the packet for iproc
!!      a2abuf(index)=fofgin(1,jpw)
!!      a2abuf(index+1)=fofgin(2,jpw)
!      packin(jpw)=index;
!      index=index+2
!!     end if
!    end if
!   end do
!  end do
!  ! this could be enhanced to save memory per node ...
!  ! currently we use just the maximum stride (for ease)
!!  sdispls(iproc+1)=iproc*maxgb
!!  rdispls(iproc+1)=iproc*maxgb
! end do

 ! fill a2abuf
 do jpw=1,gnpw
  index=packin(jpw)

  a2abuf(index)=fofgin(1,jpw)
  a2abuf(index+1)=fofgin(2,jpw)
 end do

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'alloc + pack a2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

#ifdef MPI
          call MPI_ALLTOALL(a2abuf, maxgb, MPI_DOUBLE_COMPLEX, b2abuf, maxgb, &
&          MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
!         call MPI_ALLTOALLV(a2abuf, scounts, sdispls, MPI_DOUBLE_COMPLEX, b2abuf, rcounts, rdispls, &
!&         MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'allreduce', t1-t0
 t0 = MPI_WTIME()
#endif

igbmin=mpi_enreg%gindex*lotin+1
igbmax=min((mpi_enreg%gindex+1)*lotin,ngb)
! do iproc=0,mpi_enreg%gngroup-1
!!  if (iproc==mpi_enreg%gindex) cycle ! if I am iproc, its already in wk1d_a()
!  ! for each processor lower and upper bounds of igb
!  ipwmin=iproc*mpi_enreg%mgblk+1
!  ipwmax=min((iproc+1)*mpi_enreg%mgblk,npwin)
!  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
!  do igb=igbmin,igbmax ! for every line that I need
!   do ipw=ipwmin,ipwmax ! for every ipw iproc had (and sent me)
!    if(indpw_k(4,ipw)==igb) then ! if I need it -> unpack it
!     i3=indpw_k(3,ipw)
!     wk1d_a(1,igb,i3,1)=b2abuf(index)
!     wk1d_a(2,igb,i3,1)=b2abuf(index+1)
!     index=index+2
!    end if
!   end do
!  end do
! end do
 do igb=igbmin,igbmax
  do i3=1,n3
   jgb=igb-igbmin+1
   index=unpackin(jgb,i3)

   ! if i3 is not used (is this likely?)
   if(index/=0) then
    wk1d_a(1,igb,i3,1)=b2abuf(index)
    wk1d_a(2,igb,i3,1)=b2abuf(index+1)
   end if
   ! here maybe else wk1d_a(...) = 0 to omit the zeroing at the beginning?
  end do
 end do
 deallocate(a2abuf,b2abuf)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'unpack', t1-t0
 t0 = MPI_WTIME()
#endif

 call sg_ffty(fftcache,mfac,mg,mgb,n3,1,igbmin,igbmax,1,1,wk1d_a,wk1d_b, &
 &    trig3,aft3,now3,bef3,1.0_dp,ind3,ic3)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'sg_ffty (z planes)', t1-t0
 t0 = MPI_WTIME()
#endif

 ! get number of z-lines for each processor
 ! it is saved in maxgb afterwards ...
 i3step = n3/mpi_enreg%gngroup+1
 ! we don't need to check the indpw_k() array because we just transpose i3
 ! we need to store a block of i3step*lotin
 maxgb=i3step*lotin
 allocate(a2abuf(2*maxgb*mpi_enreg%gngroup))
 allocate(b2abuf(2*maxgb*mpi_enreg%gngroup))
 ! fill the all2all communication buffer a2abuf
 i3step = n3/mpi_enreg%gngroup+1
 igbmin=mpi_enreg%gindex*lotin+1
 Igbmax=min((mpi_enreg%gindex+1)*lotin,ngb)
 do iproc=0,mpi_enreg%gngroup-1
  ! if I am iproc, I have it already in wk1d_b()
  if (iproc==mpi_enreg%gindex) cycle
  ! for each processor lower and upper bounds of i3
  i3min=iproc*i3step+1
  i3max=min((iproc+1)*i3step,n3)
  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
  do igb=igbmin,igbmax ! for every xy-line that I have
   do i3=i3min,i3max ! for every z-plane iproc needs
    ! pack data in a2abuf
    a2abuf(index)=wk1d_b(1,igb,i3,1)
    a2abuf(index+1)=wk1d_b(2,igb,i3,1)
    index=index+2
   end do
  end do
  ! initialize displacements
  sdispls(iproc+1)=iproc*maxgb
  rdispls(iproc+1)=iproc*maxgb
 end do
 ! this is only to avoid local copying in ALLTOALL (because there is no
 ! MPI_IN_PLACE - which would really be helpful here)
 ! send the same amount of data to all others
 rcounts(:)=maxgb
 scounts(:)=maxgb
 ! don't send (copy) to myself!
 rcounts(mpi_enreg%gindex+1)=0
 scounts(mpi_enreg%gindex+1)=0

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'pack a2abuf', t1-t0
 t0 = MPI_WTIME()
#endif


#ifdef MPI
          call MPI_ALLTOALL(a2abuf, maxgb, MPI_DOUBLE_COMPLEX, b2abuf, maxgb, &
&          MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
!         call MPI_ALLTOALLV(a2abuf, scounts, sdispls, MPI_DOUBLE_COMPLEX, b2abuf, rcounts, rdispls, &
!&         MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'alltoall', t1-t0
 t0 = MPI_WTIME()
#endif

 ! fill the all2all communication buffer a2abuf
 i3step = n3/mpi_enreg%gngroup+1
 i3min=mpi_enreg%gindex*i3step+1
 i3max=min((mpi_enreg%gindex+1)*i3step,n3)
 do iproc=0,mpi_enreg%gngroup-1
  ! if I am iproc, I have it already in wk1d_b()
  if (iproc==mpi_enreg%gindex) cycle
  ! for each processor lower and upper bounds of i3
  igbmin=iproc*lotin+1
  igbmax=min((iproc+1)*lotin,ngb)
  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
  do igb=igbmin,igbmax ! for every xy-line that iproc has (and sent me)
   do i3=i3min,i3max ! for every z-plane I need
     ! unpack data to wk1d_b
     wk1d_b(1,igb,i3,1)=b2abuf(index)
     wk1d_b(2,igb,i3,1)=b2abuf(index+1)
     index=index+2
   end do
  end do
 end do
 deallocate(a2abuf,b2abuf)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'unpack b2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

 ! Allocate two 2-dimensional work arrays
 allocate(wk2d_a(2,n4,n5,1),wk2d_b(2,n4,n5,1))
  do i3=i3min,i3max
  ! Zero the values on the current plane
  wk2d_a(:,1:n1,1:n2,1)=0.0_dp

  ! Copy the data in the current plane
  ! TODO: we should avoid this? Do this during a2a unpack :)
  do igb=1,ngb
   i1=indpw_k(1,igb) ; i2=indpw_k(2,igb)
   wk2d_a(1,i1,i2,1)=wk1d_b(1,igb,i3,1)
   wk2d_a(2,i1,i2,1)=wk1d_b(2,igb,i3,1)
  end do
  ! Perform x transform, taking into account arrays of zeros
  g2min=gboundin(3,1) ; g2max=gboundin(4,1)
  if ( g2min+n2 >= g2max+2 ) then
   do i2=g2max+2,g2min+n2
    wk2d_b(:,1:n1,i2,1)=0.0_dp
   end do
  end if
  gbound_dum(1)=1 ; gbound_dum(2)=1
  gbound_dum(3)=g2min ; gbound_dum(4)=g2max
  call sg_fftpx(fftcache,mfac,mg,0,n4,n5,1,n2,1,wk2d_a,wk2d_b,&
  &    trig1,aft1,now1,bef1,1.0_dp,ind1,ic1,gbound_dum)
  ! Perform y transform
  n1i=1
  call sg_ffty(fftcache,mfac,mg,n4,n5,1,n1i,n1,1,1,wk2d_b,wk2d_a, &
  &    trig2,aft2,now2,bef2,1.0_dp,ind2,ic2)
  ! The wave function is now in real space, for the current plane

  ! Apply local potential
  do i2=1,n2
   do i1=1,n1
    wk2d_a(1,i1,i2,1)=denpot(i1,i2,i3)*wk2d_a(1,i1,i2,1)
    wk2d_a(2,i1,i2,1)=denpot(i1,i2,i3)*wk2d_a(2,i1,i2,1)
   end do
  end do

  ! Perform y transform
  n1i=1
  call sg_ffty(fftcache,mfac,mg,n4,n5,1,n1i,n1,1,1,wk2d_a,wk2d_b, &
  &    trig5,aft5,now5,bef5,-1.0_dp,ind5,ic5)
  ! Perform x transform, taking into account arrays of zeros
  gbound_dum(1)=1 ; gbound_dum(2)=1
  gbound_dum(3)=gboundout(3,1) ; gbound_dum(4)=gboundout(4,1)
  call sg_fftpx(fftcache,mfac,mg,0,n4,n5,1,n2,1,wk2d_b,wk2d_a,&
  &    trig4,aft4,now4,bef4,-1.0_dp,ind4,ic4,gbound_dum)
  ! Copy the data from the current plane to wk1d_b
  ! TODO: this should be done directly during packing :)
  do igb=1,ngb
   i1=indpw_k(1,igb) ; i2=indpw_k(2,igb)
   wk1d_b(1,igb,i3,1)=wk2d_a(1,i1,i2,1)
   wk1d_b(2,igb,i3,1)=wk2d_a(2,i1,i2,1)
  end do

  ! End loop on planes
  end do
! end do
 deallocate(wk2d_a,wk2d_b)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'xy trans, apply, xy trans', t1-t0
 t0 = MPI_WTIME()
#endif

 i3step = n3/mpi_enreg%gngroup+1
 ! we don't need to check the indpw_k() array because we just transpose i3
 ! we need to store a block of i3step*lotin
 maxgb=i3step*lotin
 allocate(a2abuf(2*maxgb*mpi_enreg%gngroup))
 allocate(b2abuf(2*maxgb*mpi_enreg%gngroup))
 ! fill the all2all communication buffer a2abuf
 i3step = n3/mpi_enreg%gngroup+1
 i3min=mpi_enreg%gindex*i3step+1
 i3max=min((mpi_enreg%gindex+1)*i3step,n3)
 do iproc=0,mpi_enreg%gngroup-1
  ! if I am iproc, I have it already in wk1d_b()
  if (iproc==mpi_enreg%gindex) cycle
  ! for each processor lower and upper bounds of igb
  igbmin=iproc*lotin+1
  igbmax=min((iproc+1)*lotin,ngb)
  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
  do igb=igbmin,igbmax ! for every xy-line that iproc needs
   do i3=i3min,i3max ! for every z-plane I have
     ! pack data in a2abuf
     a2abuf(index)=wk1d_b(1,igb,i3,1)
     a2abuf(index+1)=wk1d_b(2,igb,i3,1)
     index=index+2
   end do
  end do
  ! initialize displacements
  sdispls(iproc+1)=iproc*maxgb
  rdispls(iproc+1)=iproc*maxgb
 end do
 ! this is only to avoid local copying in ALLTOALL (because there is no
 ! MPI_IN_PLACE - which would really be helpful here)
 ! send the same amount of data to all others
 rcounts(:)=maxgb
 scounts(:)=maxgb
 ! don't send (copy) to myself!
 rcounts(mpi_enreg%gindex+1)=0
 scounts(mpi_enreg%gindex+1)=0

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'pack a2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

#ifdef MPI
          call MPI_ALLTOALL(a2abuf, maxgb, MPI_DOUBLE_COMPLEX, b2abuf, maxgb, &
&          MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
!         call MPI_ALLTOALLV(a2abuf, scounts, sdispls, MPI_DOUBLE_COMPLEX, b2abuf, rcounts, rdispls, &
!&         MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'alltoall', t1-t0
 t0 = MPI_WTIME()
#endif

 ! fill the all2all communication buffer a2abuf
 lotout=ngb/mpi_enreg%gngroup+1
 igbmin=mpi_enreg%gindex*lotout+1
 igbmax=min((mpi_enreg%gindex+1)*lotout,ngb)
 do iproc=0,mpi_enreg%gngroup-1
  ! if I am iproc, I have it already in wk1d_b()
  if (iproc==mpi_enreg%gindex) cycle
  ! for each processor lower and upper bounds of i3
  i3min=iproc*i3step+1
  i3max=min((iproc+1)*i3step,n3)
  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
  do igb=igbmin,igbmax ! for every xy-line that I need
   do i3=i3min,i3max ! for every z-plane iproc has (and sent)
    ! unpack data to wk1d_b
    wk1d_b(1,igb,i3,1)=b2abuf(index)
    wk1d_b(2,igb,i3,1)=b2abuf(index+1)
    index=index+2
   end do
  end do
 end do
 deallocate(a2abuf,b2abuf)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'unpack b2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

 call sg_ffty(fftcache,mfac,mg,mgb,n3,1,igbmin,igbmax,1,1,wk1d_b,wk1d_a, &
 &    trig6,aft6,now6,bef6,-1.0_dp,ind6,ic6)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'sg_ffty in z', t1-t0
 t0 = MPI_WTIME()
#endif

! ! get number of xy-lines for each processor
! ! this whole loop is just to get the datasize for the ALLTOALL
! ! it is saved in maxgb afterwards ...
! ngbproc(:) = 0
! maxgb = 0
! do iiproc=0,mpi_enreg%gngroup-1
!  ! for each processor lower and upper bounds of igb
!  igbmin=iiproc*lotout+1
!  igbmax=min((iiproc+1)*lotout,ngb)
!  do iproc=0,mpi_enreg%gngroup-1
!   ipwmin=iproc*mpi_enreg%mgblk+1
!   ipwmax=min((iproc+1)*mpi_enreg%mgblk,npwout)
!   do igb=igbmin,igbmax ! for every line iiproc has
!    do ipw=ipwmin,ipwmax ! every npw iproc needs
!     if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> count it
!      ngbproc(iproc+1)=ngbproc(iproc+1)+1
!     end if
!    end do
!   end do
!   ! if iiproc sends to me (I have to receive from him): save it as rcount
!   if (iproc==mpi_enreg%gindex) rcounts(iiproc+1)=ngbproc(iproc+1)
!  end do
!  ! if this ngbproc belongs to me (I send as iiproc):  save it as scount
!  if (iiproc==mpi_enreg%gindex) scounts(:)=ngbproc(:)
!  ! the maximum number of gb's over all iiproc - size for alltoall :)
!  if (maxgb<maxval(ngbproc)) maxgb=maxval(ngbproc)
! end do
! ! don't send (copy) to myself!
! rcounts(mpi_enreg%gindex+1)=0
! scounts(mpi_enreg%gindex+1)=0
!! write(*,*) 'scounts', mpi_enreg%me, scounts(:)
!! write(*,*) 'rcounts', mpi_enreg%me, rcounts(:)

 maxgb=maxgbout
 allocate(a2abuf(2*maxgb*mpi_enreg%gngroup))
 allocate(b2abuf(2*maxgb*mpi_enreg%gngroup))
! ! fill a2abuf
 igbmin=mpi_enreg%gindex*lotout+1
 igbmax=min((mpi_enreg%gindex+1)*lotout,ngb)
! do iproc=0,mpi_enreg%gngroup-1
!  ! for each processor lower and upper bounds of ipw
!  ipwmin=iproc*mpi_enreg%mgblk+1
!  ipwmax=min((iproc+1)*mpi_enreg%mgblk,npwout)
!  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
!  do igb=igbmin,igbmax ! for every line I have
!   do ipw=ipwmin,ipwmax ! for every ipw iproc needs
!    if(indpw_k(4,ipw)==igb) then ! if iproc needs it -> pack it
!     i3=indpw_k(3,ipw)
!!     if (iproc==mpi_enreg%gindex) then ! if I am iproc - copy it directly
!!      jpw=ipw-ipwmin+1
!!      fofgout(1,jpw)=wk1d_a(1,igb,i3,1)
!!      fofgout(2,jpw)=wk1d_a(2,igb,i3,1)
!!     else ! pack the packet for iproc
!      a2abuf(index)=wk1d_a(1,igb,i3,1)
!      a2abuf(index+1)=wk1d_a(2,igb,i3,1)
!      index=index+2
!!     end if
!    end if
!   end do
!  end do
!  ! initialize displacements
!  sdispls(iproc+1)=iproc*maxgb
!  rdispls(iproc+1)=iproc*maxgb
! end do

 do igb=igbmin,igbmax
  do i3=1,n3
   jgb=igb-igbmin+1
   index=packout(jgb,i3)

   if(index/=0) then
    a2abuf(index)=wk1d_a(1,igb,i3,1)
    a2abuf(index+1)=wk1d_a(2,igb,i3,1)
   end if
  end do
 end do

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'pack a2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

#ifdef MPI
          call MPI_ALLTOALL(a2abuf, maxgb, MPI_DOUBLE_COMPLEX, b2abuf, maxgb, &
&          MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
!         call MPI_ALLTOALLV(a2abuf, scounts, sdispls, MPI_DOUBLE_COMPLEX, b2abuf, rcounts, rdispls, &
!&         MPI_DOUBLE_COMPLEX, mpi_enreg%gmpicomm(mpi_enreg%ggroup), ierr)
#endif

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'alltoall', t1-t0
 t0 = MPI_WTIME()
#endif

! ipwmin=mpi_enreg%gmin
! ipwmax=mpi_enreg%gmax
! do iproc=0,mpi_enreg%gngroup-1
!  ! if I am iproc - I have it already in fofgout()
!  if (iproc==mpi_enreg%gindex) cycle
!  ! for each processor lower and upper bounds of igb
!  igbmin=iproc*lotout+1
!  igbmax=min((iproc+1)*lotout,ngb)
!  index=iproc*2*maxgb+1 ! index of data in a2abuf of iproc
!  do igb=igbmin,igbmax ! for every line that iproc has (and sent)
!   do ipw=ipwmin,ipwmax ! for every ipw I need
!    if(indpw_k(4,ipw)==igb) then ! if I need it -> unpack it
!     jpw=ipw-ipwmin+1
!     fofgout(1,jpw)=b2abuf(index)
!     fofgout(2,jpw)=b2abuf(index+1)
!     index=index+2
!    end if
!   end do
!  end do
! end do
 ! fill fofgout
 do jpw=1,gnpw
  index=unpackout(jpw)

  fofgout(1,jpw)=b2abuf(index)
  fofgout(2,jpw)=b2abuf(index+1)
 end do

 deallocate(a2abuf,b2abuf)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) 'unpack b2abuf', t1-t0
 t0 = MPI_WTIME()
#endif

 norm=1._dp/dble(nfftot)
 call ZDSCAL(gnpw, norm, fofgout, 1)

 deallocate(wk1d_a,wk1d_b)
 deallocate(indpw_k)
! deallocate(indpw_kout)
! deallocate(ngbproc)
 deallocate(scounts)
 deallocate(rcounts)
 deallocate(sdispls)
 deallocate(rdispls)

#ifdef MPI_TRACE
 t1 = MPI_WTIME()
 if (mpi_enreg%me == 0) write(*,*) '******** cleanup ******', t1-t0
 t0 = MPI_WTIME()
#endif

! DEBUG
! write(6,*)' sg_fftrisc : exit '
! stop
! ENDDEBUG

end subroutine sg_fftrisc_htor
!!***
