module m_phase1

  implicit none

contains

  subroutine phase1( nx, ny, ival1, oval0, oval1, oval2, wnx, wny, spgix, spgiy )

  use Math_Const
  use ffts

  implicit none

  integer, intent(in) :: nx, ny
  real, dimension(nx,ny), intent(in) :: ival1
  real, dimension(nx,ny), intent(out) :: oval0, oval1, oval2
  integer, intent(in) :: wnx, wny
  integer, intent(in) :: spgix, spgiy

  integer, parameter :: ifft=5
  integer :: i, j
  real :: t1, t2, t3
  integer, dimension(ifft) :: pxfact, pyfact, pxsfact, pysfact
  complex, dimension(nx,ny) :: sp1, sp2, spi1, spi2
  complex, dimension(nx/spgix,ny/spgiy) :: cpval1, cpval2, sps1, sps2, spis1, spis2
  complex, allocatable, dimension(:,:) :: omegaxbr, omegaybr, omegaxbi, omegaybi
  complex, allocatable, dimension(:,:) :: omegaxnr, omegaynr, omegaxni, omegayni
  complex, allocatable, dimension(:,:) :: omegaxbrs, omegaybrs, omegaxbis, omegaybis
  complex, allocatable, dimension(:,:) :: omegaxnrs, omegaynrs, omegaxnis, omegaynis

  call rotate_array_f()
  call prim_calc( nx, pxfact(1:4), pxfact(5) )
  call prim_calc( ny, pyfact(1:4), pyfact(5) )
  call prim_calc( nx/spgix, pxsfact(1:4), pxsfact(5) )
  call prim_calc( ny/spgiy, pysfact(1:4), pysfact(5) )

  allocate(omegaxnr(0:nx-1,0:nx-1))
  allocate(omegaynr(0:ny-1,0:ny-1))
  allocate(omegaxni(0:nx-1,0:nx-1))
  allocate(omegayni(0:ny-1,0:ny-1))
  allocate(omegaxbr(0:pxfact(5)-1,0:pxfact(5)-1))
  allocate(omegaybr(0:pyfact(5)-1,0:pyfact(5)-1))
  allocate(omegaxbi(0:pxfact(5)-1,0:pxfact(5)-1))
  allocate(omegaybi(0:pyfact(5)-1,0:pyfact(5)-1))
  allocate(omegaxnrs(0:nx/spgix-1,0:nx/spgix-1))
  allocate(omegaynrs(0:ny/spgiy-1,0:ny/spgiy-1))
  allocate(omegaxnis(0:nx/spgix-1,0:nx/spgix-1))
  allocate(omegaynis(0:ny/spgiy-1,0:ny/spgiy-1))
  allocate(omegaxbrs(0:pxsfact(5)-1,0:pxsfact(5)-1))
  allocate(omegaybrs(0:pysfact(5)-1,0:pysfact(5)-1))
  allocate(omegaxbis(0:pxsfact(5)-1,0:pxsfact(5)-1))
  allocate(omegaybis(0:pysfact(5)-1,0:pysfact(5)-1))

  call rotate_calc( nx, 'r', pxfact,  &
  &               omegaxbr(0:pxfact(5)-1,0:pxfact(5)-1),  &
  &               omegaxnr(0:nx-1,0:nx-1) )
  call rotate_calc( nx, 'i', pxfact,  &
  &               omegaxbi(0:pxfact(5)-1,0:pxfact(5)-1),  &
  &               omegaxni(0:nx-1,0:nx-1) )
  call rotate_calc( ny, 'r', pyfact,  &
  &               omegaybr(0:pyfact(5)-1,0:pyfact(5)-1),  &
  &               omegaynr(0:ny-1,0:ny-1) )
  call rotate_calc( ny, 'i', pyfact,  &
  &               omegaybi(0:pyfact(5)-1,0:pyfact(5)-1),  &
  &               omegayni(0:ny-1,0:ny-1) )

  call rotate_calc( nx/spgix, 'r', pxsfact,  &
  &               omegaxbrs(0:pxsfact(5)-1,0:pxsfact(5)-1),  &
  &               omegaxnrs(0:nx/spgix-1,0:nx/spgix-1) )
  call rotate_calc( nx/spgix, 'i', pxsfact,  &
  &               omegaxbis(0:pxsfact(5)-1,0:pxsfact(5)-1),  &
  &               omegaxnis(0:nx/spgix-1,0:nx/spgix-1) )
  call rotate_calc( ny/spgiy, 'r', pysfact,  &
  &               omegaybrs(0:pysfact(5)-1,0:pysfact(5)-1),  &
  &               omegaynrs(0:ny/spgiy-1,0:ny/spgiy-1) )
  call rotate_calc( ny/spgiy, 'i', pysfact,  &
  &               omegaybis(0:pysfact(5)-1,0:pysfact(5)-1),  &
  &               omegaynis(0:ny/spgiy-1,0:ny/spgiy-1) )

  do j=1,ny/spgiy
     do i=1,nx/spgix
        cpval1(i,j)=ival1((i-1)*spgix+1,(j-1)*spgiy+1)
     end do
  end do

  call cpu_time(t1)

  call ffttp_2d( nx/spgix, ny/spgiy,  &
  &              cpval1(1:nx/spgix,1:ny/spgiy),  &
  &              cpval2(1:nx/spgix,1:ny/spgiy),  &
  &              'r', 'o', prim_factx=pxsfact, prim_facty=pysfact,  &
  &              omegax_fix=omegaxbrs, omegaxn_fix=omegaxnrs,  &
  &              omegay_fix=omegaybrs, omegayn_fix=omegaynrs )

  call cpu_time(t2)

  write(*,*) "cpu_time is ", t2-t1, "[sec]."

  sp1=0.0
  sp2=0.0

!-- WN0
  sp1(1,1)=cpval2(1,1)
  sp1(nx,ny)=cpval2(nx/spgix,ny/spgiy)

  sp2(1,1)=cpval2(1,1)
  sp2(nx,ny)=cpval2(nx/spgix,ny/spgiy)

  do j=2,wny+1
     sp2(1,j)=cpval2(1,j)
     sp2(1,ny-j+2)=cpval2(1,ny/spgiy-j+2)
  end do

  do i=2,wnx+1
     sp2(i,1)=cpval2(i,1)
     sp2(nx-i+2,1)=cpval2(nx/spgix-i+2,1)
  end do

  do j=2,wny+1
     do i=2,wnx+1
        sp2(i,j)=cpval2(i,j)
        sp2(nx-i+2,ny-j+2)=cpval2(nx/spgix-i+2,ny/spgiy-j+2)
        sp2(i,ny-j+2)=cpval2(i,ny/spgiy-j+2)
        sp2(nx-i+2,j)=cpval2(nx/spgix-i+2,j)
     end do
  end do

!  call ffttp_2d( nx, ny, cpval2(1:nx,1:ny), cpval1(1:nx,1:ny),  &
!  &              'i', 'o', prim_factx=pxfact, prim_facty=pyfact,  &
!  &              omegax_fix=omegaxbi, omegaxn_fix=omegaxni,  &
!  &              omegay_fix=omegaybi, omegayn_fix=omegayni )

  call ffttp_2d( nx, ny, sp1(1:nx,1:ny), spi1(1:nx,1:ny),  &
  &              'i', 'o', prim_factx=pxfact, prim_facty=pyfact,  &
  &              omegax_fix=omegaxbi, omegaxn_fix=omegaxni,  &
  &              omegay_fix=omegaybi, omegayn_fix=omegayni )

  call ffttp_2d( nx, ny, sp2(1:nx,1:ny), spi2(1:nx,1:ny),  &
  &              'i', 'o', prim_factx=pxfact, prim_facty=pyfact,  &
  &              omegax_fix=omegaxbi, omegaxn_fix=omegaxni,  &
  &              omegay_fix=omegaybi, omegayn_fix=omegayni )

  do j=1,ny
     do i=1,nx
        oval0(i,j)=real(ival1(i,j))
        oval1(i,j)=real(spi1(i,j))
        oval2(i,j)=real(spi2(i,j))
     end do
  end do

  end subroutine phase1

end module m_phase1

program advanced2

!-- advanced1 Ʊץ. , FFT ѴΥǡǤտǴְ.
!-- ¾ n ʬ MPI ץˤץ.
!-- ꥸʥǡκ,  root ץǹԤ.

  use m_phase1
  use mpi
  use dcl
  use dcl_automatic

  implicit none

  integer, parameter :: nx=1400, ny=800
  integer, parameter :: spgix=10, spgiy=10
  integer, parameter :: npx=2, npy=2
  integer, parameter :: root=0

  integer :: i, j, icounter, ip, jp
  integer :: wnx, wny
  integer :: MY_RANK, PETOT, IERROR   ! For MPI variables
  real :: x(nx), y(ny)
  real, allocatable, dimension(:,:) :: valp1
  real, allocatable, dimension(:) :: totsp1, totsp2
  real, dimension(nx,ny) :: ival1, oval0, oval1, oval2

!-- Initializing MPI

  call MPI_INIT( IERROR )

!-- Getting total process number and oneself process ID.

  call MPI_COMM_RANK( MPI_COMM_WORLD, MY_RANK, IERROR )
  call MPI_COMM_SIZE( MPI_COMM_WORLD, PETOT, IERROR )

  allocate(totsp1(nx*ny))
  allocate(totsp2((nx/npx)*(ny/npy)))

  if(MY_RANK==root)then

     wnx=7
     wny=4

     do j=1,ny
        do i=1,nx
!           ival1(i,j)=exp(-(((real(i)-0.5*real(nx))/real(nx))**2  &
!  &                        +((real(j)-0.5*real(ny))/real(ny))**2))
           ival1(i,j)=exp(-(((real(i))/real(nx))**2  &
  &                        +((real(j))/real(ny))**2))
        end do
     end do

     x=(/((real(i)/real(nx)),i=1,nx)/)
     y=(/((real(j)/real(ny)),j=1,ny)/)

!-- rearranging from data array to comm array

     icounter=0

     do jp=1,npy
        do ip=1,npx
           do j=1,ny/npy
              do i=1,nx/npx
                 icounter=icounter+1
                 totsp1(icounter)=ival1((ip-1)*nx/npx+i,(jp-1)*ny/npy+j)
              end do
           end do
        end do
     end do

  end if

  call MPI_BARRIER( MPI_COMM_WORLD, IERROR )

!-- splitting the whole data to partial data

  call MPI_SCATTER( totsp1, (nx/npx)*(ny/npy), MPI_REAL,  &
  &                 totsp2, (nx/npx)*(ny/npy), MPI_REAL,  &
  &                 root, MPI_COMM_WORLD, IERROR )

  call MPI_BARRIER( MPI_COMM_WORLD, IERROR )

!-- rearranging from comm array to data array

  if(MY_RANK==root)then
     totsp1=0.0
     ival1=0.0
  end if

  allocate(valp1(nx/npx,ny/npy))
  icounter=0

  do j=1,ny/npy
     do i=1,nx/npx
        icounter=icounter+1
        valp1(i,j)=totsp2(icounter)
     end do
  end do

  icounter=0

  do j=1,ny/npy
     do i=1,nx/npx
        icounter=icounter+1
        totsp2(icounter)=valp1(i,j)
     end do
  end do

  call MPI_BARRIER( MPI_COMM_WORLD, IERROR )

!-- gathering the partial data to the whole data

  call MPI_GATHER( totsp2, (nx/npx)*(ny/npy), MPI_REAL,  &
  &                totsp1, (nx/npx)*(ny/npy), MPI_REAL,  &
  &                root, MPI_COMM_WORLD, IERROR )

  call MPI_BARRIER( MPI_COMM_WORLD, IERROR )

  if(MY_RANK==root)then
!-- rearranging from comm array to data array

     icounter=0

     do jp=1,npy
        do ip=1,npx
           do j=1,ny/npy
              do i=1,nx/npx
                 icounter=icounter+1
                 ival1((ip-1)*nx/npx+i,(jp-1)*ny/npy+j)=totsp1(icounter)
              end do
           end do
        end do
     end do

     call phase1( nx, ny, ival1, oval0, oval1, oval2, wnx, wny, spgix, spgiy )

!------------- DCL
     call SGISET('IFONT',2 )
     CALL GLRSET( 'RMISS', -999.0 )
     CALL GLLSET( 'LMISS', .TRUE. )
     call UZFACT(0.75)

     call color_setting( 80, (/0.0, 1.0/), min_tab=15999,  &
  &                      max_tab=95999, col_min=10, col_max=99 )

     call DclOpenGraphics(1)

     call Dcl_2D_cont_shade( 'Full',  &
  &                          x, y, ival1, oval0,  &
  &                          (/0.0, 1.0/), (/0.0, 1.0e-4/),  &
  &                          (/'X (km)', 'Y (km)'/),  &
  &                          (/'(f6.1)', '(f6.1)'/),  &
  &                          c_num=(/10, 80/), no_tone=.true. )

     call Dcl_2D_cont_shade( 'WN0',  &
  &                          x, y, ival1, oval1,  &
  &                          (/0.0, 1.0/), (/0.0, 1.0e-4/),  &
  &                          (/'X (km)', 'Y (km)'/),  &
  &                          (/'(f6.1)', '(f6.1)'/),  &
  &                          c_num=(/10, 80/), no_tone=.true. )

     call Dcl_2D_cont_shade( 'WN1',  &
  &                          x, y, ival1, oval2,  &
  &                          (/0.0, 1.0/), (/0.0, 1.0e-4/),  &
  &                          (/'X (km)', 'Y (km)'/),  &
  &                          (/'(f6.1)', '(f6.1)'/),  &
  &                          c_num=(/10, 80/), no_tone=.true. )

     call DclCloseGraphics

  end if

!-- finishing MPI process

  call MPI_FINALIZE( IERROR )

end program advanced2
