module solver
    use const
    use var
    use comm
    use io
implicit real*8 (a-h,o-z)

contains

subroutine init_solver
    ! -------------------------
    ! evp elliptic solver data
    ! used for surface pressure
    ! -------------------------
!    open(99,file='prepglo/data/evp'//grd,form='unformatted')
    if(lsolver.eq.1) then
	if(myid.eq.0) write(*,*)'evp solver'
!	read(99) al,ar,ab,at,ac,rinv,rinv1,ie
!	close(99)
	elseif(lsolver.eq.2) then
	if(myid.eq.0) write(*,*)'bi-cgstab solver'
        call rdnc_evp
!	read(99) al,ar,ab,at,ac,cl,cb,cc,cr,ct
!	close(99)
    else
	if(myid.eq.0)write(*,*)'lsolver=',lsolver,' (incorrect solver)'
	call mpi_finalize(ierr)
	stop
    endif
end subroutine



! *****************
! elliptic solver *
! *****************
! ----------------------------------------------------------------------
subroutine p_bicgstab_le(ab,al,ac,ar,at,b,x,cb,cl,cc,cr,ct,r,rh,p,v,s,t,ph,sh)
    parameter(i0j0=i0*j0)
    real*8 x,r,rh,p,v,s,t,ph,sh
    real*8 rho,rhn,alpha,beta,w,res,tmp,tp,tol
    dimension ab(i2,j2),al(i2,j2),ac(i2,j2),ar(i2,j2),at(i2,0:j2), &
    b(i2,j2),x(i0,j0),r(i0,j0),rh(i0,j0),p(i0,j0),v(i0,j0),  &
    s(i0,j0),t(i0,j0),ph(i0,j0),sh(i0,j0), &
    cb(i2,j2),cl(i2,j2),cc(i2,j2),cr(i2,j2),ct(i2,j2)

    iter=0
    10   rho=1.d0
    alpha=1.d0
    w=1.d0
    call p_sqprod(ab,al,ac,ar,at,x,r)

    do 50 j=2,j1
    do 50 i=2,i1
    50   r(i,j)=b(i-1,j-1)-r(i,j)

    call dcopy(i0j0,r,1,rh,1)
    call p_inprod(r,r,tp)

    res=sqrt(tp)
    if (res .lt. 1e-7) then
	call mpi_sendrecv(x(2,1),1,m_v8lon,m_w,1,x(i0,1),1,m_v8lon,m_e,1,m_cart,istat,ierr)
	call mpi_sendrecv(x(i1,1),1,m_v8lon,m_e,1,x(1,1),1,m_v8lon,m_w,1,m_cart,istat,ierr)
	call mpi_sendrecv(x(1,2),1,m_v8lat,m_s,1,x(1,j0),1,m_v8lat,m_n,1,m_cart,istat,ierr)
	call mpi_sendrecv(x(1,j1),1,m_v8lat,m_n,1,x(1,1),1,m_v8lat,m_s,1,m_cart,istat,ierr)
	return
    endif

    100  iter=iter+1
    call p_inprod(r,rh,rhn)
    beta=(rhn/rho)*(alpha/w)

    ! print*,iter,'beta=',beta

    call daxpy(i0j0,-1.d0*w,v,1,p,1)
    call dscal(i0j0,beta,p,1)
    call daxpy(i0j0,1.d0,r,1,p,1)

    ! do 200 j=2,j1
    ! do 200 i=2,i1
    ! 200 ph(i,j)=p(i,j)

    ! call p_sqelim(cb,cl,cc,cr,ct,p,ph)
    ! call p_lusqelim(cb,cl,cc,cr,ct,p,ph)
    call p_sipsqelim(cb,cl,cc,cr,ct,p,ph)
    call p_sqprod(ab,al,ac,ar,at,ph,v)
    call p_inprod(rh,v,tmp)
    alpha=rhn/tmp

    ! print*,iter,'alpha=', alpha

    call dcopy(i0j0,r,1,s,1)
    call daxpy(i0j0,-1.d0*alpha,v,1,s,1)
    ! call dcopy(i0j0,s,1,sh,1)

    ! do 300 j=2,j1
    ! do 300 i=2,i1
    ! 300 sh(i,j)=s(i,j)

    call p_sipsqelim(cb,cl,cc,cr,ct,s,sh)
    ! call p_lusqelim(cb,cl,cc,cr,ct,s,sh)
    ! call p_sqelim(cb,cl,cc,cr,ct,s,sh)
    call p_sqprod(ab,al,ac,ar,at,sh,t)
    call p_inprod(t,t,tmp)
    call p_inprod(t,s,w)
    w=w/tmp

    ! print*,iter,'wn=',w

    tmp=0.d0

    call daxpy(i0j0,alpha,ph,1,x,1)
    call daxpy(i0j0,w,sh,1,x,1)
    call dcopy(i0j0,s,1,r,1)
    call daxpy(i0j0,-1.d0*w,t,1,r,1)

    call p_inprod(r,r,tp)

    rho=rhn
    ! if (myid .eq. 0) print*,iter,sqrt(tp)
    ! if (sqrt(tp)/res .gt. 1e-3 .and. mod(iter,100) .ne. 0) goto 100
    ! if (myid .eq. 0) print*, iter,sqrt(tp)
    !if (sqrt(tp) .gt. 1e-7 .and. iter .lt. 10000) goto 10
    if (sqrt(tp).gt.1.e-9.or.sqrt(tp)/res .gt. 1e-3.and. mod(iter,1000) .ne. 0) goto 100
    call mpi_barrier(m_cart,ierr)
    call mpi_sendrecv(x(2,1),1,m_v8lon,m_w,1,x(i0,1),1,m_v8lon,m_e,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(i1,1),1,m_v8lon,m_e,1,x(1,1),1,m_v8lon,m_w,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(1,2),1,m_v8lat,m_s,1,x(1,j0),1,m_v8lat,m_n,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(1,j1),1,m_v8lat,m_n,1,x(1,1),1,m_v8lat,m_s,1,m_cart,istat,ierr)

    continue
end subroutine


! ----------------------------------------------------------------------
subroutine p_sqprod(ab,al,ac,ar,at,x,b)
    ! real*8 ab,al,ac,ar,at,x,b
    real*8 x,b
    dimension ab(i2,j2),al(i2,j2),ac(i2,j2),ar(i2,j2),at(i2,0:j2),x(i0,j0),b(i0,j0)

    call mpi_barrier(m_cart,ierr)
    call mpi_sendrecv(x(2,1),1,m_v8lon,m_w,1,x(i0,1),1,m_v8lon,m_e,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(i1,1),1,m_v8lon,m_e,1,x(1,1),1,m_v8lon,m_w,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(1,2),1,m_v8lat,m_s,1,x(1,j0),1,m_v8lat,m_n,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(1,j1),1,m_v8lat,m_n,1,x(1,1),1,m_v8lat,m_s,1,m_cart,istat,ierr)

    do 100 j=2,j1
    do 100 i=2,i1
    it=i-1
    jt=j-1
    100  b(i,j)=ab(it,jt)*x(i,jt)+al(it,jt)*x(it,j)+ac(it,jt)*x(i,j)+ar(it,jt)*x(i+1,j)+at(it,jt)*x(i,j+1)
end subroutine


! ----------------------------------------------------------------------
subroutine p_inprod(a1,a2,pd)
    real*8 a1,a2,tmp,pd
    dimension a1(i0,j0),a2(i0,j0)

    tmp=0.d0
    do 100 j=2,j1
    do 100 i=2,i1
    100  tmp=tmp+a1(i,j)*a2(i,j)
    call mpi_barrier(m_cart,ierr)
    call mpi_allreduce(tmp,pd,1,mpi_real8,mpi_sum,m_cart,ierr)

    return
end subroutine


! --------------------------
subroutine p_sqelim(cb,cl,cc,cr,ct,b,bh)
    parameter(i3=i2-1,j3=j2-1,i0j0=i0*j0)
    ! real*8 cb,cl,cc,cr,ct,
    real*8 b,bh
    dimension cb(i2,j2),cl(i2,j2),cc(i2,j2),cr(i2,j2),ct(i2,j2),b(i0,j0),bh(i0,j0)

    call dcopy(i0j0,b,1,bh,1)
    do 100 j=2,j2
    jt=j-1
    do 150 i=2,i2
    it=i-1
    bh(i,j)=bh(i,j)*cc(it,jt)
    bh(i+1,j)=bh(i+1,j)-cr(it,jt)*bh(i,j)
    150  bh(i,j+1)=bh(i,j+1)-ct(it,jt)*bh(i,j)
    bh(i1,j)=bh(i1,j)*cc(i2,jt)
    100  bh(i1,j+1)=bh(i1,j+1)-ct(i2,j)*bh(i1,j)

    do 200 i=2,i2
    it=i-1
    bh(i,j1)=bh(i,j1)*cc(it,j2)
    200  bh(i+1,j1)=bh(i+1,j1)-cr(it,j2)*bh(i,j1)
    bh(i1,j1)=bh(i1,j1)*cc(i2,j2)

    do 300 j=j1,3,-1
    jt=j-1
    do 350 i=i1,3,-1
    it=i-1
    bh(i,j)=bh(i,j)*cc(it,jt)
    bh(i-1,j)=bh(i-1,j)-cr(it,jt)*bh(i,j)
    350  bh(i,j-1)=bh(i,j-1)-ct(it,jt)*bh(i,j)
    bh(2,j)=bh(2,j)*cc(1,jt)
    300  bh(2,j-1)=bh(2,j-1)-ct(1,jt)*bh(2,j)

    do 400 i=i1,3,-1
    it=i-1
    bh(i,2)=bh(i,2)*cc(it,1)
    400  bh(i-1,2)=bh(i-1,2)-cr(it,1)*bh(i,2)
    bh(2,2)=bh(2,2)*cc(1,1)
end subroutine


! ---------------------------------------------------------------------
subroutine p_lusqelim(cb,cl,cc,cr,ct,b,bh)
    parameter(i3=i2-1,j3=j2-1,i0j0=i0*j0)
    ! real*8 cb,cl,cc,cr,ct,b,bh
    real*8 b,bh
    dimension cb(i2,j2),cl(i2,j2),cc(i2,j2),cr(i2,j2),ct(i2,j2),b(i0,j0),bh(i0,j0)

    call dcopy(i0j0,b,1,bh,1)
    do 100 j=2,j2
    jt=j-1
    do 150 i=2,i2
    it=i-1
    bh(i+1,j)=bh(i+1,j)-cl(it,jt)*bh(i,j)
    150  bh(i,j+1)=bh(i,j+1)-cb(it,jt)*bh(i,j)
    100  bh(i1,j+1)=bh(i1,j+1)-cb(i2,j)*bh(i1,j)

    do 200 i=2,i2
    it=i-1
    200  bh(i+1,j1)=bh(i+1,j1)-cl(it,j2)*bh(i,j1)
    bh(i1,j1)=bh(i1,j1)

    do 300 j=j1,3,-1
    jt=j-1
    do 350 i=i1,3,-1
    it=i-1
    bh(i,j)=bh(i,j)*cc(it,jt)
    bh(i-1,j)=bh(i-1,j)-cr(it,jt)*bh(i,j)
    350  bh(i,j-1)=bh(i,j-1)-ct(it,jt)*bh(i,j)
    bh(2,j)=bh(2,j)*cc(1,jt)
    300  bh(2,j-1)=bh(2,j-1)-ct(1,jt)*bh(2,j)

    do 400 i=i1,3,-1
    it=i-1
    bh(i,2)=bh(i,2)*cc(it,1)
    400  bh(i-1,2)=bh(i-1,2)-cr(it,1)*bh(i,2)
    bh(2,2)=bh(2,2)*cc(1,1)
end subroutine

! ---------------------------------------------------------------------
subroutine p_sipsqelim(cb,cl,cc,cr,ct,b,bh)
    parameter(i3=i2-1,j3=j2-1,i0j0=i0*j0)
    ! real*8 cb,cl,cc,cr,ct,b,bh
    real*8 b,bh
    dimension cb(i2,j2),cl(i2,j2),cc(i2,j2),cr(i2,j2),ct(i2,j2),b(i0,j0),bh(i0,j0)

    call dcopy(i0j0,b,1,bh,1)
    do 100 j=2,j2
    jt=j-1
    do 150 i=2,i2
    it=i-1
    bh(i,j)=bh(i,j)*cc(it,jt)
    bh(i+1,j)=bh(i+1,j)-cl(it,jt)*bh(i,j)
    150  bh(i,j+1)=bh(i,j+1)-cb(it,jt)*bh(i,j)
    bh(i1,j)=bh(i1,j)*cc(i2,jt)
    100  bh(i1,j+1)=bh(i1,j+1)-cb(i2,jt)*bh(i1,j)

    do 200 i=2,i2
    it=i-1
    bh(i,j1)=bh(i,j1)*cc(it,j2)
    200  bh(i+1,j1)=bh(i+1,j1)-cl(it,j2)*bh(i,j1)
    bh(i1,j1)=bh(i1,j1)*cc(i2,j2)

    do 300 j=j1,3,-1
    jt=j-1
    do 350 i=i1,3,-1
    it=i-1
    bh(i-1,j)=bh(i-1,j)-cr(it,jt)*bh(i,j)
    350  bh(i,j-1)=bh(i,j-1)-ct(it,jt)*bh(i,j)
    300  bh(2,j-1)=bh(2,j-1)-ct(1,jt)*bh(2,j)

    do 400 i=i1,3,-1
    400  bh(i-1,2)=bh(i-1,2)-cr(i-1,1)*bh(i,2)
end subroutine

! ----------------------------------------------------------------------
subroutine rep2(ax,ay,bb,cx,cy,rinv,rinv1,dum0,dum1,dum2,dumg,f,x,ie)
    ! ----------------------------------------------------------------------
    !ibm  real*8 rinv,rinv1,dum0,dum1,dum2,x,h,row
    parameter(nbg0=nb0*ngy0,nbg1=nbg0-1,ibir=i2t/ngy0)
    real*8 x,rinv,rinv1,dum1,dum2,dum0,dumg,dd
    dimension ax(i2,j2),ay(i2,j2),bb(i2,j2),cx(i2,j2),cy(i2,0:j2),f(i2,j2),rinv(ibir,i0,nbg0),rinv1(ibir,i0,nbg1),ie(nb0),dum0(i0,nb0),dum1(ibir),dum2(i0),x(i0,j0),dumg(i2t)

    character grd*7
    common/grd/grd


    do 160 ng=0,ngy0-1
    jsp=1
    do 150 nb=1,nb0
    nbg=ng*nb0+nb
    if (mylat .eq. ng) then
	jfp=ie(nb)-2
	do 106 j=jsp,jfp
	do 105 i=1,i2
	105  x(i+1,j+2)=(f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1))/cy(i,j)
	call mpi_sendrecv(x(2,j+2),1,mpi_real8,m_w,1,x(i0,j+2),1,mpi_real8,m_e,1,m_cart,istat,ierr)
	call mpi_sendrecv(x(i1,j+2),1,mpi_real8,m_e,1,x(1,j+2),1,mpi_real8,m_w,1,m_cart,istat,ierr)
	106  continue
	if (nbg.eq.nbg0) go to 160
	j=ie(nb)-1
	do 115 i=1,i2
	115  dum2(i)=f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1)-cy(i,j)*x(i+1,j+2)
	j=ie(nb)
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
	do 116 n=1,i0
	dum2(n)=x(n,j)
	116  dum0(n,nb)=x(n,j)
	dd=1.d0
    else
	if (nbg .eq. nbg0) go to 160
	dd=0.d0
    endif
    call mpi_scatter(dumg,ibir,mpi_real8,dum1,ibir,mpi_real8,ng,m_comm_lat,ierr)
    call dgemv('t',ibir,i0,-1.d0,rinv1(1,1,nbg),ibir,dum1,1,dd,dum2,1)
    if (nb .eq. nb0) go to 150
    call mpi_reduce(dum2,x(1,ie(nb)),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)
    jsp=ie(nb)

    150  continue
    call mpi_reduce(dum2,x(1,1),i0,mpi_real8,mpi_sum,ng+1,m_comm_lat,ierr)
    160  continue

    !  wenien
    !       open(160,file='opt/rep160'//grd,form='unformatted')
    !       write(160) x,f
    !       close(160)


    do 300 ng=ngy0-1,0,-1
    do 250 nbs=1,nb0
    nb=nb0-nbs+1
    nbg=nb0*ng+nb

    jsp=1
    if (nb.ne.1) jsp=ie(nb-1)
    if (mylat .eq. ng) then
	if (nbg.eq.nbg0) go to 201
	j=ie(nb)
	do 200 n=1,i0
	200  x(n,j)=dum0(n,nb)
	201  j=ie(nb)-1
	do 210 i=1,i2
	210  dum2(i)=f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1)-cy(i,j)*x(i+1,j+2)
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
    endif

    call mpi_scatter(dumg,ibir,mpi_real8,dum1,ibir,mpi_real8,ng,m_comm_lat,ierr)
    call dgemv('t',ibir,i0,1.d0,rinv(1,1,nbg),ibir,dum1,1,0.d0,dum2,1)
    call mpi_reduce(dum2,dum0(1,nb),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)

    if (mylat .eq. ng) then
	do 220 n=1,i0
	220  x(n,jsp+1)=x(n,jsp+1)+dum0(n,nb)
    endif
    if (nbg.eq.1) go to 300
    if (mylat .eq. ng) then
	do 230 m=1,i2
	230  dum2(m)=dum0(m+1,nb)*cy(m,jsp-1)
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
    endif
    call mpi_scatter(dumg,ibir,mpi_real8,dum1,ibir,mpi_real8,ng,m_comm_lat,ierr)
    call dgemv('t',ibir,i0,1.d0,rinv1(1,1,nbg-1),ibir,dum1,1,0.d0,dum2,1)
    call mpi_reduce(dum2,dum0(1,nb),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)
    if (mylat .eq. ng) then
	do 245 n=1,i0
	245  dum0(n,nb)=x(n,jsp)+dum0(n,nb)
    endif
    250  continue
    if (mylat .eq. ng) call mpi_send(x(1,2),i0,mpi_real8,m_s,1,m_cart,ierr)
    if (mylat .eq. ng-1) call mpi_recv(x(1,j0),i0,mpi_real8,m_n,1,m_cart,istat,ierr)
    300  continue
    if (mylat .eq. 0) then
	do 350 i=1,i0
	350  dum0(i,1)=0.d0
    endif

    jsp=1
    do 400 nb=1,nb0
    jfp=ie(nb)-2
    if (nb.eq.nb0) jfp=ie(nb)-2
    do 410 i=1,i0
    410  x(i,jsp)=dum0(i,nb)
    do 420 j=jsp,jfp
    do 430 i=1,i2
    430  x(i+1,j+2)=(f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1))/cy(i,j)
    call mpi_sendrecv(x(2,j+2),1,mpi_real8,m_w,1,x(i0,j+2),1,mpi_real8,m_e,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(i1,j+2),1,mpi_real8,m_e,1,x(1,j+2),1,mpi_real8,m_w,1,m_cart,istat,ierr)
    420  continue
    400  jsp=ie(nb)
end subroutine


! ----------------------------------------------------------------------
subroutine rep(ax,ay,bb,cx,cy,rinv,rinv1,dum0,dum1,dum2,dumg,f,x,ie)
    parameter(nbg0=nb0*ngy0,nbg1=nbg0-1,ig0=i2*ngx0+2,ig2=ig0-2,ib=ig2/ngy0)
    real*8 x,rinv,rinv1,dum1,dum2,dum0,dumg,dd
    ! real*8 ax,ay,bb,cx,cy,f
    dimension ax(i2,j2),ay(i2,j2),bb(i2,j2),cx(i2,j2),cy(i2,0:j2),f(i2,j2),rinv(ib,i0,nbg0),rinv1(ib,i0,nbg1),ie(nb0),dum0(i0,nb0),dum1(ib),dum2(i0),x(i0,j0),dumg(i2t)

    real*8 timer
    common/timer/timer(10)

    timer(7)=mpi_wtime()

    do 160 ng=0,ngy0-1
    jsp=1
    do 150 nb=1,nb0
    nbg=ng*nb0+nb
    if (mylat .eq. ng) then
	jfp=ie(nb)-2
	do 106 j=jsp,jfp
	do 105 i=1,i2
	105  x(i+1,j+2)=(f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1))/cy(i,j)
	timer(8)=mpi_wtime()
	call mpi_sendrecv(x(2,j+2),1,mpi_real8,m_w,1,x(i0,j+2),1,mpi_real8,m_e,1,m_cart,istat,ierr)
	call mpi_sendrecv(x(i1,j+2),1,mpi_real8,m_e,1,x(1,j+2),1,mpi_real8,m_w,1,m_cart,istat,ierr)

	106  timer(5)=timer(5)+mpi_wtime()-timer(8)
	if (nbg.eq.nbg0) go to 160
	j=ie(nb)-1
	do 115 i=1,i2
	115  dum2(i)=f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1)-cy(i,j)*x(i+1,j+2)
	j=ie(nb)
	timer(8)=mpi_wtime()
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
	timer(5)=timer(5)+mpi_wtime()-timer(8)
	do 116 n=1,i0
	dum2(n)=x(n,j)
	116  dum0(n,nb)=x(n,j)
	dd=1.d0
    else
	if (nbg .eq. nbg0) go to 160
	dd=0.d0
    endif
    timer(8)=mpi_wtime()
    call mpi_scatter(dumg,ib,mpi_real8,dum1,ib,mpi_real8,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    call dgemv('t',ib,i0,-1.d0,rinv1(1,1,nbg),ib,dum1,1,dd,dum2,1)
    if (nb .eq. nb0) go to 150
    timer(8)=mpi_wtime()
    call mpi_reduce(dum2,x(1,ie(nb)),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    jsp=ie(nb)

    150  continue
    timer(8)=mpi_wtime()
    call mpi_reduce(dum2,x(1,1),i0,mpi_real8,mpi_sum,ng+1,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    160  continue

    do 300 ng=ngy0-1,0,-1
    do 250 nbs=1,nb0
    nb=nb0-nbs+1
    nbg=nb0*ng+nb

    jsp=1
    if (nb.ne.1) jsp=ie(nb-1)
    if (mylat .eq. ng) then
	if (nbg.eq.nbg0) go to 201
	j=ie(nb)
	do 200 n=1,i0
	200  x(n,j)=dum0(n,nb)
	201  j=ie(nb)-1
	do 210 i=1,i2
	210  dum2(i)=f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1)-cy(i,j)*x(i+1,j+2)
	timer(8)=mpi_wtime()
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
	timer(5)=timer(5)+mpi_wtime()-timer(8)
    endif

    timer(8)=mpi_wtime()
    call mpi_scatter(dumg,ib,mpi_real8,dum1,ib,mpi_real8,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    call dgemv('t',ib,i0,1.d0,rinv(1,1,nbg),ib,dum1,1,0.d0,dum2,1)
    timer(8)=mpi_wtime()
    call mpi_reduce(dum2,dum0(1,nb),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)

    if (mylat .eq. ng) then
	do 220 n=1,i0
	220  x(n,jsp+1)=x(n,jsp+1)+dum0(n,nb)
    endif
    if (nbg.eq.1) go to 300
    if (mylat .eq. ng) then
	do 230 m=1,i2
	230  dum2(m)=dum0(m+1,nb)*cy(m,jsp-1)
	timer(8)=mpi_wtime()
	call mpi_allgather(dum2,i2,mpi_real8,dumg,i2,mpi_real8,m_comm_lon,ierr)
	timer(5)=timer(5)+mpi_wtime()-timer(8)
    endif
    timer(8)=mpi_wtime()
    call mpi_scatter(dumg,ib,mpi_real8,dum1,ib,mpi_real8,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    call dgemv('t',ib,i0,1.d0,rinv1(1,1,nbg-1),ib,dum1,1,0.d0,dum2,1)
    timer(8)=mpi_wtime()
    call mpi_reduce(dum2,dum0(1,nb),i0,mpi_real8,mpi_sum,ng,m_comm_lat,ierr)
    timer(5)=timer(5)+mpi_wtime()-timer(8)
    if (mylat .eq. ng) then
	do 245 n=1,i0
	245  dum0(n,nb)=x(n,jsp)+dum0(n,nb)
    endif
    250  continue
    if (mylat .eq. ng) call mpi_send(x(1,2),i0,mpi_real8,m_s,1,m_cart,ierr)
    if (mylat .eq. ng-1) call mpi_recv(x(1,j0),i0,mpi_real8,m_n,1,m_cart,istat,ierr)
    300  continue
    if (mylat .eq. 0) then
	do 350 i=1,i0
	350  dum0(i,1)=0.d0
    endif

    jsp=1
    do 400 nb=1,nb0
    jfp=ie(nb)-2
    if (nb.eq.nb0) jfp=ie(nb)-2
    do 410 i=1,i0
    410  x(i,jsp)=dum0(i,nb)
    do 420 j=jsp,jfp
    do 430 i=1,i2
    430  x(i+1,j+2)=(f(i,j)-ax(i,j)*x(i,j+1)-ay(i,j)*x(i+1,j)-bb(i,j)*x(i+1,j+1)-cx(i,j)*x(i+2,j+1))/cy(i,j)
    timer(8)=mpi_wtime()
    call mpi_sendrecv(x(2,j+2),1,mpi_real8,m_w,1,x(i0,j+2),1,mpi_real8,m_e,1,m_cart,istat,ierr)
    call mpi_sendrecv(x(i1,j+2),1,mpi_real8,m_e,1,x(1,j+2),1,mpi_real8,m_w,1,m_cart,istat,ierr)
    420  timer(5)=timer(5)+mpi_wtime()-timer(8)
    400  jsp=ie(nb)
end subroutine


end module