# Time domain 15 degree (t',x',tau') migration of zero-offset section 
#
#Usage: <Headin T15mig n1=nt= d1=dt=  n2=nx=  d2=dx= vc=/vel=
#        trick=.14 taper=no stepdown=1 npad=1 >Headout
#%
#  the retarded coordinate transform:
#       t'      =       t + tau
#       x'      =       x
#       tau'    =       tau       ( tau = 2*z/v -- psudo-depth )
#
#n1=nt,n2=nx    trace length, number of traces
#d1=dt,d2=dx    sampling intervals
#vc=            constant velocity(v=vc); must specify either vc or vel
#vel=           headfile specifying velocity profile(s) of length nt
#                 when v is v(t) or v(t,x)
#trick=.14      1/6. trick in more accurate finite difference approximation 
#                 of second x derivative;  trick<0 when dipfilter needed
#taper=yes      taper side traces to reduce boundary reflections
#stepdown=1     increment of downward step
#  
# author:       Zhiming Li, 8/17/84
#
#       Modified:       
#               stew        --- transfered to convex and took out ap code 
#               li 12/6/85  --- transpose data and velocity array       
#               stew 4/18/86 -- vectorized rewrite based on SEP-38 article
#               stew 4/25/86 -- convert interval vels to tau step internally
#               stew 5/6/86  -- hand roll rhs and triply outer loops for speed
#               stew 5/26/86 -- somewhat faster recoding
#               marta 6/17/86---changed rite to npad+1 instead of npad
#               stew&marta 9/30/86---fixed alp(ntau,nx) alloc and declaration
#                                 ---changed min0(ittop,(it-1)/(stepdown-1))
#                                    to min0(ittop,(it-1)/(stepdown-1)-1)
#               marta 10/1/86--- fixed bug that left beta undefined
#               stew 7/27/89 --- For CM2 Fortran 8x
#               stew 8/6/89 --- Pull tridiagonal code inline so I can use
#                               CMF$LAYOUT directives.
#
# ----------
#  Keyword:  15-degree time-domain migration
# ----------
integer nt,nx,ntp1,stepdown,npad
integer nxpow2,nxlog2,ntaupow2
real eps,dt,dx,beta,vc
character *10 taper
from either: integer n1:nt, n2:nx, stepdown=1
from either: real d1:dt, d2:dx
from par: real beta=.14, vc=0.0, eps=.0001
from par: character taper='no'
from par: integer npad=1
        npad=1
        nx=nx+npad*2
        ntp1=nt+1
        ntau=(nt-1)/stepdown
        nxpow2=1
        nxlog2=0
        while(nxpow2< nx) {
           nxlog2=nxlog2+1
           nxpow2=nxpow2*2
        }
        ntaupow2=1
        while(ntaupow2<ntau) {
           ntaupow2=ntaupow2*2
        }

allocate:  real vfe(ntaupow2*nxpow2+nt-ntau)
allocate:  real pfe(ntp1,nxpow2)
allocate:  integer factor1(nxlog2+1),factor2(nxlog2+1)

subroutine parmig(eps,dt,dx,beta,nt,nx,vc,ntp1,taper,stepdown,ntau,npad,nxpow2,nxlog2,ntaupow2,pfe,vfe,factor1,factor2)
       integer nt,nx,ntp1,ix,it,itau,infd,outfd,velfd
       integer nxpow2,nxlog2,ntaupow2
       integer stepdown,ntau,ittop,min0,npad,i
       real eps
       real psec(ntp1,nxpow2)
%cmf$layout psec(:news,:news)
       real p1(ntaupow2,nxpow2),p2(ntaupow2,nxpow2)
%cmf$layout p1(1:news,8:news)
%cmf$layout p2(1:news,8:news)
       real p3(ntaupow2,nxpow2),p4(ntaupow2,nxpow2)
%cmf$layout p3(1:news,8:news)
%cmf$layout p4(1:news,8:news)
       real alp(ntaupow2,nxpow2),a(ntaupow2,nxpow2)
%cmf$layout alp(1:news,8:news)
%cmf$layout a(1:news,8:news)
       real b(ntaupow2,nxpow2),c(ntaupow2,nxpow2)
%cmf$layout b(1:news,8:news)
%cmf$layout c(1:news,8:news)
       real r1(ntaupow2,nxpow2),r2(ntaupow2,nxpow2)
%cmf$layout r1(1:news,8:news)
%cmf$layout r2(1:news,8:news)
       real efac(nxpow2)
%cmf$layout efac(:news)
       real sc,beta,dt,dx,vc,factor,exp,float
       integer factor1(*),factor2(*)
%cmf$layout factor1(:serial)
%cmf$layout factor2(:serial)
       real vfe(ntaupow2,nxpow2),pfe(ntp1,nxpow2)
%cmf$layout pfe(:serial,:serial)
%cmf$layout vfe(:serial,:serial)
       integer reed,rite,icnt,itemp,iprev,inow,auxin,input,output
       integer n1vel,n2vel,lseek
       integer endvec(2),jt,irem
       integer iadr1,iadr2,iadalp
       integer loopcnt
       integer lbound,p1addr,vp
       real tepsilon
       logical rpt
       character*(*) taper
%       include '/usr/include/cm/paris-configuration-fort.h'
#%       include '/usr/include/cm/CMF_defs.h'
#%       include 'T15mig.ifc'
%       interface
%       integer function cmf_get_field_id(a)
%       real a(128,256)
%cmf$layout a(1:news,8:news)
%       end interface
%       interface
%       integer function cmf_get_vp_set_id(a)
%       real a(128,256)
%cmf$layout a(1:news,8:news)
%       end interface
%       interface
%       subroutine trifact(a,b,c,ns,neq,f1,f2,l2,ep,lp)
%       real a(128,256),b(128,256),c(128,256)
%       integer ns,neq,l2,lp
%       real ep
%       integer f1(*),f2(*)
%cmf$layout a(:news,:news)
%cmf$layout b(:news,:news)
%cmf$layout c(:news,:news)
%cmf$layout f1(:serial)
%cmf$layout f2(:serial)
%       end interface
define inext iprev

       infd=input()
       outfd=output()
 
       print *,'reading input section'
# read input section
       do ix=1,npad {
          do it=1,nt {
             pfe(it,ix)=0.0
       }  }
       do ix=nx-npad+1,nx {
          do it=1,nt {
             pfe(it,ix)=0.0
       }  }
       do ix=1+npad,nx-npad {
           icnt=reed(infd,pfe(1,ix),nt*4)
           pfe(ntp1,ix)=0.0
       }
       endvec(1)=ntp1
       endvec(2)=nx
       psec=0.0
%      call CMF_to_cm(psec,pfe,endvec)
 
# taper sides to reduce boundary artifacts

       if(taper(:1) == 'y') {
            efac=0.0
            do ix=1+npad,nx/32 {
                factor=float(ix-1-nx/32)/float(nx/32)
                efac(ix)=factor
                efac(nx+1-ix)=factor
                }
            efac=exp(-efac**2)
            psec=psec*spread(efac,1,ntp1)
        }


# read migration velocities
       print *,'setting up velocity field'
       sc = - (1./32.)*(dt/dx)**2
       alp = 0.0
       if(vc == 0.0) {
           velfd = auxin('vel')
           n1vel = nt
           n2vel = nx
           icnt= auxpar('n1','d',n1vel,'vel')
           icnt= auxpar('n2','d',n2vel,'vel')
           if(n1vel < nt) call erexit('trace and velocity profile length mismatch')
           if(n2vel != 1 && n2vel < nx) call erexit('too few velocity profiles')
           do ix=npad,nx-npad {
               icnt=lseek(velfd,(ix-1)*n1vel*4,0) # goto beginning of vel. profile
               icnt=reed(velfd,vfe(1,ix),nt*4) # read first nt velocities
               if(icnt == 0) {  # assume v(tau) and propagate
                    do itau=1,ntau
                        vfe(itau,ix) = vfe(itau,ix-1)
               } else {         # process v(tau,x)
                   do it=1,nt
                       pfe(it,ix)=sc*pfe(it,ix)**2
                   if(stepdown > 1) call vpack(vfe(1,ix),nt,vfe(1,ix),ntau,stepdown)
               }
           }
           endvec(1)=ntau
           endvec(2)=nx
%          call CMF_to_cm(alp,vfe,endvec)
      } else {  # constant velocity
           sc = sc * stepdown * vc**2
           alp=sc
           }
#
# set up padding: replicate first and last traces,velocities for zero slope
#

       psec(:,1:npad)=spread(psec(:,npad+1),2,npad)
       psec(:,nx+1-npad:nx)=spread(psec(:,nx-npad),2,npad)
       alp(:,1:npad)=spread(alp(:,npad+1),2,npad)
       alp(:,nx+1-npad:nx)=spread(alp(:,nx-npad),2,npad)
#
# set up and prefactor tridiagonal systems
#
       print *,'factoring tridiagonal system'
       a=beta+alp
       a(:,nx+1:nxpow2)=0.0
       b=1.0-2.0*a
       c=a
#
       a(:,nx)=-1.0
       b(:,nx)=+1.0
#
       b(:,1)=+1.0
       c(:,1)=-1.0
#
%       vp=CMF_get_vp_set_id(a)
       call trifact(a,b,c,ntaupow2,nxpow2,factor1,factor2,nxlog2,eps,loopcnt)
#%       p1addr=CMF_get_field_id(p1)
#%       call cm_set_vp_set(vp)
#       rpt=.true.
#       loopcnt=1
#       lbound=1
#%       do while (rpt .and. (loopcnt.le.nxlog2))
#       p2=-1.0/b
#       p3=a
#       p4=c
#       p1=a*eoshift(p2,dim=2,shift=-lbound)
#       tepsilon=maxval(abs(p1))
#%       factor1(loopcnt)=cm_allocate_heap_field(64)
#%       factor2(loopcnt)=cm_add_offset_to_field_id(
#%     1         factor1(loopcnt),32)
#%       call cm_f_move_1L(factor1(loopcnt),p1addr,23,8)
#       b=b+p1*cshift(p4,dim=2,shift=-lbound)
#       a=p1*cshift(p3,dim=2,shift=-lbound)
#       p1=c*eoshift(p2,dim=2,shift=lbound)
#       tepsilon=max(tepsilon,maxval(abs(p1)))
#%       call cm_f_move_1L(factor2(loopcnt),p1addr,23,8)
#       b=b+p1*cshift(p3,dim=2,shift=lbound)
#       c=p1+cshift(p4,dim=2,shift=lbound)
#       lbound=lbound*2
#       if(tepsilon.le.eps) rpt=.false.
#       loopcnt=loopcnt+1
#%       end do
#        p1=1.0/b
#%       factor1(loopcnt)=cm_allocate_heap_field(32)
#%       call cm_f_move_1L(factor1(loopcnt),p1addr,23,8)
#        loopcnt=loopcnt-1
       print *,nxpow2,ntaupow2,nxlog2,loopcnt
%       call cm_reset_timer(0)
%       call cm_start_timer(0)
#
# simplify rhs tridiagonal coefficients
#
       alp=beta-alp
                   
# initialize parallel subdiagonals
       print *,'starting migration'
       p1=0
       p2=0
       p3=0
       p4=0
       inow=1
       iprev=2
%       iadr1=CMF_get_field_id(r1)
%       iadr2=CMF_get_field_id(r2)
%       iadalp=CMF_get_field_id(alp)
#
# now for the "parallel" migration
#
#      P1 P4
#      P2 P3
#
#      (1+(beta+alpha)T) (P4+P2) = (1+(beta-alpha)T) (P1+P3)
#
       do it=nt,1+stepdown,-1 {  # work from bottom up
#
           ittop = ntp1-it
           if(stepdown>1)
               ittop = min0(ittop,(it-1)/(stepdown-1)-1)
#
           p1=cshift(p3,1,-1)
           p1(1,:)=psec(it,:)
#          R1=P1+P3
##         r2=r1-CSHIFT(r1,2,1)
#%           call cm_f_news_sub_always_3_1L(iadr2,iadr1,iadr1,2,
#%     1          cm_upward,23,8)
##         r2=r2-CSHIFT(r2,2,-1)
#%           call cm_f_news_sub_always_2_1L(iadr2,iadr2,2,
#%     1          cm_downward,23,8)
##         r2=r1-alp*r2
#%           call cm_f_mult_subf_1L(iadr2,iadalp,iadr2,iadr1,23,8)
#          r2(:,1)=0.0
#          r2(:,nx)=0.0
#
#          call factapp(r2,ntaupow2,nxpow2,factor1,factor2,nxlog2,loopcnt)
#%          call cm_set_vp_set(vp)
#          lbound=1
#           do i=1,loopcnt {
#          p3=r2
#          r1=cshift(p3,dim=2,shift=-lbound)
#%          call cm_f_mult_add_1L(iadr2,factor1(i),iadr1,iadr2,23,8)
#           r1=cshift(p3,dim=2,shift=lbound)
#%          call cm_f_mult_add_1L(iadr2,factor2(i),iadr1,iadr2,23,8)
#          lbound=lbound*2
#          }
#%          call cm_f_multiply_2_1L(iadr2,factor1(loopcnt+1),23,8)
#          p4=r2-p2
           if(stepdown == 1) {
               if(it == 2) {
                   psec(it:nt,:)=p4(1:ntau,:)
                   }
           } else {
               itau=(it-2)/(stepdown-1)
               irem=it-2-itau*(stepdown-1)
               if(itau>0 && itau <= ntau) {
                    jt=it+itau-1
                    psec(jt,:)=p4(itau,:)
               }
               if(irem.eq.0) {
                    itau=itau-1
                    if(itau>0 && itau <= ntau) {
                        jt=it+itau-1
                        psec(jt,:)=p4(itau,:)
                    }
               }
           }
           p3=p4
           p2=p1
#
           itemp=iprev
           iprev=inow
           inow=itemp
      }

# all done migrating; write output section

%      call cm_stop_timer(1)
      print *,'writing migrated section'
      endvec(1)=ntp1
      endvec(2)=nx
%      call CMF_from_cm(pfe,psec,endvec)
      do ix=npad+1,nx-npad {
           icnt=rite(outfd,pfe(1,ix),nt*4)
      }
#
      print *,'done'
      return
      end

subroutine vpack(vin,nt,vout,ntau,stepdown)
integer nt,ntau,stepdown
real vin(stepdown,ntau),vout(ntau)
integer itau,it

%C$DIR SCALAR
do it=2,stepdown {
%C$DIR NO_RECURRENCE
    do itau=1,ntau {
        vin(1,itau) = vin(1,itau)+vin(it,itau)
    }
}

do itau=1,ntau
    vout(itau) = vin(1,itau)
return
end
