1   !C
  2   !C***
  3   !C*** hecmw_matvec_33
  4   !C***
  5   !C
  6   subroutine hecmw_matvec_33 (hecMESH, hecMAT, X, Y, COMMtime)
  7     use hecmw_util
  8     implicit none
  9     type (hecmwST_local_mesh), intent(in) :: hecMESH
 10     type (hecmwST_matrix), intent(in), target :: hecMAT
 11     real(kind=kreal), intent(in) :: X(:)
 12     real(kind=kreal), intent(out) :: Y(:)
 13     real(kind=kreal), intent(inout), optional :: COMMtime
 14 
 15     real(kind=kreal) :: Tcomm
 16     real(kind=kreal), allocatable :: WK(:)
 17 
 18     Tcomm = 0.d0
 19 
 20     if (mpcmatvec_flg) then
 21       allocate(WK(hecMAT%NP * hecMAT%NDOF))
 22       call hecmw_TtmatTvec_33(hecMESH, hecMAT, X, Y, WK, Tcomm)
 23       deallocate(WK)
 24     else
 25       call hecmw_matvec_33_inner(hecMESH, hecMAT, X, Y, Tcomm)
 26     endif
 27 
 28     if (present(COMMtime)) COMMtime = COMMtime + Tcomm
 29   end subroutine hecmw_matvec_33
 30 
 31   !C
 32   !C***
 33   !C*** hecmw_matvec_33_inner ( private subroutine )
 34   !C***
 35   !C
 36   subroutine hecmw_matvec_33_inner (hecMESH, hecMAT, X, Y, COMMtime)
 37     use hecmw_util
 38     use m_hecmw_comm_f
 39     use hecmw_matrix_contact
 40     use hecmw_matrix_misc
 41     use hecmw_jad_type
 42     use hecmw_tuning_fx
 43     !$ use omp_lib
 44 
 45     implicit none
 46     type (hecmwST_local_mesh), intent(in) :: hecMESH
 47     type (hecmwST_matrix), intent(in), target :: hecMAT
 48     real(kind=kreal), intent(in) :: X(:)
 49     real(kind=kreal), intent(out) :: Y(:)
 50     real(kind=kreal), intent(inout), optional :: COMMtime
 51 
 52     real(kind=kreal) :: START_TIME, END_TIME, Tcomm
 53     integer(kind=kint) :: i, j, jS, jE, in
 54     real(kind=kreal) :: YV1, YV2, YV3, X1, X2, X3
 55 
 56     integer(kind=kint) :: N, NP
 57     integer(kind=kint), pointer :: indexL(:), itemL(:), indexU(:), itemU(:)
 58     real(kind=kreal), pointer :: AL(:), AU(:), D(:)
 59 
 60     ! for communication hiding
 61     integer(kind=kint) :: ireq
 62 
 63     ! added for turning >>>
 64     integer, parameter :: numOfBlockPerThread = 100
 65     logical, save :: isFirst = .true.
 66     integer, save :: numOfThread = 1
 67     integer, save, allocatable :: startPos(:), endPos(:)
 68     integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
 69     integer(kind=kint) :: threadNum, blockNum, numOfBlock
 70     integer(kind=kint) :: numOfElement, elementCount, blockIndex
 71     real(kind=kreal) :: numOfElementPerBlock
 72     ! <<< added for turning
 73 
 74     IF (hecmw_JAD_IS_INITIALIZED().ne.0) THEN
 75       Tcomm = 0.d0
 76       START_TIME = hecmw_Wtime()
 77       call hecmw_JAD_MATVEC(hecMESH, hecMAT, X, Y, Tcomm)
 78       END_TIME = hecmw_Wtime()
 79       time_Ax = time_Ax + END_TIME - START_TIME - Tcomm
 80       if (present(COMMtime)) COMMtime = COMMtime + Tcomm
 81     ELSE
 82 
 83       N = hecMAT%N
 84       NP = hecMAT%NP
 85       indexL => hecMAT%indexL
 86       indexU => hecMAT%indexU
 87       itemL => hecMAT%itemL
 88       itemU => hecMAT%itemU
 89       AL => hecMAT%AL
 90       AU => hecMAT%AU
 91       D => hecMAT%D
 92 
 93       ! added for turning >>>
 94       if (.not. isFirst) then
 95         numOfBlock = numOfThread * numOfBlockPerThread
 96         if (endPos(numOfBlock-1) .ne. N-1) then
 97           deallocate(startPos, endPos)
 98           isFirst = .true.
 99         endif
100       endif
101       if (isFirst) then
102         !$ numOfThread = omp_get_max_threads()
103         numOfBlock = numOfThread * numOfBlockPerThread
104         allocate (startPos(0 : numOfBlock - 1), endPos(0 : numOfBlock - 1))
105         numOfElement = N + indexL(N) + indexU(N)
106         numOfElementPerBlock = dble(numOfElement) / numOfBlock
107         blockNum = 0
108         elementCount = 0
109         startPos(blockNum) = 1
110         do i= 1, N
111           elementCount = elementCount + 1
112           elementCount = elementCount + (indexL(i) - indexL(i-1))
113           elementCount = elementCount + (indexU(i) - indexU(i-1))
114           if (elementCount > (blockNum + 1) * numOfElementPerBlock) then
115             endPos(blockNum) = i
116             ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
117             !      startPos(blockNum), endPos(blockNum)
118             blockNum = blockNum + 1
119             startPos(blockNum) = i + 1
120             if (blockNum == (numOfBlock - 1)) exit
121           endif
122         enddo
123         endPos(blockNum) = N
124         ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
125         !      startPos(blockNum), endPos(blockNum)
126         ! for irregular data
127         do i= blockNum+1, numOfBlock-1
128           startPos(i) = N
129           endPos(i) = N-1
130           ! write(9000+hecMESH%my_rank,*) mod(i, numOfThread), &
131           !      startPos(i), endPos(i)
132         end do
133 
134         call hecmw_tuning_fx_calc_sector_cache(NP, 3, &
135              sectorCacheSize0, sectorCacheSize1)
136 
137         isFirst = .false.
138       endif
139       ! <<< added for turning
140 
141       START_TIME= HECMW_WTIME()
142       ! if (async_matvec_flg) then
143       !   call hecmw_update_3_R_async (hecMESH, X, NP, ireq)
144       ! else
145       call hecmw_update_3_R (hecMESH, X, NP)
146       ! endif
147       END_TIME= HECMW_WTIME()
148       if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
149 
150       START_TIME = hecmw_Wtime()
151 
152       !call fapp_start("loopInMatvec33", 1, 0)
153       !call start_collection("loopInMatvec33")
154 
155 !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
156 !OCL CACHE_SUBSECTOR_ASSIGN(X)
157 
158 !$OMP PARALLEL DEFAULT(NONE) &
159 !$OMP&PRIVATE(i,X1,X2,X3,YV1,YV2,YV3,jS,jE,j,in,threadNum,blockNum,blockIndex) &
160 !$OMP&SHARED(D,AL,AU,indexL,itemL,indexU,itemU,X,Y,startPos,endPos,numOfThread,N,async_matvec_flg)
161       threadNum = 0
162       !$ threadNum = omp_get_thread_num()
163       do blockNum = 0 , numOfBlockPerThread - 1
164         blockIndex = blockNum * numOfThread  + threadNum
165         do i = startPos(blockIndex), endPos(blockIndex)
166           X1= X(3*i-2)
167           X2= X(3*i-1)
168           X3= X(3*i  )
169           YV1= D(9*i-8)*X1 + D(9*i-7)*X2 + D(9*i-6)*X3
170           YV2= D(9*i-5)*X1 + D(9*i-4)*X2 + D(9*i-3)*X3
171           YV3= D(9*i-2)*X1 + D(9*i-1)*X2 + D(9*i  )*X3
172 
173           jS= indexL(i-1) + 1
174           jE= indexL(i  )
175           do j= jS, jE
176             in  = itemL(j)
177             X1= X(3*in-2)
178             X2= X(3*in-1)
179             X3= X(3*in  )
180             YV1= YV1 + AL(9*j-8)*X1 + AL(9*j-7)*X2 + AL(9*j-6)*X3
181             YV2= YV2 + AL(9*j-5)*X1 + AL(9*j-4)*X2 + AL(9*j-3)*X3
182             YV3= YV3 + AL(9*j-2)*X1 + AL(9*j-1)*X2 + AL(9*j  )*X3
183           enddo
184           jS= indexU(i-1) + 1
185           jE= indexU(i  )
186           do j= jS, jE
187             in  = itemU(j)
188             ! if (async_matvec_flg .and. in > N) cycle
189             X1= X(3*in-2)
190             X2= X(3*in-1)
191             X3= X(3*in  )
192             YV1= YV1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
193             YV2= YV2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
194             YV3= YV3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j  )*X3
195           enddo
196           Y(3*i-2)= YV1
197           Y(3*i-1)= YV2
198           Y(3*i  )= YV3
199         enddo
200       enddo
201 !$OMP END PARALLEL
202 
203 !OCL END_CACHE_SUBSECTOR
204 !OCL END_CACHE_SECTOR_SIZE
205 
206       !call stop_collection("loopInMatvec33")
207       !call fapp_stop("loopInMatvec33", 1, 0)
208 
209       END_TIME = hecmw_Wtime()
210       time_Ax = time_Ax + END_TIME - START_TIME
211 
212       ! if (async_matvec_flg) then
213       !   START_TIME= HECMW_WTIME()
214       !   call hecmw_update_3_R_wait (hecMESH, ireq)
215       !   END_TIME= HECMW_WTIME()
216       !   if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
217 
218       !   START_TIME = hecmw_Wtime()
219 
220       !   do i = 1, N
221       !     jS= index_o(i-1) + 1
222       !     jE= index_o(i  )
223       !     if (jS > jE) cycle
224       !     YV1= 0.d0
225       !     YV2= 0.d0
226       !     YV3= 0.d0
227       !     do j=jS, jE
228       !       in = item_o(j)
229       !       X1= X(3*(N+in)-2)
230       !       X2= X(3*(N+in)-1)
231       !       X3= X(3*(N+in)  )
232       !       YV1= YV1 + A_o(9*j-8)*X1 + A_o(9*j-7)*X2 + A_o(9*j-6)*X3
233       !       YV2= YV2 + A_o(9*j-5)*X1 + A_o(9*j-4)*X2 + A_o(9*j-3)*X3
234       !       YV3= YV3 + A_o(9*j-2)*X1 + A_o(9*j-1)*X2 + A_o(9*j  )*X3
235       !     enddo
236       !     Y(3*i-2)= Y(3*i-2)+YV1
237       !     Y(3*i-1)= Y(3*i-1)+YV2
238       !     Y(3*i  )= Y(3*i  )+YV3
239       !   enddo
240 
241       !   END_TIME = hecmw_Wtime()
242       !   time_Ax = time_Ax + END_TIME - START_TIME
243       ! endif
244 
245     ENDIF
246 
247     if (hecMAT%cmat%n_val > 0) then
248       call hecmw_cmat_multvec_add( hecMAT%cmat, X, Y, NP * hecMAT%NDOF )
249     end if
250 
251   end subroutine hecmw_matvec_33_inner