1 !C
2 !C***
3 !C*** hecmw_matvec_33
4 !C***
5 !C
6 subroutine hecmw_matvec_33 (hecMESH, hecMAT, X, Y, COMMtime)
7 use hecmw_util
8 implicit none
9 type (hecmwST_local_mesh), intent(in) :: hecMESH
10 type (hecmwST_matrix), intent(in), target :: hecMAT
11 real(kind=kreal), intent(in) :: X(:)
12 real(kind=kreal), intent(out) :: Y(:)
13 real(kind=kreal), intent(inout), optional :: COMMtime
14
15 real(kind=kreal) :: Tcomm
16 real(kind=kreal), allocatable :: WK(:)
17
18 Tcomm = 0.d0
19
20 if (mpcmatvec_flg) then
21 allocate(WK(hecMAT%NP * hecMAT%NDOF))
22 call hecmw_TtmatTvec_33(hecMESH, hecMAT, X, Y, WK, Tcomm)
23 deallocate(WK)
24 else
25 call hecmw_matvec_33_inner(hecMESH, hecMAT, X, Y, Tcomm)
26 endif
27
28 if (present(COMMtime)) COMMtime = COMMtime + Tcomm
29 end subroutine hecmw_matvec_33
30
31 !C
32 !C***
33 !C*** hecmw_matvec_33_inner ( private subroutine )
34 !C***
35 !C
36 subroutine hecmw_matvec_33_inner (hecMESH, hecMAT, X, Y, COMMtime)
37 use hecmw_util
38 use m_hecmw_comm_f
39 use hecmw_matrix_contact
40 use hecmw_matrix_misc
41 use hecmw_jad_type
42 use hecmw_tuning_fx
43 !$ use omp_lib
44
45 implicit none
46 type (hecmwST_local_mesh), intent(in) :: hecMESH
47 type (hecmwST_matrix), intent(in), target :: hecMAT
48 real(kind=kreal), intent(in) :: X(:)
49 real(kind=kreal), intent(out) :: Y(:)
50 real(kind=kreal), intent(inout), optional :: COMMtime
51
52 real(kind=kreal) :: START_TIME, END_TIME, Tcomm
53 integer(kind=kint) :: i, j, jS, jE, in
54 real(kind=kreal) :: YV1, YV2, YV3, X1, X2, X3
55
56 integer(kind=kint) :: N, NP
57 integer(kind=kint), pointer :: indexL(:), itemL(:), indexU(:), itemU(:)
58 real(kind=kreal), pointer :: AL(:), AU(:), D(:)
59
60 ! for communication hiding
61 integer(kind=kint) :: ireq
62
63 ! added for turning >>>
64 integer, parameter :: numOfBlockPerThread = 100
65 logical, save :: isFirst = .true.
66 integer, save :: numOfThread = 1
67 integer, save, allocatable :: startPos(:), endPos(:)
68 integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
69 integer(kind=kint) :: threadNum, blockNum, numOfBlock
70 integer(kind=kint) :: numOfElement, elementCount, blockIndex
71 real(kind=kreal) :: numOfElementPerBlock
72 ! <<< added for turning
73
74 IF (hecmw_JAD_IS_INITIALIZED().ne.0) THEN
75 Tcomm = 0.d0
76 START_TIME = hecmw_Wtime()
77 call hecmw_JAD_MATVEC(hecMESH, hecMAT, X, Y, Tcomm)
78 END_TIME = hecmw_Wtime()
79 time_Ax = time_Ax + END_TIME - START_TIME - Tcomm
80 if (present(COMMtime)) COMMtime = COMMtime + Tcomm
81 ELSE
82
83 N = hecMAT%N
84 NP = hecMAT%NP
85 indexL => hecMAT%indexL
86 indexU => hecMAT%indexU
87 itemL => hecMAT%itemL
88 itemU => hecMAT%itemU
89 AL => hecMAT%AL
90 AU => hecMAT%AU
91 D => hecMAT%D
92
93 ! added for turning >>>
94 if (.not. isFirst) then
95 numOfBlock = numOfThread * numOfBlockPerThread
96 if (endPos(numOfBlock-1) .ne. N-1) then
97 deallocate(startPos, endPos)
98 isFirst = .true.
99 endif
100 endif
101 if (isFirst) then
102 !$ numOfThread = omp_get_max_threads()
103 numOfBlock = numOfThread * numOfBlockPerThread
104 allocate (startPos(0 : numOfBlock - 1), endPos(0 : numOfBlock - 1))
105 numOfElement = N + indexL(N) + indexU(N)
106 numOfElementPerBlock = dble(numOfElement) / numOfBlock
107 blockNum = 0
108 elementCount = 0
109 startPos(blockNum) = 1
110 do i= 1, N
111 elementCount = elementCount + 1
112 elementCount = elementCount + (indexL(i) - indexL(i-1))
113 elementCount = elementCount + (indexU(i) - indexU(i-1))
114 if (elementCount > (blockNum + 1) * numOfElementPerBlock) then
115 endPos(blockNum) = i
116 ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
117 ! startPos(blockNum), endPos(blockNum)
118 blockNum = blockNum + 1
119 startPos(blockNum) = i + 1
120 if (blockNum == (numOfBlock - 1)) exit
121 endif
122 enddo
123 endPos(blockNum) = N
124 ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
125 ! startPos(blockNum), endPos(blockNum)
126 ! for irregular data
127 do i= blockNum+1, numOfBlock-1
128 startPos(i) = N
129 endPos(i) = N-1
130 ! write(9000+hecMESH%my_rank,*) mod(i, numOfThread), &
131 ! startPos(i), endPos(i)
132 end do
133
134 call hecmw_tuning_fx_calc_sector_cache(NP, 3, &
135 sectorCacheSize0, sectorCacheSize1)
136
137 isFirst = .false.
138 endif
139 ! <<< added for turning
140
141 START_TIME= HECMW_WTIME()
142 ! if (async_matvec_flg) then
143 ! call hecmw_update_3_R_async (hecMESH, X, NP, ireq)
144 ! else
145 call hecmw_update_3_R (hecMESH, X, NP)
146 ! endif
147 END_TIME= HECMW_WTIME()
148 if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
149
150 START_TIME = hecmw_Wtime()
151
152 !call fapp_start("loopInMatvec33", 1, 0)
153 !call start_collection("loopInMatvec33")
154
155 !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
156 !OCL CACHE_SUBSECTOR_ASSIGN(X)
157
158 !$OMP PARALLEL DEFAULT(NONE) &
159 !$OMP&PRIVATE(i,X1,X2,X3,YV1,YV2,YV3,jS,jE,j,in,threadNum,blockNum,blockIndex) &
160 !$OMP&SHARED(D,AL,AU,indexL,itemL,indexU,itemU,X,Y,startPos,endPos,numOfThread,N,async_matvec_flg)
161 threadNum = 0
162 !$ threadNum = omp_get_thread_num()
163 do blockNum = 0 , numOfBlockPerThread - 1
164 blockIndex = blockNum * numOfThread + threadNum
165 do i = startPos(blockIndex), endPos(blockIndex)
166 X1= X(3*i-2)
167 X2= X(3*i-1)
168 X3= X(3*i )
169 YV1= D(9*i-8)*X1 + D(9*i-7)*X2 + D(9*i-6)*X3
170 YV2= D(9*i-5)*X1 + D(9*i-4)*X2 + D(9*i-3)*X3
171 YV3= D(9*i-2)*X1 + D(9*i-1)*X2 + D(9*i )*X3
172
173 jS= indexL(i-1) + 1
174 jE= indexL(i )
175 do j= jS, jE
176 in = itemL(j)
177 X1= X(3*in-2)
178 X2= X(3*in-1)
179 X3= X(3*in )
180 YV1= YV1 + AL(9*j-8)*X1 + AL(9*j-7)*X2 + AL(9*j-6)*X3
181 YV2= YV2 + AL(9*j-5)*X1 + AL(9*j-4)*X2 + AL(9*j-3)*X3
182 YV3= YV3 + AL(9*j-2)*X1 + AL(9*j-1)*X2 + AL(9*j )*X3
183 enddo
184 jS= indexU(i-1) + 1
185 jE= indexU(i )
186 do j= jS, jE
187 in = itemU(j)
188 ! if (async_matvec_flg .and. in > N) cycle
189 X1= X(3*in-2)
190 X2= X(3*in-1)
191 X3= X(3*in )
192 YV1= YV1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
193 YV2= YV2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
194 YV3= YV3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
195 enddo
196 Y(3*i-2)= YV1
197 Y(3*i-1)= YV2
198 Y(3*i )= YV3
199 enddo
200 enddo
201 !$OMP END PARALLEL
202
203 !OCL END_CACHE_SUBSECTOR
204 !OCL END_CACHE_SECTOR_SIZE
205
206 !call stop_collection("loopInMatvec33")
207 !call fapp_stop("loopInMatvec33", 1, 0)
208
209 END_TIME = hecmw_Wtime()
210 time_Ax = time_Ax + END_TIME - START_TIME
211
212 ! if (async_matvec_flg) then
213 ! START_TIME= HECMW_WTIME()
214 ! call hecmw_update_3_R_wait (hecMESH, ireq)
215 ! END_TIME= HECMW_WTIME()
216 ! if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
217
218 ! START_TIME = hecmw_Wtime()
219
220 ! do i = 1, N
221 ! jS= index_o(i-1) + 1
222 ! jE= index_o(i )
223 ! if (jS > jE) cycle
224 ! YV1= 0.d0
225 ! YV2= 0.d0
226 ! YV3= 0.d0
227 ! do j=jS, jE
228 ! in = item_o(j)
229 ! X1= X(3*(N+in)-2)
230 ! X2= X(3*(N+in)-1)
231 ! X3= X(3*(N+in) )
232 ! YV1= YV1 + A_o(9*j-8)*X1 + A_o(9*j-7)*X2 + A_o(9*j-6)*X3
233 ! YV2= YV2 + A_o(9*j-5)*X1 + A_o(9*j-4)*X2 + A_o(9*j-3)*X3
234 ! YV3= YV3 + A_o(9*j-2)*X1 + A_o(9*j-1)*X2 + A_o(9*j )*X3
235 ! enddo
236 ! Y(3*i-2)= Y(3*i-2)+YV1
237 ! Y(3*i-1)= Y(3*i-1)+YV2
238 ! Y(3*i )= Y(3*i )+YV3
239 ! enddo
240
241 ! END_TIME = hecmw_Wtime()
242 ! time_Ax = time_Ax + END_TIME - START_TIME
243 ! endif
244
245 ENDIF
246
247 if (hecMAT%cmat%n_val > 0) then
248 call hecmw_cmat_multvec_add( hecMAT%cmat, X, Y, NP * hecMAT%NDOF )
249 end if
250
251 end subroutine hecmw_matvec_33_inner