1 | !C
|
2 | !C***
|
3 | !C*** hecmw_matvec_33
|
4 | !C***
|
5 | !C
|
6 | subroutine hecmw_matvec_33 (hecMESH, hecMAT, X, Y, COMMtime)
|
7 | use hecmw_util
|
8 | implicit none
|
9 | type (hecmwST_local_mesh), intent(in) :: hecMESH
|
10 | type (hecmwST_matrix), intent(in), target :: hecMAT
|
11 | real(kind=kreal), intent(in) :: X(:)
|
12 | real(kind=kreal), intent(out) :: Y(:)
|
13 | real(kind=kreal), intent(inout), optional :: COMMtime
|
14 |
|
15 | real(kind=kreal) :: Tcomm
|
16 | real(kind=kreal), allocatable :: WK(:)
|
17 |
|
18 | Tcomm = 0.d0
|
19 |
|
20 | if (mpcmatvec_flg) then
|
21 | allocate(WK(hecMAT%NP * hecMAT%NDOF))
|
22 | call hecmw_TtmatTvec_33(hecMESH, hecMAT, X, Y, WK, Tcomm)
|
23 | deallocate(WK)
|
24 | else
|
25 | call hecmw_matvec_33_inner(hecMESH, hecMAT, X, Y, Tcomm)
|
26 | endif
|
27 |
|
28 | if (present(COMMtime)) COMMtime = COMMtime + Tcomm
|
29 | end subroutine hecmw_matvec_33
|
30 |
|
31 | !C
|
32 | !C***
|
33 | !C*** hecmw_matvec_33_inner ( private subroutine )
|
34 | !C***
|
35 | !C
|
36 | subroutine hecmw_matvec_33_inner (hecMESH, hecMAT, X, Y, COMMtime)
|
37 | use hecmw_util
|
38 | use m_hecmw_comm_f
|
39 | use hecmw_matrix_contact
|
40 | use hecmw_matrix_misc
|
41 | use hecmw_jad_type
|
42 | use hecmw_tuning_fx
|
43 | !$ use omp_lib
|
44 |
|
45 | implicit none
|
46 | type (hecmwST_local_mesh), intent(in) :: hecMESH
|
47 | type (hecmwST_matrix), intent(in), target :: hecMAT
|
48 | real(kind=kreal), intent(in) :: X(:)
|
49 | real(kind=kreal), intent(out) :: Y(:)
|
50 | real(kind=kreal), intent(inout), optional :: COMMtime
|
51 |
|
52 | real(kind=kreal) :: START_TIME, END_TIME, Tcomm
|
53 | integer(kind=kint) :: i, j, jS, jE, in
|
54 | real(kind=kreal) :: YV1, YV2, YV3, X1, X2, X3
|
55 |
|
56 | integer(kind=kint) :: N, NP
|
57 | integer(kind=kint), pointer :: indexL(:), itemL(:), indexU(:), itemU(:)
|
58 | real(kind=kreal), pointer :: AL(:), AU(:), D(:)
|
59 |
|
60 | ! for communication hiding
|
61 | integer(kind=kint) :: ireq
|
62 |
|
63 | ! added for turning >>>
|
64 | integer, parameter :: numOfBlockPerThread = 100
|
65 | logical, save :: isFirst = .true.
|
66 | integer, save :: numOfThread = 1
|
67 | integer, save, allocatable :: startPos(:), endPos(:)
|
68 | integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
|
69 | integer(kind=kint) :: threadNum, blockNum, numOfBlock
|
70 | integer(kind=kint) :: numOfElement, elementCount, blockIndex
|
71 | real(kind=kreal) :: numOfElementPerBlock
|
72 | ! <<< added for turning
|
73 |
|
74 | IF (hecmw_JAD_IS_INITIALIZED().ne.0) THEN
|
75 | Tcomm = 0.d0
|
76 | START_TIME = hecmw_Wtime()
|
77 | call hecmw_JAD_MATVEC(hecMESH, hecMAT, X, Y, Tcomm)
|
78 | END_TIME = hecmw_Wtime()
|
79 | time_Ax = time_Ax + END_TIME - START_TIME - Tcomm
|
80 | if (present(COMMtime)) COMMtime = COMMtime + Tcomm
|
81 | ELSE
|
82 |
|
83 | N = hecMAT%N
|
84 | NP = hecMAT%NP
|
85 | indexL => hecMAT%indexL
|
86 | indexU => hecMAT%indexU
|
87 | itemL => hecMAT%itemL
|
88 | itemU => hecMAT%itemU
|
89 | AL => hecMAT%AL
|
90 | AU => hecMAT%AU
|
91 | D => hecMAT%D
|
92 |
|
93 | ! added for turning >>>
|
94 | if (.not. isFirst) then
|
95 | numOfBlock = numOfThread * numOfBlockPerThread
|
96 | if (endPos(numOfBlock-1) .ne. N-1) then
|
97 | deallocate(startPos, endPos)
|
98 | isFirst = .true.
|
99 | endif
|
100 | endif
|
101 | if (isFirst) then
|
102 | !$ numOfThread = omp_get_max_threads()
|
103 | numOfBlock = numOfThread * numOfBlockPerThread
|
104 | allocate (startPos(0 : numOfBlock - 1), endPos(0 : numOfBlock - 1))
|
105 | numOfElement = N + indexL(N) + indexU(N)
|
106 | numOfElementPerBlock = dble(numOfElement) / numOfBlock
|
107 | blockNum = 0
|
108 | elementCount = 0
|
109 | startPos(blockNum) = 1
|
110 | do i= 1, N
|
111 | elementCount = elementCount + 1
|
112 | elementCount = elementCount + (indexL(i) - indexL(i-1))
|
113 | elementCount = elementCount + (indexU(i) - indexU(i-1))
|
114 | if (elementCount > (blockNum + 1) * numOfElementPerBlock) then
|
115 | endPos(blockNum) = i
|
116 | ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
|
117 | ! startPos(blockNum), endPos(blockNum)
|
118 | blockNum = blockNum + 1
|
119 | startPos(blockNum) = i + 1
|
120 | if (blockNum == (numOfBlock - 1)) exit
|
121 | endif
|
122 | enddo
|
123 | endPos(blockNum) = N
|
124 | ! write(9000+hecMESH%my_rank,*) mod(blockNum, numOfThread), &
|
125 | ! startPos(blockNum), endPos(blockNum)
|
126 | ! for irregular data
|
127 | do i= blockNum+1, numOfBlock-1
|
128 | startPos(i) = N
|
129 | endPos(i) = N-1
|
130 | ! write(9000+hecMESH%my_rank,*) mod(i, numOfThread), &
|
131 | ! startPos(i), endPos(i)
|
132 | end do
|
133 |
|
134 | call hecmw_tuning_fx_calc_sector_cache(NP, 3, &
|
135 | sectorCacheSize0, sectorCacheSize1)
|
136 |
|
137 | isFirst = .false.
|
138 | endif
|
139 | ! <<< added for turning
|
140 |
|
141 | START_TIME= HECMW_WTIME()
|
142 | ! if (async_matvec_flg) then
|
143 | ! call hecmw_update_3_R_async (hecMESH, X, NP, ireq)
|
144 | ! else
|
145 | call hecmw_update_3_R (hecMESH, X, NP)
|
146 | ! endif
|
147 | END_TIME= HECMW_WTIME()
|
148 | if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
|
149 |
|
150 | START_TIME = hecmw_Wtime()
|
151 |
|
152 | !call fapp_start("loopInMatvec33", 1, 0)
|
153 | !call start_collection("loopInMatvec33")
|
154 |
|
155 | !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
|
156 | !OCL CACHE_SUBSECTOR_ASSIGN(X)
|
157 |
|
158 | !$OMP PARALLEL DEFAULT(NONE) &
|
159 | !$OMP&PRIVATE(i,X1,X2,X3,YV1,YV2,YV3,jS,jE,j,in,threadNum,blockNum,blockIndex) &
|
160 | !$OMP&SHARED(D,AL,AU,indexL,itemL,indexU,itemU,X,Y,startPos,endPos,numOfThread,N,async_matvec_flg)
|
161 | threadNum = 0
|
162 | !$ threadNum = omp_get_thread_num()
|
163 | do blockNum = 0 , numOfBlockPerThread - 1
|
164 | blockIndex = blockNum * numOfThread + threadNum
|
165 | do i = startPos(blockIndex), endPos(blockIndex)
|
166 | X1= X(3*i-2)
|
167 | X2= X(3*i-1)
|
168 | X3= X(3*i )
|
169 | YV1= D(9*i-8)*X1 + D(9*i-7)*X2 + D(9*i-6)*X3
|
170 | YV2= D(9*i-5)*X1 + D(9*i-4)*X2 + D(9*i-3)*X3
|
171 | YV3= D(9*i-2)*X1 + D(9*i-1)*X2 + D(9*i )*X3
|
172 |
|
173 | jS= indexL(i-1) + 1
|
174 | jE= indexL(i )
|
175 | do j= jS, jE
|
176 | in = itemL(j)
|
177 | X1= X(3*in-2)
|
178 | X2= X(3*in-1)
|
179 | X3= X(3*in )
|
180 | YV1= YV1 + AL(9*j-8)*X1 + AL(9*j-7)*X2 + AL(9*j-6)*X3
|
181 | YV2= YV2 + AL(9*j-5)*X1 + AL(9*j-4)*X2 + AL(9*j-3)*X3
|
182 | YV3= YV3 + AL(9*j-2)*X1 + AL(9*j-1)*X2 + AL(9*j )*X3
|
183 | enddo
|
184 | jS= indexU(i-1) + 1
|
185 | jE= indexU(i )
|
186 | do j= jS, jE
|
187 | in = itemU(j)
|
188 | ! if (async_matvec_flg .and. in > N) cycle
|
189 | X1= X(3*in-2)
|
190 | X2= X(3*in-1)
|
191 | X3= X(3*in )
|
192 | YV1= YV1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
|
193 | YV2= YV2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
|
194 | YV3= YV3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
|
195 | enddo
|
196 | Y(3*i-2)= YV1
|
197 | Y(3*i-1)= YV2
|
198 | Y(3*i )= YV3
|
199 | enddo
|
200 | enddo
|
201 | !$OMP END PARALLEL
|
202 |
|
203 | !OCL END_CACHE_SUBSECTOR
|
204 | !OCL END_CACHE_SECTOR_SIZE
|
205 |
|
206 | !call stop_collection("loopInMatvec33")
|
207 | !call fapp_stop("loopInMatvec33", 1, 0)
|
208 |
|
209 | END_TIME = hecmw_Wtime()
|
210 | time_Ax = time_Ax + END_TIME - START_TIME
|
211 |
|
212 | ! if (async_matvec_flg) then
|
213 | ! START_TIME= HECMW_WTIME()
|
214 | ! call hecmw_update_3_R_wait (hecMESH, ireq)
|
215 | ! END_TIME= HECMW_WTIME()
|
216 | ! if (present(COMMtime)) COMMtime = COMMtime + END_TIME - START_TIME
|
217 |
|
218 | ! START_TIME = hecmw_Wtime()
|
219 |
|
220 | ! do i = 1, N
|
221 | ! jS= index_o(i-1) + 1
|
222 | ! jE= index_o(i )
|
223 | ! if (jS > jE) cycle
|
224 | ! YV1= 0.d0
|
225 | ! YV2= 0.d0
|
226 | ! YV3= 0.d0
|
227 | ! do j=jS, jE
|
228 | ! in = item_o(j)
|
229 | ! X1= X(3*(N+in)-2)
|
230 | ! X2= X(3*(N+in)-1)
|
231 | ! X3= X(3*(N+in) )
|
232 | ! YV1= YV1 + A_o(9*j-8)*X1 + A_o(9*j-7)*X2 + A_o(9*j-6)*X3
|
233 | ! YV2= YV2 + A_o(9*j-5)*X1 + A_o(9*j-4)*X2 + A_o(9*j-3)*X3
|
234 | ! YV3= YV3 + A_o(9*j-2)*X1 + A_o(9*j-1)*X2 + A_o(9*j )*X3
|
235 | ! enddo
|
236 | ! Y(3*i-2)= Y(3*i-2)+YV1
|
237 | ! Y(3*i-1)= Y(3*i-1)+YV2
|
238 | ! Y(3*i )= Y(3*i )+YV3
|
239 | ! enddo
|
240 |
|
241 | ! END_TIME = hecmw_Wtime()
|
242 | ! time_Ax = time_Ax + END_TIME - START_TIME
|
243 | ! endif
|
244 |
|
245 | ENDIF
|
246 |
|
247 | if (hecMAT%cmat%n_val > 0) then
|
248 | call hecmw_cmat_multvec_add( hecMAT%cmat, X, Y, NP * hecMAT%NDOF )
|
249 | end if
|
250 |
|
251 | end subroutine hecmw_matvec_33_inner
|