1 | subroutine hecmw_precond_SSOR_33_apply(ZP)
|
2 | use hecmw_tuning_fx
|
3 | implicit none
|
4 | real(kind=kreal), intent(inout) :: ZP(:)
|
5 | integer(kind=kint) :: ic, i, iold, j, isL, ieL, isU, ieU, k
|
6 | real(kind=kreal) :: SW1, SW2, SW3, X1, X2, X3
|
7 |
|
8 | ! added for turning >>>
|
9 | integer(kind=kint), parameter :: numOfBlockPerThread = 100
|
10 | integer(kind=kint), save :: numOfThread = 1, numOfBlock
|
11 | integer(kind=kint), save, allocatable :: icToBlockIndex(:)
|
12 | integer(kind=kint), save, allocatable :: blockIndexToColorIndex(:)
|
13 | integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
|
14 | integer(kind=kint) :: blockIndex, elementCount, numOfElement, ii
|
15 | real(kind=kreal) :: numOfElementPerBlock
|
16 | integer(kind=kint) :: my_rank
|
17 |
|
18 | if (isFirst .eqv. .true.) then
|
19 | !$ numOfThread = omp_get_max_threads()
|
20 | numOfBlock = numOfThread * numOfBlockPerThread
|
21 | if (allocated(icToBlockIndex)) deallocate(icToBlockIndex)
|
22 | if (allocated(blockIndexToColorIndex)) deallocate(blockIndexToColorIndex)
|
23 | allocate (icToBlockIndex(0:NColor), &
|
24 | blockIndexToColorIndex(0:numOfBlock + NColor))
|
25 | numOfElement = N + indexL(N) + indexU(N)
|
26 | numOfElementPerBlock = dble(numOfElement) / numOfBlock
|
27 | blockIndex = 0
|
28 | icToBlockIndex = -1
|
29 | icToBlockIndex(0) = 0
|
30 | blockIndexToColorIndex = -1
|
31 | blockIndexToColorIndex(0) = 0
|
32 | my_rank = hecmw_comm_get_rank()
|
33 | ! write(9000+my_rank,*) &
|
34 | ! '# numOfElementPerBlock =', numOfElementPerBlock
|
35 | ! write(9000+my_rank,*) &
|
36 | ! '# ic, blockIndex, colorIndex, elementCount'
|
37 | do ic = 1, NColor
|
38 | elementCount = 0
|
39 | ii = 1
|
40 | do i = COLORindex(ic-1)+1, COLORindex(ic)
|
41 | elementCount = elementCount + 1
|
42 | elementCount = elementCount + (indexL(i) - indexL(i-1))
|
43 | elementCount = elementCount + (indexU(i) - indexU(i-1))
|
44 | if (elementCount > ii * numOfElementPerBlock &
|
45 | .or. i == COLORindex(ic)) then
|
46 | ii = ii + 1
|
47 | blockIndex = blockIndex + 1
|
48 | blockIndexToColorIndex(blockIndex) = i
|
49 | ! write(9000+my_rank,*) ic, blockIndex, &
|
50 | ! blockIndexToColorIndex(blockIndex), elementCount
|
51 | endif
|
52 | enddo
|
53 | icToBlockIndex(ic) = blockIndex
|
54 | enddo
|
55 | numOfBlock = blockIndex
|
56 |
|
57 | call hecmw_tuning_fx_calc_sector_cache( N, 3, &
|
58 | sectorCacheSize0, sectorCacheSize1 )
|
59 |
|
60 | isFirst = .false.
|
61 | endif
|
62 | ! <<< added for turning
|
63 |
|
64 | !call start_collection("loopInPrecond33")
|
65 |
|
66 | !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
|
67 | !OCL CACHE_SUBSECTOR_ASSIGN(ZP)
|
68 |
|
69 | !$omp parallel default(none) &
|
70 | !$omp&shared(NColor,indexL,itemL,indexU,itemU,AL,AU,D,ALU,perm,&
|
71 | !$omp& NContact,indexCL,itemCL,indexCU,itemCU,CAL,CAU,&
|
72 | !$omp& ZP,icToBlockIndex,blockIndexToColorIndex) &
|
73 | !$omp&private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k,blockIndex)
|
74 |
|
75 | !C-- FORWARD
|
76 | do ic=1,NColor
|
77 | !$omp do schedule (static, 1)
|
78 | do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
|
79 | do i = blockIndexToColorIndex(blockIndex-1)+1, &
|
80 | blockIndexToColorIndex(blockIndex)
|
81 | ! do i = startPos(threadNum, ic), endPos(threadNum, ic)
|
82 | iold = perm(i)
|
83 | SW1= ZP(3*iold-2)
|
84 | SW2= ZP(3*iold-1)
|
85 | SW3= ZP(3*iold )
|
86 | isL= indexL(i-1)+1
|
87 | ieL= indexL(i)
|
88 | do j= isL, ieL
|
89 | !k= perm(itemL(j))
|
90 | k= itemL(j)
|
91 | X1= ZP(3*k-2)
|
92 | X2= ZP(3*k-1)
|
93 | X3= ZP(3*k )
|
94 | SW1= SW1 - AL(9*j-8)*X1 - AL(9*j-7)*X2 - AL(9*j-6)*X3
|
95 | SW2= SW2 - AL(9*j-5)*X1 - AL(9*j-4)*X2 - AL(9*j-3)*X3
|
96 | SW3= SW3 - AL(9*j-2)*X1 - AL(9*j-1)*X2 - AL(9*j )*X3
|
97 | enddo ! j
|
98 |
|
99 | if (NContact.ne.0) then
|
100 | isL= indexCL(i-1)+1
|
101 | ieL= indexCL(i)
|
102 | do j= isL, ieL
|
103 | !k= perm(itemCL(j))
|
104 | k= itemCL(j)
|
105 | X1= ZP(3*k-2)
|
106 | X2= ZP(3*k-1)
|
107 | X3= ZP(3*k )
|
108 | SW1= SW1 - CAL(9*j-8)*X1 - CAL(9*j-7)*X2 - CAL(9*j-6)*X3
|
109 | SW2= SW2 - CAL(9*j-5)*X1 - CAL(9*j-4)*X2 - CAL(9*j-3)*X3
|
110 | SW3= SW3 - CAL(9*j-2)*X1 - CAL(9*j-1)*X2 - CAL(9*j )*X3
|
111 | enddo ! j
|
112 | endif
|
113 |
|
114 | X1= SW1
|
115 | X2= SW2
|
116 | X3= SW3
|
117 | X2= X2 - ALU(9*i-5)*X1
|
118 | X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
|
119 | X3= ALU(9*i )* X3
|
120 | X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
|
121 | X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
|
122 | ZP(3*iold-2)= X1
|
123 | ZP(3*iold-1)= X2
|
124 | ZP(3*iold )= X3
|
125 | enddo ! i
|
126 | enddo ! blockIndex
|
127 | !$omp end do
|
128 | enddo ! ic
|
129 |
|
130 | !C-- BACKWARD
|
131 | do ic=NColor, 1, -1
|
132 | !$omp do schedule (static, 1)
|
133 | do blockIndex = icToBlockIndex(ic), icToBlockIndex(ic-1)+1, -1
|
134 | do i = blockIndexToColorIndex(blockIndex), &
|
135 | blockIndexToColorIndex(blockIndex-1)+1, -1
|
136 | ! do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
|
137 | ! do i = blockIndexToColorIndex(blockIndex-1)+1, &
|
138 | ! blockIndexToColorIndex(blockIndex)
|
139 | ! do i = endPos(threadNum, ic), startPos(threadNum, ic), -1
|
140 | SW1= 0.d0
|
141 | SW2= 0.d0
|
142 | SW3= 0.d0
|
143 | isU= indexU(i-1) + 1
|
144 | ieU= indexU(i)
|
145 | do j= ieU, isU, -1
|
146 | !k= perm(itemU(j))
|
147 | k= itemU(j)
|
148 | X1= ZP(3*k-2)
|
149 | X2= ZP(3*k-1)
|
150 | X3= ZP(3*k )
|
151 | SW1= SW1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
|
152 | SW2= SW2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
|
153 | SW3= SW3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
|
154 | enddo ! j
|
155 |
|
156 | if (NContact.gt.0) then
|
157 | isU= indexCU(i-1) + 1
|
158 | ieU= indexCU(i)
|
159 | do j= ieU, isU, -1
|
160 | !k= perm(itemCU(j))
|
161 | k= itemCU(j)
|
162 | X1= ZP(3*k-2)
|
163 | X2= ZP(3*k-1)
|
164 | X3= ZP(3*k )
|
165 | SW1= SW1 + CAU(9*j-8)*X1 + CAU(9*j-7)*X2 + CAU(9*j-6)*X3
|
166 | SW2= SW2 + CAU(9*j-5)*X1 + CAU(9*j-4)*X2 + CAU(9*j-3)*X3
|
167 | SW3= SW3 + CAU(9*j-2)*X1 + CAU(9*j-1)*X2 + CAU(9*j )*X3
|
168 | enddo ! j
|
169 | endif
|
170 |
|
171 | X1= SW1
|
172 | X2= SW2
|
173 | X3= SW3
|
174 | X2= X2 - ALU(9*i-5)*X1
|
175 | X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
|
176 | X3= ALU(9*i )* X3
|
177 | X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
|
178 | X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
|
179 | iold = perm(i)
|
180 | ZP(3*iold-2)= ZP(3*iold-2) - X1
|
181 | ZP(3*iold-1)= ZP(3*iold-1) - X2
|
182 | ZP(3*iold )= ZP(3*iold ) - X3
|
183 | enddo ! i
|
184 | enddo ! blockIndex
|
185 | !$omp end do
|
186 | enddo ! ic
|
187 | !$omp end parallel
|
188 |
|
189 | !OCL END_CACHE_SUBSECTOR
|
190 | !OCL END_CACHE_SECTOR_SIZE
|
191 |
|
192 | !call stop_collection("loopInPrecond33")
|
193 |
|
194 | end subroutine hecmw_precond_SSOR_33_apply
|