| 1 | subroutine hecmw_precond_SSOR_33_apply(ZP)
|
| 2 | use hecmw_tuning_fx
|
| 3 | implicit none
|
| 4 | real(kind=kreal), intent(inout) :: ZP(:)
|
| 5 | integer(kind=kint) :: ic, i, iold, j, isL, ieL, isU, ieU, k
|
| 6 | real(kind=kreal) :: SW1, SW2, SW3, X1, X2, X3
|
| 7 |
|
| 8 | ! added for turning >>>
|
| 9 | integer(kind=kint), parameter :: numOfBlockPerThread = 100
|
| 10 | integer(kind=kint), save :: numOfThread = 1, numOfBlock
|
| 11 | integer(kind=kint), save, allocatable :: icToBlockIndex(:)
|
| 12 | integer(kind=kint), save, allocatable :: blockIndexToColorIndex(:)
|
| 13 | integer(kind=kint), save :: sectorCacheSize0, sectorCacheSize1
|
| 14 | integer(kind=kint) :: blockIndex, elementCount, numOfElement, ii
|
| 15 | real(kind=kreal) :: numOfElementPerBlock
|
| 16 | integer(kind=kint) :: my_rank
|
| 17 |
|
| 18 | if (isFirst .eqv. .true.) then
|
| 19 | !$ numOfThread = omp_get_max_threads()
|
| 20 | numOfBlock = numOfThread * numOfBlockPerThread
|
| 21 | if (allocated(icToBlockIndex)) deallocate(icToBlockIndex)
|
| 22 | if (allocated(blockIndexToColorIndex)) deallocate(blockIndexToColorIndex)
|
| 23 | allocate (icToBlockIndex(0:NColor), &
|
| 24 | blockIndexToColorIndex(0:numOfBlock + NColor))
|
| 25 | numOfElement = N + indexL(N) + indexU(N)
|
| 26 | numOfElementPerBlock = dble(numOfElement) / numOfBlock
|
| 27 | blockIndex = 0
|
| 28 | icToBlockIndex = -1
|
| 29 | icToBlockIndex(0) = 0
|
| 30 | blockIndexToColorIndex = -1
|
| 31 | blockIndexToColorIndex(0) = 0
|
| 32 | my_rank = hecmw_comm_get_rank()
|
| 33 | ! write(9000+my_rank,*) &
|
| 34 | ! '# numOfElementPerBlock =', numOfElementPerBlock
|
| 35 | ! write(9000+my_rank,*) &
|
| 36 | ! '# ic, blockIndex, colorIndex, elementCount'
|
| 37 | do ic = 1, NColor
|
| 38 | elementCount = 0
|
| 39 | ii = 1
|
| 40 | do i = COLORindex(ic-1)+1, COLORindex(ic)
|
| 41 | elementCount = elementCount + 1
|
| 42 | elementCount = elementCount + (indexL(i) - indexL(i-1))
|
| 43 | elementCount = elementCount + (indexU(i) - indexU(i-1))
|
| 44 | if (elementCount > ii * numOfElementPerBlock &
|
| 45 | .or. i == COLORindex(ic)) then
|
| 46 | ii = ii + 1
|
| 47 | blockIndex = blockIndex + 1
|
| 48 | blockIndexToColorIndex(blockIndex) = i
|
| 49 | ! write(9000+my_rank,*) ic, blockIndex, &
|
| 50 | ! blockIndexToColorIndex(blockIndex), elementCount
|
| 51 | endif
|
| 52 | enddo
|
| 53 | icToBlockIndex(ic) = blockIndex
|
| 54 | enddo
|
| 55 | numOfBlock = blockIndex
|
| 56 |
|
| 57 | call hecmw_tuning_fx_calc_sector_cache( N, 3, &
|
| 58 | sectorCacheSize0, sectorCacheSize1 )
|
| 59 |
|
| 60 | isFirst = .false.
|
| 61 | endif
|
| 62 | ! <<< added for turning
|
| 63 |
|
| 64 | !call start_collection("loopInPrecond33")
|
| 65 |
|
| 66 | !OCL CACHE_SECTOR_SIZE(sectorCacheSize0,sectorCacheSize1)
|
| 67 | !OCL CACHE_SUBSECTOR_ASSIGN(ZP)
|
| 68 |
|
| 69 | !$omp parallel default(none) &
|
| 70 | !$omp&shared(NColor,indexL,itemL,indexU,itemU,AL,AU,D,ALU,perm,&
|
| 71 | !$omp& NContact,indexCL,itemCL,indexCU,itemCU,CAL,CAU,&
|
| 72 | !$omp& ZP,icToBlockIndex,blockIndexToColorIndex) &
|
| 73 | !$omp&private(SW1,SW2,SW3,X1,X2,X3,ic,i,iold,isL,ieL,isU,ieU,j,k,blockIndex)
|
| 74 |
|
| 75 | !C-- FORWARD
|
| 76 | do ic=1,NColor
|
| 77 | !$omp do schedule (static, 1)
|
| 78 | do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
|
| 79 | do i = blockIndexToColorIndex(blockIndex-1)+1, &
|
| 80 | blockIndexToColorIndex(blockIndex)
|
| 81 | ! do i = startPos(threadNum, ic), endPos(threadNum, ic)
|
| 82 | iold = perm(i)
|
| 83 | SW1= ZP(3*iold-2)
|
| 84 | SW2= ZP(3*iold-1)
|
| 85 | SW3= ZP(3*iold )
|
| 86 | isL= indexL(i-1)+1
|
| 87 | ieL= indexL(i)
|
| 88 | do j= isL, ieL
|
| 89 | !k= perm(itemL(j))
|
| 90 | k= itemL(j)
|
| 91 | X1= ZP(3*k-2)
|
| 92 | X2= ZP(3*k-1)
|
| 93 | X3= ZP(3*k )
|
| 94 | SW1= SW1 - AL(9*j-8)*X1 - AL(9*j-7)*X2 - AL(9*j-6)*X3
|
| 95 | SW2= SW2 - AL(9*j-5)*X1 - AL(9*j-4)*X2 - AL(9*j-3)*X3
|
| 96 | SW3= SW3 - AL(9*j-2)*X1 - AL(9*j-1)*X2 - AL(9*j )*X3
|
| 97 | enddo ! j
|
| 98 |
|
| 99 | if (NContact.ne.0) then
|
| 100 | isL= indexCL(i-1)+1
|
| 101 | ieL= indexCL(i)
|
| 102 | do j= isL, ieL
|
| 103 | !k= perm(itemCL(j))
|
| 104 | k= itemCL(j)
|
| 105 | X1= ZP(3*k-2)
|
| 106 | X2= ZP(3*k-1)
|
| 107 | X3= ZP(3*k )
|
| 108 | SW1= SW1 - CAL(9*j-8)*X1 - CAL(9*j-7)*X2 - CAL(9*j-6)*X3
|
| 109 | SW2= SW2 - CAL(9*j-5)*X1 - CAL(9*j-4)*X2 - CAL(9*j-3)*X3
|
| 110 | SW3= SW3 - CAL(9*j-2)*X1 - CAL(9*j-1)*X2 - CAL(9*j )*X3
|
| 111 | enddo ! j
|
| 112 | endif
|
| 113 |
|
| 114 | X1= SW1
|
| 115 | X2= SW2
|
| 116 | X3= SW3
|
| 117 | X2= X2 - ALU(9*i-5)*X1
|
| 118 | X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
|
| 119 | X3= ALU(9*i )* X3
|
| 120 | X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
|
| 121 | X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
|
| 122 | ZP(3*iold-2)= X1
|
| 123 | ZP(3*iold-1)= X2
|
| 124 | ZP(3*iold )= X3
|
| 125 | enddo ! i
|
| 126 | enddo ! blockIndex
|
| 127 | !$omp end do
|
| 128 | enddo ! ic
|
| 129 |
|
| 130 | !C-- BACKWARD
|
| 131 | do ic=NColor, 1, -1
|
| 132 | !$omp do schedule (static, 1)
|
| 133 | do blockIndex = icToBlockIndex(ic), icToBlockIndex(ic-1)+1, -1
|
| 134 | do i = blockIndexToColorIndex(blockIndex), &
|
| 135 | blockIndexToColorIndex(blockIndex-1)+1, -1
|
| 136 | ! do blockIndex = icToBlockIndex(ic-1)+1, icToBlockIndex(ic)
|
| 137 | ! do i = blockIndexToColorIndex(blockIndex-1)+1, &
|
| 138 | ! blockIndexToColorIndex(blockIndex)
|
| 139 | ! do i = endPos(threadNum, ic), startPos(threadNum, ic), -1
|
| 140 | SW1= 0.d0
|
| 141 | SW2= 0.d0
|
| 142 | SW3= 0.d0
|
| 143 | isU= indexU(i-1) + 1
|
| 144 | ieU= indexU(i)
|
| 145 | do j= ieU, isU, -1
|
| 146 | !k= perm(itemU(j))
|
| 147 | k= itemU(j)
|
| 148 | X1= ZP(3*k-2)
|
| 149 | X2= ZP(3*k-1)
|
| 150 | X3= ZP(3*k )
|
| 151 | SW1= SW1 + AU(9*j-8)*X1 + AU(9*j-7)*X2 + AU(9*j-6)*X3
|
| 152 | SW2= SW2 + AU(9*j-5)*X1 + AU(9*j-4)*X2 + AU(9*j-3)*X3
|
| 153 | SW3= SW3 + AU(9*j-2)*X1 + AU(9*j-1)*X2 + AU(9*j )*X3
|
| 154 | enddo ! j
|
| 155 |
|
| 156 | if (NContact.gt.0) then
|
| 157 | isU= indexCU(i-1) + 1
|
| 158 | ieU= indexCU(i)
|
| 159 | do j= ieU, isU, -1
|
| 160 | !k= perm(itemCU(j))
|
| 161 | k= itemCU(j)
|
| 162 | X1= ZP(3*k-2)
|
| 163 | X2= ZP(3*k-1)
|
| 164 | X3= ZP(3*k )
|
| 165 | SW1= SW1 + CAU(9*j-8)*X1 + CAU(9*j-7)*X2 + CAU(9*j-6)*X3
|
| 166 | SW2= SW2 + CAU(9*j-5)*X1 + CAU(9*j-4)*X2 + CAU(9*j-3)*X3
|
| 167 | SW3= SW3 + CAU(9*j-2)*X1 + CAU(9*j-1)*X2 + CAU(9*j )*X3
|
| 168 | enddo ! j
|
| 169 | endif
|
| 170 |
|
| 171 | X1= SW1
|
| 172 | X2= SW2
|
| 173 | X3= SW3
|
| 174 | X2= X2 - ALU(9*i-5)*X1
|
| 175 | X3= X3 - ALU(9*i-2)*X1 - ALU(9*i-1)*X2
|
| 176 | X3= ALU(9*i )* X3
|
| 177 | X2= ALU(9*i-4)*( X2 - ALU(9*i-3)*X3 )
|
| 178 | X1= ALU(9*i-8)*( X1 - ALU(9*i-6)*X3 - ALU(9*i-7)*X2)
|
| 179 | iold = perm(i)
|
| 180 | ZP(3*iold-2)= ZP(3*iold-2) - X1
|
| 181 | ZP(3*iold-1)= ZP(3*iold-1) - X2
|
| 182 | ZP(3*iold )= ZP(3*iold ) - X3
|
| 183 | enddo ! i
|
| 184 | enddo ! blockIndex
|
| 185 | !$omp end do
|
| 186 | enddo ! ic
|
| 187 | !$omp end parallel
|
| 188 |
|
| 189 | !OCL END_CACHE_SUBSECTOR
|
| 190 | !OCL END_CACHE_SECTOR_SIZE
|
| 191 |
|
| 192 | !call stop_collection("loopInPrecond33")
|
| 193 |
|
| 194 | end subroutine hecmw_precond_SSOR_33_apply
|