| 126 | | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| 127 | | { |
| 128 | | int x = startX; |
| 129 | | Scalar scalarUpdater; |
| 130 | | |
| 131 | | if ((x & 1) == 1) { |
| 132 | | scalarUpdater.step(src, dst, offset, x, x + 1); |
| 133 | | x += 1; |
| 134 | | } |
| 135 | | |
| 136 | | __m128d oneFifth = _mm_set_pd(1.0/3.0, 1.0/3.0); |
| 137 | | __m128d buff0 = _mm_loadu_pd(src + x - 1); |
| 138 | | __m128d same0 = _mm_load_pd(src + x + 0); |
| 139 | | |
| 140 | | int paddedEndX = endX - 7; |
| 141 | | for (; x < paddedEndX; x += 8) { |
| 142 | | // load center row |
| 143 | | __m128d same1 = _mm_load_pd(src + x + 2); |
| 144 | | __m128d same2 = _mm_load_pd(src + x + 4); |
| 145 | | __m128d same3 = _mm_load_pd(src + x + 6); |
| 146 | | __m128d same4 = _mm_load_pd(src + x + 8); |
| 147 | | |
| 148 | | // shuffle values obtain left/right neighbors |
| 149 | | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| 150 | | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| 151 | | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| 152 | | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| 153 | | |
| 154 | | // add center row with left... |
| 155 | | same0 = _mm_add_pd(same0, buff0); |
| 156 | | same1 = _mm_add_pd(same1, buff1); |
| 157 | | same2 = _mm_add_pd(same2, buff2); |
| 158 | | same3 = _mm_add_pd(same3, buff3); |
| 159 | | |
| 160 | | // ...and right neighbors |
| 161 | | same0 = _mm_add_pd(same0, buff1); |
| 162 | | same1 = _mm_add_pd(same1, buff2); |
| 163 | | same2 = _mm_add_pd(same2, buff3); |
| 164 | | same3 = _mm_add_pd(same3, buff4); |
| 165 | | |
| 166 | | // load top row |
| 167 | | __m128d temp0 = _mm_load_pd(src - offset + x + 0); |
| 168 | | __m128d temp1 = _mm_load_pd(src - offset + x + 2); |
| 169 | | __m128d temp2 = _mm_load_pd(src - offset + x + 4); |
| 170 | | __m128d temp3 = _mm_load_pd(src - offset + x + 6); |
| 171 | | // add top row |
| 172 | | same0 = _mm_add_pd(same0, temp0); |
| 173 | | same1 = _mm_add_pd(same1, temp1); |
| 174 | | same2 = _mm_add_pd(same2, temp2); |
| 175 | | same3 = _mm_add_pd(same3, temp3); |
| 176 | | |
| 177 | | // load bottom row |
| 178 | | buff0 = _mm_load_pd(src + offset + x + 0); |
| 179 | | buff1 = _mm_load_pd(src + offset + x + 2); |
| 180 | | buff2 = _mm_load_pd(src + offset + x + 4); |
| 181 | | buff3 = _mm_load_pd(src + offset + x + 6); |
| 182 | | // add bottom row |
| 183 | | same0 = _mm_add_pd(same0, buff0); |
| 184 | | same1 = _mm_add_pd(same1, buff1); |
| 185 | | same2 = _mm_add_pd(same2, buff2); |
| 186 | | same3 = _mm_add_pd(same3, buff3); |
| 187 | | |
| 188 | | // scale down... |
| 189 | | same0 = _mm_mul_pd(same0, oneFifth); |
| 190 | | same1 = _mm_mul_pd(same1, oneFifth); |
| 191 | | same2 = _mm_mul_pd(same2, oneFifth); |
| 192 | | same3 = _mm_mul_pd(same3, oneFifth); |
| 193 | | |
| 194 | | // ...and store |
| 195 | | _mm_store_pd(dst + 0, same0); |
| 196 | | _mm_store_pd(dst + 2, same1); |
| 197 | | _mm_store_pd(dst + 4, same2); |
| 198 | | _mm_store_pd(dst + 6, same3); |
| 199 | | |
| 200 | | same0 = same4; |
| 201 | | buff0 = buff4; |
| 202 | | } |
| 203 | | |
| 204 | | scalarUpdater.step(src, dst, offset, x, endX); |
| 205 | | } |
| 206 | | |
| 207 | | int flops() |
| 208 | | { |
| 209 | | return 5; |
| 210 | | } |
| 211 | | }; |
| 212 | | |
| 213 | | class VectorizedSSEMelbourneShuffleC |
| 214 | | { |
| 215 | | public: |
| 216 | | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| 217 | | { |
| 218 | | int x = startX; |
| 219 | | Scalar scalarUpdater; |
| 220 | | |
| 221 | | if ((x & 1) == 1) { |
| 222 | | scalarUpdater.step(src, dst, offset, x, x + 1); |
| 223 | | x += 1; |
| 224 | | } |
| 225 | | |
| 226 | | __m128d oneFifth = _mm_set_pd(1.0/3.0, 1.0/3.0); |
| 227 | | __m128d buff0 = _mm_loadu_pd(src + x - 1); |
| 228 | | __m128d same0 = _mm_load_pd(src + x + 0); |
| 229 | | |
| 230 | | int paddedEndX = endX - 7; |
| 231 | | for (; x < paddedEndX; x += 8) { |
| 232 | | |
| 233 | | __m128d same1 = _mm_load_pd(src + x + 2); |
| 234 | | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| 235 | | same0 = _mm_add_pd(same0, buff0); |
| 236 | | same0 = _mm_add_pd(same0, buff1); |
| 237 | | __m128d temp0 = _mm_load_pd(src - offset + x + 0); |
| 238 | | same0 = _mm_add_pd(same0, temp0); |
| 239 | | temp0 = _mm_load_pd(src + offset + x + 0); |
| 240 | | same0 = _mm_add_pd(same0, temp0); |
| 241 | | same0 = _mm_mul_pd(same0, oneFifth); |
| 242 | | _mm_store_pd(dst + 0, same0); |
| 243 | | |
| 244 | | __m128d same2 = _mm_load_pd(src + x + 4); |
| 245 | | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| 246 | | same1 = _mm_add_pd(same1, buff1); |
| 247 | | same1 = _mm_add_pd(same1, buff2); |
| 248 | | __m128d temp1 = _mm_load_pd(src - offset + x + 2); |
| 249 | | same1 = _mm_add_pd(same1, temp1); |
| 250 | | temp1 = _mm_load_pd(src + offset + x + 2); |
| 251 | | same1 = _mm_add_pd(same1, temp1); |
| 252 | | same1 = _mm_mul_pd(same1, oneFifth); |
| 253 | | _mm_store_pd(dst + 2, same1); |
| 254 | | |
| 255 | | __m128d same3 = _mm_load_pd(src + x + 6); |
| 256 | | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| 257 | | same2 = _mm_add_pd(same2, buff2); |
| 258 | | same2 = _mm_add_pd(same2, buff3); |
| 259 | | __m128d temp2 = _mm_load_pd(src - offset + x + 4); |
| 260 | | same2 = _mm_add_pd(same2, temp2); |
| 261 | | temp2 = _mm_load_pd(src + offset + x + 4); |
| 262 | | same2 = _mm_add_pd(same2, temp2); |
| 263 | | same2 = _mm_mul_pd(same2, oneFifth); |
| 264 | | _mm_store_pd(dst + 4, same2); |
| 265 | | |
| 266 | | __m128d same4 = _mm_load_pd(src + x + 8); |
| 267 | | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| 268 | | same3 = _mm_add_pd(same3, buff3); |
| 269 | | same3 = _mm_add_pd(same3, buff4); |
| 270 | | __m128d temp3 = _mm_load_pd(src - offset + x + 6); |
| 271 | | same3 = _mm_add_pd(same3, temp3); |
| 272 | | temp3 = _mm_load_pd(src + offset + x + 6); |
| 273 | | same3 = _mm_add_pd(same3, temp3); |
| 274 | | same3 = _mm_mul_pd(same3, oneFifth); |
| 275 | | _mm_store_pd(dst + 6, same3); |
| 276 | | |
| 277 | | same0 = same4; |
| 278 | | buff0 = buff4; |
| 279 | | } |
| 280 | | |
| 281 | | scalarUpdater.step(src, dst, offset, x, endX); |
| 282 | | } |
| 283 | | |
| 284 | | int flops() |
| 285 | | { |
| 286 | | return 5; |
| 287 | | } |
| 288 | | }; |
| 289 | | |
| 290 | | |
| 291 | | template<typename UPDATER> |
| 292 | | class Benchmark |
| 293 | | { |
| 294 | | public: |
| 295 | | void run(Coord<2> dim, int repeats) |
| 296 | | { |
| 297 | | GridType a(dim); |
| 298 | | GridType b(dim); |
| | 136 | typedef Grid<double, typename Topologies::Cube<DIM>::Topology> GridType; |
| | 137 | |
| | 138 | |
| | 139 | void run(Coord<DIM> dim, int repeats) |
| | 140 | { |
| | 141 | Coord<DIM> coeffDim = dim; |
| | 142 | coeffDim.c[DIM - 1] *= UPDATER::coefficients(); |
| | 143 | GridType coeff(coeffDim, 0.1); |
| | 144 | GridType a(dim, 1.0); |
| | 145 | GridType b(dim, 1.0); |
| | 221 | class Scalar3D |
| | 222 | { |
| | 223 | public: |
| | 224 | static int coefficients() |
| | 225 | { |
| | 226 | return 7; |
| | 227 | } |
| | 228 | |
| | 229 | inline void step(double *coeff[7], double *src, double *dst, int offsetY, int offsetZ, int startX, int endX) |
| | 230 | { |
| | 231 | for (int x = startX; x < endX; ++x) { |
| | 232 | dst[x] = |
| | 233 | coeff[0][x] * src[x - offsetZ] + |
| | 234 | coeff[1][x] * src[x - offsetY] + |
| | 235 | coeff[2][x] * src[x - 1] + |
| | 236 | coeff[3][x] * src[x] + |
| | 237 | coeff[4][x] * src[x + 1] + |
| | 238 | coeff[5][x] * src[x + offsetY] + |
| | 239 | coeff[6][x] * src[x + offsetZ]; |
| | 240 | } |
| | 241 | } |
| | 242 | |
| | 243 | int flops() |
| | 244 | { |
| | 245 | return 13; |
| | 246 | } |
| | 247 | }; |
| | 248 | |
| | 249 | class Vectorized3D |
| | 250 | { |
| | 251 | public: |
| | 252 | static int coefficients() |
| | 253 | { |
| | 254 | return 7; |
| | 255 | } |
| | 256 | |
| | 257 | inline void step(double **coeff, double *src, double *dst, int offsetY, int offsetZ, int startX, int endX) |
| | 258 | { |
| | 259 | int x = startX; |
| | 260 | Scalar3D scalarUpdater; |
| | 261 | |
| | 262 | if ((x & 1) == 1) { |
| | 263 | scalarUpdater.step(coeff, src, dst, offsetY, offsetZ, x, x + 1); |
| | 264 | x += 1; |
| | 265 | } |
| | 266 | |
| | 267 | __m128d same0 = _mm_load_pd(src + x + 0); |
| | 268 | __m128d neig0 = _mm_loadu_pd(src + x + 1); |
| | 269 | |
| | 270 | int paddedEndX = endX - 7; |
| | 271 | for (; x < paddedEndX; x += 8) { |
| | 272 | __m128d same1 = _mm_load_pd(src + x + 2); |
| | 273 | __m128d same2 = _mm_load_pd(src + x + 4); |
| | 274 | __m128d same3 = _mm_load_pd(src + x + 6); |
| | 275 | __m128d same4 = _mm_load_pd(src + x + 8); |
| | 276 | |
| | 277 | __m128d neig1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 278 | __m128d neig2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 279 | __m128d neig3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 280 | __m128d neig4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 281 | |
| | 282 | same0 = _mm_mul_pd(same0, _mm_load_pd(&coeff[3][x + 0])); |
| | 283 | same1 = _mm_mul_pd(same1, _mm_load_pd(&coeff[3][x + 2])); |
| | 284 | same2 = _mm_mul_pd(same2, _mm_load_pd(&coeff[3][x + 4])); |
| | 285 | same3 = _mm_mul_pd(same3, _mm_load_pd(&coeff[3][x + 6])); |
| | 286 | |
| | 287 | __m128d temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[2][x + 0])); |
| | 288 | __m128d temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[2][x + 2])); |
| | 289 | __m128d temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[2][x + 4])); |
| | 290 | __m128d temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[2][x + 6])); |
| | 291 | |
| | 292 | same0 = _mm_add_pd(same0, temp1); |
| | 293 | same1 = _mm_add_pd(same1, temp2); |
| | 294 | same2 = _mm_add_pd(same2, temp3); |
| | 295 | same3 = _mm_add_pd(same3, temp4); |
| | 296 | |
| | 297 | temp1 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[4][x + 0])); |
| | 298 | temp2 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[4][x + 2])); |
| | 299 | temp3 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[4][x + 4])); |
| | 300 | temp4 = _mm_mul_pd(neig4, _mm_load_pd(&coeff[4][x + 6])); |
| | 301 | |
| | 302 | same0 = _mm_add_pd(same0, temp1); |
| | 303 | same1 = _mm_add_pd(same1, temp2); |
| | 304 | same2 = _mm_add_pd(same2, temp3); |
| | 305 | same3 = _mm_add_pd(same3, temp4); |
| | 306 | |
| | 307 | neig0 = _mm_load_pd(src + x - offsetZ + 0); |
| | 308 | neig1 = _mm_load_pd(src + x - offsetZ + 2); |
| | 309 | neig2 = _mm_load_pd(src + x - offsetZ + 4); |
| | 310 | neig3 = _mm_load_pd(src + x - offsetZ + 6); |
| | 311 | |
| | 312 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[0][x + 0])); |
| | 313 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[0][x + 2])); |
| | 314 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[0][x + 4])); |
| | 315 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[0][x + 6])); |
| | 316 | |
| | 317 | same0 = _mm_add_pd(same0, temp1); |
| | 318 | same1 = _mm_add_pd(same1, temp2); |
| | 319 | same2 = _mm_add_pd(same2, temp3); |
| | 320 | same3 = _mm_add_pd(same3, temp4); |
| | 321 | |
| | 322 | neig0 = _mm_load_pd(src + x - offsetY + 0); |
| | 323 | neig1 = _mm_load_pd(src + x - offsetY + 2); |
| | 324 | neig2 = _mm_load_pd(src + x - offsetY + 4); |
| | 325 | neig3 = _mm_load_pd(src + x - offsetY + 6); |
| | 326 | |
| | 327 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[1][x + 0])); |
| | 328 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[1][x + 2])); |
| | 329 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[1][x + 4])); |
| | 330 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[1][x + 6])); |
| | 331 | |
| | 332 | same0 = _mm_add_pd(same0, temp1); |
| | 333 | same1 = _mm_add_pd(same1, temp2); |
| | 334 | same2 = _mm_add_pd(same2, temp3); |
| | 335 | same3 = _mm_add_pd(same3, temp4); |
| | 336 | |
| | 337 | neig0 = _mm_load_pd(src + x + offsetY + 0); |
| | 338 | neig1 = _mm_load_pd(src + x + offsetY + 2); |
| | 339 | neig2 = _mm_load_pd(src + x + offsetY + 4); |
| | 340 | neig3 = _mm_load_pd(src + x + offsetY + 6); |
| | 341 | |
| | 342 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[5][x + 0])); |
| | 343 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[5][x + 2])); |
| | 344 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[5][x + 4])); |
| | 345 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[5][x + 6])); |
| | 346 | |
| | 347 | same0 = _mm_add_pd(same0, temp1); |
| | 348 | same1 = _mm_add_pd(same1, temp2); |
| | 349 | same2 = _mm_add_pd(same2, temp3); |
| | 350 | same3 = _mm_add_pd(same3, temp4); |
| | 351 | |
| | 352 | neig0 = _mm_load_pd(src + x + offsetZ + 0); |
| | 353 | neig1 = _mm_load_pd(src + x + offsetZ + 2); |
| | 354 | neig2 = _mm_load_pd(src + x + offsetZ + 4); |
| | 355 | neig3 = _mm_load_pd(src + x + offsetZ + 6); |
| | 356 | |
| | 357 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[5][x + 0])); |
| | 358 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[5][x + 2])); |
| | 359 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[5][x + 4])); |
| | 360 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[5][x + 6])); |
| | 361 | |
| | 362 | same0 = _mm_add_pd(same0, temp1); |
| | 363 | same1 = _mm_add_pd(same1, temp2); |
| | 364 | same2 = _mm_add_pd(same2, temp3); |
| | 365 | same3 = _mm_add_pd(same3, temp4); |
| | 366 | |
| | 367 | _mm_store_pd(dst + 0, same0); |
| | 368 | _mm_store_pd(dst + 2, same1); |
| | 369 | _mm_store_pd(dst + 4, same2); |
| | 370 | _mm_store_pd(dst + 6, same3); |
| | 371 | |
| | 372 | same0 = same4; |
| | 373 | neig0 = neig4; |
| | 374 | |
| | 375 | // dst[x] = |
| | 376 | // coeff[0][x] * src[x - offsetZ] + |
| | 377 | // coeff[1][x] * src[x - offsetY] + |
| | 378 | // coeff[2][x] * src[x - 1] + |
| | 379 | // coeff[3][x] * src[x] + |
| | 380 | // coeff[4][x] * src[x + 1] + |
| | 381 | // coeff[5][x] * src[x + offsetY] + |
| | 382 | // coeff[6][x] * src[x + offsetZ]; |
| | 383 | } |
| | 384 | |
| | 385 | scalarUpdater.step(coeff, src, dst, offsetY, offsetZ, x, endX); |
| | 386 | } |
| | 387 | |
| | 388 | int flops() |
| | 389 | { |
| | 390 | return 13; |
| | 391 | } |
| | 392 | }; |
| | 393 | |
| | 394 | |