| | 165 | // double *source[9] = { |
| | 166 | // &oldGrid->at(Coord<DIM>(0, y - 1, z + 1)), |
| | 167 | // &oldGrid->at(Coord<DIM>(0, y - 1, z )), |
| | 168 | // &oldGrid->at(Coord<DIM>(0, y - 1, z - 1)), |
| | 169 | // &oldGrid->at(Coord<DIM>(0, y, z + 1)), |
| | 170 | // &oldGrid->at(Coord<DIM>(0, y, z )), |
| | 171 | // &oldGrid->at(Coord<DIM>(0, y, z - 1)), |
| | 172 | // &oldGrid->at(Coord<DIM>(0, y + 1, z + 1)), |
| | 173 | // &oldGrid->at(Coord<DIM>(0, y + 1, z )), |
| | 174 | // &oldGrid->at(Coord<DIM>(0, y + 1, z - 1)) |
| | 175 | // }; |
| | 176 | // updater.update(source, &newGrid->at(c), 1, dim.c[0] - 1); |
| | 235 | |
| | 236 | #define TN 0 |
| | 237 | #define T 1 |
| | 238 | #define TS 2 |
| | 239 | #define N 3 |
| | 240 | #define C 4 |
| | 241 | #define S 5 |
| | 242 | #define BN 6 |
| | 243 | #define B 7 |
| | 244 | #define BS 8 |
| | 245 | |
| | 246 | class Jacobi3D |
| | 247 | { |
| | 248 | public: |
| | 249 | static int coefficients() |
| | 250 | { |
| | 251 | return 1; |
| | 252 | } |
| | 253 | |
| | 254 | inline void update(double **src, double *dst, int startX, int endX) |
| | 255 | { |
| | 256 | int x = startX; |
| | 257 | |
| | 258 | if ((x & 1) == 1) { |
| | 259 | updateScalar(src, dst, x, x + 1); |
| | 260 | x += 1; |
| | 261 | } |
| | 262 | |
| | 263 | __m128d oneSeventh = _mm_set_pd(1.0/7.0, 1.0/7.0); |
| | 264 | __m128d buff0 = _mm_loadu_pd(src[C] + x - 1); |
| | 265 | __m128d same0 = _mm_load_pd(src[C] + x + 0); |
| | 266 | |
| | 267 | int paddedEndX = endX - 7; |
| | 268 | for (; x < paddedEndX; x += 8) { |
| | 269 | // load center row |
| | 270 | __m128d same1 = _mm_load_pd(src[0] + x + 2); |
| | 271 | __m128d same2 = _mm_load_pd(src[0] + x + 4); |
| | 272 | __m128d same3 = _mm_load_pd(src[0] + x + 6); |
| | 273 | __m128d same4 = _mm_load_pd(src[0] + x + 8); |
| | 274 | |
| | 275 | // shuffle values obtain left/right neighbors |
| | 276 | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 277 | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 278 | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 279 | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 280 | |
| | 281 | // load top row |
| | 282 | __m128d temp0 = _mm_load_pd(src[T] + x + 0); |
| | 283 | __m128d temp1 = _mm_load_pd(src[T] + x + 2); |
| | 284 | __m128d temp2 = _mm_load_pd(src[T] + x + 4); |
| | 285 | __m128d temp3 = _mm_load_pd(src[T] + x + 6); |
| | 286 | |
| | 287 | // add center row with left... |
| | 288 | same0 = _mm_add_pd(same0, buff0); |
| | 289 | same1 = _mm_add_pd(same1, buff1); |
| | 290 | same2 = _mm_add_pd(same2, buff2); |
| | 291 | same3 = _mm_add_pd(same3, buff3); |
| | 292 | |
| | 293 | // ...and right neighbors |
| | 294 | same0 = _mm_add_pd(same0, buff1); |
| | 295 | same1 = _mm_add_pd(same1, buff2); |
| | 296 | same2 = _mm_add_pd(same2, buff3); |
| | 297 | same3 = _mm_add_pd(same3, buff4); |
| | 298 | |
| | 299 | // load bottom row |
| | 300 | buff0 = _mm_load_pd(src[B] + x + 0); |
| | 301 | buff1 = _mm_load_pd(src[B] + x + 2); |
| | 302 | buff2 = _mm_load_pd(src[B] + x + 4); |
| | 303 | buff3 = _mm_load_pd(src[B] + x + 6); |
| | 304 | |
| | 305 | // add top row |
| | 306 | same0 = _mm_add_pd(same0, temp0); |
| | 307 | same1 = _mm_add_pd(same1, temp1); |
| | 308 | same2 = _mm_add_pd(same2, temp2); |
| | 309 | same3 = _mm_add_pd(same3, temp3); |
| | 310 | |
| | 311 | // load north row |
| | 312 | temp0 = _mm_load_pd(src[N] + x + 0); |
| | 313 | temp1 = _mm_load_pd(src[N] + x + 2); |
| | 314 | temp2 = _mm_load_pd(src[N] + x + 4); |
| | 315 | temp3 = _mm_load_pd(src[N] + x + 6); |
| | 316 | |
| | 317 | // add bottom row |
| | 318 | same0 = _mm_add_pd(same0, buff0); |
| | 319 | same1 = _mm_add_pd(same1, buff1); |
| | 320 | same2 = _mm_add_pd(same2, buff2); |
| | 321 | same3 = _mm_add_pd(same3, buff3); |
| | 322 | |
| | 323 | // load south row |
| | 324 | buff0 = _mm_load_pd(src[S] + x + 0); |
| | 325 | buff1 = _mm_load_pd(src[S] + x + 2); |
| | 326 | buff2 = _mm_load_pd(src[S] + x + 4); |
| | 327 | buff3 = _mm_load_pd(src[S] + x + 6); |
| | 328 | |
| | 329 | // add north row |
| | 330 | same0 = _mm_add_pd(same0, temp0); |
| | 331 | same1 = _mm_add_pd(same1, temp1); |
| | 332 | same2 = _mm_add_pd(same2, temp2); |
| | 333 | same3 = _mm_add_pd(same3, temp3); |
| | 334 | |
| | 335 | // add south row |
| | 336 | same0 = _mm_add_pd(same0, buff0); |
| | 337 | same1 = _mm_add_pd(same1, buff1); |
| | 338 | same2 = _mm_add_pd(same2, buff2); |
| | 339 | same3 = _mm_add_pd(same3, buff3); |
| | 340 | |
| | 341 | // scale down... |
| | 342 | same0 = _mm_mul_pd(same0, oneSeventh); |
| | 343 | same1 = _mm_mul_pd(same1, oneSeventh); |
| | 344 | same2 = _mm_mul_pd(same2, oneSeventh); |
| | 345 | same3 = _mm_mul_pd(same3, oneSeventh); |
| | 346 | |
| | 347 | // ...and store |
| | 348 | _mm_store_pd(dst + x + 0, same0); |
| | 349 | _mm_store_pd(dst + x + 2, same1); |
| | 350 | _mm_store_pd(dst + x + 4, same2); |
| | 351 | _mm_store_pd(dst + x + 6, same3); |
| | 352 | |
| | 353 | same0 = same4; |
| | 354 | buff0 = buff4; |
| | 355 | } |
| | 356 | |
| | 357 | updateScalar(src, dst, x, endX); |
| | 358 | } |
| | 359 | |
| | 360 | inline void updateScalar(double *src[9], double *dst, int startX, int endX) |
| | 361 | { |
| | 362 | for (int x = startX; x < endX; ++x) { |
| | 363 | dst[x] = |
| | 364 | (src[S][x] + |
| | 365 | src[T][x] + |
| | 366 | src[C][x - 1] + |
| | 367 | src[C][x] + |
| | 368 | src[C][x + 1] + |
| | 369 | src[B][x] + |
| | 370 | src[N][x]) * (1.0 / 7.0); |
| | 371 | } |
| | 372 | } |
| | 373 | |
| | 374 | int flops() |
| | 375 | { |
| | 376 | return 8; |
| | 377 | } |
| | 378 | }; |
| | 379 | |