| | 394 | class ExtendedScalar3D |
| | 395 | { |
| | 396 | public: |
| | 397 | static int coefficients() |
| | 398 | { |
| | 399 | return 13; |
| | 400 | } |
| | 401 | |
| | 402 | inline void step(double *coeff[13], double *src, double *dst, int offsetY, int offsetZ, int startX, int endX) |
| | 403 | { |
| | 404 | for (int x = startX; x < endX; ++x) { |
| | 405 | dst[x] = |
| | 406 | coeff[ 0][x] * src[x - offsetZ - offsetY] + |
| | 407 | coeff[ 1][x] * src[x - offsetZ] + |
| | 408 | coeff[ 2][x] * src[x - offsetZ + offsetY] + |
| | 409 | coeff[ 3][x] * src[x - offsetY] + |
| | 410 | coeff[ 4][x] * src[x - 1] + |
| | 411 | coeff[ 5][x] * src[x] + |
| | 412 | coeff[ 6][x] * src[x + 1] + |
| | 413 | coeff[ 7][x] * src[x + offsetY] + |
| | 414 | coeff[ 8][x] * src[x + offsetZ - offsetY] + |
| | 415 | coeff[ 9][x] * src[x + offsetZ] + |
| | 416 | coeff[10][x] * src[x + offsetZ + offsetY] + |
| | 417 | coeff[11][x - offsetZ - offsetY] + |
| | 418 | coeff[11][x - offsetZ] + |
| | 419 | coeff[11][x - offsetZ + offsetY] + |
| | 420 | coeff[11][x - offsetY] + |
| | 421 | coeff[11][x] + |
| | 422 | coeff[11][x + offsetY] + |
| | 423 | coeff[11][x + offsetZ - offsetY] + |
| | 424 | coeff[11][x + offsetZ] + |
| | 425 | coeff[11][x + offsetZ + offsetY] + |
| | 426 | coeff[12][x - offsetZ - offsetY] + |
| | 427 | coeff[12][x - offsetZ] + |
| | 428 | coeff[12][x - offsetZ + offsetY] + |
| | 429 | coeff[12][x - offsetY] + |
| | 430 | coeff[12][x] + |
| | 431 | coeff[12][x + offsetY] + |
| | 432 | coeff[12][x + offsetZ - offsetY] + |
| | 433 | coeff[12][x + offsetZ] + |
| | 434 | coeff[12][x + offsetZ + offsetY]; |
| | 435 | } |
| | 436 | } |
| | 437 | |
| | 438 | int flops() |
| | 439 | { |
| | 440 | return 40; |
| | 441 | } |
| | 442 | }; |
| | 443 | |
| | 444 | class ExtendedVectorized3D |
| | 445 | { |
| | 446 | public: |
| | 447 | static int coefficients() |
| | 448 | { |
| | 449 | return 13; |
| | 450 | } |
| | 451 | |
| | 452 | inline void step(double *coeff[13], double *src, double *dst, int offsetY, int offsetZ, int startX, int endX) |
| | 453 | { |
| | 454 | int x = startX; |
| | 455 | ExtendedScalar3D scalarUpdater; |
| | 456 | |
| | 457 | if ((x & 1) == 1) { |
| | 458 | scalarUpdater.step(coeff, src, dst, offsetY, offsetZ, x, x + 1); |
| | 459 | x += 1; |
| | 460 | } |
| | 461 | |
| | 462 | __m128d same0 = _mm_load_pd(src + x + 0); |
| | 463 | __m128d neig0 = _mm_loadu_pd(src + x + 1); |
| | 464 | |
| | 465 | int paddedEndX = endX - 7; |
| | 466 | for (; x < paddedEndX; x += 8) { |
| | 467 | __m128d same1 = _mm_load_pd(src + x + 2); |
| | 468 | __m128d same2 = _mm_load_pd(src + x + 4); |
| | 469 | __m128d same3 = _mm_load_pd(src + x + 6); |
| | 470 | __m128d same4 = _mm_load_pd(src + x + 8); |
| | 471 | |
| | 472 | __m128d neig1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 473 | __m128d neig2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 474 | __m128d neig3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 475 | __m128d neig4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 476 | |
| | 477 | same0 = _mm_mul_pd(same0, _mm_load_pd(&coeff[3][x + 0])); |
| | 478 | same1 = _mm_mul_pd(same1, _mm_load_pd(&coeff[3][x + 2])); |
| | 479 | same2 = _mm_mul_pd(same2, _mm_load_pd(&coeff[3][x + 4])); |
| | 480 | same3 = _mm_mul_pd(same3, _mm_load_pd(&coeff[3][x + 6])); |
| | 481 | |
| | 482 | __m128d temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[2][x + 0])); |
| | 483 | __m128d temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[2][x + 2])); |
| | 484 | __m128d temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[2][x + 4])); |
| | 485 | __m128d temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[2][x + 6])); |
| | 486 | |
| | 487 | same0 = _mm_add_pd(same0, temp1); |
| | 488 | same1 = _mm_add_pd(same1, temp2); |
| | 489 | same2 = _mm_add_pd(same2, temp3); |
| | 490 | same3 = _mm_add_pd(same3, temp4); |
| | 491 | |
| | 492 | temp1 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[4][x + 0])); |
| | 493 | temp2 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[4][x + 2])); |
| | 494 | temp3 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[4][x + 4])); |
| | 495 | temp4 = _mm_mul_pd(neig4, _mm_load_pd(&coeff[4][x + 6])); |
| | 496 | |
| | 497 | same0 = _mm_add_pd(same0, temp1); |
| | 498 | same1 = _mm_add_pd(same1, temp2); |
| | 499 | same2 = _mm_add_pd(same2, temp3); |
| | 500 | same3 = _mm_add_pd(same3, temp4); |
| | 501 | |
| | 502 | neig0 = _mm_load_pd(src + x - offsetZ + 0); |
| | 503 | neig1 = _mm_load_pd(src + x - offsetZ + 2); |
| | 504 | neig2 = _mm_load_pd(src + x - offsetZ + 4); |
| | 505 | neig3 = _mm_load_pd(src + x - offsetZ + 6); |
| | 506 | |
| | 507 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[0][x + 0])); |
| | 508 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[0][x + 2])); |
| | 509 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[0][x + 4])); |
| | 510 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[0][x + 6])); |
| | 511 | |
| | 512 | same0 = _mm_add_pd(same0, temp1); |
| | 513 | same1 = _mm_add_pd(same1, temp2); |
| | 514 | same2 = _mm_add_pd(same2, temp3); |
| | 515 | same3 = _mm_add_pd(same3, temp4); |
| | 516 | |
| | 517 | neig0 = _mm_load_pd(src + x - offsetY + 0); |
| | 518 | neig1 = _mm_load_pd(src + x - offsetY + 2); |
| | 519 | neig2 = _mm_load_pd(src + x - offsetY + 4); |
| | 520 | neig3 = _mm_load_pd(src + x - offsetY + 6); |
| | 521 | |
| | 522 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[1][x + 0])); |
| | 523 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[1][x + 2])); |
| | 524 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[1][x + 4])); |
| | 525 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[1][x + 6])); |
| | 526 | |
| | 527 | same0 = _mm_add_pd(same0, temp1); |
| | 528 | same1 = _mm_add_pd(same1, temp2); |
| | 529 | same2 = _mm_add_pd(same2, temp3); |
| | 530 | same3 = _mm_add_pd(same3, temp4); |
| | 531 | |
| | 532 | neig0 = _mm_load_pd(src + x + offsetY + 0); |
| | 533 | neig1 = _mm_load_pd(src + x + offsetY + 2); |
| | 534 | neig2 = _mm_load_pd(src + x + offsetY + 4); |
| | 535 | neig3 = _mm_load_pd(src + x + offsetY + 6); |
| | 536 | |
| | 537 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[5][x + 0])); |
| | 538 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[5][x + 2])); |
| | 539 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[5][x + 4])); |
| | 540 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[5][x + 6])); |
| | 541 | |
| | 542 | same0 = _mm_add_pd(same0, temp1); |
| | 543 | same1 = _mm_add_pd(same1, temp2); |
| | 544 | same2 = _mm_add_pd(same2, temp3); |
| | 545 | same3 = _mm_add_pd(same3, temp4); |
| | 546 | |
| | 547 | //xxxxxxxxxxxxx |
| | 548 | neig0 = _mm_load_pd(src + x + offsetZ + 0); |
| | 549 | neig1 = _mm_load_pd(src + x + offsetZ + 2); |
| | 550 | neig2 = _mm_load_pd(src + x + offsetZ + 4); |
| | 551 | neig3 = _mm_load_pd(src + x + offsetZ + 6); |
| | 552 | |
| | 553 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[6][x + 0])); |
| | 554 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[6][x + 2])); |
| | 555 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[6][x + 4])); |
| | 556 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[6][x + 6])); |
| | 557 | |
| | 558 | same0 = _mm_add_pd(same0, temp1); |
| | 559 | same1 = _mm_add_pd(same1, temp2); |
| | 560 | same2 = _mm_add_pd(same2, temp3); |
| | 561 | same3 = _mm_add_pd(same3, temp4); |
| | 562 | |
| | 563 | //xxxxxxxxxxxxx |
| | 564 | neig0 = _mm_load_pd(src + x - offsetZ - offsetY + 0); |
| | 565 | neig1 = _mm_load_pd(src + x - offsetZ - offsetY + 2); |
| | 566 | neig2 = _mm_load_pd(src + x - offsetZ - offsetY + 4); |
| | 567 | neig3 = _mm_load_pd(src + x - offsetZ - offsetY + 6); |
| | 568 | |
| | 569 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[7][x + 0])); |
| | 570 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[7][x + 2])); |
| | 571 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[7][x + 4])); |
| | 572 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[7][x + 6])); |
| | 573 | |
| | 574 | same0 = _mm_add_pd(same0, temp1); |
| | 575 | same1 = _mm_add_pd(same1, temp2); |
| | 576 | same2 = _mm_add_pd(same2, temp3); |
| | 577 | same3 = _mm_add_pd(same3, temp4); |
| | 578 | |
| | 579 | //xxxxxxxxxxxxx |
| | 580 | neig0 = _mm_load_pd(src + x - offsetZ + offsetY + 0); |
| | 581 | neig1 = _mm_load_pd(src + x - offsetZ + offsetY + 2); |
| | 582 | neig2 = _mm_load_pd(src + x - offsetZ + offsetY + 4); |
| | 583 | neig3 = _mm_load_pd(src + x - offsetZ + offsetY + 6); |
| | 584 | |
| | 585 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[8][x + 0])); |
| | 586 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[8][x + 2])); |
| | 587 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[8][x + 4])); |
| | 588 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[8][x + 6])); |
| | 589 | |
| | 590 | same0 = _mm_add_pd(same0, temp1); |
| | 591 | same1 = _mm_add_pd(same1, temp2); |
| | 592 | same2 = _mm_add_pd(same2, temp3); |
| | 593 | same3 = _mm_add_pd(same3, temp4); |
| | 594 | |
| | 595 | //xxxxxxxxxxxxx |
| | 596 | neig0 = _mm_load_pd(src + x + offsetZ - offsetY + 0); |
| | 597 | neig1 = _mm_load_pd(src + x + offsetZ - offsetY + 2); |
| | 598 | neig2 = _mm_load_pd(src + x + offsetZ - offsetY + 4); |
| | 599 | neig3 = _mm_load_pd(src + x + offsetZ - offsetY + 6); |
| | 600 | |
| | 601 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[9][x + 0])); |
| | 602 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[9][x + 2])); |
| | 603 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[9][x + 4])); |
| | 604 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[9][x + 6])); |
| | 605 | |
| | 606 | same0 = _mm_add_pd(same0, temp1); |
| | 607 | same1 = _mm_add_pd(same1, temp2); |
| | 608 | same2 = _mm_add_pd(same2, temp3); |
| | 609 | same3 = _mm_add_pd(same3, temp4); |
| | 610 | |
| | 611 | //xxxxxxxxxxxxx |
| | 612 | neig0 = _mm_load_pd(src + x + offsetZ + offsetY + 0); |
| | 613 | neig1 = _mm_load_pd(src + x + offsetZ + offsetY + 2); |
| | 614 | neig2 = _mm_load_pd(src + x + offsetZ + offsetY + 4); |
| | 615 | neig3 = _mm_load_pd(src + x + offsetZ + offsetY + 6); |
| | 616 | |
| | 617 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&coeff[10][x + 0])); |
| | 618 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&coeff[10][x + 2])); |
| | 619 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&coeff[10][x + 4])); |
| | 620 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&coeff[10][x + 6])); |
| | 621 | |
| | 622 | same0 = _mm_add_pd(same0, temp1); |
| | 623 | same1 = _mm_add_pd(same1, temp2); |
| | 624 | same2 = _mm_add_pd(same2, temp3); |
| | 625 | same3 = _mm_add_pd(same3, temp4); |
| | 626 | |
| | 627 | //xxxxxxxxxxxxx |
| | 628 | neig0 = _mm_load_pd(coeff[11] + x - offsetZ - offsetY + 0); |
| | 629 | neig1 = _mm_load_pd(coeff[11] + x - offsetZ - offsetY + 2); |
| | 630 | neig2 = _mm_load_pd(coeff[11] + x - offsetZ - offsetY + 4); |
| | 631 | neig3 = _mm_load_pd(coeff[11] + x - offsetZ - offsetY + 6); |
| | 632 | |
| | 633 | same0 = _mm_add_pd(same0, neig0); |
| | 634 | same1 = _mm_add_pd(same1, neig1); |
| | 635 | same2 = _mm_add_pd(same2, neig2); |
| | 636 | same3 = _mm_add_pd(same3, neig3); |
| | 637 | |
| | 638 | //xxxxxxxxxxxxx |
| | 639 | neig0 = _mm_load_pd(coeff[11] + x - offsetZ + 0); |
| | 640 | neig1 = _mm_load_pd(coeff[11] + x - offsetZ + 2); |
| | 641 | neig2 = _mm_load_pd(coeff[11] + x - offsetZ + 4); |
| | 642 | neig3 = _mm_load_pd(coeff[11] + x - offsetZ + 6); |
| | 643 | |
| | 644 | same0 = _mm_add_pd(same0, neig0); |
| | 645 | same1 = _mm_add_pd(same1, neig1); |
| | 646 | same2 = _mm_add_pd(same2, neig2); |
| | 647 | same3 = _mm_add_pd(same3, neig3); |
| | 648 | |
| | 649 | //xxxxxxxxxxxxx |
| | 650 | neig0 = _mm_load_pd(coeff[11] + x - offsetZ + offsetY + 0); |
| | 651 | neig1 = _mm_load_pd(coeff[11] + x - offsetZ + offsetY + 2); |
| | 652 | neig2 = _mm_load_pd(coeff[11] + x - offsetZ + offsetY + 4); |
| | 653 | neig3 = _mm_load_pd(coeff[11] + x - offsetZ + offsetY + 6); |
| | 654 | |
| | 655 | same0 = _mm_add_pd(same0, neig0); |
| | 656 | same1 = _mm_add_pd(same1, neig1); |
| | 657 | same2 = _mm_add_pd(same2, neig2); |
| | 658 | same3 = _mm_add_pd(same3, neig3); |
| | 659 | |
| | 660 | //xxxxxxxxxxxxx |
| | 661 | neig0 = _mm_load_pd(coeff[11] + x - offsetY + 0); |
| | 662 | neig1 = _mm_load_pd(coeff[11] + x - offsetY + 2); |
| | 663 | neig2 = _mm_load_pd(coeff[11] + x - offsetY + 4); |
| | 664 | neig3 = _mm_load_pd(coeff[11] + x - offsetY + 6); |
| | 665 | |
| | 666 | same0 = _mm_add_pd(same0, neig0); |
| | 667 | same1 = _mm_add_pd(same1, neig1); |
| | 668 | same2 = _mm_add_pd(same2, neig2); |
| | 669 | same3 = _mm_add_pd(same3, neig3); |
| | 670 | |
| | 671 | //xxxxxxxxxxxxx |
| | 672 | neig0 = _mm_load_pd(coeff[11] + offsetY + x + 0); |
| | 673 | neig1 = _mm_load_pd(coeff[11] + offsetY + x + 2); |
| | 674 | neig2 = _mm_load_pd(coeff[11] + offsetY + x + 4); |
| | 675 | neig3 = _mm_load_pd(coeff[11] + offsetY + x + 6); |
| | 676 | |
| | 677 | same0 = _mm_add_pd(same0, neig0); |
| | 678 | same1 = _mm_add_pd(same1, neig1); |
| | 679 | same2 = _mm_add_pd(same2, neig2); |
| | 680 | same3 = _mm_add_pd(same3, neig3); |
| | 681 | |
| | 682 | //xxxxxxxxxxxxx |
| | 683 | neig0 = _mm_load_pd(coeff[11] + x + 0); |
| | 684 | neig1 = _mm_load_pd(coeff[11] + x + 2); |
| | 685 | neig2 = _mm_load_pd(coeff[11] + x + 4); |
| | 686 | neig3 = _mm_load_pd(coeff[11] + x + 6); |
| | 687 | |
| | 688 | same0 = _mm_add_pd(same0, neig0); |
| | 689 | same1 = _mm_add_pd(same1, neig1); |
| | 690 | same2 = _mm_add_pd(same2, neig2); |
| | 691 | same3 = _mm_add_pd(same3, neig3); |
| | 692 | |
| | 693 | //xxxxxxxxxxxxx |
| | 694 | neig0 = _mm_load_pd(coeff[11] + x + offsetZ - offsetY + 0); |
| | 695 | neig1 = _mm_load_pd(coeff[11] + x + offsetZ - offsetY + 2); |
| | 696 | neig2 = _mm_load_pd(coeff[11] + x + offsetZ - offsetY + 4); |
| | 697 | neig3 = _mm_load_pd(coeff[11] + x + offsetZ - offsetY + 6); |
| | 698 | |
| | 699 | same0 = _mm_add_pd(same0, neig0); |
| | 700 | same1 = _mm_add_pd(same1, neig1); |
| | 701 | same2 = _mm_add_pd(same2, neig2); |
| | 702 | same3 = _mm_add_pd(same3, neig3); |
| | 703 | |
| | 704 | //xxxxxxxxxxxxx |
| | 705 | neig0 = _mm_load_pd(coeff[11] + x + offsetZ + 0); |
| | 706 | neig1 = _mm_load_pd(coeff[11] + x + offsetZ + 2); |
| | 707 | neig2 = _mm_load_pd(coeff[11] + x + offsetZ + 4); |
| | 708 | neig3 = _mm_load_pd(coeff[11] + x + offsetZ + 6); |
| | 709 | |
| | 710 | same0 = _mm_add_pd(same0, neig0); |
| | 711 | same1 = _mm_add_pd(same1, neig1); |
| | 712 | same2 = _mm_add_pd(same2, neig2); |
| | 713 | same3 = _mm_add_pd(same3, neig3); |
| | 714 | |
| | 715 | //xxxxxxxxxxxxx |
| | 716 | neig0 = _mm_load_pd(coeff[11] + x + offsetZ + offsetY + 0); |
| | 717 | neig1 = _mm_load_pd(coeff[11] + x + offsetZ + offsetY + 2); |
| | 718 | neig2 = _mm_load_pd(coeff[11] + x + offsetZ + offsetY + 4); |
| | 719 | neig3 = _mm_load_pd(coeff[11] + x + offsetZ + offsetY + 6); |
| | 720 | |
| | 721 | same0 = _mm_add_pd(same0, neig0); |
| | 722 | same1 = _mm_add_pd(same1, neig1); |
| | 723 | same2 = _mm_add_pd(same2, neig2); |
| | 724 | same3 = _mm_add_pd(same3, neig3); |
| | 725 | |
| | 726 | //xxxxxxxxxxxxx |
| | 727 | neig0 = _mm_load_pd(coeff[12] + x - offsetZ - offsetY + 0); |
| | 728 | neig1 = _mm_load_pd(coeff[12] + x - offsetZ - offsetY + 2); |
| | 729 | neig2 = _mm_load_pd(coeff[12] + x - offsetZ - offsetY + 4); |
| | 730 | neig3 = _mm_load_pd(coeff[12] + x - offsetZ - offsetY + 6); |
| | 731 | |
| | 732 | same0 = _mm_add_pd(same0, neig0); |
| | 733 | same1 = _mm_add_pd(same1, neig1); |
| | 734 | same2 = _mm_add_pd(same2, neig2); |
| | 735 | same3 = _mm_add_pd(same3, neig3); |
| | 736 | |
| | 737 | //xxxxxxxxxxxxx |
| | 738 | neig0 = _mm_load_pd(coeff[12] + x - offsetZ + 0); |
| | 739 | neig1 = _mm_load_pd(coeff[12] + x - offsetZ + 2); |
| | 740 | neig2 = _mm_load_pd(coeff[12] + x - offsetZ + 4); |
| | 741 | neig3 = _mm_load_pd(coeff[12] + x - offsetZ + 6); |
| | 742 | |
| | 743 | same0 = _mm_add_pd(same0, neig0); |
| | 744 | same1 = _mm_add_pd(same1, neig1); |
| | 745 | same2 = _mm_add_pd(same2, neig2); |
| | 746 | same3 = _mm_add_pd(same3, neig3); |
| | 747 | |
| | 748 | //xxxxxxxxxxxxx |
| | 749 | neig0 = _mm_load_pd(coeff[12] + x - offsetZ + offsetY + 0); |
| | 750 | neig1 = _mm_load_pd(coeff[12] + x - offsetZ + offsetY + 2); |
| | 751 | neig2 = _mm_load_pd(coeff[12] + x - offsetZ + offsetY + 4); |
| | 752 | neig3 = _mm_load_pd(coeff[12] + x - offsetZ + offsetY + 6); |
| | 753 | |
| | 754 | same0 = _mm_add_pd(same0, neig0); |
| | 755 | same1 = _mm_add_pd(same1, neig1); |
| | 756 | same2 = _mm_add_pd(same2, neig2); |
| | 757 | same3 = _mm_add_pd(same3, neig3); |
| | 758 | |
| | 759 | //xxxxxxxxxxxxx |
| | 760 | neig0 = _mm_load_pd(coeff[12] + x - offsetY + 0); |
| | 761 | neig1 = _mm_load_pd(coeff[12] + x - offsetY + 2); |
| | 762 | neig2 = _mm_load_pd(coeff[12] + x - offsetY + 4); |
| | 763 | neig3 = _mm_load_pd(coeff[12] + x - offsetY + 6); |
| | 764 | |
| | 765 | same0 = _mm_add_pd(same0, neig0); |
| | 766 | same1 = _mm_add_pd(same1, neig1); |
| | 767 | same2 = _mm_add_pd(same2, neig2); |
| | 768 | same3 = _mm_add_pd(same3, neig3); |
| | 769 | |
| | 770 | //xxxxxxxxxxxxx |
| | 771 | neig0 = _mm_load_pd(coeff[12] + offsetY + x + 0); |
| | 772 | neig1 = _mm_load_pd(coeff[12] + offsetY + x + 2); |
| | 773 | neig2 = _mm_load_pd(coeff[12] + offsetY + x + 4); |
| | 774 | neig3 = _mm_load_pd(coeff[12] + offsetY + x + 6); |
| | 775 | |
| | 776 | same0 = _mm_add_pd(same0, neig0); |
| | 777 | same1 = _mm_add_pd(same1, neig1); |
| | 778 | same2 = _mm_add_pd(same2, neig2); |
| | 779 | same3 = _mm_add_pd(same3, neig3); |
| | 780 | |
| | 781 | //xxxxxxxxxxxxx |
| | 782 | neig0 = _mm_load_pd(coeff[12] + x + 0); |
| | 783 | neig1 = _mm_load_pd(coeff[12] + x + 2); |
| | 784 | neig2 = _mm_load_pd(coeff[12] + x + 4); |
| | 785 | neig3 = _mm_load_pd(coeff[12] + x + 6); |
| | 786 | |
| | 787 | same0 = _mm_add_pd(same0, neig0); |
| | 788 | same1 = _mm_add_pd(same1, neig1); |
| | 789 | same2 = _mm_add_pd(same2, neig2); |
| | 790 | same3 = _mm_add_pd(same3, neig3); |
| | 791 | |
| | 792 | //xxxxxxxxxxxxx |
| | 793 | neig0 = _mm_load_pd(coeff[12] + x + offsetZ - offsetY + 0); |
| | 794 | neig1 = _mm_load_pd(coeff[12] + x + offsetZ - offsetY + 2); |
| | 795 | neig2 = _mm_load_pd(coeff[12] + x + offsetZ - offsetY + 4); |
| | 796 | neig3 = _mm_load_pd(coeff[12] + x + offsetZ - offsetY + 6); |
| | 797 | |
| | 798 | same0 = _mm_add_pd(same0, neig0); |
| | 799 | same1 = _mm_add_pd(same1, neig1); |
| | 800 | same2 = _mm_add_pd(same2, neig2); |
| | 801 | same3 = _mm_add_pd(same3, neig3); |
| | 802 | |
| | 803 | //xxxxxxxxxxxxx |
| | 804 | neig0 = _mm_load_pd(coeff[12] + x + offsetZ + 0); |
| | 805 | neig1 = _mm_load_pd(coeff[12] + x + offsetZ + 2); |
| | 806 | neig2 = _mm_load_pd(coeff[12] + x + offsetZ + 4); |
| | 807 | neig3 = _mm_load_pd(coeff[12] + x + offsetZ + 6); |
| | 808 | |
| | 809 | same0 = _mm_add_pd(same0, neig0); |
| | 810 | same1 = _mm_add_pd(same1, neig1); |
| | 811 | same2 = _mm_add_pd(same2, neig2); |
| | 812 | same3 = _mm_add_pd(same3, neig3); |
| | 813 | |
| | 814 | //xxxxxxxxxxxxx |
| | 815 | neig0 = _mm_load_pd(coeff[12] + x + offsetZ + offsetY + 0); |
| | 816 | neig1 = _mm_load_pd(coeff[12] + x + offsetZ + offsetY + 2); |
| | 817 | neig2 = _mm_load_pd(coeff[12] + x + offsetZ + offsetY + 4); |
| | 818 | neig3 = _mm_load_pd(coeff[12] + x + offsetZ + offsetY + 6); |
| | 819 | |
| | 820 | same0 = _mm_add_pd(same0, neig0); |
| | 821 | same1 = _mm_add_pd(same1, neig1); |
| | 822 | same2 = _mm_add_pd(same2, neig2); |
| | 823 | same3 = _mm_add_pd(same3, neig3); |
| | 824 | |
| | 825 | //yyyyyyyyyyyyy |
| | 826 | _mm_store_pd(dst + 0, same0); |
| | 827 | _mm_store_pd(dst + 2, same1); |
| | 828 | _mm_store_pd(dst + 4, same2); |
| | 829 | _mm_store_pd(dst + 6, same3); |
| | 830 | |
| | 831 | same0 = same4; |
| | 832 | neig0 = neig4; |
| | 833 | |
| | 834 | // dst[x] = |
| | 835 | // coeff[0][x] * src[x - offsetZ] + |
| | 836 | // coeff[1][x] * src[x - offsetY] + |
| | 837 | // coeff[2][x] * src[x - 1] + |
| | 838 | // coeff[3][x] * src[x] + |
| | 839 | // coeff[4][x] * src[x + 1] + |
| | 840 | // coeff[5][x] * src[x + offsetY] + |
| | 841 | // coeff[6][x] * src[x + offsetZ]; |
| | 842 | } |
| | 843 | |
| | 844 | scalarUpdater.step(coeff, src, dst, offsetY, offsetZ, x, endX); |
| | 845 | } |
| | 846 | |
| | 847 | int flops() |
| | 848 | { |
| | 849 | return 40; |
| | 850 | } |
| | 851 | }; |
| | 852 | |