| | 1481 | template<int DIM_X, int DIM_Y, int DIM_Z> |
| | 1482 | class ExtendedVectorized3DNextGen |
| | 1483 | { |
| | 1484 | public: |
| | 1485 | static int coefficients() |
| | 1486 | { |
| | 1487 | return 13; |
| | 1488 | } |
| | 1489 | |
| | 1490 | template<class NEIGHBORHOOD> |
| | 1491 | inline void step(const NEIGHBORHOOD& hood, double *dst, int unusedOffsetY, int unusedOffsetZ, int startX, int endX) |
| | 1492 | { |
| | 1493 | const int SLICE_SIZE = DIM_X * DIM_Y; |
| | 1494 | const int TOTAL_SIZE = DIM_X * DIM_Y * DIM_Z; |
| | 1495 | const int offsetY = DIM_X; |
| | 1496 | const int offsetZ = SLICE_SIZE; |
| | 1497 | |
| | 1498 | int x = startX; |
| | 1499 | |
| | 1500 | if ((x & 1) == 1) { |
| | 1501 | stepScalar(hood, dst, DIM_Y, offsetZ, x, x + 1); |
| | 1502 | x += 1; |
| | 1503 | } |
| | 1504 | |
| | 1505 | __m128d same0 = _mm_load_pd(&(hood.template src<0, 0, 0>(x))); |
| | 1506 | __m128d neig0 = _mm_loadu_pd(&hood.template src<1, 0, 0>(x)); |
| | 1507 | |
| | 1508 | int paddedEndX = endX - 7; |
| | 1509 | for (; x < paddedEndX; x += 8) { |
| | 1510 | __m128d same1 = _mm_load_pd(&hood.template src<2, 0, 0>(x)); |
| | 1511 | __m128d same2 = _mm_load_pd(&hood.template src<4, 0, 0>(x)); |
| | 1512 | __m128d same3 = _mm_load_pd(&hood.template src<6, 0, 0>(x)); |
| | 1513 | __m128d same4 = _mm_load_pd(&hood.template src<8, 0, 0>(x)); |
| | 1514 | |
| | 1515 | __m128d neig1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 1516 | __m128d neig2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 1517 | __m128d neig3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 1518 | __m128d neig4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 1519 | |
| | 1520 | same0 = _mm_mul_pd(same0, _mm_load_pd(&hood.template coeff<0, 0, 0, 3>(x))); |
| | 1521 | same1 = _mm_mul_pd(same1, _mm_load_pd(&hood.template coeff<2, 0, 0, 3>(x))); |
| | 1522 | same2 = _mm_mul_pd(same2, _mm_load_pd(&hood.template coeff<4, 0, 0, 3>(x))); |
| | 1523 | same3 = _mm_mul_pd(same3, _mm_load_pd(&hood.template coeff<6, 0, 0, 3>(x))); |
| | 1524 | |
| | 1525 | __m128d temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 2>(x))); |
| | 1526 | __m128d temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 2>(x))); |
| | 1527 | __m128d temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 2>(x))); |
| | 1528 | __m128d temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 2>(x))); |
| | 1529 | |
| | 1530 | same0 = _mm_add_pd(same0, temp1); |
| | 1531 | same1 = _mm_add_pd(same1, temp2); |
| | 1532 | same2 = _mm_add_pd(same2, temp3); |
| | 1533 | same3 = _mm_add_pd(same3, temp4); |
| | 1534 | |
| | 1535 | temp1 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<0, 0, 0, 4>(x))); |
| | 1536 | temp2 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<2, 0, 0, 4>(x))); |
| | 1537 | temp3 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<4, 0, 0, 4>(x))); |
| | 1538 | temp4 = _mm_mul_pd(neig4, _mm_load_pd(&hood.template coeff<6, 0, 0, 4>(x))); |
| | 1539 | |
| | 1540 | same0 = _mm_add_pd(same0, temp1); |
| | 1541 | same1 = _mm_add_pd(same1, temp2); |
| | 1542 | same2 = _mm_add_pd(same2, temp3); |
| | 1543 | same3 = _mm_add_pd(same3, temp4); |
| | 1544 | |
| | 1545 | neig0 = _mm_load_pd(&hood.template src<0, 0, -1>(x)); |
| | 1546 | neig1 = _mm_load_pd(&hood.template src<2, 0, -1>(x)); |
| | 1547 | neig2 = _mm_load_pd(&hood.template src<4, 0, -1>(x)); |
| | 1548 | neig3 = _mm_load_pd(&hood.template src<6, 0, -1>(x)); |
| | 1549 | |
| | 1550 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 0>(x))); |
| | 1551 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 0>(x))); |
| | 1552 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 0>(x))); |
| | 1553 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 0>(x))); |
| | 1554 | |
| | 1555 | same0 = _mm_add_pd(same0, temp1); |
| | 1556 | same1 = _mm_add_pd(same1, temp2); |
| | 1557 | same2 = _mm_add_pd(same2, temp3); |
| | 1558 | same3 = _mm_add_pd(same3, temp4); |
| | 1559 | |
| | 1560 | neig0 = _mm_load_pd(&hood.template src<0, -1, 0>(x)); |
| | 1561 | neig1 = _mm_load_pd(&hood.template src<2, -1, 0>(x)); |
| | 1562 | neig2 = _mm_load_pd(&hood.template src<4, -1, 0>(x)); |
| | 1563 | neig3 = _mm_load_pd(&hood.template src<6, -1, 0>(x)); |
| | 1564 | |
| | 1565 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 1>(x))); |
| | 1566 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 1>(x))); |
| | 1567 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 1>(x))); |
| | 1568 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 1>(x))); |
| | 1569 | |
| | 1570 | same0 = _mm_add_pd(same0, temp1); |
| | 1571 | same1 = _mm_add_pd(same1, temp2); |
| | 1572 | same2 = _mm_add_pd(same2, temp3); |
| | 1573 | same3 = _mm_add_pd(same3, temp4); |
| | 1574 | |
| | 1575 | neig0 = _mm_load_pd(&hood.template src<0, 1, 0>(x)); |
| | 1576 | neig1 = _mm_load_pd(&hood.template src<2, 1, 0>(x)); |
| | 1577 | neig2 = _mm_load_pd(&hood.template src<4, 1, 0>(x)); |
| | 1578 | neig3 = _mm_load_pd(&hood.template src<6, 1, 0>(x)); |
| | 1579 | |
| | 1580 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 5>(x))); |
| | 1581 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 5>(x))); |
| | 1582 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 5>(x))); |
| | 1583 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 5>(x))); |
| | 1584 | |
| | 1585 | same0 = _mm_add_pd(same0, temp1); |
| | 1586 | same1 = _mm_add_pd(same1, temp2); |
| | 1587 | same2 = _mm_add_pd(same2, temp3); |
| | 1588 | same3 = _mm_add_pd(same3, temp4); |
| | 1589 | |
| | 1590 | //xxxxxxxxxxxxx |
| | 1591 | neig0 = _mm_load_pd(&hood.template src<0, 0, 1>(x)); |
| | 1592 | neig1 = _mm_load_pd(&hood.template src<2, 0, 1>(x)); |
| | 1593 | neig2 = _mm_load_pd(&hood.template src<4, 0, 1>(x)); |
| | 1594 | neig3 = _mm_load_pd(&hood.template src<6, 0, 1>(x)); |
| | 1595 | |
| | 1596 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 6>(x))); |
| | 1597 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 6>(x))); |
| | 1598 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 6>(x))); |
| | 1599 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 6>(x))); |
| | 1600 | |
| | 1601 | same0 = _mm_add_pd(same0, temp1); |
| | 1602 | same1 = _mm_add_pd(same1, temp2); |
| | 1603 | same2 = _mm_add_pd(same2, temp3); |
| | 1604 | same3 = _mm_add_pd(same3, temp4); |
| | 1605 | |
| | 1606 | //xxxxxxxxxxxxx |
| | 1607 | neig0 = _mm_load_pd(&hood.template src<0, -1, -1>(x)); |
| | 1608 | neig1 = _mm_load_pd(&hood.template src<2, -1, -1>(x)); |
| | 1609 | neig2 = _mm_load_pd(&hood.template src<4, -1, -1>(x)); |
| | 1610 | neig3 = _mm_load_pd(&hood.template src<6, -1, -1>(x)); |
| | 1611 | |
| | 1612 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 7>(x))); |
| | 1613 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 7>(x))); |
| | 1614 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 7>(x))); |
| | 1615 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 7>(x))); |
| | 1616 | |
| | 1617 | same0 = _mm_add_pd(same0, temp1); |
| | 1618 | same1 = _mm_add_pd(same1, temp2); |
| | 1619 | same2 = _mm_add_pd(same2, temp3); |
| | 1620 | same3 = _mm_add_pd(same3, temp4); |
| | 1621 | |
| | 1622 | //xxxxxxxxxxxxx |
| | 1623 | neig0 = _mm_load_pd(&hood.template src<0, 1, -1>(x)); |
| | 1624 | neig1 = _mm_load_pd(&hood.template src<2, 1, -1>(x)); |
| | 1625 | neig2 = _mm_load_pd(&hood.template src<4, 1, -1>(x)); |
| | 1626 | neig3 = _mm_load_pd(&hood.template src<6, 1, -1>(x)); |
| | 1627 | |
| | 1628 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 8>(x))); |
| | 1629 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 8>(x))); |
| | 1630 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 8>(x))); |
| | 1631 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 8>(x))); |
| | 1632 | |
| | 1633 | same0 = _mm_add_pd(same0, temp1); |
| | 1634 | same1 = _mm_add_pd(same1, temp2); |
| | 1635 | same2 = _mm_add_pd(same2, temp3); |
| | 1636 | same3 = _mm_add_pd(same3, temp4); |
| | 1637 | |
| | 1638 | //xxxxxxxxxxxxx |
| | 1639 | neig0 = _mm_load_pd(&hood.template src<0, -1, 1>(x)); |
| | 1640 | neig1 = _mm_load_pd(&hood.template src<2, -1, 1>(x)); |
| | 1641 | neig2 = _mm_load_pd(&hood.template src<4, -1, 1>(x)); |
| | 1642 | neig3 = _mm_load_pd(&hood.template src<6, -1, 1>(x)); |
| | 1643 | |
| | 1644 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 9>(x))); |
| | 1645 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 9>(x))); |
| | 1646 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 9>(x))); |
| | 1647 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 9>(x))); |
| | 1648 | |
| | 1649 | same0 = _mm_add_pd(same0, temp1); |
| | 1650 | same1 = _mm_add_pd(same1, temp2); |
| | 1651 | same2 = _mm_add_pd(same2, temp3); |
| | 1652 | same3 = _mm_add_pd(same3, temp4); |
| | 1653 | |
| | 1654 | //xxxxxxxxxxxxx |
| | 1655 | neig0 = _mm_load_pd(&hood.template src<0, 1, 1>(x)); |
| | 1656 | neig1 = _mm_load_pd(&hood.template src<2, 1, 1>(x)); |
| | 1657 | neig2 = _mm_load_pd(&hood.template src<4, 1, 1>(x)); |
| | 1658 | neig3 = _mm_load_pd(&hood.template src<6, 1, 1>(x)); |
| | 1659 | |
| | 1660 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood.template coeff<0, 0, 0, 10>(x))); |
| | 1661 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood.template coeff<2, 0, 0, 10>(x))); |
| | 1662 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood.template coeff<4, 0, 0, 10>(x))); |
| | 1663 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood.template coeff<6, 0, 0, 10>(x))); |
| | 1664 | |
| | 1665 | same0 = _mm_add_pd(same0, temp1); |
| | 1666 | same1 = _mm_add_pd(same1, temp2); |
| | 1667 | same2 = _mm_add_pd(same2, temp3); |
| | 1668 | same3 = _mm_add_pd(same3, temp4); |
| | 1669 | |
| | 1670 | //xxxxxxxxxxxxx |
| | 1671 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, -1, 11>(x)); |
| | 1672 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, -1, 11>(x)); |
| | 1673 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, -1, 11>(x)); |
| | 1674 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, -1, 11>(x)); |
| | 1675 | |
| | 1676 | same0 = _mm_add_pd(same0, neig0); |
| | 1677 | same1 = _mm_add_pd(same1, neig1); |
| | 1678 | same2 = _mm_add_pd(same2, neig2); |
| | 1679 | same3 = _mm_add_pd(same3, neig3); |
| | 1680 | |
| | 1681 | //xxxxxxxxxxxxx |
| | 1682 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, -1, 11>(x)); |
| | 1683 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, -1, 11>(x)); |
| | 1684 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, -1, 11>(x)); |
| | 1685 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, -1, 11>(x)); |
| | 1686 | |
| | 1687 | same0 = _mm_add_pd(same0, neig0); |
| | 1688 | same1 = _mm_add_pd(same1, neig1); |
| | 1689 | same2 = _mm_add_pd(same2, neig2); |
| | 1690 | same3 = _mm_add_pd(same3, neig3); |
| | 1691 | |
| | 1692 | //xxxxxxxxxxxxx |
| | 1693 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, -1, 11>(x)); |
| | 1694 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, -1, 11>(x)); |
| | 1695 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, -1, 11>(x)); |
| | 1696 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, -1, 11>(x)); |
| | 1697 | |
| | 1698 | same0 = _mm_add_pd(same0, neig0); |
| | 1699 | same1 = _mm_add_pd(same1, neig1); |
| | 1700 | same2 = _mm_add_pd(same2, neig2); |
| | 1701 | same3 = _mm_add_pd(same3, neig3); |
| | 1702 | |
| | 1703 | //xxxxxxxxxxxxx |
| | 1704 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, 0, 11>(x)); |
| | 1705 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, 0, 11>(x)); |
| | 1706 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, 0, 11>(x)); |
| | 1707 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, 0, 11>(x)); |
| | 1708 | |
| | 1709 | same0 = _mm_add_pd(same0, neig0); |
| | 1710 | same1 = _mm_add_pd(same1, neig1); |
| | 1711 | same2 = _mm_add_pd(same2, neig2); |
| | 1712 | same3 = _mm_add_pd(same3, neig3); |
| | 1713 | |
| | 1714 | //xxxxxxxxxxxxx |
| | 1715 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, 0, 11>(x)); |
| | 1716 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, 0, 11>(x)); |
| | 1717 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, 0, 11>(x)); |
| | 1718 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, 0, 11>(x)); |
| | 1719 | |
| | 1720 | same0 = _mm_add_pd(same0, neig0); |
| | 1721 | same1 = _mm_add_pd(same1, neig1); |
| | 1722 | same2 = _mm_add_pd(same2, neig2); |
| | 1723 | same3 = _mm_add_pd(same3, neig3); |
| | 1724 | |
| | 1725 | //xxxxxxxxxxxxx |
| | 1726 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, 0, 11>(x)); |
| | 1727 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, 0, 11>(x)); |
| | 1728 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, 0, 11>(x)); |
| | 1729 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, 0, 11>(x)); |
| | 1730 | |
| | 1731 | same0 = _mm_add_pd(same0, neig0); |
| | 1732 | same1 = _mm_add_pd(same1, neig1); |
| | 1733 | same2 = _mm_add_pd(same2, neig2); |
| | 1734 | same3 = _mm_add_pd(same3, neig3); |
| | 1735 | |
| | 1736 | //xxxxxxxxxxxxx |
| | 1737 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, 1, 11>(x)); |
| | 1738 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, 1, 11>(x)); |
| | 1739 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, 1, 11>(x)); |
| | 1740 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, 1, 11>(x)); |
| | 1741 | |
| | 1742 | same0 = _mm_add_pd(same0, neig0); |
| | 1743 | same1 = _mm_add_pd(same1, neig1); |
| | 1744 | same2 = _mm_add_pd(same2, neig2); |
| | 1745 | same3 = _mm_add_pd(same3, neig3); |
| | 1746 | |
| | 1747 | //xxxxxxxxxxxxx |
| | 1748 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, 1, 11>(x)); |
| | 1749 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, 1, 11>(x)); |
| | 1750 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, 1, 11>(x)); |
| | 1751 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, 1, 11>(x)); |
| | 1752 | |
| | 1753 | same0 = _mm_add_pd(same0, neig0); |
| | 1754 | same1 = _mm_add_pd(same1, neig1); |
| | 1755 | same2 = _mm_add_pd(same2, neig2); |
| | 1756 | same3 = _mm_add_pd(same3, neig3); |
| | 1757 | |
| | 1758 | //xxxxxxxxxxxxx |
| | 1759 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, 1, 11>(x)); |
| | 1760 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, 1, 11>(x)); |
| | 1761 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, 1, 11>(x)); |
| | 1762 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, 1, 11>(x)); |
| | 1763 | |
| | 1764 | same0 = _mm_add_pd(same0, neig0); |
| | 1765 | same1 = _mm_add_pd(same1, neig1); |
| | 1766 | same2 = _mm_add_pd(same2, neig2); |
| | 1767 | same3 = _mm_add_pd(same3, neig3); |
| | 1768 | |
| | 1769 | //xxxxxxxxxxxxx |
| | 1770 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, -1, 12>(x)); |
| | 1771 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, -1, 12>(x)); |
| | 1772 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, -1, 12>(x)); |
| | 1773 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, -1, 12>(x)); |
| | 1774 | |
| | 1775 | same0 = _mm_add_pd(same0, neig0); |
| | 1776 | same1 = _mm_add_pd(same1, neig1); |
| | 1777 | same2 = _mm_add_pd(same2, neig2); |
| | 1778 | same3 = _mm_add_pd(same3, neig3); |
| | 1779 | |
| | 1780 | //xxxxxxxxxxxxx |
| | 1781 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, -1, 12>(x)); |
| | 1782 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, -1, 12>(x)); |
| | 1783 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, -1, 12>(x)); |
| | 1784 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, -1, 12>(x)); |
| | 1785 | |
| | 1786 | same0 = _mm_add_pd(same0, neig0); |
| | 1787 | same1 = _mm_add_pd(same1, neig1); |
| | 1788 | same2 = _mm_add_pd(same2, neig2); |
| | 1789 | same3 = _mm_add_pd(same3, neig3); |
| | 1790 | |
| | 1791 | //xxxxxxxxxxxxx |
| | 1792 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, -1, 12>(x)); |
| | 1793 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, -1, 12>(x)); |
| | 1794 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, -1, 12>(x)); |
| | 1795 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, -1, 12>(x)); |
| | 1796 | |
| | 1797 | same0 = _mm_add_pd(same0, neig0); |
| | 1798 | same1 = _mm_add_pd(same1, neig1); |
| | 1799 | same2 = _mm_add_pd(same2, neig2); |
| | 1800 | same3 = _mm_add_pd(same3, neig3); |
| | 1801 | |
| | 1802 | //xxxxxxxxxxxxx |
| | 1803 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, 0, 12>(x)); |
| | 1804 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, 0, 12>(x)); |
| | 1805 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, 0, 12>(x)); |
| | 1806 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, 0, 12>(x)); |
| | 1807 | |
| | 1808 | same0 = _mm_add_pd(same0, neig0); |
| | 1809 | same1 = _mm_add_pd(same1, neig1); |
| | 1810 | same2 = _mm_add_pd(same2, neig2); |
| | 1811 | same3 = _mm_add_pd(same3, neig3); |
| | 1812 | |
| | 1813 | //xxxxxxxxxxxxx |
| | 1814 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, 0, 12>(x)); |
| | 1815 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, 0, 12>(x)); |
| | 1816 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, 0, 12>(x)); |
| | 1817 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, 0, 12>(x)); |
| | 1818 | |
| | 1819 | same0 = _mm_add_pd(same0, neig0); |
| | 1820 | same1 = _mm_add_pd(same1, neig1); |
| | 1821 | same2 = _mm_add_pd(same2, neig2); |
| | 1822 | same3 = _mm_add_pd(same3, neig3); |
| | 1823 | |
| | 1824 | //xxxxxxxxxxxxx |
| | 1825 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, 0, 12>(x)); |
| | 1826 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, 0, 12>(x)); |
| | 1827 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, 0, 12>(x)); |
| | 1828 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, 0, 12>(x)); |
| | 1829 | |
| | 1830 | same0 = _mm_add_pd(same0, neig0); |
| | 1831 | same1 = _mm_add_pd(same1, neig1); |
| | 1832 | same2 = _mm_add_pd(same2, neig2); |
| | 1833 | same3 = _mm_add_pd(same3, neig3); |
| | 1834 | |
| | 1835 | //xxxxxxxxxxxxx |
| | 1836 | neig0 = _mm_load_pd(&hood.template coeff<0, -1, 1, 12>(x)); |
| | 1837 | neig1 = _mm_load_pd(&hood.template coeff<2, -1, 1, 12>(x)); |
| | 1838 | neig2 = _mm_load_pd(&hood.template coeff<4, -1, 1, 12>(x)); |
| | 1839 | neig3 = _mm_load_pd(&hood.template coeff<6, -1, 1, 12>(x)); |
| | 1840 | |
| | 1841 | same0 = _mm_add_pd(same0, neig0); |
| | 1842 | same1 = _mm_add_pd(same1, neig1); |
| | 1843 | same2 = _mm_add_pd(same2, neig2); |
| | 1844 | same3 = _mm_add_pd(same3, neig3); |
| | 1845 | |
| | 1846 | //xxxxxxxxxxxxx |
| | 1847 | neig0 = _mm_load_pd(&hood.template coeff<0, 0, 1, 12>(x)); |
| | 1848 | neig1 = _mm_load_pd(&hood.template coeff<2, 0, 1, 12>(x)); |
| | 1849 | neig2 = _mm_load_pd(&hood.template coeff<4, 0, 1, 12>(x)); |
| | 1850 | neig3 = _mm_load_pd(&hood.template coeff<6, 0, 1, 12>(x)); |
| | 1851 | |
| | 1852 | same0 = _mm_add_pd(same0, neig0); |
| | 1853 | same1 = _mm_add_pd(same1, neig1); |
| | 1854 | same2 = _mm_add_pd(same2, neig2); |
| | 1855 | same3 = _mm_add_pd(same3, neig3); |
| | 1856 | |
| | 1857 | //xxxxxxxxxxxxx |
| | 1858 | neig0 = _mm_load_pd(&hood.template coeff<0, 1, 1, 12>(x)); |
| | 1859 | neig1 = _mm_load_pd(&hood.template coeff<2, 1, 1, 12>(x)); |
| | 1860 | neig2 = _mm_load_pd(&hood.template coeff<4, 1, 1, 12>(x)); |
| | 1861 | neig3 = _mm_load_pd(&hood.template coeff<6, 1, 1, 12>(x)); |
| | 1862 | |
| | 1863 | same0 = _mm_add_pd(same0, neig0); |
| | 1864 | same1 = _mm_add_pd(same1, neig1); |
| | 1865 | same2 = _mm_add_pd(same2, neig2); |
| | 1866 | same3 = _mm_add_pd(same3, neig3); |
| | 1867 | |
| | 1868 | //yyyyyyyyyyyyy |
| | 1869 | _mm_store_pd(dst + x + 0, same0); |
| | 1870 | _mm_store_pd(dst + x + 2, same1); |
| | 1871 | _mm_store_pd(dst + x + 4, same2); |
| | 1872 | _mm_store_pd(dst + x + 6, same3); |
| | 1873 | |
| | 1874 | same0 = same4; |
| | 1875 | neig0 = neig4; |
| | 1876 | |
| | 1877 | // dst[x] = |
| | 1878 | // coeff[0][x] * src[x - offsetZ] + |
| | 1879 | // coeff[1][x] * src[x - offsetY] + |
| | 1880 | // coeff[2][x] * src[x - 1] + |
| | 1881 | // coeff[3][x] * src[x] + |
| | 1882 | // coeff[4][x] * src[x + 1] + |
| | 1883 | // coeff[5][x] * src[x + offsetY] + |
| | 1884 | // coeff[6][x] * src[x + offsetZ]; |
| | 1885 | } |
| | 1886 | |
| | 1887 | stepScalar(hood, dst, offsetY, offsetZ, x, endX); |
| | 1888 | } |
| | 1889 | |
| | 1890 | template<class NEIGHBORHOOD> |
| | 1891 | inline void stepScalar(const NEIGHBORHOOD& hood, double *dst, int offsetY, int offsetZ, int startX, int endX) |
| | 1892 | { |
| | 1893 | for (int x = startX; x < endX; ++x) { |
| | 1894 | dst[x] = |
| | 1895 | hood.template coeff<0, 0, 0, 0>(x) * hood.template src<0, -1, -1>(x) + |
| | 1896 | hood.template coeff<0, 0, 0, 1>(x) * hood.template src<0, 0, -1>(x) + |
| | 1897 | hood.template coeff<0, 0, 0, 2>(x) * hood.template src<0, 1, -1>(x) + |
| | 1898 | hood.template coeff<0, 0, 0, 3>(x) * hood.template src<0, -1, 0>(x) + |
| | 1899 | hood.template coeff<0, 0, 0, 4>(x) * hood.template src<0 - 1, 0, 0>(x) + |
| | 1900 | hood.template coeff<0, 0, 0, 5>(x) * hood.template src<0, 0, 0>(x) + |
| | 1901 | hood.template coeff<0, 0, 0, 6>(x) * hood.template src<0 + 1, 0, 0>(x) + |
| | 1902 | hood.template coeff<0, 0, 0, 7>(x) * hood.template src<0, 1, 0>(x) + |
| | 1903 | hood.template coeff<0, 0, 0, 8>(x) * hood.template src<0, -1, 1>(x) + |
| | 1904 | hood.template coeff<0, 0, 0, 9>(x) * hood.template src<0, 0, 1>(x) + |
| | 1905 | hood.template coeff<0, 0, 0,10>(x) * hood.template src<0, 1, 1>(x) + |
| | 1906 | |
| | 1907 | hood.template coeff<0, -1, -1, 11>(x) + |
| | 1908 | hood.template coeff<0, 0, -1, 11>(x) + |
| | 1909 | hood.template coeff<0, 1, -1, 11>(x) + |
| | 1910 | hood.template coeff<0, -1, 0, 11>(x) + |
| | 1911 | hood.template coeff<-1, 0, 0, 11>(x) + |
| | 1912 | hood.template coeff<0, 0, 0, 11>(x) + |
| | 1913 | hood.template coeff<1, 0, 0, 11>(x) + |
| | 1914 | hood.template coeff<0, 1, 0, 11>(x) + |
| | 1915 | hood.template coeff<0, -1, 1, 11>(x) + |
| | 1916 | hood.template coeff<0, 0, 1, 11>(x) + |
| | 1917 | hood.template coeff<0, 1, 1, 11>(x) + |
| | 1918 | |
| | 1919 | hood.template coeff<0, -1, -1, 12>(x) + |
| | 1920 | hood.template coeff<0, 0, -1, 12>(x) + |
| | 1921 | hood.template coeff<0, 1, -1, 12>(x) + |
| | 1922 | hood.template coeff<0, -1, 0, 12>(x) + |
| | 1923 | hood.template coeff<-1, 0, 0, 12>(x) + |
| | 1924 | hood.template coeff<0, 0, 0, 12>(x) + |
| | 1925 | hood.template coeff<1, 0, 0, 12>(x) + |
| | 1926 | hood.template coeff<0, 1, 0, 12>(x) + |
| | 1927 | hood.template coeff<0, -1, 1, 12>(x) + |
| | 1928 | hood.template coeff<0, 0, 1, 12>(x) + |
| | 1929 | hood.template coeff<0, 1, 1, 12>(x); |
| | 1930 | } |
| | 1931 | } |
| | 1932 | |
| | 1933 | int flops() |
| | 1934 | { |
| | 1935 | return 40; |
| | 1936 | } |
| | 1937 | }; |
| | 1938 | |