| | 1971 | class ExtendedVectorized3DDefiant |
| | 1972 | { |
| | 1973 | public: |
| | 1974 | static int coefficients() |
| | 1975 | { |
| | 1976 | return 13; |
| | 1977 | } |
| | 1978 | |
| | 1979 | template<class NEIGHBORHOOD> |
| | 1980 | inline void step(const NEIGHBORHOOD& hood, double *dst, int startX, int endX) |
| | 1981 | { |
| | 1982 | int x = startX; |
| | 1983 | |
| | 1984 | if ((x & 1) == 1) { |
| | 1985 | stepScalar(hood, dst, x, x + 1); |
| | 1986 | x += 1; |
| | 1987 | } |
| | 1988 | |
| | 1989 | __m128d same0 = _mm_load_pd(&(hood[FC<0, 0, 0>()].srcB(x))); |
| | 1990 | __m128d neig0 = _mm_loadu_pd(&(hood[FC<1, 0, 0>()].srcB(x))); |
| | 1991 | |
| | 1992 | int paddedEndX = endX - 7; |
| | 1993 | for (; x < paddedEndX; x += 8) { |
| | 1994 | __m128d same1 = _mm_load_pd(&hood[FC<2, 0, 0>()].srcB(x)); |
| | 1995 | __m128d same2 = _mm_load_pd(&hood[FC<4, 0, 0>()].srcB(x)); |
| | 1996 | __m128d same3 = _mm_load_pd(&hood[FC<6, 0, 0>()].srcB(x)); |
| | 1997 | __m128d same4 = _mm_load_pd(&hood[FC<8, 0, 0>()].srcB(x)); |
| | 1998 | |
| | 1999 | __m128d neig1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 2000 | __m128d neig2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 2001 | __m128d neig3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 2002 | __m128d neig4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 2003 | |
| | 2004 | same0 = _mm_mul_pd(same0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 3>(), x))); |
| | 2005 | same1 = _mm_mul_pd(same1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 3>(), x))); |
| | 2006 | same2 = _mm_mul_pd(same2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 3>(), x))); |
| | 2007 | same3 = _mm_mul_pd(same3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 3>(), x))); |
| | 2008 | |
| | 2009 | __m128d temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 2>(), x))); |
| | 2010 | __m128d temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 2>(), x))); |
| | 2011 | __m128d temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 2>(), x))); |
| | 2012 | __m128d temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 2>(), x))); |
| | 2013 | |
| | 2014 | same0 = _mm_add_pd(same0, temp1); |
| | 2015 | same1 = _mm_add_pd(same1, temp2); |
| | 2016 | same2 = _mm_add_pd(same2, temp3); |
| | 2017 | same3 = _mm_add_pd(same3, temp4); |
| | 2018 | |
| | 2019 | temp1 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 4>(), x))); |
| | 2020 | temp2 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 4>(), x))); |
| | 2021 | temp3 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 4>(), x))); |
| | 2022 | temp4 = _mm_mul_pd(neig4, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 4>(), x))); |
| | 2023 | |
| | 2024 | same0 = _mm_add_pd(same0, temp1); |
| | 2025 | same1 = _mm_add_pd(same1, temp2); |
| | 2026 | same2 = _mm_add_pd(same2, temp3); |
| | 2027 | same3 = _mm_add_pd(same3, temp4); |
| | 2028 | |
| | 2029 | neig0 = _mm_load_pd(&hood[FC<0, 0, -1>()].srcB(x)); |
| | 2030 | neig1 = _mm_load_pd(&hood[FC<2, 0, -1>()].srcB(x)); |
| | 2031 | neig2 = _mm_load_pd(&hood[FC<4, 0, -1>()].srcB(x)); |
| | 2032 | neig3 = _mm_load_pd(&hood[FC<6, 0, -1>()].srcB(x)); |
| | 2033 | |
| | 2034 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 0>(), x))); |
| | 2035 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 0>(), x))); |
| | 2036 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 0>(), x))); |
| | 2037 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 0>(), x))); |
| | 2038 | |
| | 2039 | same0 = _mm_add_pd(same0, temp1); |
| | 2040 | same1 = _mm_add_pd(same1, temp2); |
| | 2041 | same2 = _mm_add_pd(same2, temp3); |
| | 2042 | same3 = _mm_add_pd(same3, temp4); |
| | 2043 | |
| | 2044 | neig0 = _mm_load_pd(&hood[FC<0, -1, 0>()].srcB(x)); |
| | 2045 | neig1 = _mm_load_pd(&hood[FC<2, -1, 0>()].srcB(x)); |
| | 2046 | neig2 = _mm_load_pd(&hood[FC<4, -1, 0>()].srcB(x)); |
| | 2047 | neig3 = _mm_load_pd(&hood[FC<6, -1, 0>()].srcB(x)); |
| | 2048 | |
| | 2049 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 1>(), x))); |
| | 2050 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 1>(), x))); |
| | 2051 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 1>(), x))); |
| | 2052 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 1>(), x))); |
| | 2053 | |
| | 2054 | same0 = _mm_add_pd(same0, temp1); |
| | 2055 | same1 = _mm_add_pd(same1, temp2); |
| | 2056 | same2 = _mm_add_pd(same2, temp3); |
| | 2057 | same3 = _mm_add_pd(same3, temp4); |
| | 2058 | |
| | 2059 | neig0 = _mm_load_pd(&hood[FC<0, 1, 0>()].srcB(x)); |
| | 2060 | neig1 = _mm_load_pd(&hood[FC<2, 1, 0>()].srcB(x)); |
| | 2061 | neig2 = _mm_load_pd(&hood[FC<4, 1, 0>()].srcB(x)); |
| | 2062 | neig3 = _mm_load_pd(&hood[FC<6, 1, 0>()].srcB(x)); |
| | 2063 | |
| | 2064 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 5>(), x))); |
| | 2065 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 5>(), x))); |
| | 2066 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 5>(), x))); |
| | 2067 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 5>(), x))); |
| | 2068 | |
| | 2069 | same0 = _mm_add_pd(same0, temp1); |
| | 2070 | same1 = _mm_add_pd(same1, temp2); |
| | 2071 | same2 = _mm_add_pd(same2, temp3); |
| | 2072 | same3 = _mm_add_pd(same3, temp4); |
| | 2073 | |
| | 2074 | //xxxxxxxxxxxxx |
| | 2075 | neig0 = _mm_load_pd(&hood[FC<0, 0, 1>()].srcB(x)); |
| | 2076 | neig1 = _mm_load_pd(&hood[FC<2, 0, 1>()].srcB(x)); |
| | 2077 | neig2 = _mm_load_pd(&hood[FC<4, 0, 1>()].srcB(x)); |
| | 2078 | neig3 = _mm_load_pd(&hood[FC<6, 0, 1>()].srcB(x)); |
| | 2079 | |
| | 2080 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 6>(), x))); |
| | 2081 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 6>(), x))); |
| | 2082 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 6>(), x))); |
| | 2083 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 6>(), x))); |
| | 2084 | |
| | 2085 | same0 = _mm_add_pd(same0, temp1); |
| | 2086 | same1 = _mm_add_pd(same1, temp2); |
| | 2087 | same2 = _mm_add_pd(same2, temp3); |
| | 2088 | same3 = _mm_add_pd(same3, temp4); |
| | 2089 | |
| | 2090 | //xxxxxxxxxxxxx |
| | 2091 | neig0 = _mm_load_pd(&hood[FC<0, -1, -1>()].srcB(x)); |
| | 2092 | neig1 = _mm_load_pd(&hood[FC<2, -1, -1>()].srcB(x)); |
| | 2093 | neig2 = _mm_load_pd(&hood[FC<4, -1, -1>()].srcB(x)); |
| | 2094 | neig3 = _mm_load_pd(&hood[FC<6, -1, -1>()].srcB(x)); |
| | 2095 | |
| | 2096 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 7>(), x))); |
| | 2097 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 7>(), x))); |
| | 2098 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 7>(), x))); |
| | 2099 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 7>(), x))); |
| | 2100 | |
| | 2101 | same0 = _mm_add_pd(same0, temp1); |
| | 2102 | same1 = _mm_add_pd(same1, temp2); |
| | 2103 | same2 = _mm_add_pd(same2, temp3); |
| | 2104 | same3 = _mm_add_pd(same3, temp4); |
| | 2105 | |
| | 2106 | //xxxxxxxxxxxxx |
| | 2107 | neig0 = _mm_load_pd(&hood[FC<0, 1, -1>()].srcB(x)); |
| | 2108 | neig1 = _mm_load_pd(&hood[FC<2, 1, -1>()].srcB(x)); |
| | 2109 | neig2 = _mm_load_pd(&hood[FC<4, 1, -1>()].srcB(x)); |
| | 2110 | neig3 = _mm_load_pd(&hood[FC<6, 1, -1>()].srcB(x)); |
| | 2111 | |
| | 2112 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 8>(), x))); |
| | 2113 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 8>(), x))); |
| | 2114 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 8>(), x))); |
| | 2115 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 8>(), x))); |
| | 2116 | |
| | 2117 | same0 = _mm_add_pd(same0, temp1); |
| | 2118 | same1 = _mm_add_pd(same1, temp2); |
| | 2119 | same2 = _mm_add_pd(same2, temp3); |
| | 2120 | same3 = _mm_add_pd(same3, temp4); |
| | 2121 | |
| | 2122 | //xxxxxxxxxxxxx |
| | 2123 | neig0 = _mm_load_pd(&hood[FC<0, -1, 1>()].srcB(x)); |
| | 2124 | neig1 = _mm_load_pd(&hood[FC<2, -1, 1>()].srcB(x)); |
| | 2125 | neig2 = _mm_load_pd(&hood[FC<4, -1, 1>()].srcB(x)); |
| | 2126 | neig3 = _mm_load_pd(&hood[FC<6, -1, 1>()].srcB(x)); |
| | 2127 | |
| | 2128 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC< 9>(), x))); |
| | 2129 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC< 9>(), x))); |
| | 2130 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC< 9>(), x))); |
| | 2131 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC< 9>(), x))); |
| | 2132 | |
| | 2133 | same0 = _mm_add_pd(same0, temp1); |
| | 2134 | same1 = _mm_add_pd(same1, temp2); |
| | 2135 | same2 = _mm_add_pd(same2, temp3); |
| | 2136 | same3 = _mm_add_pd(same3, temp4); |
| | 2137 | |
| | 2138 | //xxxxxxxxxxxxx |
| | 2139 | neig0 = _mm_load_pd(&hood[FC<0, 1, 1>()].srcB(x)); |
| | 2140 | neig1 = _mm_load_pd(&hood[FC<2, 1, 1>()].srcB(x)); |
| | 2141 | neig2 = _mm_load_pd(&hood[FC<4, 1, 1>()].srcB(x)); |
| | 2142 | neig3 = _mm_load_pd(&hood[FC<6, 1, 1>()].srcB(x)); |
| | 2143 | |
| | 2144 | temp1 = _mm_mul_pd(neig0, _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC<10>(), x))); |
| | 2145 | temp2 = _mm_mul_pd(neig1, _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC<10>(), x))); |
| | 2146 | temp3 = _mm_mul_pd(neig2, _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC<10>(), x))); |
| | 2147 | temp4 = _mm_mul_pd(neig3, _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC<10>(), x))); |
| | 2148 | |
| | 2149 | same0 = _mm_add_pd(same0, temp1); |
| | 2150 | same1 = _mm_add_pd(same1, temp2); |
| | 2151 | same2 = _mm_add_pd(same2, temp3); |
| | 2152 | same3 = _mm_add_pd(same3, temp4); |
| | 2153 | |
| | 2154 | //xxxxxxxxxxxxx |
| | 2155 | neig0 = _mm_load_pd(&hood[FC<0, -1, -1>()].coeffB(FC<11>(), x)); |
| | 2156 | neig1 = _mm_load_pd(&hood[FC<2, -1, -1>()].coeffB(FC<11>(), x)); |
| | 2157 | neig2 = _mm_load_pd(&hood[FC<4, -1, -1>()].coeffB(FC<11>(), x)); |
| | 2158 | neig3 = _mm_load_pd(&hood[FC<6, -1, -1>()].coeffB(FC<11>(), x)); |
| | 2159 | |
| | 2160 | same0 = _mm_add_pd(same0, neig0); |
| | 2161 | same1 = _mm_add_pd(same1, neig1); |
| | 2162 | same2 = _mm_add_pd(same2, neig2); |
| | 2163 | same3 = _mm_add_pd(same3, neig3); |
| | 2164 | |
| | 2165 | //xxxxxxxxxxxxx |
| | 2166 | neig0 = _mm_load_pd(&hood[FC<0, 0, -1>()].coeffB(FC<11>(), x)); |
| | 2167 | neig1 = _mm_load_pd(&hood[FC<2, 0, -1>()].coeffB(FC<11>(), x)); |
| | 2168 | neig2 = _mm_load_pd(&hood[FC<4, 0, -1>()].coeffB(FC<11>(), x)); |
| | 2169 | neig3 = _mm_load_pd(&hood[FC<6, 0, -1>()].coeffB(FC<11>(), x)); |
| | 2170 | |
| | 2171 | same0 = _mm_add_pd(same0, neig0); |
| | 2172 | same1 = _mm_add_pd(same1, neig1); |
| | 2173 | same2 = _mm_add_pd(same2, neig2); |
| | 2174 | same3 = _mm_add_pd(same3, neig3); |
| | 2175 | |
| | 2176 | //xxxxxxxxxxxxx |
| | 2177 | neig0 = _mm_load_pd(&hood[FC<0, 1, -1>()].coeffB(FC<11>(), x)); |
| | 2178 | neig1 = _mm_load_pd(&hood[FC<2, 1, -1>()].coeffB(FC<11>(), x)); |
| | 2179 | neig2 = _mm_load_pd(&hood[FC<4, 1, -1>()].coeffB(FC<11>(), x)); |
| | 2180 | neig3 = _mm_load_pd(&hood[FC<6, 1, -1>()].coeffB(FC<11>(), x)); |
| | 2181 | |
| | 2182 | same0 = _mm_add_pd(same0, neig0); |
| | 2183 | same1 = _mm_add_pd(same1, neig1); |
| | 2184 | same2 = _mm_add_pd(same2, neig2); |
| | 2185 | same3 = _mm_add_pd(same3, neig3); |
| | 2186 | |
| | 2187 | //xxxxxxxxxxxxx |
| | 2188 | neig0 = _mm_load_pd(&hood[FC<0, -1, 0>()].coeffB(FC<11>(), x)); |
| | 2189 | neig1 = _mm_load_pd(&hood[FC<2, -1, 0>()].coeffB(FC<11>(), x)); |
| | 2190 | neig2 = _mm_load_pd(&hood[FC<4, -1, 0>()].coeffB(FC<11>(), x)); |
| | 2191 | neig3 = _mm_load_pd(&hood[FC<6, -1, 0>()].coeffB(FC<11>(), x)); |
| | 2192 | |
| | 2193 | same0 = _mm_add_pd(same0, neig0); |
| | 2194 | same1 = _mm_add_pd(same1, neig1); |
| | 2195 | same2 = _mm_add_pd(same2, neig2); |
| | 2196 | same3 = _mm_add_pd(same3, neig3); |
| | 2197 | |
| | 2198 | //xxxxxxxxxxxxx |
| | 2199 | neig0 = _mm_load_pd(&hood[FC<0, 1, 0>()].coeffB(FC<11>(), x)); |
| | 2200 | neig1 = _mm_load_pd(&hood[FC<2, 1, 0>()].coeffB(FC<11>(), x)); |
| | 2201 | neig2 = _mm_load_pd(&hood[FC<4, 1, 0>()].coeffB(FC<11>(), x)); |
| | 2202 | neig3 = _mm_load_pd(&hood[FC<6, 1, 0>()].coeffB(FC<11>(), x)); |
| | 2203 | |
| | 2204 | same0 = _mm_add_pd(same0, neig0); |
| | 2205 | same1 = _mm_add_pd(same1, neig1); |
| | 2206 | same2 = _mm_add_pd(same2, neig2); |
| | 2207 | same3 = _mm_add_pd(same3, neig3); |
| | 2208 | |
| | 2209 | //xxxxxxxxxxxxx |
| | 2210 | neig0 = _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC<11>(), x)); |
| | 2211 | neig1 = _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC<11>(), x)); |
| | 2212 | neig2 = _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC<11>(), x)); |
| | 2213 | neig3 = _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC<11>(), x)); |
| | 2214 | |
| | 2215 | same0 = _mm_add_pd(same0, neig0); |
| | 2216 | same1 = _mm_add_pd(same1, neig1); |
| | 2217 | same2 = _mm_add_pd(same2, neig2); |
| | 2218 | same3 = _mm_add_pd(same3, neig3); |
| | 2219 | |
| | 2220 | //xxxxxxxxxxxxx |
| | 2221 | neig0 = _mm_load_pd(&hood[FC<0, -1, 1>()].coeffB(FC<11>(), x)); |
| | 2222 | neig1 = _mm_load_pd(&hood[FC<2, -1, 1>()].coeffB(FC<11>(), x)); |
| | 2223 | neig2 = _mm_load_pd(&hood[FC<4, -1, 1>()].coeffB(FC<11>(), x)); |
| | 2224 | neig3 = _mm_load_pd(&hood[FC<6, -1, 1>()].coeffB(FC<11>(), x)); |
| | 2225 | |
| | 2226 | same0 = _mm_add_pd(same0, neig0); |
| | 2227 | same1 = _mm_add_pd(same1, neig1); |
| | 2228 | same2 = _mm_add_pd(same2, neig2); |
| | 2229 | same3 = _mm_add_pd(same3, neig3); |
| | 2230 | |
| | 2231 | //xxxxxxxxxxxxx |
| | 2232 | neig0 = _mm_load_pd(&hood[FC<0, 0, 1>()].coeffB(FC<11>(), x)); |
| | 2233 | neig1 = _mm_load_pd(&hood[FC<2, 0, 1>()].coeffB(FC<11>(), x)); |
| | 2234 | neig2 = _mm_load_pd(&hood[FC<4, 0, 1>()].coeffB(FC<11>(), x)); |
| | 2235 | neig3 = _mm_load_pd(&hood[FC<6, 0, 1>()].coeffB(FC<11>(), x)); |
| | 2236 | |
| | 2237 | same0 = _mm_add_pd(same0, neig0); |
| | 2238 | same1 = _mm_add_pd(same1, neig1); |
| | 2239 | same2 = _mm_add_pd(same2, neig2); |
| | 2240 | same3 = _mm_add_pd(same3, neig3); |
| | 2241 | |
| | 2242 | //xxxxxxxxxxxxx |
| | 2243 | neig0 = _mm_load_pd(&hood[FC<0, 1, 1>()].coeffB(FC<11>(), x)); |
| | 2244 | neig1 = _mm_load_pd(&hood[FC<2, 1, 1>()].coeffB(FC<11>(), x)); |
| | 2245 | neig2 = _mm_load_pd(&hood[FC<4, 1, 1>()].coeffB(FC<11>(), x)); |
| | 2246 | neig3 = _mm_load_pd(&hood[FC<6, 1, 1>()].coeffB(FC<11>(), x)); |
| | 2247 | |
| | 2248 | same0 = _mm_add_pd(same0, neig0); |
| | 2249 | same1 = _mm_add_pd(same1, neig1); |
| | 2250 | same2 = _mm_add_pd(same2, neig2); |
| | 2251 | same3 = _mm_add_pd(same3, neig3); |
| | 2252 | |
| | 2253 | //xxxxxxxxxxxxx |
| | 2254 | neig0 = _mm_load_pd(&hood[FC<0, -1, -1>()].coeffB(FC<12>(), x)); |
| | 2255 | neig1 = _mm_load_pd(&hood[FC<2, -1, -1>()].coeffB(FC<12>(), x)); |
| | 2256 | neig2 = _mm_load_pd(&hood[FC<4, -1, -1>()].coeffB(FC<12>(), x)); |
| | 2257 | neig3 = _mm_load_pd(&hood[FC<6, -1, -1>()].coeffB(FC<12>(), x)); |
| | 2258 | |
| | 2259 | same0 = _mm_add_pd(same0, neig0); |
| | 2260 | same1 = _mm_add_pd(same1, neig1); |
| | 2261 | same2 = _mm_add_pd(same2, neig2); |
| | 2262 | same3 = _mm_add_pd(same3, neig3); |
| | 2263 | |
| | 2264 | //xxxxxxxxxxxxx |
| | 2265 | neig0 = _mm_load_pd(&hood[FC<0, 0, -1>()].coeffB(FC<12>(), x)); |
| | 2266 | neig1 = _mm_load_pd(&hood[FC<2, 0, -1>()].coeffB(FC<12>(), x)); |
| | 2267 | neig2 = _mm_load_pd(&hood[FC<4, 0, -1>()].coeffB(FC<12>(), x)); |
| | 2268 | neig3 = _mm_load_pd(&hood[FC<6, 0, -1>()].coeffB(FC<12>(), x)); |
| | 2269 | |
| | 2270 | same0 = _mm_add_pd(same0, neig0); |
| | 2271 | same1 = _mm_add_pd(same1, neig1); |
| | 2272 | same2 = _mm_add_pd(same2, neig2); |
| | 2273 | same3 = _mm_add_pd(same3, neig3); |
| | 2274 | |
| | 2275 | //xxxxxxxxxxxxx |
| | 2276 | neig0 = _mm_load_pd(&hood[FC<0, 1, -1>()].coeffB(FC<12>(), x)); |
| | 2277 | neig1 = _mm_load_pd(&hood[FC<2, 1, -1>()].coeffB(FC<12>(), x)); |
| | 2278 | neig2 = _mm_load_pd(&hood[FC<4, 1, -1>()].coeffB(FC<12>(), x)); |
| | 2279 | neig3 = _mm_load_pd(&hood[FC<6, 1, -1>()].coeffB(FC<12>(), x)); |
| | 2280 | |
| | 2281 | same0 = _mm_add_pd(same0, neig0); |
| | 2282 | same1 = _mm_add_pd(same1, neig1); |
| | 2283 | same2 = _mm_add_pd(same2, neig2); |
| | 2284 | same3 = _mm_add_pd(same3, neig3); |
| | 2285 | |
| | 2286 | //xxxxxxxxxxxxx |
| | 2287 | neig0 = _mm_load_pd(&hood[FC<0, -1, 0>()].coeffB(FC<12>(), x)); |
| | 2288 | neig1 = _mm_load_pd(&hood[FC<2, -1, 0>()].coeffB(FC<12>(), x)); |
| | 2289 | neig2 = _mm_load_pd(&hood[FC<4, -1, 0>()].coeffB(FC<12>(), x)); |
| | 2290 | neig3 = _mm_load_pd(&hood[FC<6, -1, 0>()].coeffB(FC<12>(), x)); |
| | 2291 | |
| | 2292 | same0 = _mm_add_pd(same0, neig0); |
| | 2293 | same1 = _mm_add_pd(same1, neig1); |
| | 2294 | same2 = _mm_add_pd(same2, neig2); |
| | 2295 | same3 = _mm_add_pd(same3, neig3); |
| | 2296 | |
| | 2297 | //xxxxxxxxxxxxx |
| | 2298 | neig0 = _mm_load_pd(&hood[FC<0, 1, 0>()].coeffB(FC<12>(), x)); |
| | 2299 | neig1 = _mm_load_pd(&hood[FC<2, 1, 0>()].coeffB(FC<12>(), x)); |
| | 2300 | neig2 = _mm_load_pd(&hood[FC<4, 1, 0>()].coeffB(FC<12>(), x)); |
| | 2301 | neig3 = _mm_load_pd(&hood[FC<6, 1, 0>()].coeffB(FC<12>(), x)); |
| | 2302 | |
| | 2303 | same0 = _mm_add_pd(same0, neig0); |
| | 2304 | same1 = _mm_add_pd(same1, neig1); |
| | 2305 | same2 = _mm_add_pd(same2, neig2); |
| | 2306 | same3 = _mm_add_pd(same3, neig3); |
| | 2307 | |
| | 2308 | //xxxxxxxxxxxxx |
| | 2309 | neig0 = _mm_load_pd(&hood[FC<0, 0, 0>()].coeffB(FC<12>(), x)); |
| | 2310 | neig1 = _mm_load_pd(&hood[FC<2, 0, 0>()].coeffB(FC<12>(), x)); |
| | 2311 | neig2 = _mm_load_pd(&hood[FC<4, 0, 0>()].coeffB(FC<12>(), x)); |
| | 2312 | neig3 = _mm_load_pd(&hood[FC<6, 0, 0>()].coeffB(FC<12>(), x)); |
| | 2313 | |
| | 2314 | same0 = _mm_add_pd(same0, neig0); |
| | 2315 | same1 = _mm_add_pd(same1, neig1); |
| | 2316 | same2 = _mm_add_pd(same2, neig2); |
| | 2317 | same3 = _mm_add_pd(same3, neig3); |
| | 2318 | |
| | 2319 | //xxxxxxxxxxxxx |
| | 2320 | neig0 = _mm_load_pd(&hood[FC<0, -1, 1>()].coeffB(FC<12>(), x)); |
| | 2321 | neig1 = _mm_load_pd(&hood[FC<2, -1, 1>()].coeffB(FC<12>(), x)); |
| | 2322 | neig2 = _mm_load_pd(&hood[FC<4, -1, 1>()].coeffB(FC<12>(), x)); |
| | 2323 | neig3 = _mm_load_pd(&hood[FC<6, -1, 1>()].coeffB(FC<12>(), x)); |
| | 2324 | |
| | 2325 | same0 = _mm_add_pd(same0, neig0); |
| | 2326 | same1 = _mm_add_pd(same1, neig1); |
| | 2327 | same2 = _mm_add_pd(same2, neig2); |
| | 2328 | same3 = _mm_add_pd(same3, neig3); |
| | 2329 | |
| | 2330 | //xxxxxxxxxxxxx |
| | 2331 | neig0 = _mm_load_pd(&hood[FC<0, 0, 1>()].coeffB(FC<12>(), x)); |
| | 2332 | neig1 = _mm_load_pd(&hood[FC<2, 0, 1>()].coeffB(FC<12>(), x)); |
| | 2333 | neig2 = _mm_load_pd(&hood[FC<4, 0, 1>()].coeffB(FC<12>(), x)); |
| | 2334 | neig3 = _mm_load_pd(&hood[FC<6, 0, 1>()].coeffB(FC<12>(), x)); |
| | 2335 | |
| | 2336 | same0 = _mm_add_pd(same0, neig0); |
| | 2337 | same1 = _mm_add_pd(same1, neig1); |
| | 2338 | same2 = _mm_add_pd(same2, neig2); |
| | 2339 | same3 = _mm_add_pd(same3, neig3); |
| | 2340 | |
| | 2341 | //xxxxxxxxxxxxx |
| | 2342 | neig0 = _mm_load_pd(&hood[FC<0, 1, 1>()].coeffB(FC<12>(), x)); |
| | 2343 | neig1 = _mm_load_pd(&hood[FC<2, 1, 1>()].coeffB(FC<12>(), x)); |
| | 2344 | neig2 = _mm_load_pd(&hood[FC<4, 1, 1>()].coeffB(FC<12>(), x)); |
| | 2345 | neig3 = _mm_load_pd(&hood[FC<6, 1, 1>()].coeffB(FC<12>(), x)); |
| | 2346 | |
| | 2347 | same0 = _mm_add_pd(same0, neig0); |
| | 2348 | same1 = _mm_add_pd(same1, neig1); |
| | 2349 | same2 = _mm_add_pd(same2, neig2); |
| | 2350 | same3 = _mm_add_pd(same3, neig3); |
| | 2351 | |
| | 2352 | //yyyyyyyyyyyyy |
| | 2353 | _mm_store_pd(dst + x + 0, same0); |
| | 2354 | _mm_store_pd(dst + x + 2, same1); |
| | 2355 | _mm_store_pd(dst + x + 4, same2); |
| | 2356 | _mm_store_pd(dst + x + 6, same3); |
| | 2357 | |
| | 2358 | same0 = same4; |
| | 2359 | neig0 = neig4; |
| | 2360 | |
| | 2361 | // dst[x] = |
| | 2362 | // coeff[0][x] * src[x - offsetZ] + |
| | 2363 | // coeff[1][x] * src[x - offsetY] + |
| | 2364 | // coeff[2][x] * src[x - 1] + |
| | 2365 | // coeff[3][x] * src[x] + |
| | 2366 | // coeff[4][x] * src[x + 1] + |
| | 2367 | // coeff[5][x] * src[x + offsetY] + |
| | 2368 | // coeff[6][x] * src[x + offsetZ]; |
| | 2369 | } |
| | 2370 | |
| | 2371 | stepScalar(hood, dst, x, endX); |
| | 2372 | } |
| | 2373 | |
| | 2374 | template<class NEIGHBORHOOD> |
| | 2375 | inline void stepScalar(const NEIGHBORHOOD& hood, double *dst, int startX, int endX) |
| | 2376 | { |
| | 2377 | for (int x = startX; x < endX; ++x) { |
| | 2378 | dst[x] = |
| | 2379 | hood[FC<0, 0, 0>()].coeffB(FC< 0>(), x) * hood[FC< 0, -1, -1>()].srcB(x) + |
| | 2380 | hood[FC<0, 0, 0>()].coeffB(FC< 1>(), x) * hood[FC< 0, 0, -1>()].srcB(x) + |
| | 2381 | hood[FC<0, 0, 0>()].coeffB(FC< 2>(), x) * hood[FC< 0, 1, -1>()].srcB(x) + |
| | 2382 | hood[FC<0, 0, 0>()].coeffB(FC< 3>(), x) * hood[FC< 0, -1, 0>()].srcB(x) + |
| | 2383 | hood[FC<0, 0, 0>()].coeffB(FC< 4>(), x) * hood[FC<-1, 0, 0>()].srcB(x) + |
| | 2384 | hood[FC<0, 0, 0>()].coeffB(FC< 5>(), x) * hood[FC< 0, 0, 0>()].srcB(x) + |
| | 2385 | hood[FC<0, 0, 0>()].coeffB(FC< 6>(), x) * hood[FC< 1, 0, 0>()].srcB(x) + |
| | 2386 | hood[FC<0, 0, 0>()].coeffB(FC< 7>(), x) * hood[FC< 0, 1, 0>()].srcB(x) + |
| | 2387 | hood[FC<0, 0, 0>()].coeffB(FC< 8>(), x) * hood[FC< 0, -1, 1>()].srcB(x) + |
| | 2388 | hood[FC<0, 0, 0>()].coeffB(FC< 9>(), x) * hood[FC< 0, 0, 1>()].srcB(x) + |
| | 2389 | hood[FC<0, 0, 0>()].coeffB(FC<10>(), x) * hood[FC< 0, 1, 1>()].srcB(x) + |
| | 2390 | |
| | 2391 | hood[FC<0, -1, -1>()].coeffB(FC<11>(), x) + |
| | 2392 | hood[FC<0, 0, -1>()].coeffB(FC<11>(), x) + |
| | 2393 | hood[FC<0, 1, -1>()].coeffB(FC<11>(), x) + |
| | 2394 | hood[FC<0, -1, 0>()].coeffB(FC<11>(), x) + |
| | 2395 | hood[FC<-1, 0, 0>()].coeffB(FC<11>(), x) + |
| | 2396 | hood[FC<0, 0, 0>()].coeffB(FC<11>(), x) + |
| | 2397 | hood[FC<1, 0, 0>()].coeffB(FC<11>(), x) + |
| | 2398 | hood[FC<0, 1, 0>()].coeffB(FC<11>(), x) + |
| | 2399 | hood[FC<0, -1, 1>()].coeffB(FC<11>(), x) + |
| | 2400 | hood[FC<0, 0, 1>()].coeffB(FC<11>(), x) + |
| | 2401 | hood[FC<0, 1, 1>()].coeffB(FC<11>(), x) + |
| | 2402 | |
| | 2403 | hood[FC<0, -1, -1>()].coeffB(FC<12>(), x) + |
| | 2404 | hood[FC<0, 0, -1>()].coeffB(FC<12>(), x) + |
| | 2405 | hood[FC<0, 1, -1>()].coeffB(FC<12>(), x) + |
| | 2406 | hood[FC<0, -1, 0>()].coeffB(FC<12>(), x) + |
| | 2407 | hood[FC<-1, 0, 0>()].coeffB(FC<12>(), x) + |
| | 2408 | hood[FC<0, 0, 0>()].coeffB(FC<12>(), x) + |
| | 2409 | hood[FC<1, 0, 0>()].coeffB(FC<12>(), x) + |
| | 2410 | hood[FC<0, 1, 0>()].coeffB(FC<12>(), x) + |
| | 2411 | hood[FC<0, -1, 1>()].coeffB(FC<12>(), x) + |
| | 2412 | hood[FC<0, 0, 1>()].coeffB(FC<12>(), x) + |
| | 2413 | hood[FC<0, 1, 1>()].coeffB(FC<12>(), x); |
| | 2414 | } |
| | 2415 | } |
| | 2416 | |
| | 2417 | int flops() |
| | 2418 | { |
| | 2419 | return 40; |
| | 2420 | } |
| | 2421 | }; |