| | 9 | #include <libgeodecomp/misc/grid.h> |
| | 10 | |
| | 11 | using namespace LibGeoDecomp; |
| | 12 | |
| | 13 | typedef Grid<double> GridType; |
| | 14 | |
| | 15 | class Scalar |
| | 16 | { |
| | 17 | public: |
| | 18 | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| | 19 | { |
| | 20 | for (int x = startX; x < endX; ++x) { |
| | 21 | dst[x] = (src[x - offset] + src[x - 1] + src[x] + src[x + 1] + src[x + offset]) * 0.2; |
| | 22 | } |
| | 23 | } |
| | 24 | |
| | 25 | int flops() |
| | 26 | { |
| | 27 | return 5; |
| | 28 | } |
| | 29 | }; |
| | 30 | |
| | 31 | class VectorizedSSEMelbourneShuffle |
| | 32 | { |
| | 33 | public: |
| | 34 | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| | 35 | { |
| | 36 | int x = startX; |
| | 37 | Scalar scalarUpdater; |
| | 38 | |
| | 39 | if ((x & 1) == 1) { |
| | 40 | scalarUpdater.step(src, dst, offset, x, x + 1); |
| | 41 | x += 1; |
| | 42 | } |
| | 43 | |
| | 44 | __m128d oneFifth = _mm_set_pd(1.0/3.0, 1.0/3.0); |
| | 45 | __m128d buff0 = _mm_loadu_pd(src + x - 1); |
| | 46 | __m128d same0 = _mm_load_pd(src + x + 0); |
| | 47 | |
| | 48 | int paddedEndX = endX - 7; |
| | 49 | for (; x < paddedEndX; x += 8) { |
| | 50 | // load center row |
| | 51 | __m128d same1 = _mm_load_pd(src + x + 2); |
| | 52 | __m128d same2 = _mm_load_pd(src + x + 4); |
| | 53 | __m128d same3 = _mm_load_pd(src + x + 6); |
| | 54 | __m128d same4 = _mm_load_pd(src + x + 8); |
| | 55 | |
| | 56 | // shuffle values obtain left/right neighbors |
| | 57 | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 58 | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 59 | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 60 | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 61 | |
| | 62 | // load top row |
| | 63 | __m128d temp0 = _mm_load_pd(src - offset + x + 0); |
| | 64 | __m128d temp1 = _mm_load_pd(src - offset + x + 2); |
| | 65 | __m128d temp2 = _mm_load_pd(src - offset + x + 4); |
| | 66 | __m128d temp3 = _mm_load_pd(src - offset + x + 6); |
| | 67 | |
| | 68 | // add center row with left... |
| | 69 | same0 = _mm_add_pd(same0, buff0); |
| | 70 | same1 = _mm_add_pd(same1, buff1); |
| | 71 | same2 = _mm_add_pd(same2, buff2); |
| | 72 | same3 = _mm_add_pd(same3, buff3); |
| | 73 | |
| | 74 | // ...and right neighbors |
| | 75 | same0 = _mm_add_pd(same0, buff1); |
| | 76 | same1 = _mm_add_pd(same1, buff2); |
| | 77 | same2 = _mm_add_pd(same2, buff3); |
| | 78 | same3 = _mm_add_pd(same3, buff4); |
| | 79 | |
| | 80 | // load bottom row |
| | 81 | buff0 = _mm_load_pd(src + offset + x + 0); |
| | 82 | buff1 = _mm_load_pd(src + offset + x + 2); |
| | 83 | buff2 = _mm_load_pd(src + offset + x + 4); |
| | 84 | buff3 = _mm_load_pd(src + offset + x + 6); |
| | 85 | |
| | 86 | // add top row |
| | 87 | same0 = _mm_add_pd(same0, temp0); |
| | 88 | same1 = _mm_add_pd(same1, temp1); |
| | 89 | same2 = _mm_add_pd(same2, temp2); |
| | 90 | same3 = _mm_add_pd(same3, temp3); |
| | 91 | |
| | 92 | // add bottom row |
| | 93 | same0 = _mm_add_pd(same0, buff0); |
| | 94 | same1 = _mm_add_pd(same1, buff1); |
| | 95 | same2 = _mm_add_pd(same2, buff2); |
| | 96 | same3 = _mm_add_pd(same3, buff3); |
| | 97 | |
| | 98 | // scale down... |
| | 99 | same0 = _mm_mul_pd(same0, oneFifth); |
| | 100 | same1 = _mm_mul_pd(same1, oneFifth); |
| | 101 | same2 = _mm_mul_pd(same2, oneFifth); |
| | 102 | same3 = _mm_mul_pd(same3, oneFifth); |
| | 103 | |
| | 104 | // ...and store |
| | 105 | _mm_store_pd(dst + 0, same0); |
| | 106 | _mm_store_pd(dst + 2, same1); |
| | 107 | _mm_store_pd(dst + 4, same2); |
| | 108 | _mm_store_pd(dst + 6, same3); |
| | 109 | |
| | 110 | same0 = same4; |
| | 111 | buff0 = buff4; |
| | 112 | } |
| | 113 | |
| | 114 | scalarUpdater.step(src, dst, offset, x, endX); |
| | 115 | } |
| | 116 | |
| | 117 | int flops() |
| | 118 | { |
| | 119 | return 5; |
| | 120 | } |
| | 121 | }; |
| | 122 | |
| | 123 | class VectorizedSSEMelbourneShuffleB |
| | 124 | { |
| | 125 | public: |
| | 126 | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| | 127 | { |
| | 128 | int x = startX; |
| | 129 | Scalar scalarUpdater; |
| | 130 | |
| | 131 | if ((x & 1) == 1) { |
| | 132 | scalarUpdater.step(src, dst, offset, x, x + 1); |
| | 133 | x += 1; |
| | 134 | } |
| | 135 | |
| | 136 | __m128d oneFifth = _mm_set_pd(1.0/3.0, 1.0/3.0); |
| | 137 | __m128d buff0 = _mm_loadu_pd(src + x - 1); |
| | 138 | __m128d same0 = _mm_load_pd(src + x + 0); |
| | 139 | |
| | 140 | int paddedEndX = endX - 7; |
| | 141 | for (; x < paddedEndX; x += 8) { |
| | 142 | // load center row |
| | 143 | __m128d same1 = _mm_load_pd(src + x + 2); |
| | 144 | __m128d same2 = _mm_load_pd(src + x + 4); |
| | 145 | __m128d same3 = _mm_load_pd(src + x + 6); |
| | 146 | __m128d same4 = _mm_load_pd(src + x + 8); |
| | 147 | |
| | 148 | // shuffle values obtain left/right neighbors |
| | 149 | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 150 | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 151 | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 152 | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 153 | |
| | 154 | // add center row with left... |
| | 155 | same0 = _mm_add_pd(same0, buff0); |
| | 156 | same1 = _mm_add_pd(same1, buff1); |
| | 157 | same2 = _mm_add_pd(same2, buff2); |
| | 158 | same3 = _mm_add_pd(same3, buff3); |
| | 159 | |
| | 160 | // ...and right neighbors |
| | 161 | same0 = _mm_add_pd(same0, buff1); |
| | 162 | same1 = _mm_add_pd(same1, buff2); |
| | 163 | same2 = _mm_add_pd(same2, buff3); |
| | 164 | same3 = _mm_add_pd(same3, buff4); |
| | 165 | |
| | 166 | // load top row |
| | 167 | __m128d temp0 = _mm_load_pd(src - offset + x + 0); |
| | 168 | __m128d temp1 = _mm_load_pd(src - offset + x + 2); |
| | 169 | __m128d temp2 = _mm_load_pd(src - offset + x + 4); |
| | 170 | __m128d temp3 = _mm_load_pd(src - offset + x + 6); |
| | 171 | // add top row |
| | 172 | same0 = _mm_add_pd(same0, temp0); |
| | 173 | same1 = _mm_add_pd(same1, temp1); |
| | 174 | same2 = _mm_add_pd(same2, temp2); |
| | 175 | same3 = _mm_add_pd(same3, temp3); |
| | 176 | |
| | 177 | // load bottom row |
| | 178 | buff0 = _mm_load_pd(src + offset + x + 0); |
| | 179 | buff1 = _mm_load_pd(src + offset + x + 2); |
| | 180 | buff2 = _mm_load_pd(src + offset + x + 4); |
| | 181 | buff3 = _mm_load_pd(src + offset + x + 6); |
| | 182 | // add bottom row |
| | 183 | same0 = _mm_add_pd(same0, buff0); |
| | 184 | same1 = _mm_add_pd(same1, buff1); |
| | 185 | same2 = _mm_add_pd(same2, buff2); |
| | 186 | same3 = _mm_add_pd(same3, buff3); |
| | 187 | |
| | 188 | // scale down... |
| | 189 | same0 = _mm_mul_pd(same0, oneFifth); |
| | 190 | same1 = _mm_mul_pd(same1, oneFifth); |
| | 191 | same2 = _mm_mul_pd(same2, oneFifth); |
| | 192 | same3 = _mm_mul_pd(same3, oneFifth); |
| | 193 | |
| | 194 | // ...and store |
| | 195 | _mm_store_pd(dst + 0, same0); |
| | 196 | _mm_store_pd(dst + 2, same1); |
| | 197 | _mm_store_pd(dst + 4, same2); |
| | 198 | _mm_store_pd(dst + 6, same3); |
| | 199 | |
| | 200 | same0 = same4; |
| | 201 | buff0 = buff4; |
| | 202 | } |
| | 203 | |
| | 204 | scalarUpdater.step(src, dst, offset, x, endX); |
| | 205 | } |
| | 206 | |
| | 207 | int flops() |
| | 208 | { |
| | 209 | return 5; |
| | 210 | } |
| | 211 | }; |
| | 212 | |
| | 213 | class VectorizedSSEMelbourneShuffleC |
| | 214 | { |
| | 215 | public: |
| | 216 | inline void step(double *src, double *dst, int offset, int startX, int endX) |
| | 217 | { |
| | 218 | int x = startX; |
| | 219 | Scalar scalarUpdater; |
| | 220 | |
| | 221 | if ((x & 1) == 1) { |
| | 222 | scalarUpdater.step(src, dst, offset, x, x + 1); |
| | 223 | x += 1; |
| | 224 | } |
| | 225 | |
| | 226 | __m128d oneFifth = _mm_set_pd(1.0/3.0, 1.0/3.0); |
| | 227 | __m128d buff0 = _mm_loadu_pd(src + x - 1); |
| | 228 | __m128d same0 = _mm_load_pd(src + x + 0); |
| | 229 | |
| | 230 | int paddedEndX = endX - 7; |
| | 231 | for (; x < paddedEndX; x += 8) { |
| | 232 | |
| | 233 | __m128d same1 = _mm_load_pd(src + x + 2); |
| | 234 | __m128d buff1 = _mm_shuffle_pd(same0, same1, (1 << 0) | (0 << 2)); |
| | 235 | same0 = _mm_add_pd(same0, buff0); |
| | 236 | same0 = _mm_add_pd(same0, buff1); |
| | 237 | __m128d temp0 = _mm_load_pd(src - offset + x + 0); |
| | 238 | same0 = _mm_add_pd(same0, temp0); |
| | 239 | temp0 = _mm_load_pd(src + offset + x + 0); |
| | 240 | same0 = _mm_add_pd(same0, temp0); |
| | 241 | same0 = _mm_mul_pd(same0, oneFifth); |
| | 242 | _mm_store_pd(dst + 0, same0); |
| | 243 | |
| | 244 | __m128d same2 = _mm_load_pd(src + x + 4); |
| | 245 | __m128d buff2 = _mm_shuffle_pd(same1, same2, (1 << 0) | (0 << 2)); |
| | 246 | same1 = _mm_add_pd(same1, buff1); |
| | 247 | same1 = _mm_add_pd(same1, buff2); |
| | 248 | __m128d temp1 = _mm_load_pd(src - offset + x + 2); |
| | 249 | same1 = _mm_add_pd(same1, temp1); |
| | 250 | temp1 = _mm_load_pd(src + offset + x + 2); |
| | 251 | same1 = _mm_add_pd(same1, temp1); |
| | 252 | same1 = _mm_mul_pd(same1, oneFifth); |
| | 253 | _mm_store_pd(dst + 2, same1); |
| | 254 | |
| | 255 | __m128d same3 = _mm_load_pd(src + x + 6); |
| | 256 | __m128d buff3 = _mm_shuffle_pd(same2, same3, (1 << 0) | (0 << 2)); |
| | 257 | same2 = _mm_add_pd(same2, buff2); |
| | 258 | same2 = _mm_add_pd(same2, buff3); |
| | 259 | __m128d temp2 = _mm_load_pd(src - offset + x + 4); |
| | 260 | same2 = _mm_add_pd(same2, temp2); |
| | 261 | temp2 = _mm_load_pd(src + offset + x + 4); |
| | 262 | same2 = _mm_add_pd(same2, temp2); |
| | 263 | same2 = _mm_mul_pd(same2, oneFifth); |
| | 264 | _mm_store_pd(dst + 4, same2); |
| | 265 | |
| | 266 | __m128d same4 = _mm_load_pd(src + x + 8); |
| | 267 | __m128d buff4 = _mm_shuffle_pd(same3, same4, (1 << 0) | (0 << 2)); |
| | 268 | same3 = _mm_add_pd(same3, buff3); |
| | 269 | same3 = _mm_add_pd(same3, buff4); |
| | 270 | __m128d temp3 = _mm_load_pd(src - offset + x + 6); |
| | 271 | same3 = _mm_add_pd(same3, temp3); |
| | 272 | temp3 = _mm_load_pd(src + offset + x + 6); |
| | 273 | same3 = _mm_add_pd(same3, temp3); |
| | 274 | same3 = _mm_mul_pd(same3, oneFifth); |
| | 275 | _mm_store_pd(dst + 6, same3); |
| | 276 | |
| | 277 | same0 = same4; |
| | 278 | buff0 = buff4; |
| | 279 | } |
| | 280 | |
| | 281 | scalarUpdater.step(src, dst, offset, x, endX); |
| | 282 | } |
| | 283 | |
| | 284 | int flops() |
| | 285 | { |
| | 286 | return 5; |
| | 287 | } |
| | 288 | }; |
| | 289 | |
| | 290 | |
| | 291 | template<typename UPDATER> |
| | 292 | class Benchmark |
| | 293 | { |
| | 294 | public: |
| | 295 | void run(Coord<2> dim, int repeats) |
| | 296 | { |
| | 297 | GridType a(dim); |
| | 298 | GridType b(dim); |
| | 299 | |
| | 300 | GridType *oldGrid = &a; |
| | 301 | GridType *newGrid = &b; |
| | 302 | |
| | 303 | int height = dim.y(); |
| | 304 | int width = dim.x(); |
| | 305 | |
| | 306 | UPDATER updater; |
| | 307 | |
| | 308 | long long tStart = getUTtime(); |
| | 309 | |
| | 310 | for (int t = 0; t < repeats; ++t) { |
| | 311 | for (int y = 1; y < height - 1; ++y) { |
| | 312 | Coord<2> c(0, y); |
| | 313 | updater.step(&oldGrid->at(c), &newGrid->at(c), width, 1, width - 1); |
| | 314 | } |
| | 315 | |
| | 316 | std::swap(newGrid, oldGrid); |
| | 317 | } |
| | 318 | |
| | 319 | long long tEnd = getUTtime(); |
| | 320 | evaluate(dim, repeats, tEnd - tStart); |
| | 321 | } |
| | 322 | |
| | 323 | void exercise() |
| | 324 | { |
| | 325 | std::cout << "# " << typeid(UPDATER).name() << "\n"; |
| | 326 | int lastDim = 0; |
| | 327 | for (int i = 4; i <= 4096; i *= 2) { |
| | 328 | int intermediateSteps = 8; |
| | 329 | for (int j = 0; j < intermediateSteps; ++j) { |
| | 330 | int d = i * std::pow(2, j * (1.0 / intermediateSteps)); |
| | 331 | if (d % 2) { |
| | 332 | d += 1; |
| | 333 | } |
| | 334 | |
| | 335 | if (d > lastDim) { |
| | 336 | lastDim = d; |
| | 337 | Coord<2> dim(d, d); |
| | 338 | int repeats = std::max(1, 500000000 / dim.prod()); |
| | 339 | run(dim, repeats); |
| | 340 | } |
| | 341 | } |
| | 342 | } |
| | 343 | std::cout << "\n"; |
| | 344 | } |
| | 345 | |
| | 346 | private: |
| | 347 | long long getUTtime() |
| | 348 | { |
| | 349 | timeval t; |
| | 350 | gettimeofday(&t, 0); |
| | 351 | return (long long)t.tv_sec * 1000000 + t.tv_usec; |
| | 352 | } |
| | 353 | |
| | 354 | void evaluate(Coord<2> dim, int repeats, long long uTime) |
| | 355 | { |
| | 356 | double seconds = 1.0 * uTime / 1000 / 1000; |
| | 357 | double gflops = 1.0 * UPDATER().flops() * (dim.x() - 2) * (dim.y() - 2) * |
| | 358 | repeats / 1000 / 1000 / 1000 / seconds; |
| | 359 | std::cout << dim.x() << " " << dim.y() << " " << gflops << "\n"; |
| | 360 | } |
| | 361 | |
| | 362 | |
| | 363 | }; |