mdds
Loading...
Searching...
No Matches
soa/block_util.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
3// SPDX-FileCopyrightText: 2021 - 2025 Kohei Yoshida
4//
5// SPDX-License-Identifier: MIT
6
7#pragma once
8
9#include "mdds/global.hpp"
10#include "../types.hpp"
11
12#if defined(__SSE2__)
13#include <emmintrin.h>
14#endif
15#if defined(__AVX2__)
16#include <immintrin.h>
17#endif
18
19namespace mdds { namespace mtv { namespace soa { namespace detail {
20
21template<typename Blks, lu_factor_t F>
23{
24 void operator()(Blks& /*block_store*/, int64_t /*start_block_index*/, int64_t /*delta*/) const
25 {
26 static_assert(
27 mdds::detail::invalid_static_int<F>, "The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
28 }
29};
30
31template<typename Blks>
32struct adjust_block_positions<Blks, lu_factor_t::none>
33{
34 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
35 {
36 int64_t n = block_store.positions.size();
37
38 if (start_block_index >= n)
39 return;
40
41#if MDDS_USE_OPENMP
42#pragma omp parallel for
43#endif
44 for (int64_t i = start_block_index; i < n; ++i)
45 block_store.positions[i] += delta;
46 }
47};
48
49template<typename Blks>
50struct adjust_block_positions<Blks, lu_factor_t::lu4>
51{
52 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
53 {
54 int64_t n = block_store.positions.size();
55
56 if (start_block_index >= n)
57 return;
58
59 // Ensure that the section length is divisible by 4.
60 int64_t len = n - start_block_index;
61 int64_t rem = len & 3; // % 4
62 len -= rem;
63 len += start_block_index;
64#if MDDS_USE_OPENMP
65#pragma omp parallel for
66#endif
67 for (int64_t i = start_block_index; i < len; i += 4)
68 {
69 block_store.positions[i + 0] += delta;
70 block_store.positions[i + 1] += delta;
71 block_store.positions[i + 2] += delta;
72 block_store.positions[i + 3] += delta;
73 }
74
75 rem += len;
76 for (int64_t i = len; i < rem; ++i)
77 block_store.positions[i] += delta;
78 }
79};
80
81template<typename Blks>
82struct adjust_block_positions<Blks, lu_factor_t::lu8>
83{
84 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
85 {
86 int64_t n = block_store.positions.size();
87
88 if (start_block_index >= n)
89 return;
90
91 // Ensure that the section length is divisible by 8.
92 int64_t len = n - start_block_index;
93 int64_t rem = len & 7; // % 8
94 len -= rem;
95 len += start_block_index;
96#if MDDS_USE_OPENMP
97#pragma omp parallel for
98#endif
99 for (int64_t i = start_block_index; i < len; i += 8)
100 {
101 block_store.positions[i + 0] += delta;
102 block_store.positions[i + 1] += delta;
103 block_store.positions[i + 2] += delta;
104 block_store.positions[i + 3] += delta;
105 block_store.positions[i + 4] += delta;
106 block_store.positions[i + 5] += delta;
107 block_store.positions[i + 6] += delta;
108 block_store.positions[i + 7] += delta;
109 }
110
111 rem += len;
112 for (int64_t i = len; i < rem; ++i)
113 block_store.positions[i] += delta;
114 }
115};
116
117template<typename Blks>
118struct adjust_block_positions<Blks, lu_factor_t::lu16>
119{
120 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
121 {
122 int64_t n = block_store.positions.size();
123
124 if (start_block_index >= n)
125 return;
126
127 // Ensure that the section length is divisible by 16.
128 int64_t len = n - start_block_index;
129 int64_t rem = len & 15; // % 16
130 len -= rem;
131 len += start_block_index;
132#if MDDS_USE_OPENMP
133#pragma omp parallel for
134#endif
135 for (int64_t i = start_block_index; i < len; i += 16)
136 {
137 block_store.positions[i + 0] += delta;
138 block_store.positions[i + 1] += delta;
139 block_store.positions[i + 2] += delta;
140 block_store.positions[i + 3] += delta;
141 block_store.positions[i + 4] += delta;
142 block_store.positions[i + 5] += delta;
143 block_store.positions[i + 6] += delta;
144 block_store.positions[i + 7] += delta;
145 block_store.positions[i + 8] += delta;
146 block_store.positions[i + 9] += delta;
147 block_store.positions[i + 10] += delta;
148 block_store.positions[i + 11] += delta;
149 block_store.positions[i + 12] += delta;
150 block_store.positions[i + 13] += delta;
151 block_store.positions[i + 14] += delta;
152 block_store.positions[i + 15] += delta;
153 }
154
155 rem += len;
156 for (int64_t i = len; i < rem; ++i)
157 block_store.positions[i] += delta;
158 }
159};
160
161template<typename Blks>
162struct adjust_block_positions<Blks, lu_factor_t::lu32>
163{
164 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
165 {
166 int64_t n = block_store.positions.size();
167
168 if (start_block_index >= n)
169 return;
170
171 // Ensure that the section length is divisible by 32.
172 int64_t len = n - start_block_index;
173 int64_t rem = len & 31; // % 32
174 len -= rem;
175 len += start_block_index;
176#if MDDS_USE_OPENMP
177#pragma omp parallel for
178#endif
179 for (int64_t i = start_block_index; i < len; i += 32)
180 {
181 block_store.positions[i + 0] += delta;
182 block_store.positions[i + 1] += delta;
183 block_store.positions[i + 2] += delta;
184 block_store.positions[i + 3] += delta;
185 block_store.positions[i + 4] += delta;
186 block_store.positions[i + 5] += delta;
187 block_store.positions[i + 6] += delta;
188 block_store.positions[i + 7] += delta;
189 block_store.positions[i + 8] += delta;
190 block_store.positions[i + 9] += delta;
191 block_store.positions[i + 10] += delta;
192 block_store.positions[i + 11] += delta;
193 block_store.positions[i + 12] += delta;
194 block_store.positions[i + 13] += delta;
195 block_store.positions[i + 14] += delta;
196 block_store.positions[i + 15] += delta;
197 block_store.positions[i + 16] += delta;
198 block_store.positions[i + 17] += delta;
199 block_store.positions[i + 18] += delta;
200 block_store.positions[i + 19] += delta;
201 block_store.positions[i + 20] += delta;
202 block_store.positions[i + 21] += delta;
203 block_store.positions[i + 22] += delta;
204 block_store.positions[i + 23] += delta;
205 block_store.positions[i + 24] += delta;
206 block_store.positions[i + 25] += delta;
207 block_store.positions[i + 26] += delta;
208 block_store.positions[i + 27] += delta;
209 block_store.positions[i + 28] += delta;
210 block_store.positions[i + 29] += delta;
211 block_store.positions[i + 30] += delta;
212 block_store.positions[i + 31] += delta;
213 }
214
215 rem += len;
216 for (int64_t i = len; i < rem; ++i)
217 block_store.positions[i] += delta;
218 }
219};
220
221#if defined(__SSE2__)
222
223template<typename Blks>
224struct adjust_block_positions<Blks, lu_factor_t::sse2_x64>
225{
226 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
227 {
228 static_assert(
229 sizeof(typename decltype(block_store.positions)::value_type) == 8,
230 "This code works only when the position values are 64-bit wide.");
231
232 int64_t n = block_store.positions.size();
233
234 if (start_block_index >= n)
235 return;
236
237 // Ensure that the section length is divisible by 2.
238 int64_t len = n - start_block_index;
239 bool odd = len & 1;
240 if (odd)
241 len -= 1;
242
243 len += start_block_index;
244
245 __m128i right = _mm_set_epi64x(delta, delta);
246
247#if MDDS_USE_OPENMP
248#pragma omp parallel for
249#endif
250 for (int64_t i = start_block_index; i < len; i += 2)
251 {
252 __m128i* dst = (__m128i*)&block_store.positions[i];
253 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
254 }
255
256 if (odd)
257 block_store.positions[len] += delta;
258 }
259};
260
261template<typename Blks>
262struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
263{
264 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
265 {
266 static_assert(
267 sizeof(typename decltype(block_store.positions)::value_type) == 8,
268 "This code works only when the position values are 64-bit wide.");
269
270 int64_t n = block_store.positions.size();
271
272 if (start_block_index >= n)
273 return;
274
275 // Ensure that the section length is divisible by 8.
276 int64_t len = n - start_block_index;
277 int64_t rem = len & 7; // % 8
278 len -= rem;
279 len += start_block_index;
280
281 __m128i right = _mm_set_epi64x(delta, delta);
282
283#if MDDS_USE_OPENMP
284#pragma omp parallel for
285#endif
286 for (int64_t i = start_block_index; i < len; i += 8)
287 {
288 __m128i* dst0 = (__m128i*)&block_store.positions[i];
289 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
290
291 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
292 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
293
294 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
295 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
296
297 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
298 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
299 }
300
301 rem += len;
302 for (int64_t i = len; i < rem; ++i)
303 block_store.positions[i] += delta;
304 }
305};
306
307template<typename Blks>
308struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
309{
310 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
311 {
312 static_assert(
313 sizeof(typename decltype(block_store.positions)::value_type) == 8,
314 "This code works only when the position values are 64-bit wide.");
315
316 int64_t n = block_store.positions.size();
317
318 if (start_block_index >= n)
319 return;
320
321 // Ensure that the section length is divisible by 16.
322 int64_t len = n - start_block_index;
323 int64_t rem = len & 15; // % 16
324 len -= rem;
325 len += start_block_index;
326
327 __m128i right = _mm_set_epi64x(delta, delta);
328
329#if MDDS_USE_OPENMP
330#pragma omp parallel for
331#endif
332 for (int64_t i = start_block_index; i < len; i += 16)
333 {
334 __m128i* dst0 = (__m128i*)&block_store.positions[i];
335 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
336
337 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
338 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
339
340 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
341 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
342
343 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
344 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
345
346 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
347 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
348
349 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
350 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
351
352 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
353 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
354
355 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
356 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
357 }
358
359 rem += len;
360 for (int64_t i = len; i < rem; ++i)
361 block_store.positions[i] += delta;
362 }
363};
364
365template<typename Blks>
366struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
367{
368 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
369 {
370 static_assert(
371 sizeof(typename decltype(block_store.positions)::value_type) == 8,
372 "This code works only when the position values are 64-bit wide.");
373
374 int64_t n = block_store.positions.size();
375
376 if (start_block_index >= n)
377 return;
378
379 // Ensure that the section length is divisible by 32.
380 int64_t len = n - start_block_index;
381 int64_t rem = len & 31; // % 32
382 len -= rem;
383 len += start_block_index;
384
385 __m128i right = _mm_set_epi64x(delta, delta);
386
387#if MDDS_USE_OPENMP
388#pragma omp parallel for
389#endif
390 for (int64_t i = start_block_index; i < len; i += 32)
391 {
392 __m128i* dst0 = (__m128i*)&block_store.positions[i];
393 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
394
395 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
396 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
397
398 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
399 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
400
401 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
402 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
403
404 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
405 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
406
407 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
408 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
409
410 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
411 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
412
413 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
414 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
415
416 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
417 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
418
419 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
420 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
421
422 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
423 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
424
425 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
426 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
427
428 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
429 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
430
431 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
432 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
433
434 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
435 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
436
437 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
438 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
439 }
440
441 rem += len;
442 for (int64_t i = len; i < rem; ++i)
443 block_store.positions[i] += delta;
444 }
445};
446
447#endif // __SSE2__
448
449#if defined(__AVX2__)
450
451template<typename Blks>
452struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
453{
454 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
455 {
456 static_assert(
457 sizeof(typename decltype(block_store.positions)::value_type) == 8,
458 "This code works only when the position values are 64-bit wide.");
459
460 int64_t n = block_store.positions.size();
461
462 if (start_block_index >= n)
463 return;
464
465 // Ensure that the section length is divisible by 4.
466 int64_t len = n - start_block_index;
467 int64_t rem = len & 3; // % 4
468 len -= rem;
469 len += start_block_index;
470
471 __m256i right = _mm256_set1_epi64x(delta);
472
473#if MDDS_USE_OPENMP
474#pragma omp parallel for
475#endif
476 for (int64_t i = start_block_index; i < len; i += 4)
477 {
478 __m256i* dst = (__m256i*)&block_store.positions[i];
479 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
480 }
481
482 rem += len;
483 for (int64_t i = len; i < rem; ++i)
484 block_store.positions[i] += delta;
485 }
486};
487
488template<typename Blks>
489struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
490{
491 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
492 {
493 static_assert(
494 sizeof(typename decltype(block_store.positions)::value_type) == 8,
495 "This code works only when the position values are 64-bit wide.");
496
497 int64_t n = block_store.positions.size();
498
499 if (start_block_index >= n)
500 return;
501
502 // Ensure that the section length is divisible by 16.
503 int64_t len = n - start_block_index;
504 int64_t rem = len & 15; // % 16
505 len -= rem;
506 len += start_block_index;
507
508 __m256i right = _mm256_set1_epi64x(delta);
509
510#if MDDS_USE_OPENMP
511#pragma omp parallel for
512#endif
513 for (int64_t i = start_block_index; i < len; i += 16)
514 {
515 __m256i* dst = (__m256i*)&block_store.positions[i];
516 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
517
518 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
519 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
520
521 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
522 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
523
524 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
525 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
526 }
527
528 rem += len;
529 for (int64_t i = len; i < rem; ++i)
530 block_store.positions[i] += delta;
531 }
532};
533
534template<typename Blks>
535struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
536{
537 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
538 {
539 static_assert(
540 sizeof(typename decltype(block_store.positions)::value_type) == 8,
541 "This code works only when the position values are 64-bit wide.");
542
543 int64_t n = block_store.positions.size();
544
545 if (start_block_index >= n)
546 return;
547
548 // Ensure that the section length is divisible by 16.
549 int64_t len = n - start_block_index;
550 int64_t rem = len & 31; // % 32
551 len -= rem;
552 len += start_block_index;
553
554 __m256i right = _mm256_set1_epi64x(delta);
555
556#if MDDS_USE_OPENMP
557#pragma omp parallel for
558#endif
559 for (int64_t i = start_block_index; i < len; i += 32)
560 {
561 __m256i* dst = (__m256i*)&block_store.positions[i];
562 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
563
564 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
565 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
566
567 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
568 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
569
570 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
571 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
572
573 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
574 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
575
576 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
577 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
578
579 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
580 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
581
582 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
583 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
584 }
585
586 rem += len;
587 for (int64_t i = len; i < rem; ++i)
588 block_store.positions[i] += delta;
589 }
590};
591
592#endif // __AVX2__
593
594}}}} // namespace mdds::mtv::soa::detail
595
596/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition soa/block_util.hpp:23