-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathneon.cpp
More file actions
554 lines (506 loc) · 22.9 KB
/
neon.cpp
File metadata and controls
554 lines (506 loc) · 22.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
//////////////////////////////////
// Introduction
//////////////////////////////////
//
// Welcome! This program is designed as a self-contained introduction to learn a bit about
// the performance problems in pixel slinging, and how SIMD (in our case: NEON on ARM) can help
// to fling your pixels faster.
//
// SIMD is an acronym for "Single Instruction Multiple Data". The idea is simple:
// instead of performing an operation on one value at a time, the CPU can
// operate on many values in parallel using wide vector registers that can store large amounts of data.
//
// For example, instead of adding:
//
// c[i] = a[i] + b[i]
//
// one element at a time in a loop, SIMD can let us add multiple elements at the same time,
// while using less instructions, and making better use of memory bandwidth too.
//
// So let's go ahead and learn some stuff! To best kick the tyres, you want to try build it
// for an ARM target (e.g. ARM macbook, or under a Yocto SDK).
//
// g++ -O2 -o test neon.cpp -DENABLE_ARM_NEON=1
//
// Try it out, read, experiment, ask questions, and have fun! ;-)
//
// STATUTORY DISCLAIMER:
// * You should profile before optimizing.
// * When optimizing like this, check what the generated code *IS* rather than assuming or guessing.
// I'm omitting both of those steps here for the sake of brevity, and focusing on demonstrating SSE/NEON.
// I will also note that I am *NOT* an expert. I just find this stuff fun, and want to share that fun.
//
// First, let's get some includes and helper types out of the way, so we can get down to business.
#include <cassert>
#include <cstdio>
#include <chrono>
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <functional>
#if defined(ENABLE_ARM_NEON)
#include <arm_neon.h>
#endif
//////////////////////////////////
// HELPERS
//////////////////////////////////
// Don't focus too much on these types. They're just to help write the example code that we care about.
// They're not really to be used in production: these were chopped out and simplified from real code
// to just provide what we need. Focus on the really interesting part (NEON), not the boring stuff.
//
// Just skip over them, and go to the more interesting part below: Search for imageContainsGray.
// A simple image type. This will serve as a wrapper around a simple RGB16 (uint16_t) array
// clang-format: off
class Image16
{
public:
Image16() = default;
Image16(uint32_t width, uint32_t height) :
stride(width),
width(width),
height(height),
m_data(width * height)
{
assert(width > 0);
assert(height > 0);
}
uint16_t* scanline(uint32_t y)
{
assert(y < height);
return &m_data[stride * y];
}
const uint16_t* scanline(uint32_t y) const
{
assert(y < height);
return &m_data[stride * y];
}
// Bad form, but these are just public to avoid having to deal with getters for them.
uint32_t stride = 0;
uint32_t width = 0;
uint32_t height = 0;
private:
std::vector<uint16_t> m_data;
};
// Now a simple rectangle type to define a subsection of our image. No functions, just keeping it nice and simple.
struct Rect
{
uint32_t x = 0;
uint32_t y = 0;
uint32_t width = 0;
uint32_t height = 0;
};
//////////////////////////////////
// The meat!
//////////////////////////////////
// And now, let's talk about what we actually want to accomplish.
//
// Graphics programming is all about pixels (and GPUs, but not today).
// In the context of software rendering, pixels are stored in a 2D buffer of some kind, where
// the contents of the buffer is defined by the type of image data you're working with.
// For the sake of keeping things simple, I've gone ahead and brought in a simple Image16 type,
// which will store straightforward RGB data in 16 bits.
//
// The buffer is accessed by "scanline", which is a row of pixels, left to right,
// with y 0 being at the start of the array. So m_data[0][0] is the first pixel, m_data[0][1] the second, and so on.
//
// Let's imagine that we want to read over a provided image, and check what type of image it is.
// If it's all black/white, or if it contains any other colors. Strange task, I know, but bear with me..
// The bottom of this file has a bunch of tests/benchmarks, so when we're ready, we'll just
// be able to run them to ensure that things seem correct, and also get an idea of how fast/slow it is.
// If you want to add your own implementation for fun, just call it from main().
//
// Anyway. Now we know what we want to do - how can we accomplish this?
// We can just iterate each pixel of the image, check it against white and black. Let's go ahead and do just that...
bool imageContainsGrayInRectV0(const Image16& image, Rect rect)
{
const uint32_t height = image.height;
const uint32_t width = image.width;
const uint32_t y0 = std::max(uint32_t(0), rect.y);
const uint32_t y1 = std::min(height, rect.y + rect.height);
const uint32_t x0 = std::max(uint32_t(0), rect.x);
const uint32_t x1 = std::min(width, rect.x + rect.width);
// Like I said, straightforward. We're just going over each scanline (y coordinate), then each x
// coordinate, and checking whether they're white (0xffff) or black (0x0000).
for (uint32_t y = y0; y < y1; ++y) {
const uint16_t* p = image.scanline(y);
for (uint32_t x = x0; x < x1; ++x) {
if (p[x] != 0x0000 && p[x] != 0xffff) {
return true;
}
}
}
return false;
}
// Let's bask in the glory of the simple for loop, and then take a moment to run our benchmarks!
// === imageContainsGrayInRectV0 ===
// all black 266.9 us OK
// all white 269.0 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.3 us OK
// gray @ bottom-left 266.8 us OK
// gray @ bottom-right 267.2 us OK
// gray @ center 133.6 us OK
//
// Hmmm. They do all pass, but the data isn't exactly pretty. Why's it so slow?
//
// First, we are reading small chunks of data. It looks like a single pixel at a time, though
// under the hood, the CPU fetches more at a time (by cache line; perhaps 64 bytes).
// Even then, this still isn't very efficient: given 2 byte pixels, that would mean a cache miss,
// and new fetch, every 32 pixels. Memory has a lot more throughput than we are making use of.
//
// Secondly, we're branching *for each pixel* In the case of a 1024x1024 image, that's
// 1,048,576 loop iterations, and each iteration is doing no real work.
// We're bottlenecked on overhead :( - we can do better, surely, but how?
//
// Let's try unroll the loop! If we loop on blocks of 8 pixels at a time,
// we should be doing 8x less loop iterations, which could be quite a benefit.
//
// Ah, however.. To actually do that, we'll need to redesign our single simple loop into
// three loops. The reason for this is simple: the rect edges (left: x, right: x + width)
// might not land cleanly on a divisible-by-8 block.
//
// So to work around that, we subdivide the part of the scanlines that we want to
// look at into three pieces. A leading piece, that deals with all pixels BEFORE the LEFTMOST
// block, then all our blocks of 8 pixels, and then a trailing piece that deals with all the pixels AFTER
// the rightmost block.
//
// Then we have those three loops I mentioned:
// - leading pixels
// - block
// - trailing pixels
//
// In the code below, x08 is our LEFT block edge. We round it UP to the next 8 pixel block,
// so that it doesn't start before the LEFT hand edge of the provided rectangle.
// x18 is our RIGHT block edge. We round it DOWN, so it's inside the RIGHT hand edge.
//
// Example:
// x0=3 (our leftmost edge)
// x1=21 (our rightmost edge)
// x08 = (3+7) & ~7 = 8 (leftmost block, rounded UP)
// x18 = 21 & ~7 = 16 (rightmost block, rounded DOWN)
//
// 0 8 16 24 (blocks of 8)
// | | | |
// v v v v
// +----+----+----+----+----+----+ (scanline)
// ^ ^
// x0=3 x1=21
// ^ ^
// x08 = 8 x18 = 16
//
// So in this case, we'll have a leading loop covering x0..x08 (3..7).
// Then we'll have a block loop handling x08..x18 (8..16).
// Then we'll have a trailing loop handling x18..x1 (17..20).
//
// Another way of thinking of this is that we subdivide each scanline into three pieces:
// [---] leading pixels (x0..x08)
// [===============] 8px blocks (x08..x18)
// [--] trailing pixels (x18..x1)
//
// That's the idea - let's go and actually implement it.
// No SIMD yet - but maybe it'll be good enough...?
bool imageContainsGrayInRectV1(const Image16& image, Rect rect)
{
const uint32_t height = image.height;
const uint32_t width = image.width;
const uint32_t y0 = std::max(uint32_t(0), rect.y);
const uint32_t y1 = std::min(height, rect.y + rect.height);
const uint32_t x0 = std::max(uint32_t(0), rect.x);
const uint32_t x1 = std::min(width, rect.x + rect.width);
// Round x0 up to the nearest multiple of 8 (no-op if already aligned).
// Adding 7 ensures any unaligned value overflows into the next 8-byte
// boundary, then the mask clears the lower 3 bits to snap to it.
const uint32_t x08 = (x0 + 7) & (~0x7);
const uint32_t x18 = x1 & (~0x7);
for (uint32_t y = y0; y < y1; ++y) {
const uint16_t* p = image.scanline(y);
// Leading pixels (left edge)
for (uint32_t x = x0; x < x08 && x < x1; ++x) {
if (p[x] != 0x0000 && p[x] != 0xffff) {
return true;
}
}
// Load and test 8 pixels at a time
for (uint32_t x = x08; x < x18; x += 8) {
if ((p[x + 0] != 0x0000 && p[x + 0] != 0xffff)
|| (p[x + 1] != 0x0000 && p[x + 1] != 0xffff)
|| (p[x + 2] != 0x0000 && p[x + 2] != 0xffff)
|| (p[x + 3] != 0x0000 && p[x + 3] != 0xffff)
|| (p[x + 4] != 0x0000 && p[x + 4] != 0xffff)
|| (p[x + 5] != 0x0000 && p[x + 5] != 0xffff)
|| (p[x + 6] != 0x0000 && p[x + 6] != 0xffff)
|| (p[x + 7] != 0x0000 && p[x + 7] != 0xffff)) {
return true;
}
}
// Trailing pixels (right edge)
for (uint32_t x = x18; x < x1; ++x) {
if (p[x] != 0x0000 && p[x] != 0xffff) {
return true;
}
}
}
return false;
}
// Phew. That was a bit of a slog - did it pay off?
//
// === imageContainsGrayInRectV0 ===
// all black 266.9 us OK
// all white 269.0 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.3 us OK
// gray @ bottom-left 266.8 us OK
// gray @ bottom-right 267.2 us OK
// gray @ center 133.6 us OK
//
// === imageContainsGrayInRectV1 ===
// all black 233.2 us OK
// all white 199.5 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.2 us OK
// gray @ bottom-left 199.6 us OK
// gray @ bottom-right 200.0 us OK
// gray @ center 100.4 us OK
//
// Ok... That's clearly better than V0, but it's still nothing to write home about.
// gcc does a better job than clang with this (results not shown), but it's still meh.
// Can we do better still? Yes!
//
// The key insight is, we don't have to do this pixel by pixel!
// Before I go any further, let's have a quick terminology primer:
//
// Vector: A CPU register that holds multiple values at once.
// Unlike an array or std::vector, it's fixed in size, and lives in a register on the CPU.
// Lane: Each individual element inside a vector.
//
// A vector can have different data sizes (e.g. uint8, uint16, uint32), as well
// as different amounts of lanes. For example, a uint16x8_t has 8 lanes, each 16 bits wide.
//
// Intrinsic: A C/C++ function implemented by the compiler that maps more or less directly
// to a specific CPU instruction, generally with a name that only a mother could love, like vld1q_u16().
//
// That's enough terminology to be dangerous. Let's continue.
// NEON (ARM's SIMD) lets you operate on _blocks of data_ in special vector registers,
// that can fit up to 128 bits (more, if you use ARM's Scalable Vector Extensions - SVE,
// but I'm yet to be lucky enough to try it). So with a uint16_t pixel,
// that means we can test up to 8 pixels at once (128 / 16 = 8).
//
// This definitely does not come for free, though. You need to think up a way to do what you want
// with blocks, and that might require bending your brain around the problem space a little.
// Before we dive into the full implementation, let's take a closer look at how it actually works.
//
// Scalar: one comparison at a time.
//
// uint16_t pixel = 0x00FF;
// bool isBlack = (pixel == 0x0000); // false
//
// NEON: eight comparisons (lanes) at once.
//
// // load up each pixel into a lane
// uint16x8_t pixels = { 0x0000, 0xFFFF, 0x8000, 0x0000, 0xFFFF, 0x0000, 0x8000, 0xFFFF };
// // load a constant into each lane (8 lanes in total, same as above)
// uint16x8_t black = vdupq_n_u16(0x0000);
// // set each lane to 0xFFFF if equal, 0x0000 if not
// uint16x8_t result = vceqq_u16(pixels, black);
// // result: { 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0x0000 }
//
// So now each pixel gets its own independent result. One instruction, many results. That's the core idea.
//
// You can then combine results with bitwise ops, just like scalar code, using intrinsics like
// vceqq_u16, vorrq_u16, etc:
//
// uint16x8_t white = vdupq_n_u16(0xFFFF);
// uint16x8_t isWhite = vceqq_u16(pixels, white); // or pixels == white, if you prefer
// uint16x8_t isBlackOrWhite = vorrq_u16(result, isWhite); // isBlackOrWhite = result | isWhite
// // isBlackOrWhite: { 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF }
// // black white GRAY! black white black GRAY! white
//
// Now you've got the basic idea, let's go ahead and try make it work.
//
// We'll keep the same (leading, block, trailing) three loop structure we had in V1, but this time,
// we'll use NEON in the inner block loop.
bool imageContainsGrayInRectV2(const Image16& image, Rect rect)
{
#if !defined(ENABLE_ARM_NEON)
// For the sake of simplicity, V2's whole implementation is going to need NEON,
// but if you're working across platforms, you might not have NEON everywhere,
// so in the real world, you'd use a trick of some form - maybe like this -
// to fall back to a less optimal implementation while keeping things still working.
// (This is a bit of a distraction, but it's useful to understand, so I figure it's
// worth mentioning.)
return imageContainsGrayInRectV1(image, rect);
#else
const uint32_t height = image.height;
const uint32_t width = image.width;
const uint32_t y0 = std::max(uint32_t(0), rect.y);
const uint32_t y1 = std::min(height, rect.y + rect.height);
const uint32_t x0 = std::max(uint32_t(0), rect.x);
const uint32_t x1 = std::min(width, rect.x + rect.width);
// We want constants for black/white to test against.
// These duplicate the value across multiple lanes, i.e:
// white = { 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF };
const uint16x8_t black = vdupq_n_u16(0x0000);
const uint16x8_t white = vdupq_n_u16(0xffff);
// As before, round x08 UP so it's inside the left rect edge. Round x18 DOWN. Multiples of 8.
const uint32_t x08 = (x0 + 7) & (~0x7);
const uint32_t x18 = x1 & (~0x7);
for (uint32_t y = y0; y < y1; ++y) {
const uint16_t* p = image.scanline(y);
// Leading pixels (left edge)
for (uint32_t x = x0; x < x08 && x < x1; ++x) {
if (p[x] != 0x0000 && p[x] != 0xffff)
return true;
}
// Now we'll compare the inner part of the scanline in blocks of 8 pixels at a time.
for (uint32_t x = x08; x < x18; x += 8) {
// Load the block.
const uint16x8_t pix16 = vld1q_u16(&p[x]);
// vceqq_u16 (under the hood; we're using the == overload to make it more readable)
// will give us bits set to 1 if equal, else 0, just like a scalar comparison.
//
// if you care about portability, you might want to avoid the operator overloads, and
// use the intrinsic forms directly, but particularly in more dense NEON code, I find
// that I strongly prefer the overloads.
//
// most of the time, they work all of the time. when they don't, you'll typically
// end up with a cryptic compile error (e.g. I've had trouble on Apple clang).
const uint16x8_t isBlack = pix16 == black;
const uint16x8_t isWhite = pix16 == white;
// More specifically, we now want to not know "isBlack" or "isWhite", but
// "are any pixels in the block not black or white"...
// Given none of our black/white bits overlap, we can or them together safely here.
// vorrq_u16 (we're using the | overload for readability) will do that for us.
const uint16x8_t isBlackOrWhite = isBlack | isWhite;
// At this point we have 8 separate comparison results (lanes). One per pixel in the block.
// Each result is either 0xffff, if the pixel was black or white, or 0x0000 if not.
// But we want a single answer across all those pixels...
//
// We reinterpret the pixel block as 2xuint64s, and OR them together.
// If anything was NOT black or white, at least one lane will be 0x0000.
// That will propagate to the final result, and not all bits will be set.
const uint64x2_t sum64 = vreinterpretq_u64_u32(isBlackOrWhite);
const uint64_t andBits = vgetq_lane_u64(sum64, 0) & vgetq_lane_u64(sum64, 1);
if (andBits != 0xffffffffffffffff) {
return true;
}
}
// Trailing pixels (right edge)
for (uint32_t x = x18; x < x1; ++x) {
if (p[x] != 0x0000 && p[x] != 0xffff) {
return true;
}
}
}
return false;
#endif
}
// Now... After all that, let's take another look at our benchmarks?
//
// === imageContainsGrayInRectV0 ===
// all black 266.9 us OK
// all white 269.0 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.3 us OK
// gray @ bottom-left 266.8 us OK
// gray @ bottom-right 267.2 us OK
// gray @ center 133.6 us OK
//
// === imageContainsGrayInRectV1 ===
// all black 233.2 us OK
// all white 199.5 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.2 us OK
// gray @ bottom-left 199.6 us OK
// gray @ bottom-right 200.0 us OK
// gray @ center 100.4 us OK
//
// === imageContainsGrayInRectV2 ===
// all black 44.2 us OK
// all white 45.7 us OK
// all gray 0.0 us OK
// gray @ top-left 0.0 us OK
// gray @ top-right 0.1 us OK
// gray @ bottom-left 46.2 us OK
// gray @ bottom-right 45.7 us OK
// gray @ center 23.1 us OK
//
// Ohhh. That's much better! Nice! Job done, let's never touch this again, huh? ;-)
//////////////////////////////////
// Tests/benchmarks
//////////////////////////////////
// Just some helpers to define some tests/benchmarks below.
struct TestCase
{
const char* name;
Image16 image;
bool expected;
};
void fillImage(Image16& img, uint16_t value)
{
for (uint32_t y = 0; y < img.height; ++y) {
uint16_t* row = img.scanline(y);
for (uint32_t x = 0; x < img.width; ++x) {
row[x] = value;
}
}
}
Image16 makeFilled(uint32_t width, uint32_t height, uint16_t color)
{
Image16 img(width, height);
fillImage(img, color);
return img;
}
Image16 makeWhiteWithGrayAt(uint32_t width, uint32_t height, uint32_t gx, uint32_t gy)
{
Image16 img = makeFilled(width, height, 0xffff);
img.scanline(gy)[gx] = 0x8000;
return img;
}
using BenchFn = std::function<bool(const Image16&, Rect)>;
void bench(const char* fnName, const BenchFn& fn, TestCase* cases, size_t n, int iters = 2000)
{
printf("=== %s ===\n", fnName);
for (size_t i = 0; i < n; ++i) {
Rect r{0, 0, cases[i].image.width, cases[i].image.height};
bool result = false;
// run a few untimed iterations to warm up caches.
// without this, results will often get skewed negatively
// by being run on a cold cache, e.g. your first benchmark might be too slow.
for (int j = 0; j < 1000; ++j) {
volatile bool sink = fn(cases[i].image, r);
(void)sink;
}
auto t0 = std::chrono::high_resolution_clock::now();
for (int j = 0; j < iters; ++j) {
result = fn(cases[i].image, r);
}
auto t1 = std::chrono::high_resolution_clock::now();
double us = std::chrono::duration<double, std::micro>(t1 - t0).count() / iters;
bool ok = (result == cases[i].expected);
printf(" %-30s %8.1f us %s\n", cases[i].name, us, ok ? "OK" : "FAIL");
}
printf("\n");
}
int main()
{
TestCase cases[] = {
{"all black", makeFilled(1024, 1024, 0x0000), false},
{"all white", makeFilled(1024, 1024, 0xffff), false},
{"all gray ", makeFilled(1024, 1024, 0x8000), true},
{"gray @ top-left", makeWhiteWithGrayAt(1024, 1024, 0, 0), true},
{"gray @ top-right", makeWhiteWithGrayAt(1024, 1024, 1023, 0), true},
{"gray @ bottom-left", makeWhiteWithGrayAt(1024, 1024, 0, 1023), true},
{"gray @ bottom-right", makeWhiteWithGrayAt(1024, 1024, 1023, 1023), true},
{"gray @ center", makeWhiteWithGrayAt(1024, 1024, 512, 512), true},
};
constexpr size_t N = sizeof(cases) / sizeof(cases[0]);
bench("imageContainsGrayInRectV0", imageContainsGrayInRectV0, cases, N);
bench("imageContainsGrayInRectV1", imageContainsGrayInRectV1, cases, N);
bench("imageContainsGrayInRectV2", imageContainsGrayInRectV2, cases, N);
return 0;
}