顺便贴上CUDA版的优化代码:
#define THREAD_N 256
#define THRESHOLD 127
__global__ static void binarizeKernel(unsigned char *in, unsigned char *out)
{
const unsigned long offset = (blockIdx.x * THREAD_N + threadIdx.x);
unsigned long p = ((unsigned long *)in)[offset];
unsigned char b = ((unsigned char *)(&p))[0];
unsigned char g = ((unsigned char *)(&p))[1];
unsigned char r = ((unsigned char *)(&p))[2];
unsigned char mi = __min(r, __min(g, b));
unsigned char ma = __max(r, __max(g, b));
out[offset] = (((unsigned short)ma + (unsigned short)mi) > THRESHOLD * 2) ? 255 : 0;
}
优化后性能提高超过1倍。