#include <cmath>
typedef struct
{
unsigned char x;
unsigned char y;
unsigned char z;
unsigned char w;
} uchar4;
#define INT_SCALE 256
#define INT_SCALE_SHF 8
// w0, w1, w2, and w3 are the four cubic B-spline basis functions
__inline long w0_int(long a)
{
return ((a*(a*(-a + 3*INT_SCALE) - 3*INT_SCALE*INT_SCALE) + INT_SCALE*INT_SCALE*INT_SCALE)/6)>>(INT_SCALE_SHF*2); // optimized
}
__inline long w1_int(long a)
{
return ((a*a*(3*a - 6*INT_SCALE) + 4*INT_SCALE*INT_SCALE*INT_SCALE)/6)>>(INT_SCALE_SHF*2);
}
__inline long w2_int(long a)
{
return ((a*(a*(-3*a + 3*INT_SCALE) + 3*INT_SCALE*INT_SCALE) + INT_SCALE*INT_SCALE*INT_SCALE)/6)>>(INT_SCALE_SHF*2);
}
__inline long w3_int(long a)
{
return ((a*a*a)/6)>>(INT_SCALE_SHF*2);
}
__inline unsigned long texPick(unsigned long *image, int x, int y, unsigned long p, unsigned long h)
{
if(x < 0)
x = 0;
else if(x >= p)
x = p - 1;
if(y < 0)
y = 0;
else if (y >= h)
y = h - 1;
return image[y * p + x];
}
__inline unsigned long cubicFilter_int(long x, unsigned long c0, unsigned long c1, unsigned long c2, unsigned long c3)
{
uchar4 *p[4];
p[0] = (uchar4 *)&c0;
p[1] = (uchar4 *)&c1;
p[2] = (uchar4 *)&c2;
p[3] = (uchar4 *)&c3;
uchar4 r;
long w0x = w0_int(x);
long w1x = w1_int(x);
long w2x = w2_int(x);
long w3x = w3_int(x);
r.x = (p[0]->x * w0x + p[1]->x * w1x + p[2]->x * w2x + p[3]->x * w3x + INT_SCALE / 2) >> INT_SCALE_SHF;
r.y = (p[0]->y * w0x + p[1]->y * w1x + p[2]->y * w2x + p[3]->y * w3x + INT_SCALE / 2) >> INT_SCALE_SHF;
r.z = (p[0]->z * w0x + p[1]->z * w1x + p[2]->z * w2x + p[3]->z * w3x + INT_SCALE / 2) >> INT_SCALE_SHF;
r.w = 0xff;
return *((unsigned long *)&r);
}
// using 16 texture lookups
__inline unsigned long tex2DBicubic(unsigned long *image, float x, float y, unsigned long pitch4, unsigned long height)
{
x -= 0.5f;
y -= 0.5f;
int px = (int)x;
int py = (int)y;
long fx = (x - px) * INT_SCALE + 0.500001f;
long fy = (y - py) * INT_SCALE + 0.500001f;
return cubicFilter_int(fy,
cubicFilter_int(fx, texPick(image, px-1, py-1, pitch4, height), texPick(image, px, py-1, pitch4, height), texPick(image, px+1, py-1, pitch4, height), texPick(image, px+2,py-1, pitch4, height)),
cubicFilter_int(fx, texPick(image, px-1, py, pitch4, height), texPick(image, px, py, pitch4, height), texPick(image, px+1, py, pitch4, height), texPick(image, px+2, py, pitch4, height)),
cubicFilter_int(fx, texPick(image, px-1, py+1, pitch4, height), texPick(image, px, py+1, pitch4, height), texPick(image, px+1, py+1, pitch4, height), texPick(image, px+2, py+1, pitch4, height)),
cubicFilter_int(fx, texPick(image, px-1, py+2, pitch4, height), texPick(image, px, py+2, pitch4, height), texPick(image, px+1, py+2, pitch4, height), texPick(image, px+2, py+2, pitch4, height))
);
}
UINT AFX_CDECL CPUBicubicThread(LPVOID param)
{
unsigned long *p = (unsigned long *)param;
unsigned long *image = (unsigned long *)p[0];
unsigned long pitch = p[1];
unsigned long pitch4 = p[1] / 4;
unsigned long width = p[2];
unsigned long height = p[3];
unsigned long y_amount = p[4];
unsigned long y_start = p[5];
unsigned long *image_out = (unsigned long *)p[6];
float scale = *((float *)&p[7]);
unsigned long pitch_org = p[8];
unsigned long pitch4_org = p[8] / 4;
unsigned long height_org = p[9];
long x, y;
image_out += pitch4 * y_start;
for(y = y_start; y < y_start + y_amount; y ++)
{
if(y >= height)
break;
for(x = 0; x < width; x ++)
{
float u = x / scale;
float v = y / scale;
image_out[x] = tex2DBicubic(image, u, v, pitch4_org, height_org);
}
image_out += pitch4;
}
return 0;
}
双立方插值,没什么可解释的,看看photoshop的缩放就知道了。用的是比较常规的方法,目标图像的每个点由原图像相应坐标附近16个点插值得到。主函数我写成线程函数了,就是为了调动几个核一起跑。关键函数是tex2DBicubic。
在E7200双核CPU上能跑出每秒处理1128万像素的速度。
计算过程我全部改成整型计算了。如果用float的话会慢约50%。
200字以内,仅用于支线交流,主线讨论请采用回复功能。