上面演示的是四则运算,编译器自然有充足的弹性进行优化。那如果是像exp、log这样的math.h函数,编译器怎么优化呢?比如下面的代码:
// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#include <math.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
ULONGLONG tk1, tk2;
for (int i = 0; i < veclen; i++)
{
vec1[i] = vec2[i] = 2.0f;
vec3[i] = 0.0f;
dvec1[i] = dvec2[i] = 2.0;
dvec3[i] = 0.0;
}
printf("float:\n");
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
for (int i = 0; i < veclen; i++)
{
//vec3[i] = vec1[i] * vec2[i];
vec3[i] = logf(vec1[i]);
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
printf("ticks: %lld\n", tk2 - tk1);
}
printf("double:\n");
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
for (int i = 0; i < veclen; i++)
{
//dvec3[i] = dvec1[i] * dvec2[i];
dvec3[i] = log(dvec1[i]);
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
printf("ticks: %lld\n", tk2 - tk1);
}
return 0;
}
</math.h></windows.h></stdio.h>
我们看一下Release反汇编就知道了。Release反汇编如下:
// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#include <math.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
00007FF78CE81070 mov qword ptr [rsp+18h],rbx
00007FF78CE81075 push rbp
00007FF78CE81076 push rsi
00007FF78CE81077 push rdi
00007FF78CE81078 push r12
00007FF78CE8107A push r13
00007FF78CE8107C push r14
00007FF78CE8107E push r15
00007FF78CE81080 sub rsp,20h
ULONGLONG tk1, tk2;
for (int i = 0; i < veclen; i++)
{
vec1[i] = vec2[i] = 2.0f;
00007FF78CE81084 mov eax,40000000h
00007FF78CE81089 lea rdi,[vec2 (07FF78CE84630h)]
00007FF78CE81090 mov ecx,100000h
00007FF78CE81095 lea r15,[vec1 (07FF78DA84630h)]
00007FF78CE8109C rep stos dword ptr [rdi]
00007FF78CE8109E mov rdi,r15
vec3[i] = 0.0f;
00007FF78CE810A1 lea r14,[vec3 (07FF78E684630h)]
00007FF78CE810A8 mov ecx,100000h
dvec1[i] = dvec2[i] = 2.0;
00007FF78CE810AD lea r13,[dvec1 (07FF78D284630h)]
00007FF78CE810B4 rep stos dword ptr [rdi]
00007FF78CE810B6 xor eax,eax
dvec3[i] = 0.0;
00007FF78CE810B8 lea r12,[dvec3 (07FF78EA84630h)]
00007FF78CE810BF mov rdi,r14
00007FF78CE810C2 mov ecx,100000h
00007FF78CE810C7 rep stos dword ptr [rdi]
00007FF78CE810C9 mov rax,4000000000000000h
00007FF78CE810D3 lea rdi,[dvec2 (07FF78DE84630h)]
00007FF78CE810DA mov ecx,100000h
00007FF78CE810DF rep stos qword ptr [rdi]
00007FF78CE810E2 mov rdi,r13
00007FF78CE810E5 mov ecx,100000h
00007FF78CE810EA rep stos qword ptr [rdi]
00007FF78CE810ED xor eax,eax
00007FF78CE810EF mov rdi,r12
00007FF78CE810F2 mov ecx,100000h
00007FF78CE810F7 rep stos qword ptr [rdi]
}
printf("float:\n");
00007FF78CE810FA lea rcx,[string "float:\n" (07FF78CE83210h)]
00007FF78CE81101 call printf (07FF78CE81010h)
00007FF78CE81106 mov ebp,0Ah
00007FF78CE8110B mov esi,ebp
00007FF78CE8110D nop dword ptr [rax]
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF78CE81110 lea rcx,[tk1]
00007FF78CE81115 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]
for (int i = 0; i < veclen; i++)
00007FF78CE8111B xor edi,edi
00007FF78CE8111D mov ebx,40000h
{
//vec3[i] = vec1[i] * vec2[i];
vec3[i] = logf(vec1[i]);
00007FF78CE81122 movups xmm0,xmmword ptr [rdi+r15]
00007FF78CE81127 call __vdecl_logf4 (07FF78CE81EF0h)
00007FF78CE8112C movups xmmword ptr [rdi+r14],xmm0
00007FF78CE81131 lea rdi,[rdi+10h]
00007FF78CE81135 sub rbx,1
00007FF78CE81139 jne main+0B2h (07FF78CE81122h)
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF78CE8113B lea rcx,[tk2]
00007FF78CE81140 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF78CE81146 mov rdx,qword ptr [tk2]
00007FF78CE8114B lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)]
00007FF78CE81152 sub rdx,qword ptr [tk1]
00007FF78CE81157 call printf (07FF78CE81010h)
00007FF78CE8115C sub rsi,1
00007FF78CE81160 jne main+0A0h (07FF78CE81110h)
}
printf("double:\n");
00007FF78CE81162 lea rcx,[string "double:\n" (07FF78CE83228h)]
}
printf("double:\n");
00007FF78CE81169 call printf (07FF78CE81010h)
00007FF78CE8116E xchg ax,ax
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF78CE81170 lea rcx,[tk1]
00007FF78CE81175 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]
for (int i = 0; i < veclen; i++)
00007FF78CE8117B xor edi,edi
00007FF78CE8117D mov ebx,80000h
{
//dvec3[i] = dvec1[i] * dvec2[i];
dvec3[i] = log(dvec1[i]);
00007FF78CE81182 movups xmm0,xmmword ptr [rdi+r13]
00007FF78CE81187 call __vdecl_log2 (07FF78CE81EE0h)
00007FF78CE8118C movups xmmword ptr [rdi+r12],xmm0
00007FF78CE81191 lea rdi,[rdi+10h]
00007FF78CE81195 sub rbx,1
00007FF78CE81199 jne main+112h (07FF78CE81182h)
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF78CE8119B lea rcx,[tk2]
00007FF78CE811A0 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF78CE811A6 mov rdx,qword ptr [tk2]
00007FF78CE811AB lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)]
00007FF78CE811B2 sub rdx,qword ptr [tk1]
00007FF78CE811B7 call printf (07FF78CE81010h)
00007FF78CE811BC sub rbp,1
00007FF78CE811C0 jne main+100h (07FF78CE81170h)
}
return 0;
00007FF78CE811C2 xor eax,eax
}
00007FF78CE811C4 mov rbx,qword ptr [rsp+70h]
00007FF78CE811C9 add rsp,20h
00007FF78CE811CD pop r15
00007FF78CE811CF pop r14
00007FF78CE811D1 pop r13
00007FF78CE811D3 pop r12
00007FF78CE811D5 pop rdi
00007FF78CE811D6 pop rsi
00007FF78CE811D7 pop rbp
00007FF78CE811D8 ret
</math.h></windows.h></stdio.h>
可以看到,编译器并没有调用logf和log函数,而是调用了__vdecl_logf4和__vdecl_log2函数。因此,即使是使用了math.h中的数学函数,仍然可以实现矢量运算优化。
时段 | 个数 |
---|---|
{{f.startingTime}}点 - {{f.endTime}}点 | {{f.fileCount}} |