上面演示的是四则运算,编译器自然有充足的弹性进行优化。那如果是像exp、log这样的math.h函数,编译器怎么优化呢?比如下面的代码:
<code class="language-cpp">// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #include <math.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; vec3[i] = 0.0f; dvec1[i] = dvec2[i] = 2.0; dvec3[i] = 0.0; } printf("float:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { //vec3[i] = vec1[i] * vec2[i]; vec3[i] = logf(vec1[i]); } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } printf("double:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { //dvec3[i] = dvec1[i] * dvec2[i]; dvec3[i] = log(dvec1[i]); } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } return 0; } </math.h></windows.h></stdio.h></code>
我们看一下Release反汇编就知道了。Release反汇编如下:
<code class="language-txt">// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #include <math.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { 00007FF78CE81070 mov qword ptr [rsp+18h],rbx 00007FF78CE81075 push rbp 00007FF78CE81076 push rsi 00007FF78CE81077 push rdi 00007FF78CE81078 push r12 00007FF78CE8107A push r13 00007FF78CE8107C push r14 00007FF78CE8107E push r15 00007FF78CE81080 sub rsp,20h ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; 00007FF78CE81084 mov eax,40000000h 00007FF78CE81089 lea rdi,[vec2 (07FF78CE84630h)] 00007FF78CE81090 mov ecx,100000h 00007FF78CE81095 lea r15,[vec1 (07FF78DA84630h)] 00007FF78CE8109C rep stos dword ptr [rdi] 00007FF78CE8109E mov rdi,r15 vec3[i] = 0.0f; 00007FF78CE810A1 lea r14,[vec3 (07FF78E684630h)] 00007FF78CE810A8 mov ecx,100000h dvec1[i] = dvec2[i] = 2.0; 00007FF78CE810AD lea r13,[dvec1 (07FF78D284630h)] 00007FF78CE810B4 rep stos dword ptr [rdi] 00007FF78CE810B6 xor eax,eax dvec3[i] = 0.0; 00007FF78CE810B8 lea r12,[dvec3 (07FF78EA84630h)] 00007FF78CE810BF mov rdi,r14 00007FF78CE810C2 mov ecx,100000h 00007FF78CE810C7 rep stos dword ptr [rdi] 00007FF78CE810C9 mov rax,4000000000000000h 00007FF78CE810D3 lea rdi,[dvec2 (07FF78DE84630h)] 00007FF78CE810DA mov ecx,100000h 00007FF78CE810DF rep stos qword ptr [rdi] 00007FF78CE810E2 mov rdi,r13 00007FF78CE810E5 mov ecx,100000h 00007FF78CE810EA rep stos qword ptr [rdi] 00007FF78CE810ED xor eax,eax 00007FF78CE810EF mov rdi,r12 00007FF78CE810F2 mov ecx,100000h 00007FF78CE810F7 rep stos qword ptr [rdi] } printf("float:\n"); 00007FF78CE810FA lea rcx,[string "float:\n" (07FF78CE83210h)] 00007FF78CE81101 call printf (07FF78CE81010h) 00007FF78CE81106 mov ebp,0Ah 00007FF78CE8110B mov esi,ebp 00007FF78CE8110D nop dword ptr [rax] for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF78CE81110 lea rcx,[tk1] 00007FF78CE81115 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] for (int i = 0; i < veclen; i++) 00007FF78CE8111B xor edi,edi 00007FF78CE8111D mov ebx,40000h { //vec3[i] = vec1[i] * vec2[i]; vec3[i] = logf(vec1[i]); 00007FF78CE81122 movups xmm0,xmmword ptr [rdi+r15] 00007FF78CE81127 call __vdecl_logf4 (07FF78CE81EF0h) 00007FF78CE8112C movups xmmword ptr [rdi+r14],xmm0 00007FF78CE81131 lea rdi,[rdi+10h] 00007FF78CE81135 sub rbx,1 00007FF78CE81139 jne main+0B2h (07FF78CE81122h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF78CE8113B lea rcx,[tk2] 00007FF78CE81140 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF78CE81146 mov rdx,qword ptr [tk2] 00007FF78CE8114B lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)] 00007FF78CE81152 sub rdx,qword ptr [tk1] 00007FF78CE81157 call printf (07FF78CE81010h) 00007FF78CE8115C sub rsi,1 00007FF78CE81160 jne main+0A0h (07FF78CE81110h) } printf("double:\n"); 00007FF78CE81162 lea rcx,[string "double:\n" (07FF78CE83228h)] } printf("double:\n"); 00007FF78CE81169 call printf (07FF78CE81010h) 00007FF78CE8116E xchg ax,ax for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF78CE81170 lea rcx,[tk1] 00007FF78CE81175 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] for (int i = 0; i < veclen; i++) 00007FF78CE8117B xor edi,edi 00007FF78CE8117D mov ebx,80000h { //dvec3[i] = dvec1[i] * dvec2[i]; dvec3[i] = log(dvec1[i]); 00007FF78CE81182 movups xmm0,xmmword ptr [rdi+r13] 00007FF78CE81187 call __vdecl_log2 (07FF78CE81EE0h) 00007FF78CE8118C movups xmmword ptr [rdi+r12],xmm0 00007FF78CE81191 lea rdi,[rdi+10h] 00007FF78CE81195 sub rbx,1 00007FF78CE81199 jne main+112h (07FF78CE81182h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF78CE8119B lea rcx,[tk2] 00007FF78CE811A0 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF78CE811A6 mov rdx,qword ptr [tk2] 00007FF78CE811AB lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)] 00007FF78CE811B2 sub rdx,qword ptr [tk1] 00007FF78CE811B7 call printf (07FF78CE81010h) 00007FF78CE811BC sub rbp,1 00007FF78CE811C0 jne main+100h (07FF78CE81170h) } return 0; 00007FF78CE811C2 xor eax,eax } 00007FF78CE811C4 mov rbx,qword ptr [rsp+70h] 00007FF78CE811C9 add rsp,20h 00007FF78CE811CD pop r15 00007FF78CE811CF pop r14 00007FF78CE811D1 pop r13 00007FF78CE811D3 pop r12 00007FF78CE811D5 pop rdi 00007FF78CE811D6 pop rsi 00007FF78CE811D7 pop rbp 00007FF78CE811D8 ret </math.h></windows.h></stdio.h></code>
可以看到,编译器并没有调用logf和log函数,而是调用了__vdecl_logf4和__vdecl_log2函数。因此,即使是使用了math.h中的数学函数,仍然可以实现矢量运算优化。
时段 | 个数 |
---|---|
{{f.startingTime}}点 - {{f.endTime}}点 | {{f.fileCount}} |