我记得世纪之初的时候,某本古老的书上有这么一句话,大概是这个意思,无论是float还是double,在CPU内部都是转换为80位浮点数运算的,因此float和double其实是一样快的。
但是时代变化太快,这句话现在还对不对呢?写了个程序验证一下。使用的是Visual C++ 2015 Update 2,编译为x64架构。为了避免调试器的干扰,直接使用Ctrl+F5运行。
程序如下:
<code class="language-cpp">// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
ULONGLONG tk1, tk2;
for (int i = 0; i < veclen; i++)
{
vec1[i] = vec2[i] = 2.0f;
vec3[i] = 0.0f;
dvec1[i] = dvec2[i] = 2.0;
dvec3[i] = 0.0;
}
printf("float:\n");
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
for (int i = 0; i < veclen; i++)
{
vec3[i] = vec1[i] * vec2[i];
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
printf("ticks: %lld\n", tk2 - tk1);
}
printf("double:\n");
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
for (int i = 0; i < veclen; i++)
{
dvec3[i] = dvec1[i] * dvec2[i];
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
printf("ticks: %lld\n", tk2 - tk1);
}
return 0;
}
</windows.h></stdio.h></code>
Debug下Ctrl+F5直接运行:
Release下Ctrl+F5直接运行:
可以看到,在Release编译下,float比double快得多,而在Debug编译下则几乎没有差别。这是为什么呢?在这里我们设置了个断点,进行一下反编译——
Debug下的反编译:
<code class="language-txt">// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
00007FF65F6717D0 push rbp
00007FF65F6717D2 push rdi
00007FF65F6717D3 sub rsp,1C8h
00007FF65F6717DA lea rbp,[rsp+20h]
00007FF65F6717DF mov rdi,rsp
00007FF65F6717E2 mov ecx,72h
00007FF65F6717E7 mov eax,0CCCCCCCCh
// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
00007FF65F6717EC rep stos dword ptr [rdi]
ULONGLONG tk1, tk2;
for (int i = 0; i < veclen; i++)
00007FF65F6717EE mov dword ptr [rbp+44h],0
00007FF65F6717F5 jmp main+2Fh (07FF65F6717FFh)
00007FF65F6717F7 mov eax,dword ptr [rbp+44h]
00007FF65F6717FA inc eax
00007FF65F6717FC mov dword ptr [rbp+44h],eax
00007FF65F6717FF cmp dword ptr [rbp+44h],100000h
00007FF65F671806 jge main+0C7h (07FF65F671897h)
{
vec1[i] = vec2[i] = 2.0f;
00007FF65F67180C movsxd rax,dword ptr [rbp+44h]
00007FF65F671810 lea rcx,[vec2 (07FF65FA7C170h)]
00007FF65F671817 movss xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)]
00007FF65F67181F movss dword ptr [rcx+rax*4],xmm0
00007FF65F671824 movsxd rax,dword ptr [rbp+44h]
00007FF65F671828 lea rcx,[vec1 (07FF65F67C170h)]
00007FF65F67182F movss xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)]
00007FF65F671837 movss dword ptr [rcx+rax*4],xmm0
vec3[i] = 0.0f;
00007FF65F67183C movsxd rax,dword ptr [rbp+44h]
00007FF65F671840 lea rcx,[vec3 (07FF65FE7C170h)]
00007FF65F671847 xorps xmm0,xmm0
00007FF65F67184A movss dword ptr [rcx+rax*4],xmm0
dvec1[i] = dvec2[i] = 2.0;
00007FF65F67184F movsxd rax,dword ptr [rbp+44h]
00007FF65F671853 lea rcx,[dvec2 (07FF660A7C170h)]
00007FF65F67185A movsd xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)]
00007FF65F671862 movsd mmword ptr [rcx+rax*8],xmm0
00007FF65F671867 movsxd rax,dword ptr [rbp+44h]
00007FF65F67186B lea rcx,[dvec1 (07FF66027C170h)]
00007FF65F671872 movsd xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)]
00007FF65F67187A movsd mmword ptr [rcx+rax*8],xmm0
dvec3[i] = 0.0;
00007FF65F67187F movsxd rax,dword ptr [rbp+44h]
00007FF65F671883 lea rcx,[dvec3 (07FF66127C170h)]
00007FF65F67188A xorps xmm0,xmm0
00007FF65F67188D movsd mmword ptr [rcx+rax*8],xmm0
}
00007FF65F671892 jmp main+27h (07FF65F6717F7h)
printf("float:\n");
00007FF65F671897 lea rcx,[string "float:\n" (07FF65F679CF0h)]
00007FF65F67189E call printf (07FF65F6711CCh)
for (int i = 0; i < 10; i++)
00007FF65F6718A3 mov dword ptr [rbp+64h],0
00007FF65F6718AA jmp main+0E4h (07FF65F6718B4h)
00007FF65F6718AC mov eax,dword ptr [rbp+64h]
00007FF65F6718AF inc eax
00007FF65F6718B1 mov dword ptr [rbp+64h],eax
00007FF65F6718B4 cmp dword ptr [rbp+64h],0Ah
00007FF65F6718B8 jge main+186h (07FF65F671956h)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF65F6718BE lea rcx,[tk1]
00007FF65F6718C2 call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]
for (int i = 0; i < veclen; i++)
00007FF65F6718C8 mov dword ptr [rbp+84h],0
00007FF65F6718D2 jmp main+112h (07FF65F6718E2h)
00007FF65F6718D4 mov eax,dword ptr [rbp+84h]
00007FF65F6718DA inc eax
00007FF65F6718DC mov dword ptr [rbp+84h],eax
00007FF65F6718E2 cmp dword ptr [rbp+84h],100000h
00007FF65F6718EC jge main+15Ah (07FF65F67192Ah)
{
vec3[i] = vec1[i] * vec2[i];
00007FF65F6718EE movsxd rax,dword ptr [rbp+84h]
00007FF65F6718F5 lea rcx,[vec1 (07FF65F67C170h)]
00007FF65F6718FC movsxd rdx,dword ptr [rbp+84h]
00007FF65F671903 lea r8,[vec2 (07FF65FA7C170h)]
00007FF65F67190A movss xmm0,dword ptr [rcx+rax*4]
00007FF65F67190F mulss xmm0,dword ptr [r8+rdx*4]
00007FF65F671915 movsxd rax,dword ptr [rbp+84h]
00007FF65F67191C lea rcx,[vec3 (07FF65FE7C170h)]
00007FF65F671923 movss dword ptr [rcx+rax*4],xmm0
}
00007FF65F671928 jmp main+104h (07FF65F6718D4h)
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF65F67192A lea rcx,[tk2]
00007FF65F67192E call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF65F671934 mov rax,qword ptr [tk1]
00007FF65F671938 mov rcx,qword ptr [tk2]
00007FF65F67193C sub rcx,rax
00007FF65F67193F mov rax,rcx
00007FF65F671942 mov rdx,rax
00007FF65F671945 lea rcx,[string "ticks: %lld\n" (07FF65F679D00h)]
00007FF65F67194C call printf (07FF65F6711CCh)
}
00007FF65F671951 jmp main+0DCh (07FF65F6718ACh)
printf("double:\n");
00007FF65F671956 lea rcx,[string "double:\n" (07FF65F679D10h)]
00007FF65F67195D call printf (07FF65F6711CCh)
for (int i = 0; i < 10; i++)
00007FF65F671962 mov dword ptr [rbp+0A4h],0
00007FF65F67196C jmp main+1ACh (07FF65F67197Ch)
00007FF65F67196E mov eax,dword ptr [rbp+0A4h]
00007FF65F671974 inc eax
00007FF65F671976 mov dword ptr [rbp+0A4h],eax
00007FF65F67197C cmp dword ptr [rbp+0A4h],0Ah
00007FF65F671983 jge main+251h (07FF65F671A21h)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF65F671989 lea rcx,[tk1]
00007FF65F67198D call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]
for (int i = 0; i < veclen; i++)
00007FF65F671993 mov dword ptr [rbp+0C4h],0
00007FF65F67199D jmp main+1DDh (07FF65F6719ADh)
00007FF65F67199F mov eax,dword ptr [rbp+0C4h]
00007FF65F6719A5 inc eax
00007FF65F6719A7 mov dword ptr [rbp+0C4h],eax
00007FF65F6719AD cmp dword ptr [rbp+0C4h],100000h
00007FF65F6719B7 jge main+225h (07FF65F6719F5h)
{
dvec3[i] = dvec1[i] * dvec2[i];
00007FF65F6719B9 movsxd rax,dword ptr [rbp+0C4h]
00007FF65F6719C0 lea rcx,[dvec1 (07FF66027C170h)]
00007FF65F6719C7 movsxd rdx,dword ptr [rbp+0C4h]
00007FF65F6719CE lea r8,[dvec2 (07FF660A7C170h)]
00007FF65F6719D5 movsd xmm0,mmword ptr [rcx+rax*8]
00007FF65F6719DA mulsd xmm0,mmword ptr [r8+rdx*8]
00007FF65F6719E0 movsxd rax,dword ptr [rbp+0C4h]
00007FF65F6719E7 lea rcx,[dvec3 (07FF66127C170h)]
00007FF65F6719EE movsd mmword ptr [rcx+rax*8],xmm0
}
00007FF65F6719F3 jmp main+1CFh (07FF65F67199Fh)
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF65F6719F5 lea rcx,[tk2]
00007FF65F6719F9 call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF65F6719FF mov rax,qword ptr [tk1]
00007FF65F671A03 mov rcx,qword ptr [tk2]
00007FF65F671A07 sub rcx,rax
00007FF65F671A0A mov rax,rcx
00007FF65F671A0D mov rdx,rax
00007FF65F671A10 lea rcx,[string "ticks: %lld\n" (07FF65F679D00h)]
00007FF65F671A17 call printf (07FF65F6711CCh)
}
00007FF65F671A1C jmp main+19Eh (07FF65F67196Eh)
return 0;
00007FF65F671A21 xor eax,eax
}
00007FF65F671A23 mov edi,eax
00007FF65F671A25 lea rcx,[rbp-20h]
00007FF65F671A29 lea rdx,[__xt_z+220h (07FF65F679CC0h)]
00007FF65F671A30 call _RTC_CheckStackVars (07FF65F671136h)
00007FF65F671A35 mov eax,edi
00007FF65F671A37 lea rsp,[rbp+1A8h]
00007FF65F671A3E pop rdi
00007FF65F671A3F pop rbp
00007FF65F671A40 ret
</windows.h></stdio.h></windows.h></stdio.h></code>
Release下的反编译:
<code class="language-txt">// realspeed.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#define veclen 1048576
float vec1[veclen];
float vec2[veclen];
float vec3[veclen];
double dvec1[veclen];
double dvec2[veclen];
double dvec3[veclen];
int main()
{
00007FF6A9EF1070 mov qword ptr [rsp+18h],rbx
00007FF6A9EF1075 push rbp
00007FF6A9EF1076 push rsi
00007FF6A9EF1077 push rdi
00007FF6A9EF1078 push r12
00007FF6A9EF107A push r13
00007FF6A9EF107C push r14
00007FF6A9EF107E push r15
00007FF6A9EF1080 sub rsp,20h
ULONGLONG tk1, tk2;
for (int i = 0; i < veclen; i++)
{
vec1[i] = vec2[i] = 2.0f;
00007FF6A9EF1084 mov eax,40000000h
00007FF6A9EF1089 lea r12,[vec2 (07FF6A9EF3620h)]
00007FF6A9EF1090 mov rdi,r12
00007FF6A9EF1093 lea r13,[vec1 (07FF6AAAF3620h)]
00007FF6A9EF109A mov ecx,100000h
vec3[i] = 0.0f;
00007FF6A9EF109F lea r15,[vec3 (07FF6AB6F3620h)]
00007FF6A9EF10A6 rep stos dword ptr [rdi]
00007FF6A9EF10A8 mov rdi,r13
dvec1[i] = dvec2[i] = 2.0;
00007FF6A9EF10AB lea r14,[dvec2 (07FF6AAEF3620h)]
00007FF6A9EF10B2 mov ecx,100000h
00007FF6A9EF10B7 lea rbp,[dvec1 (07FF6AA2F3620h)]
00007FF6A9EF10BE rep stos dword ptr [rdi]
00007FF6A9EF10C0 xor eax,eax
dvec3[i] = 0.0;
00007FF6A9EF10C2 lea rsi,[dvec3 (07FF6ABAF3620h)]
00007FF6A9EF10C9 mov rdi,r15
00007FF6A9EF10CC mov ecx,100000h
00007FF6A9EF10D1 rep stos dword ptr [rdi]
00007FF6A9EF10D3 mov rax,4000000000000000h
00007FF6A9EF10DD mov rdi,r14
00007FF6A9EF10E0 mov ecx,100000h
00007FF6A9EF10E5 rep stos qword ptr [rdi]
00007FF6A9EF10E8 mov rdi,rbp
00007FF6A9EF10EB mov ecx,100000h
00007FF6A9EF10F0 rep stos qword ptr [rdi]
00007FF6A9EF10F3 xor eax,eax
00007FF6A9EF10F5 mov rdi,rsi
00007FF6A9EF10F8 mov ecx,100000h
00007FF6A9EF10FD rep stos qword ptr [rdi]
}
printf("float:\n");
00007FF6A9EF1100 lea rcx,[string "float:\n" (07FF6A9EF2210h)]
00007FF6A9EF1107 call printf (07FF6A9EF1010h)
00007FF6A9EF110C mov ebx,0Ah
00007FF6A9EF1111 mov edi,ebx
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF6A9EF1113 lea rcx,[tk1]
00007FF6A9EF1118 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]
for (int i = 0; i < veclen; i++)
00007FF6A9EF111E xor eax,eax
00007FF6A9EF1120 mov ecx,20000h
00007FF6A9EF1125 nop word ptr [rax+rax]
{
vec3[i] = vec1[i] * vec2[i];
00007FF6A9EF1130 movups xmm0,xmmword ptr [rax+r13]
00007FF6A9EF1135 movups xmm1,xmmword ptr [rax+r12]
00007FF6A9EF113A lea rax,[rax+20h]
00007FF6A9EF113E mulps xmm1,xmm0
00007FF6A9EF1141 movups xmm0,xmmword ptr [rax+r13-10h]
00007FF6A9EF1147 movups xmmword ptr [rax+r15-20h],xmm1
00007FF6A9EF114D movups xmm1,xmmword ptr [rax+r12-10h]
00007FF6A9EF1153 mulps xmm1,xmm0
00007FF6A9EF1156 movups xmmword ptr [rax+r15-10h],xmm1
00007FF6A9EF115C sub rcx,1
00007FF6A9EF1160 jne main+0C0h (07FF6A9EF1130h)
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF6A9EF1162 lea rcx,[tk2]
00007FF6A9EF1167 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF116D mov rdx,qword ptr [tk2]
printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF1172 lea rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)]
00007FF6A9EF1179 sub rdx,qword ptr [tk1]
00007FF6A9EF117E call printf (07FF6A9EF1010h)
00007FF6A9EF1183 sub rdi,1
00007FF6A9EF1187 jne main+0A3h (07FF6A9EF1113h)
}
printf("double:\n");
00007FF6A9EF1189 lea rcx,[string "double:\n" (07FF6A9EF2228h)]
00007FF6A9EF1190 call printf (07FF6A9EF1010h)
for (int i = 0; i < 10; i++)
{
QueryPerformanceCounter((LARGE_INTEGER*)&tk1);
00007FF6A9EF1195 lea rcx,[tk1]
00007FF6A9EF119A call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]
for (int i = 0; i < veclen; i++)
00007FF6A9EF11A0 xor eax,eax
00007FF6A9EF11A2 mov ecx,40000h
00007FF6A9EF11A7 nop word ptr [rax+rax]
{
dvec3[i] = dvec1[i] * dvec2[i];
00007FF6A9EF11B0 movups xmm0,xmmword ptr [rax+rbp]
00007FF6A9EF11B4 movups xmm1,xmmword ptr [rax+r14]
00007FF6A9EF11B9 lea rax,[rax+20h]
00007FF6A9EF11BD mulpd xmm1,xmm0
00007FF6A9EF11C1 movups xmm0,xmmword ptr [rax+r14-10h]
00007FF6A9EF11C7 movups xmmword ptr [rax+rsi-20h],xmm1
00007FF6A9EF11CC movups xmm1,xmmword ptr [rax+rbp-10h]
00007FF6A9EF11D1 mulpd xmm1,xmm0
{
dvec3[i] = dvec1[i] * dvec2[i];
00007FF6A9EF11D5 movups xmmword ptr [rax+rsi-10h],xmm1
00007FF6A9EF11DA sub rcx,1
00007FF6A9EF11DE jne main+140h (07FF6A9EF11B0h)
}
QueryPerformanceCounter((LARGE_INTEGER*)&tk2);
00007FF6A9EF11E0 lea rcx,[tk2]
00007FF6A9EF11E5 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)]
printf("ticks: %lld\n", tk2 - tk1);
00007FF6A9EF11EB mov rdx,qword ptr [tk2]
00007FF6A9EF11F0 lea rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)]
00007FF6A9EF11F7 sub rdx,qword ptr [tk1]
00007FF6A9EF11FC call printf (07FF6A9EF1010h)
00007FF6A9EF1201 sub rbx,1
00007FF6A9EF1205 jne main+125h (07FF6A9EF1195h)
}
return 0;
00007FF6A9EF1207 xor eax,eax
}
00007FF6A9EF1209 mov rbx,qword ptr [rsp+70h]
00007FF6A9EF120E add rsp,20h
00007FF6A9EF1212 pop r15
00007FF6A9EF1214 pop r14
00007FF6A9EF1216 pop r13
00007FF6A9EF1218 pop r12
00007FF6A9EF121A pop rdi
00007FF6A9EF121B pop rsi
00007FF6A9EF121C pop rbp
00007FF6A9EF121D ret
</windows.h></stdio.h></code>
可以看到,现在早已过了x87 FPU的年代,编译器并没有使用FPU指令,而是使用的SSE指令。
Debug编译下,为了调试方便,将每一个循环都完整表现出来了(循环计数为100000h,即1048576),并且使用了movss/mulss和movsd/mulsd这两组标量指令,速度当然差不多。
而Release编译下,则将循环计数精简为20000h(131072=1048576/8)和40000h(262144=1048576/4),并且使用了movups/mulps和movups/mulpd这两组矢量指令,每次循环内进行2次运算,总计进行40000h(262144=1048576/4)和80000h(524288=1048576/2)次运算。由于SSE寄存器是固定的128位宽,每次只能放置4个32位宽的float或2个64位宽的double数据,因此使用float的话,只需要进行1/4次运算,而使用double的话,则需要进行1/2次运算。
结论就是:对于标量运算,float和double没有显著差别,而对于矢量运算,float比double要快。
因此,在计算量庞大的图形运算中,通常使用float而不是double以提高运算速度。
200字以内,仅用于支线交流,主线讨论请采用回复功能。