加载中
加载中
表情图片
评为精选
鼓励
加载中...
分享
加载中...
文件下载
加载中...
修改排序
加载中...
float和double一样快吗?
acmilan2016/06/11软件综合 IP:四川

我记得世纪之初的时候,某本古老的书上有这么一句话,大概是这个意思,无论是float还是double,在CPU内部都是转换为80位浮点数运算的,因此float和double其实是一样快的。

但是时代变化太快,这句话现在还对不对呢?写了个程序验证一下。使用的是Visual C++ 2015 Update 2,编译为x64架构。为了避免调试器的干扰,直接使用Ctrl+F5运行。

程序如下:

Other
// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; vec3[i] = 0.0f; dvec1[i] = dvec2[i] = 2.0; dvec3[i] = 0.0; } printf("float:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { vec3[i] = vec1[i] * vec2[i]; } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } printf("double:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { dvec3[i] = dvec1[i] * dvec2[i]; } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } return 0; } </windows.h></stdio.h>

Debug下Ctrl+F5直接运行:

debug.png

Release下Ctrl+F5直接运行:

release.png

可以看到,在Release编译下,float比double快得多,而在Debug编译下则几乎没有差别。这是为什么呢?在这里我们设置了个断点,进行一下反编译——

Debug下的反编译:

Other
// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { 00007FF65F6717D0 push rbp 00007FF65F6717D2 push rdi 00007FF65F6717D3 sub rsp,1C8h 00007FF65F6717DA lea rbp,[rsp+20h] 00007FF65F6717DF mov rdi,rsp 00007FF65F6717E2 mov ecx,72h 00007FF65F6717E7 mov eax,0CCCCCCCCh // realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { 00007FF65F6717EC rep stos dword ptr [rdi] ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) 00007FF65F6717EE mov dword ptr [rbp+44h],0 00007FF65F6717F5 jmp main+2Fh (07FF65F6717FFh) 00007FF65F6717F7 mov eax,dword ptr [rbp+44h] 00007FF65F6717FA inc eax 00007FF65F6717FC mov dword ptr [rbp+44h],eax 00007FF65F6717FF cmp dword ptr [rbp+44h],100000h 00007FF65F671806 jge main+0C7h (07FF65F671897h) { vec1[i] = vec2[i] = 2.0f; 00007FF65F67180C movsxd rax,dword ptr [rbp+44h] 00007FF65F671810 lea rcx,[vec2 (07FF65FA7C170h)] 00007FF65F671817 movss xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)] 00007FF65F67181F movss dword ptr [rcx+rax*4],xmm0 00007FF65F671824 movsxd rax,dword ptr [rbp+44h] 00007FF65F671828 lea rcx,[vec1 (07FF65F67C170h)] 00007FF65F67182F movss xmm0,dword ptr [__real@40000000 (07FF65F679D1Ch)] 00007FF65F671837 movss dword ptr [rcx+rax*4],xmm0 vec3[i] = 0.0f; 00007FF65F67183C movsxd rax,dword ptr [rbp+44h] 00007FF65F671840 lea rcx,[vec3 (07FF65FE7C170h)] 00007FF65F671847 xorps xmm0,xmm0 00007FF65F67184A movss dword ptr [rcx+rax*4],xmm0 dvec1[i] = dvec2[i] = 2.0; 00007FF65F67184F movsxd rax,dword ptr [rbp+44h] 00007FF65F671853 lea rcx,[dvec2 (07FF660A7C170h)] 00007FF65F67185A movsd xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)] 00007FF65F671862 movsd mmword ptr [rcx+rax*8],xmm0 00007FF65F671867 movsxd rax,dword ptr [rbp+44h] 00007FF65F67186B lea rcx,[dvec1 (07FF66027C170h)] 00007FF65F671872 movsd xmm0,mmword ptr [__real@4000000000000000 (07FF65F679D20h)] 00007FF65F67187A movsd mmword ptr [rcx+rax*8],xmm0 dvec3[i] = 0.0; 00007FF65F67187F movsxd rax,dword ptr [rbp+44h] 00007FF65F671883 lea rcx,[dvec3 (07FF66127C170h)] 00007FF65F67188A xorps xmm0,xmm0 00007FF65F67188D movsd mmword ptr [rcx+rax*8],xmm0 } 00007FF65F671892 jmp main+27h (07FF65F6717F7h) printf("float:\n"); 00007FF65F671897 lea rcx,[string "float:\n" (07FF65F679CF0h)] 00007FF65F67189E call printf (07FF65F6711CCh) for (int i = 0; i < 10; i++) 00007FF65F6718A3 mov dword ptr [rbp+64h],0 00007FF65F6718AA jmp main+0E4h (07FF65F6718B4h) 00007FF65F6718AC mov eax,dword ptr [rbp+64h] 00007FF65F6718AF inc eax 00007FF65F6718B1 mov dword ptr [rbp+64h],eax 00007FF65F6718B4 cmp dword ptr [rbp+64h],0Ah 00007FF65F6718B8 jge main+186h (07FF65F671956h) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF65F6718BE lea rcx,[tk1] 00007FF65F6718C2 call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)] for (int i = 0; i < veclen; i++) 00007FF65F6718C8 mov dword ptr [rbp+84h],0 00007FF65F6718D2 jmp main+112h (07FF65F6718E2h) 00007FF65F6718D4 mov eax,dword ptr [rbp+84h] 00007FF65F6718DA inc eax 00007FF65F6718DC mov dword ptr [rbp+84h],eax 00007FF65F6718E2 cmp dword ptr [rbp+84h],100000h 00007FF65F6718EC jge main+15Ah (07FF65F67192Ah) { vec3[i] = vec1[i] * vec2[i]; 00007FF65F6718EE movsxd rax,dword ptr [rbp+84h] 00007FF65F6718F5 lea rcx,[vec1 (07FF65F67C170h)] 00007FF65F6718FC movsxd rdx,dword ptr [rbp+84h] 00007FF65F671903 lea r8,[vec2 (07FF65FA7C170h)] 00007FF65F67190A movss xmm0,dword ptr [rcx+rax*4] 00007FF65F67190F mulss xmm0,dword ptr [r8+rdx*4] 00007FF65F671915 movsxd rax,dword ptr [rbp+84h] 00007FF65F67191C lea rcx,[vec3 (07FF65FE7C170h)] 00007FF65F671923 movss dword ptr [rcx+rax*4],xmm0 } 00007FF65F671928 jmp main+104h (07FF65F6718D4h) QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF65F67192A lea rcx,[tk2] 00007FF65F67192E call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF65F671934 mov rax,qword ptr [tk1] 00007FF65F671938 mov rcx,qword ptr [tk2] 00007FF65F67193C sub rcx,rax 00007FF65F67193F mov rax,rcx 00007FF65F671942 mov rdx,rax 00007FF65F671945 lea rcx,[string "ticks: %lld\n" (07FF65F679D00h)] 00007FF65F67194C call printf (07FF65F6711CCh) } 00007FF65F671951 jmp main+0DCh (07FF65F6718ACh) printf("double:\n"); 00007FF65F671956 lea rcx,[string "double:\n" (07FF65F679D10h)] 00007FF65F67195D call printf (07FF65F6711CCh) for (int i = 0; i < 10; i++) 00007FF65F671962 mov dword ptr [rbp+0A4h],0 00007FF65F67196C jmp main+1ACh (07FF65F67197Ch) 00007FF65F67196E mov eax,dword ptr [rbp+0A4h] 00007FF65F671974 inc eax 00007FF65F671976 mov dword ptr [rbp+0A4h],eax 00007FF65F67197C cmp dword ptr [rbp+0A4h],0Ah 00007FF65F671983 jge main+251h (07FF65F671A21h) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF65F671989 lea rcx,[tk1] 00007FF65F67198D call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)] for (int i = 0; i < veclen; i++) 00007FF65F671993 mov dword ptr [rbp+0C4h],0 00007FF65F67199D jmp main+1DDh (07FF65F6719ADh) 00007FF65F67199F mov eax,dword ptr [rbp+0C4h] 00007FF65F6719A5 inc eax 00007FF65F6719A7 mov dword ptr [rbp+0C4h],eax 00007FF65F6719AD cmp dword ptr [rbp+0C4h],100000h 00007FF65F6719B7 jge main+225h (07FF65F6719F5h) { dvec3[i] = dvec1[i] * dvec2[i]; 00007FF65F6719B9 movsxd rax,dword ptr [rbp+0C4h] 00007FF65F6719C0 lea rcx,[dvec1 (07FF66027C170h)] 00007FF65F6719C7 movsxd rdx,dword ptr [rbp+0C4h] 00007FF65F6719CE lea r8,[dvec2 (07FF660A7C170h)] 00007FF65F6719D5 movsd xmm0,mmword ptr [rcx+rax*8] 00007FF65F6719DA mulsd xmm0,mmword ptr [r8+rdx*8] 00007FF65F6719E0 movsxd rax,dword ptr [rbp+0C4h] 00007FF65F6719E7 lea rcx,[dvec3 (07FF66127C170h)] 00007FF65F6719EE movsd mmword ptr [rcx+rax*8],xmm0 } 00007FF65F6719F3 jmp main+1CFh (07FF65F67199Fh) QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF65F6719F5 lea rcx,[tk2] 00007FF65F6719F9 call qword ptr [__imp_QueryPerformanceCounter (07FF6621B3000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF65F6719FF mov rax,qword ptr [tk1] 00007FF65F671A03 mov rcx,qword ptr [tk2] 00007FF65F671A07 sub rcx,rax 00007FF65F671A0A mov rax,rcx 00007FF65F671A0D mov rdx,rax 00007FF65F671A10 lea rcx,[string "ticks: %lld\n" (07FF65F679D00h)] 00007FF65F671A17 call printf (07FF65F6711CCh) } 00007FF65F671A1C jmp main+19Eh (07FF65F67196Eh) return 0; 00007FF65F671A21 xor eax,eax } 00007FF65F671A23 mov edi,eax 00007FF65F671A25 lea rcx,[rbp-20h] 00007FF65F671A29 lea rdx,[__xt_z+220h (07FF65F679CC0h)] 00007FF65F671A30 call _RTC_CheckStackVars (07FF65F671136h) 00007FF65F671A35 mov eax,edi 00007FF65F671A37 lea rsp,[rbp+1A8h] 00007FF65F671A3E pop rdi 00007FF65F671A3F pop rbp 00007FF65F671A40 ret </windows.h></stdio.h></windows.h></stdio.h>

Release下的反编译:

Other
// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { 00007FF6A9EF1070 mov qword ptr [rsp+18h],rbx 00007FF6A9EF1075 push rbp 00007FF6A9EF1076 push rsi 00007FF6A9EF1077 push rdi 00007FF6A9EF1078 push r12 00007FF6A9EF107A push r13 00007FF6A9EF107C push r14 00007FF6A9EF107E push r15 00007FF6A9EF1080 sub rsp,20h ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; 00007FF6A9EF1084 mov eax,40000000h 00007FF6A9EF1089 lea r12,[vec2 (07FF6A9EF3620h)] 00007FF6A9EF1090 mov rdi,r12 00007FF6A9EF1093 lea r13,[vec1 (07FF6AAAF3620h)] 00007FF6A9EF109A mov ecx,100000h vec3[i] = 0.0f; 00007FF6A9EF109F lea r15,[vec3 (07FF6AB6F3620h)] 00007FF6A9EF10A6 rep stos dword ptr [rdi] 00007FF6A9EF10A8 mov rdi,r13 dvec1[i] = dvec2[i] = 2.0; 00007FF6A9EF10AB lea r14,[dvec2 (07FF6AAEF3620h)] 00007FF6A9EF10B2 mov ecx,100000h 00007FF6A9EF10B7 lea rbp,[dvec1 (07FF6AA2F3620h)] 00007FF6A9EF10BE rep stos dword ptr [rdi] 00007FF6A9EF10C0 xor eax,eax dvec3[i] = 0.0; 00007FF6A9EF10C2 lea rsi,[dvec3 (07FF6ABAF3620h)] 00007FF6A9EF10C9 mov rdi,r15 00007FF6A9EF10CC mov ecx,100000h 00007FF6A9EF10D1 rep stos dword ptr [rdi] 00007FF6A9EF10D3 mov rax,4000000000000000h 00007FF6A9EF10DD mov rdi,r14 00007FF6A9EF10E0 mov ecx,100000h 00007FF6A9EF10E5 rep stos qword ptr [rdi] 00007FF6A9EF10E8 mov rdi,rbp 00007FF6A9EF10EB mov ecx,100000h 00007FF6A9EF10F0 rep stos qword ptr [rdi] 00007FF6A9EF10F3 xor eax,eax 00007FF6A9EF10F5 mov rdi,rsi 00007FF6A9EF10F8 mov ecx,100000h 00007FF6A9EF10FD rep stos qword ptr [rdi] } printf("float:\n"); 00007FF6A9EF1100 lea rcx,[string "float:\n" (07FF6A9EF2210h)] 00007FF6A9EF1107 call printf (07FF6A9EF1010h) 00007FF6A9EF110C mov ebx,0Ah 00007FF6A9EF1111 mov edi,ebx for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF6A9EF1113 lea rcx,[tk1] 00007FF6A9EF1118 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)] for (int i = 0; i < veclen; i++) 00007FF6A9EF111E xor eax,eax 00007FF6A9EF1120 mov ecx,20000h 00007FF6A9EF1125 nop word ptr [rax+rax] { vec3[i] = vec1[i] * vec2[i]; 00007FF6A9EF1130 movups xmm0,xmmword ptr [rax+r13] 00007FF6A9EF1135 movups xmm1,xmmword ptr [rax+r12] 00007FF6A9EF113A lea rax,[rax+20h] 00007FF6A9EF113E mulps xmm1,xmm0 00007FF6A9EF1141 movups xmm0,xmmword ptr [rax+r13-10h] 00007FF6A9EF1147 movups xmmword ptr [rax+r15-20h],xmm1 00007FF6A9EF114D movups xmm1,xmmword ptr [rax+r12-10h] 00007FF6A9EF1153 mulps xmm1,xmm0 00007FF6A9EF1156 movups xmmword ptr [rax+r15-10h],xmm1 00007FF6A9EF115C sub rcx,1 00007FF6A9EF1160 jne main+0C0h (07FF6A9EF1130h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF6A9EF1162 lea rcx,[tk2] 00007FF6A9EF1167 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF6A9EF116D mov rdx,qword ptr [tk2] printf("ticks: %lld\n", tk2 - tk1); 00007FF6A9EF1172 lea rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)] 00007FF6A9EF1179 sub rdx,qword ptr [tk1] 00007FF6A9EF117E call printf (07FF6A9EF1010h) 00007FF6A9EF1183 sub rdi,1 00007FF6A9EF1187 jne main+0A3h (07FF6A9EF1113h) } printf("double:\n"); 00007FF6A9EF1189 lea rcx,[string "double:\n" (07FF6A9EF2228h)] 00007FF6A9EF1190 call printf (07FF6A9EF1010h) for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF6A9EF1195 lea rcx,[tk1] 00007FF6A9EF119A call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)] for (int i = 0; i < veclen; i++) 00007FF6A9EF11A0 xor eax,eax 00007FF6A9EF11A2 mov ecx,40000h 00007FF6A9EF11A7 nop word ptr [rax+rax] { dvec3[i] = dvec1[i] * dvec2[i]; 00007FF6A9EF11B0 movups xmm0,xmmword ptr [rax+rbp] 00007FF6A9EF11B4 movups xmm1,xmmword ptr [rax+r14] 00007FF6A9EF11B9 lea rax,[rax+20h] 00007FF6A9EF11BD mulpd xmm1,xmm0 00007FF6A9EF11C1 movups xmm0,xmmword ptr [rax+r14-10h] 00007FF6A9EF11C7 movups xmmword ptr [rax+rsi-20h],xmm1 00007FF6A9EF11CC movups xmm1,xmmword ptr [rax+rbp-10h] 00007FF6A9EF11D1 mulpd xmm1,xmm0 { dvec3[i] = dvec1[i] * dvec2[i]; 00007FF6A9EF11D5 movups xmmword ptr [rax+rsi-10h],xmm1 00007FF6A9EF11DA sub rcx,1 00007FF6A9EF11DE jne main+140h (07FF6A9EF11B0h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF6A9EF11E0 lea rcx,[tk2] 00007FF6A9EF11E5 call qword ptr [__imp_QueryPerformanceCounter (07FF6A9EF2000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF6A9EF11EB mov rdx,qword ptr [tk2] 00007FF6A9EF11F0 lea rcx,[string "ticks: %lld\n" (07FF6A9EF2218h)] 00007FF6A9EF11F7 sub rdx,qword ptr [tk1] 00007FF6A9EF11FC call printf (07FF6A9EF1010h) 00007FF6A9EF1201 sub rbx,1 00007FF6A9EF1205 jne main+125h (07FF6A9EF1195h) } return 0; 00007FF6A9EF1207 xor eax,eax } 00007FF6A9EF1209 mov rbx,qword ptr [rsp+70h] 00007FF6A9EF120E add rsp,20h 00007FF6A9EF1212 pop r15 00007FF6A9EF1214 pop r14 00007FF6A9EF1216 pop r13 00007FF6A9EF1218 pop r12 00007FF6A9EF121A pop rdi 00007FF6A9EF121B pop rsi 00007FF6A9EF121C pop rbp 00007FF6A9EF121D ret </windows.h></stdio.h>

可以看到,现在早已过了x87 FPU的年代,编译器并没有使用FPU指令,而是使用的SSE指令。

Debug编译下,为了调试方便,将每一个循环都完整表现出来了(循环计数为100000h,即1048576),并且使用了movss/mulss和movsd/mulsd这两组标量指令,速度当然差不多。

而Release编译下,则将循环计数精简为20000h(131072=1048576/8)和40000h(262144=1048576/4),并且使用了movups/mulps和movups/mulpd这两组矢量指令,每次循环内进行2次运算,总计进行40000h(262144=1048576/4)和80000h(524288=1048576/2)次运算。由于SSE寄存器是固定的128位宽,每次只能放置4个32位宽的float或2个64位宽的double数据,因此使用float的话,只需要进行1/4次运算,而使用double的话,则需要进行1/2次运算。

结论就是:对于标量运算,float和double没有显著差别,而对于矢量运算,float比double要快。

因此,在计算量庞大的图形运算中,通常使用float而不是double以提高运算速度。

[修改于 9年3个月前 - 2016/06/11 12:32:47]

来自:计算机科学 / 软件综合
7
 
新版本公告
~~空空如也
acmilan 作者
9年3个月前 修改于 9年3个月前 IP:四川
821328

上面演示的是四则运算,编译器自然有充足的弹性进行优化。那如果是像exp、log这样的math.h函数,编译器怎么优化呢?比如下面的代码:

Other
// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #include <math.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; vec3[i] = 0.0f; dvec1[i] = dvec2[i] = 2.0; dvec3[i] = 0.0; } printf("float:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { //vec3[i] = vec1[i] * vec2[i]; vec3[i] = logf(vec1[i]); } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } printf("double:\n"); for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); for (int i = 0; i < veclen; i++) { //dvec3[i] = dvec1[i] * dvec2[i]; dvec3[i] = log(dvec1[i]); } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); printf("ticks: %lld\n", tk2 - tk1); } return 0; } </math.h></windows.h></stdio.h>

我们看一下Release反汇编就知道了。Release反汇编如下:

Other
// realspeed.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include <stdio.h> #include <windows.h> #include <math.h> #define veclen 1048576 float vec1[veclen]; float vec2[veclen]; float vec3[veclen]; double dvec1[veclen]; double dvec2[veclen]; double dvec3[veclen]; int main() { 00007FF78CE81070 mov qword ptr [rsp+18h],rbx 00007FF78CE81075 push rbp 00007FF78CE81076 push rsi 00007FF78CE81077 push rdi 00007FF78CE81078 push r12 00007FF78CE8107A push r13 00007FF78CE8107C push r14 00007FF78CE8107E push r15 00007FF78CE81080 sub rsp,20h ULONGLONG tk1, tk2; for (int i = 0; i < veclen; i++) { vec1[i] = vec2[i] = 2.0f; 00007FF78CE81084 mov eax,40000000h 00007FF78CE81089 lea rdi,[vec2 (07FF78CE84630h)] 00007FF78CE81090 mov ecx,100000h 00007FF78CE81095 lea r15,[vec1 (07FF78DA84630h)] 00007FF78CE8109C rep stos dword ptr [rdi] 00007FF78CE8109E mov rdi,r15 vec3[i] = 0.0f; 00007FF78CE810A1 lea r14,[vec3 (07FF78E684630h)] 00007FF78CE810A8 mov ecx,100000h dvec1[i] = dvec2[i] = 2.0; 00007FF78CE810AD lea r13,[dvec1 (07FF78D284630h)] 00007FF78CE810B4 rep stos dword ptr [rdi] 00007FF78CE810B6 xor eax,eax dvec3[i] = 0.0; 00007FF78CE810B8 lea r12,[dvec3 (07FF78EA84630h)] 00007FF78CE810BF mov rdi,r14 00007FF78CE810C2 mov ecx,100000h 00007FF78CE810C7 rep stos dword ptr [rdi] 00007FF78CE810C9 mov rax,4000000000000000h 00007FF78CE810D3 lea rdi,[dvec2 (07FF78DE84630h)] 00007FF78CE810DA mov ecx,100000h 00007FF78CE810DF rep stos qword ptr [rdi] 00007FF78CE810E2 mov rdi,r13 00007FF78CE810E5 mov ecx,100000h 00007FF78CE810EA rep stos qword ptr [rdi] 00007FF78CE810ED xor eax,eax 00007FF78CE810EF mov rdi,r12 00007FF78CE810F2 mov ecx,100000h 00007FF78CE810F7 rep stos qword ptr [rdi] } printf("float:\n"); 00007FF78CE810FA lea rcx,[string "float:\n" (07FF78CE83210h)] 00007FF78CE81101 call printf (07FF78CE81010h) 00007FF78CE81106 mov ebp,0Ah 00007FF78CE8110B mov esi,ebp 00007FF78CE8110D nop dword ptr [rax] for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF78CE81110 lea rcx,[tk1] 00007FF78CE81115 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] for (int i = 0; i < veclen; i++) 00007FF78CE8111B xor edi,edi 00007FF78CE8111D mov ebx,40000h { //vec3[i] = vec1[i] * vec2[i]; vec3[i] = logf(vec1[i]); 00007FF78CE81122 movups xmm0,xmmword ptr [rdi+r15] 00007FF78CE81127 call __vdecl_logf4 (07FF78CE81EF0h) 00007FF78CE8112C movups xmmword ptr [rdi+r14],xmm0 00007FF78CE81131 lea rdi,[rdi+10h] 00007FF78CE81135 sub rbx,1 00007FF78CE81139 jne main+0B2h (07FF78CE81122h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF78CE8113B lea rcx,[tk2] 00007FF78CE81140 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF78CE81146 mov rdx,qword ptr [tk2] 00007FF78CE8114B lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)] 00007FF78CE81152 sub rdx,qword ptr [tk1] 00007FF78CE81157 call printf (07FF78CE81010h) 00007FF78CE8115C sub rsi,1 00007FF78CE81160 jne main+0A0h (07FF78CE81110h) } printf("double:\n"); 00007FF78CE81162 lea rcx,[string "double:\n" (07FF78CE83228h)] } printf("double:\n"); 00007FF78CE81169 call printf (07FF78CE81010h) 00007FF78CE8116E xchg ax,ax for (int i = 0; i < 10; i++) { QueryPerformanceCounter((LARGE_INTEGER*)&tk1); 00007FF78CE81170 lea rcx,[tk1] 00007FF78CE81175 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] for (int i = 0; i < veclen; i++) 00007FF78CE8117B xor edi,edi 00007FF78CE8117D mov ebx,80000h { //dvec3[i] = dvec1[i] * dvec2[i]; dvec3[i] = log(dvec1[i]); 00007FF78CE81182 movups xmm0,xmmword ptr [rdi+r13] 00007FF78CE81187 call __vdecl_log2 (07FF78CE81EE0h) 00007FF78CE8118C movups xmmword ptr [rdi+r12],xmm0 00007FF78CE81191 lea rdi,[rdi+10h] 00007FF78CE81195 sub rbx,1 00007FF78CE81199 jne main+112h (07FF78CE81182h) } QueryPerformanceCounter((LARGE_INTEGER*)&tk2); 00007FF78CE8119B lea rcx,[tk2] 00007FF78CE811A0 call qword ptr [__imp_QueryPerformanceCounter (07FF78CE83000h)] printf("ticks: %lld\n", tk2 - tk1); 00007FF78CE811A6 mov rdx,qword ptr [tk2] 00007FF78CE811AB lea rcx,[string "ticks: %lld\n" (07FF78CE83218h)] 00007FF78CE811B2 sub rdx,qword ptr [tk1] 00007FF78CE811B7 call printf (07FF78CE81010h) 00007FF78CE811BC sub rbp,1 00007FF78CE811C0 jne main+100h (07FF78CE81170h) } return 0; 00007FF78CE811C2 xor eax,eax } 00007FF78CE811C4 mov rbx,qword ptr [rsp+70h] 00007FF78CE811C9 add rsp,20h 00007FF78CE811CD pop r15 00007FF78CE811CF pop r14 00007FF78CE811D1 pop r13 00007FF78CE811D3 pop r12 00007FF78CE811D5 pop rdi 00007FF78CE811D6 pop rsi 00007FF78CE811D7 pop rbp 00007FF78CE811D8 ret </math.h></windows.h></stdio.h>

可以看到,编译器并没有调用logf和log函数,而是调用了__vdecl_logf4和__vdecl_log2函数。因此,即使是使用了math.h中的数学函数,仍然可以实现矢量运算优化。

引用
评论
加载评论中,请稍候...
200字以内,仅用于支线交流,主线讨论请采用回复功能。
折叠评论
acmilan作者
8年2个月前 修改于 8年2个月前 IP:四川
836577
引用 小俊:
这个测试还不算完整,只能证明没有vectorization的情况下,double和float的throughput是一样的,但不能证明它们的latency也是一样。
这里不做准确的的延迟测试,只是为了表明float向量化之后的吞吐量增加到4倍,而double只增加到2倍,因此float更有利于大吞吐量但精度要求不高的运算。float和double在没有向量化之前速度相同,不是要证明的重点,只是对现象的粗略的表述。
引用
评论
加载评论中,请稍候...
200字以内,仅用于支线交流,主线讨论请采用回复功能。
折叠评论

想参与大家的讨论?现在就 登录 或者 注册

所属专业
上级专业
同级专业
acmilan
进士 学者 笔友
文章
461
回复
2934
学术分
4
2009/05/30注册,6年7个月前活动
暂无简介
主体类型:个人
所属领域:无
认证方式:邮箱
IP归属地:未同步
插入公式
评论控制
加载中...
文号:{{pid}}
投诉或举报
加载中...
{{tip}}
请选择违规类型:
{{reason.type}}

空空如也

笔记
{{note.content}}
{{n.user.username}}
{{fromNow(n.toc)}} {{n.status === noteStatus.disabled ? "已屏蔽" : ""}} {{n.status === noteStatus.unknown ? "正在审核" : ""}} {{n.status === noteStatus.deleted ? '已删除' : ''}}
  • 编辑
  • 删除
  • {{n.status === 'disabled' ? "解除屏蔽" : "屏蔽" }}
我也是有底线的