22.1 Vectorization

向量化3,例如循环用两个数组生成一个数组。循环体从输入数组中取值,处理后存储到另一个数组。重要的一点是操作了每一个元素。向量化—同时处理多个元素。

向量化并不是新的技术:本书的作者在1998年使用Cray Y-MP EL“lite”时从Cray Y-MP supercomputer line看到过。

例子:

  1. for (i = 0; i < 1024; i++)
  2. {
  3. C[i] = A[i]*B[i];
  4. }

这段代码从A和B中取出元素,相乘,并把结果保存到C。

如果每个元素为32位int型,那么可以从A中加载4个元素到128bits XMM寄存器,B加载到另一个XMM寄存器,通过执行PMULID(Multiply Packed Signed Dword Integers and Store Low Result)和PMULHW(Multiply Packed Signed Integers and Store High Result),一次可以得到4个64位结果。

循环次数从1024变成1024/4,当然更快。

一些简单的情况下某些编译器可以自动向量化,Intel C++5.

函数如下:

  1. int f (int sz, int *ar1, int *ar2, int *ar3)
  2. {
  3. for (int i=0; i<sz; i++)
  4. ar3[i]=ar1[i]+ar2[i];
  5. return 0;
  6. };

22.1.1 Intel C++

Intel C++ 11.1.051 win32下编译:

icl intel.cpp /QaxSSE2 /Faintel.asm /Ox

可以得到(IDA中):

  1. ; int __cdecl f(int, int *, int *, int *)
  2. public ?f@@YAHHPAH00@Z
  3. ?f@@YAHHPAH00@Z proc near
  4. var_10 = dword ptr -10h
  5. sz = dword ptr 4
  6. ar1 = dword ptr 8
  7. ar2 = dword ptr 0Ch
  8. ar3 = dword ptr 10h
  9. push edi
  10. push esi
  11. push ebx
  12. push esi
  13. mov edx, [esp+10h+sz]
  14. test edx, edx
  15. jle loc_15B
  16. mov eax, [esp+10h+ar3]
  17. cmp edx, 6
  18. jle loc_143
  19. cmp eax, [esp+10h+ar2]
  20. jbe short loc_36
  21. mov esi, [esp+10h+ar2]
  22. sub esi, eax
  23. lea ecx, ds:0[edx*4]
  24. neg esi
  25. cmp ecx, esi
  26. jbe short loc_55
  27. loc_36: ; CODE XREF: f(int,int *,int *,int *)+21
  28. cmp eax, [esp+10h+ar2]
  29. jnb loc_143
  30. mov esi, [esp+10h+ar2]
  31. sub esi, eax
  32. lea ecx, ds:0[edx*4]
  33. cmp esi, ecx
  34. jb loc_143
  35. loc_55: ; CODE XREF: f(int,int *,int *,int *)+34
  36. cmp eax, [esp+10h+ar1]
  37. jbe short loc_67
  38. mov esi, [esp+10h+ar1]
  39. sub esi, eax
  40. neg esi
  41. cmp ecx, esi
  42. jbe short loc_7F
  43. loc_67: ; CODE XREF: f(int,int *,int *,int *)+59
  44. cmp eax, [esp+10h+ar1]
  45. jnb loc_143
  46. mov esi, [esp+10h+ar1]
  47. sub esi, eax
  48. cmp esi, ecx
  49. jb loc_143
  50. loc_7F: ; CODE XREF: f(int,int *,int *,int *)+65
  51. mov edi, eax ; edi = ar1
  52. and edi, 0Fh ; is ar1 16-byte aligned?
  53. jz short loc_9A ; yes
  54. test edi, 3
  55. jnz loc_162
  56. neg edi
  57. add edi, 10h
  58. shr edi, 2
  59. loc_9A: ; CODE XREF: f(int,int *,int *,int *)+84
  60. lea ecx, [edi+4]
  61. cmp edx, ecx
  62. jl loc_162
  63. mov ecx, edx
  64. sub ecx, edi
  65. and ecx, 3
  66. neg ecx
  67. add ecx, edx
  68. test edi, edi
  69. jbe short loc_D6
  70. mov ebx, [esp+10h+ar2]
  71. mov [esp+10h+var_10], ecx
  72. mov ecx, [esp+10h+ar1]
  73. xor esi, esi
  74. loc_C1: ; CODE XREF: f(int,int *,int *,int *)+CD
  75. mov edx, [ecx+esi*4]
  76. add edx, [ebx+esi*4]
  77. mov [eax+esi*4], edx
  78. inc esi
  79. cmp esi, edi
  80. jb short loc_C1
  81. mov ecx, [esp+10h+var_10]
  82. mov edx, [esp+10h+sz]
  83. loc_D6: ; CODE XREF: f(int,int *,int *,int *)+B2
  84. mov esi, [esp+10h+ar2]
  85. lea esi, [esi+edi*4] ; is ar2+i*4 16-byte aligned?
  86. test esi, 0Fh
  87. jz short loc_109 ; yes!
  88. mov ebx, [esp+10h+ar1]
  89. mov esi, [esp+10h+ar2]
  90. loc_ED: ; CODE XREF: f(int,int *,int *,int *)+105
  91. movdqu xmm1, xmmword ptr [ebx+edi*4]
  92. movdqu xmm0, xmmword ptr [esi+edi*4] ; ar2+i*4 is not 16-byte aligned, so load
  93. it to xmm0
  94. paddd xmm1, xmm0
  95. movdqa xmmword ptr [eax+edi*4], xmm1
  96. add edi, 4
  97. cmp edi, ecx
  98. jb short loc_ED
  99. jmp short loc_127
  100. ; ---------------------------------------------------------------------------
  101. loc_109: ; CODE XREF: f(int,int *,int *,int *)+E3
  102. mov ebx, [esp+10h+ar1]
  103. mov esi, [esp+10h+ar2]
  104. loc_111: ; CODE XREF: f(int,int *,int *,int *)+125
  105. movdqu xmm0, xmmword ptr [ebx+edi*4]
  106. paddd xmm0, xmmword ptr [esi+edi*4]
  107. movdqa xmmword ptr [eax+edi*4], xmm0
  108. add edi, 4
  109. cmp edi, ecx
  110. jb short loc_111
  111. loc_127: ; CODE XREF: f(int,int *,int *,int *)+107
  112. ; f(int,int *,int *,int *)+164
  113. cmp ecx, edx
  114. jnb short loc_15B
  115. mov esi, [esp+10h+ar1]
  116. mov edi, [esp+10h+ar2]
  117. loc_133: ; CODE XREF: f(int,int *,int *,int *)+13F
  118. mov ebx, [esi+ecx*4]
  119. add ebx, [edi+ecx*4]
  120. mov [eax+ecx*4], ebx
  121. inc ecx
  122. cmp ecx, edx
  123. jb short loc_133
  124. jmp short loc_15B
  125. ; ---------------------------------------------------------------------------
  126. loc_143: ; CODE XREF: f(int,int *,int *,int *)+17
  127. ; f(int,int *,int *,int *)+3A ...
  128. mov esi, [esp+10h+ar1]
  129. mov edi, [esp+10h+ar2]
  130. xor ecx, ecx
  131. loc_14D: ; CODE XREF: f(int,int *,int *,int *)+159
  132. mov ebx, [esi+ecx*4]
  133. add ebx, [edi+ecx*4]
  134. mov [eax+ecx*4], ebx
  135. inc ecx
  136. cmp ecx, edx
  137. jb short loc_14D
  138. loc_15B: ; CODE XREF: f(int,int *,int *,int *)+A
  139. ; f(int,int *,int *,int *)+129 ...
  140. xor eax, eax
  141. pop ecx
  142. pop ebx
  143. pop esi
  144. pop edi
  145. retn
  146. ; ---------------------------------------------------------------------------
  147. loc_162: ; CODE XREF: f(int,int *,int *,int *)+8C
  148. ; f(int,int *,int *,int *)+9F
  149. xor ecx, ecx
  150. jmp short loc_127
  151. ?f@@YAHHPAH00@Z endp

SSE2相关指令是:

  1. MOVDQU (Move Unaligned Double Quadword)—仅仅从内存加载16个字节到XMM寄存器。
  2. PADDD (Add Packed Integers)—把源存储器与目的寄存器按双字对齐无符号整数普通相加,结果送入目的寄存器,内存变量必须对齐内存16字节.
  3. MOVDQA (Move Aligned Double Quadword)—把源存储器内容值送入目的寄存器,当有m128时,必须对齐内存16字节.

如果工作元素超过4对,并且指针ar3按照16字节对齐,SSE2指令将被执行: 如果ar2按照16字节对齐,则代码如下:

  1. movdqu xmm0, xmmword ptr [ebx+edi*4] ; ar1+i*4
  2. paddd xmm0, xmmword ptr [esi+edi*4] ; ar2+i*4
  3. movdqa xmmword ptr [eax+edi*4], xmm0 ; ar3+i*4

否则,ar2处的值将用MOVDQU加载到XMM0,它不需要对齐指针,代码如下:

  1. movdqu xmm1, xmmword ptr [ebx+edi*4] ; ar1+i*4
  2. movdqu xmm0, xmmword ptr [esi+edi*4] ; ar2+i*4 is not 16-byte aligned, so load it to xmm0
  3. paddd xmm1, xmm0
  4. movdqa xmmword ptr [eax+edi*4], xmm1 ; ar3+i*4

其他情况,将没有SSE2代码被执行。

22.1.2 GCC

gcc用-O3 选项同时打开SSE2支持: -msse2.

可以得到(GCC 4.4.1):

  1. ; f(int, int *, int *, int *)
  2. public _Z1fiPiS_S_
  3. _Z1fiPiS_S_ proc near
  4. var_18 = dword ptr -18h
  5. var_14 = dword ptr -14h
  6. var_10 = dword ptr -10h
  7. arg_0 = dword ptr 8
  8. arg_4 = dword ptr 0Ch
  9. arg_8 = dword ptr 10h
  10. arg_C = dword ptr 14h
  11. push ebp
  12. mov ebp, esp
  13. push edi
  14. push esi
  15. push ebx
  16. sub esp, 0Ch
  17. mov ecx, [ebp+arg_0]
  18. mov esi, [ebp+arg_4]
  19. mov edi, [ebp+arg_8]
  20. mov ebx, [ebp+arg_C]
  21. test ecx, ecx
  22. jle short loc_80484D8
  23. cmp ecx, 6
  24. lea eax, [ebx+10h]
  25. ja short loc_80484E8
  26. loc_80484C1: ; CODE XREF: f(int,int *,int *,int *)+4B
  27. ; f(int,int *,int *,int *)+61 ...
  28. xor eax, eax
  29. nop
  30. lea esi, [esi+0]
  31. loc_80484C8: ; CODE XREF: f(int,int *,int *,int *)+36
  32. mov edx, [edi+eax*4]
  33. add edx, [esi+eax*4]
  34. mov [ebx+eax*4], edx
  35. add eax, 1
  36. cmp eax, ecx
  37. jnz short loc_80484C8
  38. loc_80484D8: ; CODE XREF: f(int,int *,int *,int *)+17
  39. ; f(int,int *,int *,int *)+A5
  40. add esp, 0Ch
  41. xor eax, eax
  42. pop ebx
  43. pop esi
  44. pop edi
  45. pop ebp
  46. retn
  47. ; ---------------------------------------------------------------------------
  48. align 8
  49. loc_80484E8: ; CODE XREF: f(int,int *,int *,int *)+1F
  50. test bl, 0Fh
  51. jnz short loc_80484C1
  52. lea edx, [esi+10h]
  53. cmp ebx, edx
  54. jbe loc_8048578
  55. loc_80484F8: ; CODE XREF: f(int,int *,int *,int *)+E0
  56. lea edx, [edi+10h]
  57. cmp ebx, edx
  58. ja short loc_8048503
  59. cmp edi, eax
  60. jbe short loc_80484C1
  61. loc_8048503: ; CODE XREF: f(int,int *,int *,int *)+5D
  62. mov eax, ecx
  63. shr eax, 2
  64. mov [ebp+var_14], eax
  65. shl eax, 2
  66. test eax, eax
  67. mov [ebp+var_10], eax
  68. jz short loc_8048547
  69. mov [ebp+var_18], ecx
  70. mov ecx, [ebp+var_14]
  71. xor eax, eax
  72. xor edx, edx
  73. nop
  74. loc_8048520: ; CODE XREF: f(int,int *,int *,int *)+9B
  75. movdqu xmm1, xmmword ptr [edi+eax]
  76. movdqu xmm0, xmmword ptr [esi+eax]
  77. add edx, 1
  78. paddd xmm0, xmm1
  79. movdqa xmmword ptr [ebx+eax], xmm0
  80. add eax, 10h
  81. cmp edx, ecx
  82. jb short loc_8048520
  83. mov ecx, [ebp+var_18]
  84. mov eax, [ebp+var_10]
  85. cmp ecx, eax
  86. jz short loc_80484D8
  87. loc_8048547: ; CODE XREF: f(int,int *,int *,int *)+73
  88. lea edx, ds:0[eax*4]
  89. add esi, edx
  90. add edi, edx
  91. add ebx, edx
  92. lea esi, [esi+0]
  93. loc_8048558: ; CODE XREF: f(int,int *,int *,int *)+CC
  94. mov edx, [edi]
  95. add eax, 1
  96. add edi, 4
  97. add edx, [esi]
  98. add esi, 4
  99. mov [ebx], edx
  100. add ebx, 4
  101. cmp ecx, eax
  102. jg short loc_8048558
  103. add esp, 0Ch
  104. xor eax, eax
  105. pop ebx
  106. pop esi
  107. pop edi
  108. pop ebp
  109. retn
  110. ; ---------------------------------------------------------------------------
  111. loc_8048578: ; CODE XREF: f(int,int *,int *,int *)+52
  112. cmp eax, esi
  113. jnb loc_80484C1
  114. jmp loc_80484F8
  115. _Z1fiPiS_S_ endp

几乎一样,但没有Intel的细致。