23.1 x86-64

对x86架构来说这是一个64位的扩展。 从反编译工程师的角度来看,最重要的区别是: 几乎所有的寄存器(除了FPU和SIMD)都扩展到了64位,而且都有一个r-前缀,而且还额外添加了8个寄存器。 现在所有的通用寄存器是:RAX、RBX、RCX、RDX、RBP、RSP、RSI、RDI、R8、R9、R10、R11、R12、R13、R14、R15。 当然,还是可以像以前一样加载旧的寄存器的。比如,使用EAX就可以访问RAX的低32位部分。 新的R8-R15寄存器也有对应的低位:R8D-R15D(低32位)、R8W-R15W(低16位)、R8B-R15B(低8位)。 SIMD寄存器的数量从8个扩展到了16个:XMM0-XMM15。

在Win64下,函数调用转换有一些轻微的变化。例如fastcall(见47.3节)。最开始的4个参数将存储在RCX、RDX、R8和R9寄存器里,其他的保存在栈上。调用者函数必须分配32个字节,因此被调用者可以保存前4个参数,然后再去按照他自己的需要去利用这些寄存器。一些较短的函数可以直接从寄存器里使用参数,但是大点的函数就需要把参数保存到栈上了。 系统V AMD64 ABI(LINUX, *BSD, MAC OS X)也改变了fastcall的方式。它为前6个参数使用了6个寄存器RDI、RSI、RDX、RCX、R8、R9。剩余的参数将传入栈中。 请看调用转换(47)一节。

为了保证兼容性,C int类型依然是32位。 •现在所有的指针都是64位的了。 当然这个有时候很麻烦:因为现在我们需要2倍的空间来存储指针,包括缓存,而不管事实上64位CPU只会使用48位的扩展内存这个情况。

由于现在寄存器数量翻倍了,编译器也将有更多的空间来处理寄存器分配的策略。对我们来说,也就是现在提交的代码将会有更少的本地变量。 例如,DES加密算法中计算第一个S-Box时,使用位切割DES方法(见22章)他将每次处理32/64/128/256个变量(依据DES_type类型(uint32、uint64、SSE2或者AVX))。

  1. /*
  2. * Generated S-box files.
  3. *
  4. * This software may be modified, redistributed, and used for any purpose,
  5. * so long as its origin is acknowledged.
  6. *
  7. * Produced by Matthew Kwan - March 1998
  8. */
  9. #ifdef _WIN64
  10. #define DES_type unsigned __int64
  11. #else
  12. #define DES_type unsigned int
  13. #endif
  14. void
  15. s1 (
  16. DES_type a1,
  17. DES_type a2,
  18. DES_type a3,
  19. DES_type a4,
  20. DES_type a5,
  21. DES_type a6,
  22. DES_type *out1,
  23. DES_type *out2,
  24. DES_type *out3,
  25. DES_type *out4
  26. ) {
  27. DES_type x1, x2, x3, x4, x5, x6, x7, x8;
  28. DES_type x9, x10, x11, x12, x13, x14, x15, x16;
  29. DES_type x17, x18, x19, x20, x21, x22, x23, x24;
  30. DES_type x25, x26, x27, x28, x29, x30, x31, x32;
  31. DES_type x33, x34, x35, x36, x37, x38, x39, x40;
  32. DES_type x41, x42, x43, x44, x45, x46, x47, x48;
  33. DES_type x49, x50, x51, x52, x53, x54, x55, x56;
  34. x1 = a3 & ~a5;
  35. x2 = x1 ^ a4;
  36. x3 = a3 & ~a4;
  37. x4 = x3 | a5;
  38. x5 = a6 & x4;
  39. x6 = x2 ^ x5;
  40. x7 = a4 & ~a5;
  41. x8 = a3 ^ a4;
  42. x9 = a6 & ~x8;
  43. x10 = x7 ^ x9;
  44. x11 = a2 | x10;
  45. x12 = x6 ^ x11;
  46. x13 = a5 ^ x5;
  47. x14 = x13 & x8;
  48. x15 = a5 & ~a4;
  49. x16 = x3 ^ x14;
  50. x17 = a6 | x16;
  51. x18 = x15 ^ x17;
  52. x19 = a2 | x18;
  53. x20 = x14 ^ x19;
  54. x21 = a1 & x20;
  55. x22 = x12 ^ ~x21;
  56. *out2 ^= x22;
  57. x23 = x1 | x5;
  58. x24 = x23 ^ x8;
  59. x25 = x18 & ~x2;
  60. x26 = a2 & ~x25;
  61. x27 = x24 ^ x26;
  62. x28 = x6 | x7;
  63. x29 = x28 ^ x25;
  64. x30 = x9 ^ x24;
  65. x31 = x18 & ~x30;
  66. x32 = a2 & x31;
  67. x33 = x29 ^ x32;
  68. x34 = a1 & x33;
  69. x35 = x27 ^ x34;
  70. *out4 ^= x35;
  71. x36 = a3 & x28;
  72. x37 = x18 & ~x36;
  73. x38 = a2 | x3;
  74. x39 = x37 ^ x38;
  75. x40 = a3 | x31;
  76. x41 = x24 & ~x37;
  77. x42 = x41 | x3;
  78. x43 = x42 & ~a2;
  79. x44 = x40 ^ x43;
  80. x45 = a1 & ~x44;
  81. x46 = x39 ^ ~x45;
  82. *out1 ^= x46;
  83. x47 = x33 & ~x9;
  84. x48 = x47 ^ x39;
  85. x49 = x4 ^ x36;
  86. x50 = x49 & ~x5;
  87. x51 = x42 | x18;
  88. x52 = x51 ^ a5;
  89. x53 = a2 & ~x52;
  90. x54 = x50 ^ x53;
  91. x55 = a1 | x54;
  92. x56 = x48 ^ ~x55;
  93. *out3 ^= x56;
  94. }

这儿也有许多本地变量。当然,并不是所有的这些都存在本地栈上。让我们用MSVC2008的/Ox选项来编译一下:

清单23.1 使用MSVC 2008编译

  1. PUBLIC _s1
  2. ; Function compile flags: /Ogtpy
  3. _TEXT SEGMENT
  4. _x6$ = -20 ; size = 4
  5. _x3$ = -16 ; size = 4
  6. _x1$ = -12 ; size = 4
  7. _x8$ = -8 ; size = 4
  8. _x4$ = -4 ; size = 4
  9. _a1$ = 8 ; size = 4
  10. _a2$ = 12 ; size = 4
  11. _a3$ = 16 ; size = 4
  12. _x33$ = 20 ; size = 4
  13. _x7$ = 20 ; size = 4
  14. _a4$ = 20 ; size = 4
  15. _a5$ = 24 ; size = 4
  16. tv326 = 28 ; size = 4
  17. _x36$ = 28 ; size = 4
  18. _x28$ = 28 ; size = 4
  19. _a6$ = 28 ; size = 4
  20. _out1$ = 32 ; size = 4
  21. _x24$ = 36 ; size = 4
  22. _out2$ = 36 ; size = 4
  23. _out3$ = 40 ; size = 4
  24. _out4$ = 44 ; size = 4
  25. _s1 PROC
  26. sub esp, 20 ; 00000014H
  27. mov edx, DWORD PTR _a5$[esp+16]
  28. push ebx
  29. mov ebx, DWORD PTR _a4$[esp+20]
  30. push ebp
  31. push esi
  32. mov esi, DWORD PTR _a3$[esp+28]
  33. push edi
  34. mov edi, ebx
  35. not edi
  36. mov ebp, edi
  37. and edi, DWORD PTR _a5$[esp+32]
  38. mov ecx, edx
  39. not ecx
  40. and ebp, esi
  41. mov eax, ecx
  42. and eax, esi
  43. and ecx, ebx
  44. mov DWORD PTR _x1$[esp+36], eax
  45. xor eax, ebx
  46. mov esi, ebp
  47. or esi, edx
  48. mov DWORD PTR _x4$[esp+36], esi
  49. and esi, DWORD PTR _a6$[esp+32]
  50. mov DWORD PTR _x7$[esp+32], ecx
  51. mov edx, esi
  52. xor edx, eax
  53. mov DWORD PTR _x6$[esp+36], edx
  54. mov edx, DWORD PTR _a3$[esp+32]
  55. xor edx, ebx
  56. mov ebx, esi
  57. xor ebx, DWORD PTR _a5$[esp+32]
  58. mov DWORD PTR _x8$[esp+36], edx
  59. and ebx, edx
  60. mov ecx, edx
  61. mov edx, ebx
  62. xor edx, ebp
  63. or edx, DWORD PTR _a6$[esp+32]
  64. not ecx
  65. and ecx, DWORD PTR _a6$[esp+32]
  66. xor edx, edi
  67. mov edi, edx
  68. or edi, DWORD PTR _a2$[esp+32]
  69. mov DWORD PTR _x3$[esp+36], ebp
  70. mov ebp, DWORD PTR _a2$[esp+32]
  71. xor edi, ebx
  72. and edi, DWORD PTR _a1$[esp+32]
  73. mov ebx, ecx
  74. xor ebx, DWORD PTR _x7$[esp+32]
  75. not edi
  76. or ebx, ebp
  77. xor edi, ebx
  78. mov ebx, edi
  79. mov edi, DWORD PTR _out2$[esp+32]
  80. xor ebx, DWORD PTR [edi]
  81. not eax
  82. xor ebx, DWORD PTR _x6$[esp+36]
  83. and eax, edx
  84. mov DWORD PTR [edi], ebx
  85. mov ebx, DWORD PTR _x7$[esp+32]
  86. or ebx, DWORD PTR _x6$[esp+36]
  87. mov edi, esi
  88. or edi, DWORD PTR _x1$[esp+36]
  89. mov DWORD PTR _x28$[esp+32], ebx
  90. xor edi, DWORD PTR _x8$[esp+36]
  91. mov DWORD PTR _x24$[esp+32], edi
  92. xor edi, ecx
  93. not edi
  94. and edi, edx
  95. mov ebx, edi
  96. and ebx, ebp
  97. xor ebx, DWORD PTR _x28$[esp+32]
  98. xor ebx, eax
  99. not eax
  100. mov DWORD PTR _x33$[esp+32], ebx
  101. and ebx, DWORD PTR _a1$[esp+32]
  102. and eax, ebp
  103. xor eax, ebx
  104. mov ebx, DWORD PTR _out4$[esp+32]
  105. xor eax, DWORD PTR [ebx]
  106. xor eax, DWORD PTR _x24$[esp+32]
  107. mov DWORD PTR [ebx], eax
  108. mov eax, DWORD PTR _x28$[esp+32]
  109. and eax, DWORD PTR _a3$[esp+32]
  110. mov ebx, DWORD PTR _x3$[esp+36]
  111. or edi, DWORD PTR _a3$[esp+32]
  112. mov DWORD PTR _x36$[esp+32], eax
  113. not eax
  114. and eax, edx
  115. or ebx, ebp
  116. xor ebx, eax
  117. not eax
  118. and eax, DWORD PTR _x24$[esp+32]
  119. not ebp
  120. or eax, DWORD PTR _x3$[esp+36]
  121. not esi
  122. and ebp, eax
  123. or eax, edx
  124. xor eax, DWORD PTR _a5$[esp+32]
  125. mov edx, DWORD PTR _x36$[esp+32]
  126. xor edx, DWORD PTR _x4$[esp+36]
  127. xor ebp, edi
  128. mov edi, DWORD PTR _out1$[esp+32]
  129. not eax
  130. and eax, DWORD PTR _a2$[esp+32]
  131. not ebp
  132. and ebp, DWORD PTR _a1$[esp+32]
  133. and edx, esi
  134. xor eax, edx
  135. or eax, DWORD PTR _a1$[esp+32]
  136. not ebp
  137. xor ebp, DWORD PTR [edi]
  138. not ecx
  139. and ecx, DWORD PTR _x33$[esp+32]
  140. xor ebp, ebx
  141. not eax
  142. mov DWORD PTR [edi], ebp
  143. xor eax, ecx
  144. mov ecx, DWORD PTR _out3$[esp+32]
  145. xor eax, DWORD PTR [ecx]
  146. pop edi
  147. pop esi
  148. xor eax, ebx
  149. pop ebp
  150. mov DWORD PTR [ecx], eax
  151. pop ebx
  152. add esp, 20 ; 00000014H
  153. ret 0
  154. _s1 ENDP

编译器在本地栈上分配了5个变量。 现在再让我们在MSVC 2008的64位环境中试一试:

清单23.2 使用MSVC 2008编译

  1. a1$ = 56
  2. a2$ = 64
  3. a3$ = 72
  4. a4$ = 80
  5. x36$1$ = 88
  6. a5$ = 88
  7. a6$ = 96
  8. out1$ = 104
  9. out2$ = 112
  10. out3$ = 120
  11. out4$ = 128
  12. s1 PROC
  13. $LN3:
  14. mov QWORD PTR [rsp+24], rbx
  15. mov QWORD PTR [rsp+32], rbp
  16. mov QWORD PTR [rsp+16], rdx
  17. mov QWORD PTR [rsp+8], rcx
  18. push rsi
  19. push rdi
  20. push r12
  21. push r13
  22. push r14
  23. push r15
  24. mov r15, QWORD PTR a5$[rsp]
  25. mov rcx, QWORD PTR a6$[rsp]
  26. mov rbp, r8
  27. mov r10, r9
  28. mov rax, r15
  29. mov rdx, rbp
  30. not rax
  31. xor rdx, r9
  32. not r10
  33. mov r11, rax
  34. and rax, r9
  35. mov rsi, r10
  36. mov QWORD PTR x36$1$[rsp], rax
  37. and r11, r8
  38. and rsi, r8
  39. and r10, r15
  40. mov r13, rdx
  41. mov rbx, r11
  42. xor rbx, r9
  43. mov r9, QWORD PTR a2$[rsp]
  44. mov r12, rsi
  45. or r12, r15
  46. not r13
  47. and r13, rcx
  48. mov r14, r12
  49. and r14, rcx
  50. mov rax, r14
  51. mov r8, r14
  52. xor r8, rbx
  53. xor rax, r15
  54. not rbx
  55. and rax, rdx
  56. mov rdi, rax
  57. xor rdi, rsi
  58. or rdi, rcx
  59. xor rdi, r10
  60. and rbx, rdi
  61. mov rcx, rdi
  62. or rcx, r9
  63. xor rcx, rax
  64. mov rax, r13
  65. xor rax, QWORD PTR x36$1$[rsp]
  66. and rcx, QWORD PTR a1$[rsp]
  67. or rax, r9
  68. not rcx
  69. xor rcx, rax
  70. mov rax, QWORD PTR out2$[rsp]
  71. xor rcx, QWORD PTR [rax]
  72. xor rcx, r8
  73. mov QWORD PTR [rax], rcx
  74. mov rax, QWORD PTR x36$1$[rsp]
  75. mov rcx, r14
  76. or rax, r8
  77. or rcx, r11
  78. mov r11, r9
  79. xor rcx, rdx
  80. mov QWORD PTR x36$1$[rsp], rax
  81. mov r8, rsi
  82. mov rdx, rcx
  83. xor rdx, r13
  84. not rdx
  85. and rdx, rdi
  86. mov r10, rdx
  87. and r10, r9
  88. xor r10, rax
  89. xor r10, rbx
  90. not rbx
  91. and rbx, r9
  92. mov rax, r10
  93. and rax, QWORD PTR a1$[rsp]
  94. xor rbx, rax
  95. mov rax, QWORD PTR out4$[rsp]
  96. xor rbx, QWORD PTR [rax]
  97. xor rbx, rcx
  98. mov QWORD PTR [rax], rbx
  99. mov rbx, QWORD PTR x36$1$[rsp]
  100. and rbx, rbp
  101. mov r9, rbx
  102. not r9
  103. and r9, rdi
  104. or r8, r11
  105. mov rax, QWORD PTR out1$[rsp]
  106. xor r8, r9
  107. not r9
  108. and r9, rcx
  109. or rdx, rbp
  110. mov rbp, QWORD PTR [rsp+80]
  111. or r9, rsi
  112. xor rbx, r12
  113. mov rcx, r11
  114. not rcx
  115. not r14
  116. not r13
  117. and rcx, r9
  118. or r9, rdi
  119. and rbx, r14
  120. xor r9, r15
  121. xor rcx, rdx
  122. mov rdx, QWORD PTR a1$[rsp]
  123. not r9
  124. not rcx
  125. and r13, r10
  126. and r9, r11
  127. and rcx, rdx
  128. xor r9, rbx
  129. mov rbx, QWORD PTR [rsp+72]
  130. not rcx
  131. xor rcx, QWORD PTR [rax]
  132. or r9, rdx
  133. not r9
  134. xor rcx, r8
  135. mov QWORD PTR [rax], rcx
  136. mov rax, QWORD PTR out3$[rsp]
  137. xor r9, r13
  138. xor r9, QWORD PTR [rax]
  139. xor r9, r8
  140. mov QWORD PTR [rax], r9
  141. pop r15
  142. pop r14
  143. pop r13
  144. pop r12
  145. pop rdi
  146. pop rsi
  147. ret 0
  148. s1 ENDP

编译器在栈上并没有分配任何内存空间,x36是a5的同义词。 顺带一提,我们可以在这儿看到的是,函数在调用者空间中保存了RCX和RDX,但是R8和R9虽然在一开始就使用了,但是却并没有保存。 还有,还有拥有更多GPR的CPU,比如Itanium(有128个寄存器)。