1. 19 Apr, 2019 4 commits
  2. 18 Apr, 2019 3 commits
  3. 17 Apr, 2019 9 commits
  4. 16 Apr, 2019 4 commits
  5. 15 Apr, 2019 5 commits
  6. 14 Apr, 2019 9 commits
  7. 13 Apr, 2019 4 commits
  8. 12 Apr, 2019 2 commits
    • FreddyFunk's avatar
      Fix Clang Format · 382722b9
      FreddyFunk authored
      382722b9
    • Lioncash's avatar
      common/swap: Improve codegen of the default swap fallbacks · 0d8ef2d3
      Lioncash authored
      Uses arithmetic that can be identified more trivially by compilers for
      optimizations. e.g. Rather than shifting the halves of the value and
      then swapping and combining them, we can swap them in place.
      
      e.g. for the original swap32 code on x86-64, clang 8.0 would generate:
      
          mov     ecx, edi
          rol     cx, 8
          shl     ecx, 16
          shr     edi, 16
          rol     di, 8
          movzx   eax, di
          or      eax, ecx
          ret
      
      while GCC 8.3 would generate the ideal:
      
          mov     eax, edi
          bswap   eax
          ret
      
      now both generate the same optimal output.
      
      MSVC used to generate the following with the old code:
      
          mov     eax, ecx
          rol     cx, 8
          shr     eax, 16
          rol     ax, 8
          movzx   ecx, cx
          movzx   eax, ax
          shl     ecx, 16
          or      eax, ecx
          ret     0
      
      Now MSVC also generates a similar, but equally optimal result as clang/GCC:
      
          bswap   ecx
          mov     eax, ecx
          ret     0
      
      ====
      
      In the swap64 case, for the original code, clang 8.0 would generate:
      
          mov     eax, edi
          bswap   eax
          shl     rax, 32
          shr     rdi, 32
          bswap   edi
          or      rax, rdi
          ret
      
      (almost there, but still missing the mark)
      
      while, again, GCC 8.3 would generate the more ideal:
      
          mov     rax, rdi
          bswap   rax
          ret
      
      now clang also generates the optimal sequence for this fallback as well.
      
      This is a case where MSVC unfortunately falls short, despite the new
      code, this one still generates a doozy of an output.
      
          mov     r8, rcx
          mov     r9, rcx
          mov     rax, 71776119061217280
          mov     rdx, r8
          and     r9, rax
          and     edx, 65280
          mov     rax, rcx
          shr     rax, 16
          or      r9, rax
          mov     rax, rcx
          shr     r9, 16
          mov     rcx, 280375465082880
          and     rax, rcx
          mov     rcx, 1095216660480
          or      r9, rax
          mov     rax, r8
          and     rax, rcx
          shr     r9, 16
          or      r9, rax
          mov     rcx, r8
          mov     rax, r8
          shr     r9, 8
          shl     rax, 16
          and     ecx, 16711680
          or      rdx, rax
          mov     eax, -16777216
          and     rax, r8
          shl     rdx, 16
          or      rdx, rcx
          shl     rdx, 16
          or      rax, rdx
          shl     rax, 8
          or      rax, r9
          ret     0
      
      which is pretty unfortunate.
      0d8ef2d3