SAFE public projects git trees. - safe/jmp/linux-2.6/blob - arch/x86/lib/memcpy_64.S

   1 /* Copyright 2002 Andi Kleen */
   2
   3 #include <linux/linkage.h>
   4
   5 #include <asm/cpufeature.h>
   6 #include <asm/dwarf2.h>
   7
   8 /*
   9  * memcpy - Copy a memory block.
  10  *
  11  * Input:
  12  *  rdi destination
  13  *  rsi source
  14  *  rdx count
  15  *
  16  * Output:
  17  * rax original destination
  18  */
  19
  20 /*
  21  * memcpy_c() - fast string ops (REP MOVSQ) based variant.
  22  *
  23  * This gets patched over the unrolled variant (below) via the
  24  * alternative instructions framework:
  25  */
  26         .section .altinstr_replacement, "ax", @progbits
  27 .Lmemcpy_c:
  28         movq %rdi, %rax
  29
  30         movl %edx, %ecx
  31         shrl $3, %ecx
  32         andl $7, %edx
  33         rep movsq
  34         movl %edx, %ecx
  35         rep movsb
  36         ret
  37 .Lmemcpy_e:
  38         .previous
  39
  40 ENTRY(__memcpy)
  41 ENTRY(memcpy)
  42         CFI_STARTPROC
  43
  44         /*
  45          * Put the number of full 64-byte blocks into %ecx.
  46          * Tail portion is handled at the end:
  47          */
  48         movq %rdi, %rax
  49         movl %edx, %ecx
  50         shrl   $6, %ecx
  51         jz .Lhandle_tail
  52
  53         .p2align 4
  54 .Lloop_64:
  55         /*
  56          * We decrement the loop index here - and the zero-flag is
  57          * checked at the end of the loop (instructions inbetween do
  58          * not change the zero flag):
  59          */
  60         decl %ecx
  61
  62         /*
  63          * Move in blocks of 4x16 bytes:
  64          */
  65         movq 0*8(%rsi),         %r11
  66         movq 1*8(%rsi),         %r8
  67         movq %r11,              0*8(%rdi)
  68         movq %r8,               1*8(%rdi)
  69
  70         movq 2*8(%rsi),         %r9
  71         movq 3*8(%rsi),         %r10
  72         movq %r9,               2*8(%rdi)
  73         movq %r10,              3*8(%rdi)
  74
  75         movq 4*8(%rsi),         %r11
  76         movq 5*8(%rsi),         %r8
  77         movq %r11,              4*8(%rdi)
  78         movq %r8,               5*8(%rdi)
  79
  80         movq 6*8(%rsi),         %r9
  81         movq 7*8(%rsi),         %r10
  82         movq %r9,               6*8(%rdi)
  83         movq %r10,              7*8(%rdi)
  84
  85         leaq 64(%rsi), %rsi
  86         leaq 64(%rdi), %rdi
  87
  88         jnz  .Lloop_64
  89
  90 .Lhandle_tail:
  91         movl %edx, %ecx
  92         andl  $63, %ecx
  93         shrl   $3, %ecx
  94         jz   .Lhandle_7
  95
  96         .p2align 4
  97 .Lloop_8:
  98         decl %ecx
  99         movq (%rsi),            %r8
 100         movq %r8,               (%rdi)
 101         leaq 8(%rdi),           %rdi
 102         leaq 8(%rsi),           %rsi
 103         jnz  .Lloop_8
 104
 105 .Lhandle_7:
 106         movl %edx, %ecx
 107         andl $7, %ecx
 108         jz .Lend
 109
 110         .p2align 4
 111 .Lloop_1:
 112         movb (%rsi), %r8b
 113         movb %r8b, (%rdi)
 114         incq %rdi
 115         incq %rsi
 116         decl %ecx
 117         jnz .Lloop_1
 118
 119 .Lend:
 120         ret
 121         CFI_ENDPROC
 122 ENDPROC(memcpy)
 123 ENDPROC(__memcpy)
 124
 125         /*
 126          * Some CPUs run faster using the string copy instructions.
 127          * It is also a lot simpler. Use this when possible:
 128          */
 129
 130         .section .altinstructions, "a"
 131         .align 8
 132         .quad memcpy
 133         .quad .Lmemcpy_c
 134         .byte X86_FEATURE_REP_GOOD
 135
 136         /*
 137          * Replace only beginning, memcpy is used to apply alternatives,
 138          * so it is silly to overwrite itself with nops - reboot is the
 139          * only outcome...
 140          */
 141         .byte .Lmemcpy_e - .Lmemcpy_c
 142         .byte .Lmemcpy_e - .Lmemcpy_c
 143         .previous