.globl clear_page
.p2align 4
clear_page:
- xorl %eax,%eax
- movl $4096/64,%ecx
- .p2align 4
-.Lloop:
- decl %ecx
-#define PUT(x) movq %rax,x*8(%rdi)
- movq %rax,(%rdi)
- PUT(1)
- PUT(2)
- PUT(3)
- PUT(4)
- PUT(5)
- PUT(6)
- PUT(7)
- leaq 64(%rdi),%rdi
- jnz .Lloop
- nop
- ret
-clear_page_end:
-
- /* C stepping K8 run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad clear_page
- .quad clear_page_c
- .byte X86_FEATURE_K8_C
- .byte clear_page_end-clear_page
- .byte clear_page_c_end-clear_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-clear_page_c:
movl $4096/8,%ecx
xorl %eax,%eax
rep
stosq
ret
-clear_page_c_end:
- .previous
.globl copy_page
.p2align 4
copy_page:
- subq $3*8,%rsp
- movq %rbx,(%rsp)
- movq %r12,1*8(%rsp)
- movq %r13,2*8(%rsp)
-
- movl $(4096/64)-5,%ecx
- .p2align 4
-.Loop64:
- dec %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- prefetcht0 5*64(%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
- jnz .Loop64
-
- movl $5,%ecx
- .p2align 4
-.Loop2:
- decl %ecx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %rdx
- movq 24 (%rsi), %r8
- movq 32 (%rsi), %r9
- movq 40 (%rsi), %r10
- movq 48 (%rsi), %r11
- movq 56 (%rsi), %r12
-
- movq %rax, (%rdi)
- movq %rbx, 8 (%rdi)
- movq %rdx, 16 (%rdi)
- movq %r8, 24 (%rdi)
- movq %r9, 32 (%rdi)
- movq %r10, 40 (%rdi)
- movq %r11, 48 (%rdi)
- movq %r12, 56 (%rdi)
-
- leaq 64(%rdi),%rdi
- leaq 64(%rsi),%rsi
-
- jnz .Loop2
-
- movq (%rsp),%rbx
- movq 1*8(%rsp),%r12
- movq 2*8(%rsp),%r13
- addq $3*8,%rsp
- ret
-
- /* C stepping K8 run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad copy_page
- .quad copy_page_c
- .byte X86_FEATURE_K8_C
- .byte copy_page_c_end-copy_page_c
- .byte copy_page_c_end-copy_page_c
- .previous
-
- .section .altinstr_replacement,"ax"
-copy_page_c:
movl $4096/8,%ecx
rep
movsq
ret
-copy_page_c_end:
- .previous
*
* Output:
* rax original destination
+ *
+ * TODO: check best memcpy for PSC
*/
.globl __memcpy
.p2align 4
__memcpy:
memcpy:
- pushq %rbx
- movq %rdi,%rax
-
- movl %edx,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
- .p2align 4
-.Lloop_64:
- decl %ecx
-
- movq (%rsi),%r11
- movq 8(%rsi),%r8
-
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
-
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
-
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
-
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
-
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
-.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
- jz .Lhandle_7
- .p2align 4
-.Lloop_8:
- decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
- incq %rdi
- incq %rsi
- decl %ecx
- jnz .Lloop_1
-
-.Lende:
- popq %rbx
- ret
-.Lfinal:
-
- /* C stepping K8 run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
- .section .altinstructions,"a"
- .align 8
- .quad memcpy
- .quad memcpy_c
- .byte X86_FEATURE_K8_C
- .byte .Lfinal-memcpy
- .byte memcpy_c_end-memcpy_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi source
- * rdx count
- */
-memcpy_c:
movq %rdi,%rax
movl %edx,%ecx
shrl $3,%ecx
rep
movsb
ret
-memcpy_c_end:
- .previous
.p2align 4
memset:
__memset:
- movq %rdi,%r10
- movq %rdx,%r11
-
- /* expand byte value */
- movzbl %sil,%ecx
- movabs $0x0101010101010101,%rax
- mul %rcx /* with rax, clobbers rdx */
-
- /* align dst */
- movl %edi,%r9d
- andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
-
- movl %r11d,%ecx
- shrl $6,%ecx
- jz .Lhandle_tail
-
- .p2align 4
-.Lloop_64:
- decl %ecx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
- leaq 64(%rdi),%rdi
- jnz .Lloop_64
-
- /* Handle tail in loops. The loops should be faster than hard
- to predict jump tables. */
- .p2align 4
-.Lhandle_tail:
- movl %r11d,%ecx
- andl $63&(~7),%ecx
- jz .Lhandle_7
- shrl $3,%ecx
- .p2align 4
-.Lloop_8:
- decl %ecx
- movq %rax,(%rdi)
- leaq 8(%rdi),%rdi
- jnz .Lloop_8
-
-.Lhandle_7:
- movl %r11d,%ecx
- andl $7,%ecx
- jz .Lende
- .p2align 4
-.Lloop_1:
- decl %ecx
- movb %al,(%rdi)
- leaq 1(%rdi),%rdi
- jnz .Lloop_1
-
-.Lende:
- movq %r10,%rax
- ret
-
-.Lbad_alignment:
- cmpq $7,%r11
- jbe .Lhandle_7
- movq %rax,(%rdi) /* unaligned store */
- movq $8,%r8
- subq %r9,%r8
- addq %r8,%rdi
- subq %r8,%r11
- jmp .Lafter_bad_alignment
-
- /* C stepping K8 run faster using the string instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstructions,"a"
- .align 8
- .quad memset
- .quad memset_c
- .byte X86_FEATURE_K8_C
- .byte memset_c_end-memset_c
- .byte memset_c_end-memset_c
- .previous
-
- .section .altinstr_replacement,"ax"
- /* rdi destination
- * rsi value
- * rdx count
- */
-memset_c:
movq %rdi,%r9
movl %edx,%r8d
andl $7,%r8d
stosb
movq %r9,%rax
ret
-memset_c_end:
- .previous