CRIS v32: Update lib/checksum.S and lib/checksumcopy.S

author Jesper Nilsson <jesper.nilsson@axis.com>

Fri, 25 Jan 2008 16:54:14 +0000 (17:54 +0100)

committer Jesper Nilsson <jesper.nilsson@axis.com>

Fri, 8 Feb 2008 10:06:35 +0000 (11:06 +0100)
author Jesper Nilsson <jesper.nilsson@axis.com>
Fri, 25 Jan 2008 16:54:14 +0000 (17:54 +0100)
committer Jesper Nilsson <jesper.nilsson@axis.com>
Fri, 8 Feb 2008 10:06:35 +0000 (11:06 +0100)
diff --git a/arch/cris/arch-v32/lib/checksum.S b/arch/cris/arch-v32/lib/checksum.S

index 32e66181b826510103ef704267a0e665292669d2..87f3fd71ab10514c342b4dc9a971ceb67ea1349d 100644 (file)
--- a/arch/cris/arch-v32/lib/checksum.S
+++ b/arch/cris/arch-v32/lib/checksum.S
@@ -1,6 +1,6 @@
  /*
   * A fast checksum routine using movem
- * Copyright (c) 1998-2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
   *
   * csum_partial(const unsigned char * buff, int len, unsigned int sum)
   */
@@ -12,30 +12,23 @@ csum_partial:
         ;; r11 - length
         ;; r12 - checksum
  
-       ;; check for breakeven length between movem and normal word looping versions
-       ;; we also do _NOT_ want to compute a checksum over more than the
-       ;; actual length when length < 40
-
-       cmpu.w  80,$r11
-       blo     _word_loop
-       nop
-
-       ;; need to save the registers we use below in the movem loop
-       ;; this overhead is why we have a check above for breakeven length
-       ;; only r0 - r8 have to be saved, the other ones are clobber-able
-       ;; according to the ABI
+       ;; Optimized for large packets
+       subq    10*4, $r11
+       blt     _word_loop
+       move.d  $r11, $acr
  
         subq    9*4,$sp
-       subq    10*4,$r11       ; update length for the first loop
+       clearf  c
         movem   $r8,[$sp]
  
         ;; do a movem checksum
  
  _mloop:        movem   [$r10+],$r9     ; read 10 longwords
-
+       ;; Loop count without touching the c flag.
+       addoq   -10*4, $acr, $acr
         ;; perform dword checksumming on the 10 longwords
  
-       add.d   $r0,$r12
+       addc    $r0,$r12
         addc    $r1,$r12
         addc    $r2,$r12
         addc    $r3,$r12
@@ -46,60 +39,41 @@ _mloop:     movem   [$r10+],$r9     ; read 10 longwords
         addc    $r8,$r12
         addc    $r9,$r12
  
-       ;; fold the carry into the checksum, to avoid having to loop the carry
-       ;; back into the top
-
-       addc    0,$r12
-       addc    0,$r12          ; do it again, since we might have generated a carry
-
-       subq    10*4,$r11
-       bge     _mloop
-       nop
-
-       addq    10*4,$r11       ; compensate for last loop underflowing length
+       ;; test $acr without trashing carry.
+       move.d  $acr, $acr
+       bpl     _mloop
+       ;; r11 <= acr  is not really needed in the mloop, just using the dslot
+       ;; to prepare for what is needed after mloop.
+       move.d  $acr, $r11
  
+       ;; fold the last carry into r13
+       addc    0, $r12
         movem   [$sp+],$r8      ; restore regs
  
  _word_loop:
-       ;; only fold if there is anything to fold.
-
-       cmpq    0,$r12
-       beq     _no_fold
-
-       ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below.
-       ;; r9 and r13 can be used as temporaries.
+       addq    10*4,$r11       ; compensate for last loop underflowing length
  
         moveq   -1,$r9          ; put 0xffff in r9, faster than move.d 0xffff,r9
         lsrq    16,$r9
  
         move.d  $r12,$r13
         lsrq    16,$r13         ; r13 = checksum >> 16
-       and.d   $r9,$r12                ; checksum = checksum & 0xffff
-       add.d   $r13,$r12               ; checksum += r13
-       move.d  $r12,$r13               ; do the same again, maybe we got a carry last add
-       lsrq    16,$r13
-       and.d   $r9,$r12
-       add.d   $r13,$r12
+       and.d   $r9,$r12        ; checksum = checksum & 0xffff
  
  _no_fold:
-       cmpq    2,$r11
+       subq    2,$r11
         blt     _no_words
-       nop
+       add.d   $r13,$r12       ; checksum += r13
  
         ;; checksum the rest of the words
-
-       subq    2,$r11
-
  _wloop:        subq    2,$r11
         bge     _wloop
         addu.w  [$r10+],$r12
  
-       addq    2,$r11
-
  _no_words:
+       addq    2,$r11
         ;; see if we have one odd byte more
-       cmpq    1,$r11
-       beq     _do_byte
+       bne     _do_byte
         nop
         ret
         move.d  $r12,$r10
diff --git a/arch/cris/arch-v32/lib/checksumcopy.S b/arch/cris/arch-v32/lib/checksumcopy.S

index 9303ccbadc6d91af82a03a84ae53eedbc37829ad..21aabe91489bf4d35c33d11d11c8d79f713d93cd 100644 (file)
--- a/arch/cris/arch-v32/lib/checksumcopy.S
+++ b/arch/cris/arch-v32/lib/checksumcopy.S
@@ -1,6 +1,6 @@
  /*
   * A fast checksum+copy routine using movem
- * Copyright (c) 1998, 2001, 2003 Axis Communications AB
+ * Copyright (c) 1998-2007 Axis Communications AB
   *
   * Authors:    Bjorn Wesen
   *
@@ -16,32 +16,23 @@ csum_partial_copy_nocheck:
         ;; r12 - length
         ;; r13 - checksum
  
-       ;; check for breakeven length between movem and normal word looping versions
-       ;; we also do _NOT_ want to compute a checksum over more than the
-       ;; actual length when length < 40
-
-       cmpu.w  80,$r12
-       blo     _word_loop
-       nop
-
-       ;; need to save the registers we use below in the movem loop
-       ;; this overhead is why we have a check above for breakeven length
-       ;; only r0 - r8 have to be saved, the other ones are clobber-able
-       ;; according to the ABI
+       ;; Optimized for large packets
+       subq    10*4, $r12
+       blt     _word_loop
+       move.d  $r12, $acr
  
         subq    9*4,$sp
-       subq    10*4,$r12       ; update length for the first loop
+       clearf  c
         movem   $r8,[$sp]
  
         ;; do a movem copy and checksum
-
  1:     ;; A failing userspace access (the read) will have this as PC.
  _mloop:        movem   [$r10+],$r9     ; read 10 longwords
+       addoq   -10*4, $acr, $acr ; loop counter in latency cycle
         movem   $r9,[$r11+]     ; write 10 longwords
  
         ;; perform dword checksumming on the 10 longwords
-
-       add.d   $r0,$r13
+       addc    $r0,$r13
         addc    $r1,$r13
         addc    $r2,$r13
         addc    $r3,$r13
@@ -52,47 +43,30 @@ _mloop:     movem   [$r10+],$r9     ; read 10 longwords
         addc    $r8,$r13
         addc    $r9,$r13
  
-       ;; fold the carry into the checksum, to avoid having to loop the carry
-       ;; back into the top
-
-       addc    0,$r13
-       addc    0,$r13          ; do it again, since we might have generated a carry
-
-       subq    10*4,$r12
-       bge     _mloop
-       nop
-
-       addq    10*4,$r12       ; compensate for last loop underflowing length
+       ;; test $acr, without trashing carry.
+       move.d  $acr, $acr
+       bpl     _mloop
+       ;; r12 <= acr  is needed after mloop and in the exception handlers.
+       move.d  $acr, $r12
  
+       ;; fold the last carry into r13
+       addc    0, $r13
         movem   [$sp+],$r8      ; restore regs
  
  _word_loop:
-       ;; only fold if there is anything to fold.
-
-       cmpq    0,$r13
-       beq     _no_fold
+       addq    10*4,$r12       ; compensate for last loop underflowing length
  
         ;; fold 32-bit checksum into a 16-bit checksum, to avoid carries below
         ;; r9 can be used as temporary.
-
         move.d  $r13,$r9
         lsrq    16,$r9          ; r0 = checksum >> 16
         and.d   0xffff,$r13     ; checksum = checksum & 0xffff
-       add.d   $r9,$r13        ; checksum += r0
-       move.d  $r13,$r9        ; do the same again, maybe we got a carry last add
-       lsrq    16,$r9
-       and.d   0xffff,$r13
-       add.d   $r9,$r13
  
-_no_fold:
-       cmpq    2,$r12
+       subq    2, $r12
         blt     _no_words
-       nop
+       add.d   $r9,$r13        ; checksum += r0
  
         ;; copy and checksum the rest of the words
-
-       subq    2,$r12
-
  2:     ;; A failing userspace access for the read below will have this as PC.
  _wloop:        move.w  [$r10+],$r9
         addu.w  $r9,$r13
@@ -100,12 +74,9 @@ _wloop:     move.w  [$r10+],$r9
         bge     _wloop
         move.w  $r9,[$r11+]
  
-       addq    2,$r12
-
  _no_words:
-       ;; see if we have one odd byte more
-       cmpq    1,$r12
-       beq     _do_byte
+       addq    2,$r12
+       bne     _do_byte
         nop
         ret
         move.d  $r13,$r10
author	Jesper Nilsson <jesper.nilsson@axis.com>
	Fri, 25 Jan 2008 16:54:14 +0000 (17:54 +0100)
committer	Jesper Nilsson <jesper.nilsson@axis.com>
	Fri, 8 Feb 2008 10:06:35 +0000 (11:06 +0100)
arch/cris/arch-v32/lib/checksum.S		patch \| blob \| history
arch/cris/arch-v32/lib/checksumcopy.S		patch \| blob \| history