bcopy.s

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2002-2004 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * The contents of this file constitute Original Code as defined in and
    7  * are subject to the Apple Public Source License Version 1.1 (the
    8  * "License").  You may not use this file except in compliance with the
    9  * License.  Please obtain a copy of the License at
   10  * http://www.apple.com/publicsource and read it before using this file.
   11  * 
   12  * This Original Code and all software distributed under the License are
   13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
   17  * License for the specific language governing rights and limitations
   18  * under the License.
   19  * 
   20  * @APPLE_LICENSE_HEADER_END@
   21  */
   22 ;
   23 ;                       Copy bytes of data around. Handles overlapped data.
   24 ;
   25 ;
   26 #include <ppc/asm.h>
   27 #include <ppc/proc_reg.h>
   28 #include <assym.s>
   29 
   30 ;       These routines use CR5 for certain flags:
   31 ;               Use CR5_lt to indicate non-cached (in bcopy and memcpy)
   32 #define noncache 20
   33 
   34 
   35 ;       The bcopy_phys variants use a stack frame so they can call bcopy as a subroutine.
   36 #define BCOPY_SF_SIZE   32      // total size
   37 #define BCOPY_SF_MSR    16      // we save caller's MSR here (possibly minus VEC and FP)
   38 
   39 
   40 #define kShort  32              // short operands are special cased
   41 
   42 
   43 ; void bcopy_physvir_32(from, to, nbytes)
   44 ;
   45 ; Attempt to copy physically addressed memory with translation on if conditions are met.
   46 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors 
   47 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
   48 ; for the passed phys addrs and do the copy with translation on.  
   49 ;
   50 ; Rules are: - neither source nor destination can cross a page. 
   51 ;            - Interrupts must be disabled when this routine is called.
   52 ;            - Translation must be on when called.
   53 ;
   54 ; To do the copy, we build a 128 DBAT for both the source and sink.  If both are the same, only one
   55 ; is loaded.  We do not touch the IBATs, so there is no issue if either physical page
   56 ; address is the same as the virtual address of the instructions we are executing.
   57 ;
   58 ; At the end, we invalidate the used DBATs.
   59 ;
   60 ; Note that the address parameters are long longs.  We will transform these to 64-bit
   61 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
   62 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
   63 ; there anyhow.
   64 ;
   65 ; Note also that this routine is used only on 32-bit machines. If you're contemplating use
   66 ; on a 64-bit processor, use the physical memory window instead; please refer to copypv()
   67 ; for an example of how this is done.
   68 
   69                         .align  5
   70                         .globl  EXT(bcopy_physvir_32)
   71 
   72 LEXT(bcopy_physvir_32)
   73             mflr    r0                          ; get return address
   74             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
   75             mfsprg      r8,2                                            ; get processor feature flags
   76             stw     r0,8(r1)                    ; save return address
   77                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
   78             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
   79             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
   80             subi    r0,r7,1                     ; get length - 1
   81                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
   82                         add             r11,r3,r0                                       ; Point to last byte of sink
   83                         mr              r5,r7                                           ; Get the length into the right register
   84             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
   85 
   86 ; This test for page overflow may not work if the length is negative.  Negative lengths are invalid input
   87 ; to bcopy_physvir() on 32-bit machines, and will result in a panic.
   88             
   89                         add             r12,r4,r0                                       ; Point to last byte of source
   90                         xor             r7,r11,r3                                       ; See if we went to next page
   91                         xor             r8,r12,r4                                       ; See if we went to next page
   92                         or              r0,r7,r8                                        ; Combine wrap
   93                         
   94 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
   95                         li              r9,((2<<3)|2)                           ; Set default attributes
   96                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
   97                         li              r7,2                                            ; Set validity flags
   98                         li              r8,2                                            ; Set validity flags
   99                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
  100 
  101                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
  102                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
  103                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
  104                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
  105                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
  106                         
  107                         sync
  108 
  109                         mtdbatl 0,r11                                           ; Set sink lower DBAT 
  110                         mtdbatu 0,r7                                            ; Set sink upper DBAT
  111 
  112                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
  113 
  114                         mtdbatl 1,r12                                           ; Set source lower DBAT 
  115                         mtdbatu 1,r8                                            ; Set source upper DBAT
  116             
  117 bcpvsame:       
  118             sync                                ; wait for the BATs to stabilize
  119             isync
  120             
  121             bl      EXT(bcopy)                  ; BATs set up, args in r3-r5, so do the copy with DR on
  122 
  123             li          r0,0                                            ; Get set to invalidate upper half of BATs
  124                         sync                                                            ; Make sure all is well
  125                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
  126                         mtdbatu 1,r0                                            ; Clear source upper DBAT
  127                         sync
  128                         isync                   
  129             
  130             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address
  131             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
  132             mtlr    r0
  133             blr
  134 
  135 
  136 ; void bcopy_phys(from, to, nbytes)
  137 ;
  138 ; Turns off data translation before the copy.  This one will not work in user state.
  139 ; This routine is used on 32 and 64-bit machines.
  140 ;
  141 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  142 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  143 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  144 ; there anyhow.
  145 ;
  146 ; Also note that you probably will not be happy if either the sink or source spans across the
  147 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code 
  148 ; will not check, so be careful.
  149 ;
  150 ; NOTE: when called, translation must be on, and we must be in 32-bit mode.
  151 ;       Interrupts may or may not be disabled.
  152 
  153                         .align  5
  154                         .globl  EXT(bcopy_phys)
  155 
  156 LEXT(bcopy_phys)
  157             mflr    r0                          ; get return address
  158             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  159             stw     r0,8(r1)                    ; save
  160             mfsprg      r8,2                                            ; get processor feature flags
  161             stwu    r1,-BCOPY_SF_SIZE(r1)       ; push on a stack frame so we can call bcopy
  162                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  163                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  164                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
  165                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  166                         mr              r5,r7                                           ; Get the length into the right register
  167 
  168 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit in cr6 and parms in r3-r5
  169                         mfmsr   r9                                                      ; Get the MSR
  170                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable            
  171             ori     r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))       ; Add in FP and DR
  172             andc    r9,r9,r6                    ; unconditionally turn DR, VEC, and FP off
  173             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
  174 
  175 ; 32-bit CPUs
  176 
  177                         mtmsr   r9                                                      ; turn DR, FP, and VEC off
  178                         isync                                                           ; Wait for it
  179                         
  180             bl      EXT(bcopy)                  ; do the copy with translation off and caching on
  181             
  182                         mfmsr   r9                                                      ; Get the MSR
  183             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on (but leave VEC and FP off)
  184             mtmsr   r9                          ; restore msr
  185             isync                               ; wait for it to happen
  186             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
  187             mtlr    r0
  188             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
  189             blr
  190 
  191             
  192 ; 64-bit: turn DR off and SF on.
  193 
  194 bcopy_phys64:                                                                   ; r9 = MSR with DP, VEC, and FP off
  195             ori     r8,r9,lo16(MASK(MSR_DR))    ; make a copy with DR back on... this is what we return to caller
  196                         srdi    r2,r3,31                                        ; Get a 1 if source is in I/O memory
  197             li          r0,1                                            ; Note - we use this in a couple places below
  198                         srdi    r10,r4,31                                       ; Get a 1 if sink is in I/O memory
  199             std     r8,BCOPY_SF_MSR(r1)         ; save caller's MSR so we remember whether EE was on
  200             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on in MSR we will copy with
  201                         cmpldi  cr0,r2,1                                        ; Is source in I/O memory?
  202                         cmpldi  cr7,r10,1                                       ; Is sink in I/O memory?
  203             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
  204             isync                                                               ; wait for it to happen
  205                         cror    cr7_eq,cr0_eq,cr7_eq            ; See if either source or sink is in I/O area
  206             beq--   cr7,io_space_real_mode_copy ; an operand is in I/O space
  207             
  208             bl      EXT(bcopy)                  ; do copy with DR off and SF on, cache enabled
  209                         
  210 bcopy_phys64x:
  211                         mfmsr   r9                                                      ; Get the MSR we used to copy
  212             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
  213             ori     r9,r9,lo16(MASK(MSR_DR))    ; turn translation back on
  214             mtmsrd  r9                          ; turn 64-bit mode off, translation back on
  215             isync                                                               ; wait for it to happen
  216             lwz     r0,BCOPY_SF_SIZE+8(r1)      ; get return address once translation is back on
  217             ld      r8,BCOPY_SF_MSR(r1)         ; get caller's MSR once translation is back on
  218             mtlr    r0
  219             mtmsrd  r8,1                        ; turn EE back on if necessary
  220             addi    r1,r1,BCOPY_SF_SIZE         ; pop off stack frame
  221             blr
  222 
  223 ;   We need to copy with DR off, but one of the operands is in I/O space.  To avoid wedging U3,
  224 ;   which cannot handle a cache burst in I/O space, we must turn caching off for the real memory access.
  225 ;   This can only be done by setting bits in HID4.  We cannot lose control and execute random code in
  226 ;   this state, so we have to disable interrupts as well.  This is an unpleasant hack.
  227 
  228 io_space_real_mode_copy:                        ; r0=1, r9=MSR we want to copy with
  229                         sldi    r11,r0,31-MSR_EE_BIT            ; Get a mask for the EE bit
  230                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
  231                         andc    r9,r9,r11                                       ; Turn off EE bit
  232                         mfspr   r2,hid4                                         ; Get HID4
  233                         mtmsrd  r9,1                        ; Force off EE
  234                         or              r2,r2,r0                                        ; Set bit to make real accesses cache-inhibited
  235                         sync                                                            ; Sync up
  236                         mtspr   hid4,r2                                         ; Make real accesses cache-inhibited
  237                         isync                                                           ; Toss prefetches
  238 
  239                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
  240                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
  241                         slbie   r12                                                     ; Make sure the ERAT is cleared 
  242                         
  243                         sync
  244                         isync
  245                         
  246             bl      EXT(bcopy_nc)               ; copy with SF on and EE, DR, VEC, and FP off, cache inhibited
  247             
  248                         li              r0,1                                            ; Get a 1
  249                         sldi    r0,r0,32+8                                      ; Get the right bit to turn off caching
  250                         mfspr   r2,hid4                                         ; Get HID4
  251                         andc    r2,r2,r0                                        ; Clear bit to make real accesses cache-inhibited
  252                         sync                                                            ; Sync up
  253                         mtspr   hid4,r2                                         ; Make real accesses not cache-inhibited
  254                         isync                                                           ; Toss prefetches
  255         
  256                         lis             r12,0xE000                                      ; Get the unlikeliest ESID possible
  257                         srdi    r12,r12,1                                       ; Make 0x7FFFFFFFF0000000
  258                         slbie   r12                                                     ; Make sure the ERAT is cleared
  259             b       bcopy_phys64x
  260 
  261 
  262 ;
  263 ; shortcopy
  264 ;
  265 ; Special case short operands (<32 bytes), which are very common.  Note that the check for
  266 ; reverse vs normal moves isn't quite correct in 64-bit mode; in rare cases we will move in
  267 ; reverse when it wasn't necessary to do so.  This is OK, since performance of the two cases
  268 ; is similar.  We do get the direction right when it counts (ie, when the operands overlap.)
  269 ; Also note that we use the G3/G4 "backend" code, even on G5.  This is OK too, since G5 has
  270 ; plenty of load/store dispatch bandwidth in this case, the extra ops are hidden by latency,
  271 ; and using word instead of doubleword moves reduces the possibility of unaligned accesses,
  272 ; which cost about 20 cycles if they cross a 32-byte boundary on G5.  Finally, because we
  273 ; might do unaligned accesses this code cannot be called from bcopy_nc().
  274 ;           r4 = destination
  275 ;           r5 = length (<32)
  276 ;           r6 = source
  277 ;           r12 = (dest - source)
  278 
  279             .align  5
  280 shortcopy:
  281             cmplw   r12,r5                      ; must move reverse if (dest-source)<length
  282             mtcrf   2,r5                        ; move length to cr6 and cr7 one at a time...
  283             mtcrf   1,r5                        ; ...which is faster on G4 and G5
  284             bge++   backend                     ; handle forward moves (most common case)
  285             add     r6,r6,r5                    ; point one past end of operands in reverse moves
  286             add     r4,r4,r5
  287             b       bbackend                    ; handle reverse moves
  288             
  289 ;       
  290 ; void bcopy(from, to, nbytes)
  291 ;
  292 ; NOTE: bcopy is called from copyin and copyout etc with the "thread_recover" ptr set.
  293 ; This means bcopy must not set up a stack frame or touch non-volatile registers, and also means that it
  294 ; cannot rely on turning off interrupts, because we expect to get DSIs and have execution aborted by a "longjmp"
  295 ; to the thread_recover routine.  What this means is that it would be hard to use vector or floating point
  296 ; registers to accelerate the copy.
  297 ;
  298 ; NOTE: this code can be called in any of three "modes":
  299 ;       - on 32-bit processors (32-byte cache line)
  300 ;       - on 64-bit processors running in 32-bit mode (128-byte cache line)
  301 ;       - on 64-bit processors running in 64-bit mode (128-byte cache line)
  302 
  303                         .align  5
  304                         .globl  EXT(bcopy)
  305             .globl  EXT(bcopy_nop_if_32bit)
  306 
  307 LEXT(bcopy)
  308                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
  309             sub.    r12,r4,r3                                   ; test for to==from in mode-independent way, start fwd/rev check
  310                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
  311                         blt     cr1,shortcopy               ; special case short operands
  312                         crclr   noncache                                        ; Set cached
  313 LEXT(bcopy_nop_if_32bit)
  314             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
  315                         bne+    copyit32                                        ; handle 32-bit processor
  316             blr                                 ; to==from so nothing to do
  317         
  318 ;
  319 ; bcopy_nc(from, to, nbytes)
  320 ;
  321 ; bcopy_nc() operates on non-cached memory so we can not use any kind of cache instructions.
  322 ; Furthermore, we must avoid all unaligned accesses on 64-bit machines, since they take
  323 ; alignment exceptions.  Thus we cannot use "shortcopy", which could do unaligned lwz/stw.
  324 ; Like bcopy(), bcopy_nc() can be called both in 32- and 64-bit mode.
  325 
  326                         .align  5
  327                         .globl  EXT(bcopy_nc)
  328             .globl  EXT(bcopy_nc_nop_if_32bit)
  329 
  330 LEXT(bcopy_nc)
  331                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
  332             sub.        r12,r4,r3                                       ; test for to==from in mode-independent way, start fwd/rev check
  333                         mr              r6,r3                                           ; Set source (must preserve r3 for memcopy return)
  334                         crset   noncache                                        ; Set non-cached
  335                         cror    cr0_eq,cr1_eq,cr0_eq        ; set cr0 beq if either length zero or to==from
  336 LEXT(bcopy_nc_nop_if_32bit)
  337             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
  338                         bne+    copyit32                                        ; handle 32-bit processor
  339             blr                                 ; either zero length or to==from
  340 
  341 ;
  342 ; void* memcpy(to, from, nbytes)
  343 ; void* memmove(to, from, nbytes)
  344 ;
  345 ; memcpy() and memmove() are only called in 32-bit mode, albeit on both 32- and 64-bit processors.
  346 ; However, they would work correctly if called in 64-bit mode.
  347 
  348                         .align  5
  349                         .globl  EXT(memcpy)
  350                         .globl  EXT(memmove)
  351             .globl  EXT(memcpy_nop_if_32bit)
  352 
  353 LEXT(memcpy)
  354 LEXT(memmove)
  355                         cmplwi  cr1,r5,kShort               ; less than 32 bytes?
  356             sub.    r12,r3,r4                                   ; test for to==from in mode-independent way, start fwd/rev check
  357                         mr              r6,r4                                           ; Set source
  358                         mr              r4,r3                                           ; Set the "to" (must preserve r3 for return value)
  359                         blt     cr1,shortcopy               ; special case short operands
  360                         crclr   noncache                                        ; Set cached
  361 LEXT(memcpy_nop_if_32bit)
  362             bne++   copyit64                    ; handle 64-bit processor (patched to NOP if 32-bit processor)
  363                         beqlr-                              ; exit if to==from
  364 
  365 
  366 ;       Here to copy on 32-bit processors.
  367 ;
  368 ;                       When we move the memory, forward overlays must be handled.  We
  369 ;                       also can not use the cache instructions if we are from bcopy_nc.
  370 ;                       We need to preserve R3 because it needs to be returned for memcpy.
  371 ;                       We can be interrupted and lose control here.
  372 ;
  373 ;           When entered:
  374 ;               r4 = destination
  375 ;               r5 = length (>0)
  376 ;               r6 = source
  377 ;               r12 = (dest - source)
  378 ;               cr5 = noncache flag
  379 
  380 copyit32:                                       ; WARNING! can drop down to this label
  381             cmplw   cr1,r12,r5                  ; must move reverse if (dest-source)<length
  382             cntlzw  r11,r5                      ; get magnitude of length
  383             dcbt    0,r6                        ; start to touch in source
  384             lis     r10,hi16(0x80000000)        ; get 0x80000000
  385             neg     r9,r4                       ; start to get alignment for destination
  386             dcbtst  0,r4                        ; start to touch in destination
  387             sraw    r8,r10,r11                  ; get mask based on operand length, to limit alignment
  388             blt-    cr1,reverse32bit            ; reverse move required
  389                         
  390 ; Forward moves on 32-bit machines, also word aligned uncached ops on 64-bit machines.
  391 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
  392 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
  393 ;               r4 = destination
  394 ;               r5 = length (>0)
  395 ;               r6 = source
  396 ;               r8 = inverse of largest mask smaller than operand length
  397 ;               r9 = neg(dest), used to compute alignment
  398 ;               cr5 = noncache flag
  399 
  400 forward32bit:                                   ; enter from 64-bit CPUs with word aligned uncached operands
  401                         rlwinm  r7,r9,0,0x1F                            ; get bytes to 32-byte-align destination
  402                         andc.   r0,r7,r8                                        ; limit to the maximum front end move
  403             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
  404                         beq             alline                                          ; Already on a line...
  405                         
  406                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5
  407                         sub             r5,r5,r0                                        ; Set the length left to move
  408 
  409                         bf              31,alhalf                                       ; No single byte to do...
  410                         lbz             r7,0(r6)                                        ; Get the byte
  411                         addi    r6,r6,1                                         ; Point to the next
  412                         stb             r7,0(r4)                                        ; Save the single
  413                         addi    r4,r4,1                                         ; Bump sink
  414                         
  415 ;                       Sink is halfword aligned here
  416 
  417 alhalf:         bf              30,alword                                       ; No halfword to do...
  418                         lhz             r7,0(r6)                                        ; Get the halfword
  419                         addi    r6,r6,2                                         ; Point to the next
  420                         sth             r7,0(r4)                                        ; Save the halfword
  421                         addi    r4,r4,2                                         ; Bump sink
  422                         
  423 ;                       Sink is word aligned here
  424 
  425 alword:         bf              29,aldouble                                     ; No word to do...
  426                         lwz             r7,0(r6)                                        ; Get the word
  427                         addi    r6,r6,4                                         ; Point to the next
  428                         stw             r7,0(r4)                                        ; Save the word
  429                         addi    r4,r4,4                                         ; Bump sink
  430                         
  431 ;                       Sink is double aligned here
  432 
  433 aldouble:       bf              28,alquad                                       ; No double to do...
  434                         lwz             r7,0(r6)                                        ; Get the first word
  435                         lwz             r8,4(r6)                                        ; Get the second word
  436                         addi    r6,r6,8                                         ; Point to the next
  437                         stw             r7,0(r4)                                        ; Save the first word
  438                         stw             r8,4(r4)                                        ; Save the second word
  439                         addi    r4,r4,8                                         ; Bump sink
  440                         
  441 ;                       Sink is quadword aligned here
  442 
  443 alquad:         bf              27,alline                                       ; No quad to do...
  444                         lwz             r7,0(r6)                                        ; Get the first word
  445                         lwz             r8,4(r6)                                        ; Get the second word
  446                         lwz             r9,8(r6)                                        ; Get the third word
  447                         stw             r7,0(r4)                                        ; Save the first word
  448                         lwz             r11,12(r6)                                      ; Get the fourth word
  449                         addi    r6,r6,16                                        ; Point to the next
  450                         stw             r8,4(r4)                                        ; Save the second word
  451                         stw             r9,8(r4)                                        ; Save the third word
  452                         stw             r11,12(r4)                                      ; Save the fourth word
  453                         addi    r4,r4,16                                        ; Bump sink
  454                         
  455 ;                       Sink is line aligned here
  456 
  457 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
  458             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
  459                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5                  
  460                         beq-    backend                                         ; No full lines to move
  461             
  462             mtctr   r0                          ; set up loop count
  463                         li              r0,96                                           ; Stride for touch ahead
  464             b       nxtline
  465                         
  466             .align  4
  467 nxtline:
  468             lwz         r2,0(r6)                                        ; Get the first word
  469                         lwz             r5,4(r6)                                        ; Get the second word
  470                         lwz             r7,8(r6)                                        ; Get the third word
  471                         lwz             r8,12(r6)                                       ; Get the fourth word
  472                         lwz             r9,16(r6)                                       ; Get the fifth word
  473                         lwz             r10,20(r6)                                      ; Get the sixth word
  474                         lwz             r11,24(r6)                                      ; Get the seventh word
  475                         lwz             r12,28(r6)                                      ; Get the eighth word
  476                         bt-             noncache,skipz                          ; Skip if we are not cached...
  477                         dcbz    0,r4                                            ; Blow away the whole line because we are replacing it
  478                         dcbt    r6,r0                                           ; Touch ahead a bit
  479 skipz:
  480                         addi    r6,r6,32                                        ; Point to the next
  481                         stw             r2,0(r4)                                        ; Save the first word
  482                         stw             r5,4(r4)                                        ; Save the second word
  483                         stw             r7,8(r4)                                        ; Save the third word
  484                         stw             r8,12(r4)                                       ; Save the fourth word
  485                         stw             r9,16(r4)                                       ; Save the fifth word
  486                         stw             r10,20(r4)                                      ; Save the sixth word
  487                         stw             r11,24(r4)                                      ; Save the seventh word
  488                         stw             r12,28(r4)                                      ; Save the eighth word
  489                         addi    r4,r4,32                                        ; Bump sink
  490                         bdnz+   nxtline                                         ; Do the next line, if any...
  491 
  492         
  493 ;                       Move backend quadword
  494 
  495 backend:                                        ; Join here from "shortcopy" for forward moves <32 bytes
  496             bf          27,noquad                                       ; No quad to do...
  497                         lwz             r7,0(r6)                                        ; Get the first word
  498                         lwz             r8,4(r6)                                        ; Get the second word
  499                         lwz             r9,8(r6)                                        ; Get the third word
  500                         lwz             r11,12(r6)                                      ; Get the fourth word
  501                         stw             r7,0(r4)                                        ; Save the first word
  502                         addi    r6,r6,16                                        ; Point to the next
  503                         stw             r8,4(r4)                                        ; Save the second word
  504                         stw             r9,8(r4)                                        ; Save the third word
  505                         stw             r11,12(r4)                                      ; Save the fourth word
  506                         addi    r4,r4,16                                        ; Bump sink
  507                         
  508 ;                       Move backend double
  509 
  510 noquad:         bf              28,nodouble                                     ; No double to do...
  511                         lwz             r7,0(r6)                                        ; Get the first word
  512                         lwz             r8,4(r6)                                        ; Get the second word
  513                         addi    r6,r6,8                                         ; Point to the next
  514                         stw             r7,0(r4)                                        ; Save the first word
  515                         stw             r8,4(r4)                                        ; Save the second word
  516                         addi    r4,r4,8                                         ; Bump sink
  517                         
  518 ;                       Move backend word
  519 
  520 nodouble:       bf              29,noword                                       ; No word to do...
  521                         lwz             r7,0(r6)                                        ; Get the word
  522                         addi    r6,r6,4                                         ; Point to the next
  523                         stw             r7,0(r4)                                        ; Save the word
  524                         addi    r4,r4,4                                         ; Bump sink
  525                         
  526 ;                       Move backend halfword
  527 
  528 noword:         bf              30,nohalf                                       ; No halfword to do...
  529                         lhz             r7,0(r6)                                        ; Get the halfword
  530                         addi    r6,r6,2                                         ; Point to the next
  531                         sth             r7,0(r4)                                        ; Save the halfword
  532                         addi    r4,r4,2                                         ; Bump sink
  533 
  534 ;                       Move backend byte
  535 
  536 nohalf:         bflr    31                          ; Leave cuz we are all done...      
  537                         lbz             r7,0(r6)                                        ; Get the byte
  538                         stb             r7,0(r4)                                        ; Save the single
  539             blr
  540 
  541 
  542 ; Reverse moves on 32-bit machines, also reverse word aligned uncached moves on 64-bit machines.
  543 ; NOTE: we never do an unaligned access if the source and destination are "relatively"
  544 ; word aligned.  We depend on this in the uncached case on 64-bit processors.
  545 ; These are slower because we don't bother with dcbz.  Fortunately, reverse moves are uncommon.
  546 ;               r4 = destination
  547 ;               r5 = length (>0)
  548 ;               r6 = source
  549 ;               r8 = inverse of largest mask smaller than operand length
  550 ;               cr5 = noncache flag (but we don't dcbz anyway)
  551 
  552 reverse32bit:                                                                   ; here from 64-bit code with word aligned uncached operands
  553             add         r4,r5,r4                                        ; Point past the last sink byte
  554                         add             r6,r5,r6                                        ; Point past the last source byte 
  555                         rlwinm  r7,r4,0,0x1F                            ; Calculate the length to align dest on cache boundary
  556                         li              r12,-1                                          ; Make sure we touch in the actual line
  557                         andc.   r0,r7,r8                                        ; Apply movement limit
  558                         dcbt    r12,r6                                          ; Touch in the last line of source
  559             mtcrf   0x01,r0                     ; move length to cr6 and cr7 one cr at a time...
  560                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
  561                         mtcrf   0x02,r0                                         ; ...since moving more than one is slower on G4 and G5                  
  562                         beq-    balline                                         ; Aready on cache line boundary (or too short to bother)
  563                         
  564                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
  565                         
  566                         bf              31,balhalf                                      ; No single byte to do...
  567                         lbz             r7,-1(r6)                                       ; Get the byte
  568                         subi    r6,r6,1                                         ; Point to the next
  569                         stb             r7,-1(r4)                                       ; Save the single
  570                         subi    r4,r4,1                                         ; Bump sink
  571                         
  572 ;                       Sink is halfword aligned here
  573 
  574 balhalf:        bf              30,balword                                      ; No halfword to do...
  575                         lhz             r7,-2(r6)                                       ; Get the halfword
  576                         subi    r6,r6,2                                         ; Point to the next
  577                         sth             r7,-2(r4)                                       ; Save the halfword
  578                         subi    r4,r4,2                                         ; Bump sink
  579                         
  580 ;                       Sink is word aligned here
  581 
  582 balword:        bf              29,baldouble                            ; No word to do...
  583                         lwz             r7,-4(r6)                                       ; Get the word
  584                         subi    r6,r6,4                                         ; Point to the next
  585                         stw             r7,-4(r4)                                       ; Save the word
  586                         subi    r4,r4,4                                         ; Bump sink
  587                         
  588 ;                       Sink is double aligned here
  589 
  590 baldouble:      bf              28,balquad                                      ; No double to do...
  591                         lwz             r7,-8(r6)                                       ; Get the first word
  592                         lwz             r8,-4(r6)                                       ; Get the second word
  593                         subi    r6,r6,8                                         ; Point to the next
  594                         stw             r7,-8(r4)                                       ; Save the first word
  595                         stw             r8,-4(r4)                                       ; Save the second word
  596                         subi    r4,r4,8                                         ; Bump sink
  597                         
  598 ;                       Sink is quadword aligned here
  599 
  600 balquad:        bf              27,balline                                      ; No quad to do...
  601                         lwz             r7,-16(r6)                                      ; Get the first word
  602                         lwz             r8,-12(r6)                                      ; Get the second word
  603                         lwz             r9,-8(r6)                                       ; Get the third word
  604                         lwz             r11,-4(r6)                                      ; Get the fourth word
  605                         stw             r7,-16(r4)                                      ; Save the first word
  606                         subi    r6,r6,16                                        ; Point to the next
  607                         stw             r8,-12(r4)                                      ; Save the second word
  608                         stw             r9,-8(r4)                                       ; Save the third word
  609                         stw             r11,-4(r4)                                      ; Save the fourth word
  610                         subi    r4,r4,16                                        ; Bump sink
  611                         
  612 ;                       Sink is line aligned here
  613 
  614 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
  615             mtcrf   0x02,r5                     ; move length to cr6 and cr7 one cr at a time...
  616                         mtcrf   0x01,r5                                         ; ...since moving more than one is slower on G4 and G5                  
  617                         beq-    bbackend                                        ; No full lines to move
  618             mtctr   r0                          ; set up loop count
  619             b       bnxtline
  620                         
  621             .align  4
  622 bnxtline:
  623                         lwz             r7,-32(r6)                                      ; Get the first word
  624                         lwz             r5,-28(r6)                                      ; Get the second word
  625                         lwz             r2,-24(r6)                                      ; Get the third word
  626                         lwz             r12,-20(r6)                                     ; Get the third word
  627                         lwz             r11,-16(r6)                                     ; Get the fifth word
  628                         lwz             r10,-12(r6)                                     ; Get the sixth word
  629                         lwz             r9,-8(r6)                                       ; Get the seventh word
  630                         lwz             r8,-4(r6)                                       ; Get the eighth word
  631                         subi    r6,r6,32                                        ; Point to the next
  632                         
  633                         stw             r7,-32(r4)                                      ; Get the first word
  634             stw         r5,-28(r4)                                      ; Get the second word
  635                         stw             r2,-24(r4)                                      ; Get the third word
  636                         stw             r12,-20(r4)                                     ; Get the third word
  637                         stw             r11,-16(r4)                                     ; Get the fifth word
  638                         stw             r10,-12(r4)                                     ; Get the sixth word
  639                         stw             r9,-8(r4)                                       ; Get the seventh word
  640                         stw             r8,-4(r4)                                       ; Get the eighth word
  641                         subi    r4,r4,32                                        ; Bump sink
  642                         
  643                         bdnz+   bnxtline                                        ; Do the next line, if any...
  644 
  645 ;
  646 ;                       Note: We touched these lines in at the beginning
  647 ;
  648         
  649 ;                       Move backend quadword
  650 
  651 bbackend:                                       ; Join here from "shortcopy" for reverse moves of <32 bytes
  652             bf          27,bnoquad                                      ; No quad to do...
  653                         lwz             r7,-16(r6)                                      ; Get the first word
  654                         lwz             r8,-12(r6)                                      ; Get the second word
  655                         lwz             r9,-8(r6)                                       ; Get the third word
  656                         lwz             r11,-4(r6)                                      ; Get the fourth word
  657                         stw             r7,-16(r4)                                      ; Save the first word
  658                         subi    r6,r6,16                                        ; Point to the next
  659                         stw             r8,-12(r4)                                      ; Save the second word
  660                         stw             r9,-8(r4)                                       ; Save the third word
  661                         stw             r11,-4(r4)                                      ; Save the fourth word
  662                         subi    r4,r4,16                                        ; Bump sink
  663                         
  664 ;                       Move backend double
  665 
  666 bnoquad:        bf              28,bnodouble                            ; No double to do...
  667                         lwz             r7,-8(r6)                                       ; Get the first word
  668                         lwz             r8,-4(r6)                                       ; Get the second word
  669                         subi    r6,r6,8                                         ; Point to the next
  670                         stw             r7,-8(r4)                                       ; Save the first word
  671                         stw             r8,-4(r4)                                       ; Save the second word
  672                         subi    r4,r4,8                                         ; Bump sink
  673                         
  674 ;                       Move backend word
  675 
  676 bnodouble:      bf              29,bnoword                                      ; No word to do...
  677                         lwz             r7,-4(r6)                                       ; Get the word
  678                         subi    r6,r6,4                                         ; Point to the next
  679                         stw             r7,-4(r4)                                       ; Save the word
  680                         subi    r4,r4,4                                         ; Bump sink
  681                         
  682 ;                       Move backend halfword
  683 
  684 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
  685                         lhz             r7,-2(r6)                                       ; Get the halfword
  686                         subi    r6,r6,2                                         ; Point to the next
  687                         sth             r7,-2(r4)                                       ; Save the halfword
  688                         subi    r4,r4,2                                         ; Bump sink
  689 
  690 ;                       Move backend byte
  691 
  692 bnohalf:        bflr    31                          ; Leave cuz we are all done...      
  693                         lbz             r7,-1(r6)                                       ; Get the byte
  694                         stb             r7,-1(r4)                                       ; Save the single
  695                         blr
  696 
  697 
  698 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
  699 // called either in 32 or 64-bit mode, which makes the test for reverse moves
  700 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
  701 // special cases.
  702 //
  703 // When entered:
  704 //              r4 = destination (32 or 64-bit ptr)
  705 //              r5 = length (always 32 bits)
  706 //              r6 = source (32 or 64-bit ptr)
  707 //      r12 = (dest - source), reverse move required if (dest-source)<length
  708 //              cr5 = noncache flag
  709 
  710         .align  5
  711 copyit64:
  712         rlwinm  r7,r5,0,0,31        // truncate length to 32-bit, in case we're running in 64-bit mode
  713         cntlzw  r11,r5                          // get magnitude of length
  714         dcbt    0,r6                            // touch in 1st block of source
  715         dcbtst  0,r4                            // touch in 1st destination cache block
  716         subc    r7,r12,r7           // set Carry if (dest-source)>=length, in mode-independent way
  717         li      r0,0                // get a 0
  718         lis     r10,hi16(0x80000000)// get 0x80000000
  719         addze.  r0,r0               // set cr0 on carry bit (beq if reverse move required)
  720         neg     r9,r4               // start to get alignment for destination
  721         sraw    r8,r10,r11          // get mask based on operand length, to limit alignment
  722         bt--    noncache,c64uncached// skip if uncached
  723         beq--   c64rdouble          // handle cached reverse moves        
  724                 
  725         
  726 // Forward, cached or doubleword aligned uncached.  This is the common case.
  727 // NOTE: we never do an unaligned access if the source and destination are "relatively"
  728 // doubleword aligned.  We depend on this in the uncached case.
  729 //      r4 = destination
  730 //      r5 = length (>0)
  731 //      r6 = source
  732 //      r8 = inverse of largest mask smaller than operand length
  733 //      r9 = neg(dest), used to compute alignment
  734 //      cr5 = noncache flag
  735 
  736 c64double:
  737         rlwinm  r7,r9,0,0x7F        // get #bytes to 128-byte align destination
  738         andc    r7,r7,r8            // limit by operand length
  739         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
  740         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
  741         sub             r5,r5,r7                        // adjust length remaining
  742         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
  743         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
  744         cmpwi   cr7,r10,0                       // set cr7 on chunk count
  745         beq             c64double2                      // dest already doubleword aligned
  746         mtctr   r8
  747         b               c64double1
  748         
  749         .align  5                                       // align inner loops
  750 c64double1:                                                     // copy bytes until dest is doubleword aligned
  751         lbz             r0,0(r6)
  752         addi    r6,r6,1
  753         stb             r0,0(r4)
  754         addi    r4,r4,1
  755         bdnz    c64double1
  756 
  757 c64double2:                                                     // r9/cr1=doublewords, r10/cr7=128-byte chunks
  758         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
  759         mtctr   r9
  760         b               c64double3
  761 
  762         .align  5                                       // align inner loops
  763 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
  764         ld              r7,0(r6)
  765         addi    r6,r6,8
  766         std             r7,0(r4)
  767         addi    r4,r4,8
  768         bdnz    c64double3
  769         
  770 // Here to xfer 128-byte chunks, if any.  Since we only have 8 GPRs for
  771 // data (64 bytes), we load/store each twice per 128-byte chunk.
  772 
  773 c64double4:                                                     // r10/cr7=128-byte chunks
  774         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
  775         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
  776         beq             cr7,c64double7          // no 128-byte chunks
  777         
  778         ; We must check for (source-dest)<128 in a mode-independent way.  If within 128 bytes,
  779         ; turn on "noncache" because we cannot use dcbz128 even if operands are cacheable.
  780         
  781         sub             r8,r6,r4                        // r8 <- (source - dest)
  782         rldicr. r0,r8,0,63-7        // zero low 7 bits and check for 0, mode independent
  783         cror    noncache,cr0_eq,noncache        // turn on "noncache" flag if (source-dest)<128
  784         mtctr   r10
  785         b               c64InnerLoop
  786                 
  787         .align  5                                       // align inner loop
  788 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
  789         ld              r0,0(r6)                        // start pipe: load 1st half-line
  790         ld              r2,8(r6)
  791         ld              r7,16(r6)
  792         ld              r8,24(r6)
  793         ld              r9,32(r6)
  794         ld              r10,40(r6)
  795         ld              r11,48(r6)
  796         ld              r12,56(r6)
  797         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
  798         dcbz128 0,r4                            // avoid prefetch of next cache line
  799 c64InnerLoop1:
  800 
  801         std             r0,0(r4)
  802         std             r2,8(r4)
  803         std             r7,16(r4)
  804         std             r8,24(r4)
  805         std             r9,32(r4)
  806         std             r10,40(r4)
  807         std             r11,48(r4)
  808         std             r12,56(r4)
  809         
  810         ld              r0,64(r6)                       // load 2nd half of chunk
  811         ld              r2,72(r6)
  812         ld              r7,80(r6)
  813         ld              r8,88(r6)
  814         ld              r9,96(r6)
  815         ld              r10,104(r6)
  816         ld              r11,112(r6)
  817         ld              r12,120(r6)
  818         addi    r6,r6,128
  819 
  820         std             r0,64(r4)
  821         std             r2,72(r4)
  822         std             r7,80(r4)
  823         std             r8,88(r4)
  824         std             r9,96(r4)
  825         std             r10,104(r4)
  826         std             r11,112(r4)
  827         std             r12,120(r4)
  828         addi    r4,r4,128                       // advance to next dest chunk
  829 
  830         bdnz    c64InnerLoop            // loop if more chunks
  831         
  832 
  833 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
  834         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
  835         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
  836         beq             cr1,c64byte                     // no leftover doublewords
  837         mtctr   r0
  838         b               c64double8
  839         
  840         .align  5                                       // align inner loop
  841 c64double8:                                                     // loop copying leftover doublewords
  842         ld              r0,0(r6)
  843         addi    r6,r6,8
  844         std             r0,0(r4)
  845         addi    r4,r4,8
  846         bdnz    c64double8
  847 
  848 
  849 // Forward byte loop.
  850 
  851 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
  852                 beqlr                       // done if no leftover bytes
  853         mtctr   r5
  854         b               c64byte1
  855         
  856         .align  5                                       // align inner loop
  857 c64byte1:
  858         lbz             r0,0(r6)
  859         addi    r6,r6,1
  860         stb             r0,0(r4)
  861         addi    r4,r4,1
  862         bdnz    c64byte1
  863 
  864         blr
  865 
  866 
  867 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
  868 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
  869 // a byte at a time, but that is still much faster than alignment exceptions.
  870 //      r4 = destination
  871 //      r5 = length (>0)
  872 //      r6 = source
  873 //      r8 = inverse of largest mask smaller than operand length
  874 //      r9 = neg(dest), used to compute alignment
  875 //      r12 = (dest-source), used to test relative alignment
  876 //      cr0 = beq if reverse move required
  877 //      cr5 = noncache flag
  878 
  879 c64uncached:
  880         rlwinm  r10,r12,0,29,31         // relatively doubleword aligned?
  881         rlwinm  r11,r12,0,30,31         // relatively word aligned?
  882         cmpwi   cr7,r10,0                       // set cr7 beq if doubleword aligned
  883         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
  884         beq--   c64reverseUncached
  885         
  886         beq             cr7,c64double           // doubleword aligned
  887         beq             cr1,forward32bit    // word aligned, use G3/G4 code
  888         cmpwi   r5,0                            // set cr0 on byte count
  889         b               c64byte                         // unaligned operands
  890 
  891 c64reverseUncached:
  892         beq             cr7,c64rdouble          // doubleword aligned so can use LD/STD
  893         beq             cr1,reverse32bit        // word aligned, use G3/G4 code
  894         add             r6,r6,r5                        // point to (end+1) of source and dest
  895         add             r4,r4,r5
  896         cmpwi   r5,0                            // set cr0 on length
  897         b               c64rbyte                        // copy a byte at a time
  898         
  899         
  900 
  901 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
  902 // aligned uncached copies.
  903 //      r4 = destination
  904 //      r5 = length (>0)
  905 //      r6 = source
  906 //      r8 = inverse of largest mask of low-order 1s smaller than operand length
  907 //      cr5 = noncache flag
  908 
  909 c64rdouble:
  910         add             r6,r6,r5                        // point to (end+1) of source and dest
  911         add             r4,r4,r5
  912         rlwinm  r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
  913         andc.   r7,r7,r8            // limit by operand length
  914         sub             r5,r5,r7                        // adjust length
  915         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
  916         cmpwi   cr1,r8,0                        // any chunks?
  917         beq             c64rd2                          // source already doubleword aligned
  918         mtctr   r7
  919 
  920 c64rd1:                                                         // copy bytes until source doublword aligned
  921         lbzu    r0,-1(r6)
  922         stbu    r0,-1(r4)
  923         bdnz    c64rd1
  924         
  925 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
  926         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
  927         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
  928         cmpwi   cr7,r0,0                        // leftover doublewords?
  929         beq             cr1,c64rd4                      // no chunks to xfer
  930         mtctr   r8
  931         b               c64rd3
  932         
  933         .align  5                                       // align inner loop
  934 c64rd3:                                                         // loop copying 64-byte chunks
  935         ld              r7,-8(r6)
  936         ld              r8,-16(r6)
  937         ld              r9,-24(r6)
  938         ld              r10,-32(r6)
  939         ld              r11,-40(r6)
  940         ld              r12,-48(r6)
  941         std             r7,-8(r4)
  942         std             r8,-16(r4)
  943         ld              r7,-56(r6)
  944         ldu             r8,-64(r6)
  945         std             r9,-24(r4)
  946         std             r10,-32(r4)
  947         std             r11,-40(r4)
  948         std             r12,-48(r4)
  949         std             r7,-56(r4)
  950         stdu    r8,-64(r4)
  951         bdnz    c64rd3
  952 
  953 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
  954         beq             cr7,c64rbyte            // no leftover doublewords
  955         mtctr   r0
  956         
  957 c64rd5:                                                         // loop copying leftover doublewords
  958         ldu             r0,-8(r6)
  959         stdu    r0,-8(r4)
  960         bdnz    c64rd5
  961 
  962 
  963 // Reverse byte loop.
  964 
  965 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
  966         beqlr                       // done if no leftover bytes
  967         mtctr   r5
  968         
  969 c64rbyte1:
  970         lbzu    r0,-1(r6)
  971         stbu    r0,-1(r4)
  972         bdnz    c64rbyte1
  973 
  974         blr
  975
Cache object: 8cac0d89a8431e1ba635e861284bb6dc
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/osfmk/ppc/bcopy.s

FreeBSD/Linux Kernel Cross Reference
sys/osfmk/ppc/bcopy.s