bcopy.s

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
    7  * 
    8  * This file contains Original Code and/or Modifications of Original Code
    9  * as defined in and that are subject to the Apple Public Source License
   10  * Version 2.0 (the 'License'). You may not use this file except in
   11  * compliance with the License. Please obtain a copy of the License at
   12  * http://www.opensource.apple.com/apsl/ and read it before using this
   13  * file.
   14  * 
   15  * The Original Code and all software distributed under the License are
   16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   20  * Please see the License for the specific language governing rights and
   21  * limitations under the License.
   22  * 
   23  * @APPLE_LICENSE_HEADER_END@
   24  */
   25 ;
   26 ;                       Copy bytes of data around. handles overlapped data.
   27 ;
   28 ;                       Change this to use Altivec later on, and maybe floating point.
   29 ;
   30 ;
   31 #include <ppc/asm.h>
   32 #include <ppc/proc_reg.h>
   33 #include <assym.s>
   34 
   35 ;               Use CR5_lt to indicate non-cached
   36 #define noncache 20
   37 
   38 ;               Use CR5_gt to indicate that we need to turn data translation back on
   39 #define fixxlate 21
   40 
   41 ;               Use CR5_eq to indicate that we need to invalidate bats (if 32-bit) or turn off
   42 ;               64-bit mode (if 64-bit) before returning to our caller.  We overload the
   43 ;               bit to reduce the number of conditional branches at bcopy exit.
   44 #define restorex 22
   45 
   46 ;               Use CR5_so to indicate that we need to restore real-mode cachability
   47 ;               Only needed on 64-bit machines
   48 #define flipcache 23
   49 
   50 ;
   51 ; bcopy_nc(from, to, nbytes)
   52 ;
   53 ; bcopy_nc operates on non-cached memory so we can not use any kind
   54 ; of cache instructions.
   55 ;
   56 
   57                         .align  5
   58                         .globl  EXT(bcopy_nc)
   59 
   60 LEXT(bcopy_nc)
   61                         
   62                         crset   noncache                                        ; Set non-cached
   63                         b               bcpswap
   64 
   65 ;       
   66 ; void bcopy_physvir(from, to, nbytes)
   67 ; Attempt to copy physically addressed memory with translation on if conditions are met.
   68 ; Otherwise do a normal bcopy_phys.  This routine is used because some 32-bit processors 
   69 ; are very slow doing real-mode (translation off) copies, so we set up temporary BATs
   70 ; for the passed phys addrs and do the copy with translation on.  
   71 ;
   72 ; Rules are: neither source nor destination can cross a page. 
   73 ;
   74 ; Interrupts must be disabled throughout the copy when this is called.
   75 ; To do this, we build a
   76 ; 128 DBAT for both the source and sink.  If both are the same, only one is
   77 ; loaded.  We do not touch the IBATs, so there is no issue if either physical page
   78 ; address is the same as the virtual address of the instructions we are executing.
   79 ;
   80 ; At the end, we invalidate the used DBATs.
   81 ;
   82 ; Note that the address parameters are long longs.  We will transform these to 64-bit
   83 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
   84 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
   85 ; there anyhow.
   86 ;
   87 ; Note, this one will not work in user state
   88 ; 
   89 
   90                         .align  5
   91                         .globl  EXT(bcopy_physvir)
   92 
   93 LEXT(bcopy_physvir)
   94 
   95                         crclr   flipcache                                       ; (HACK) No cache flip needed
   96             mfsprg      r8,2                                            ; get processor feature flags
   97             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
   98                         addic.  r0,r7,-1                                        ; Get length - 1
   99                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  100                         add             r11,r3,r0                                       ; Point to last byte of sink
  101                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  102             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  103             rlwimi      r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  104                         mr              r5,r7                                           ; Get the length into the right register
  105                         cmplw   cr1,r3,r4                                       ; Does source == sink?  
  106             bt++        pf64Bitb,bcopy_phys1            ; if 64-bit processor, use standard routine (no BATs)
  107                         add             r12,r4,r0                                       ; Point to last byte of source
  108                         bltlr-                                                          ; Bail if length is 0 or way too big
  109                         xor             r7,r11,r3                                       ; See if we went to next page
  110                         xor             r8,r12,r4                                       ; See if we went to next page
  111                         or              r0,r7,r8                                        ; Combine wrap
  112                         
  113 //                      li              r9,((PTE_WIMG_CB_CACHED_COHERENT<<3)|2) ; Set default attributes
  114                         li              r9,((2<<3)|2)                           ; Set default attributes
  115                         rlwinm. r0,r0,0,0,19                            ; Did we overflow a page?
  116                         li              r7,2                                            ; Set validity flags
  117                         li              r8,2                                            ; Set validity flags
  118                         bne-    bcopy_phys1                                     ; Overflowed page, do normal physical copy...
  119 
  120                         crset   restorex                                        ; Remember to trash BATs on the way out
  121                         rlwimi  r11,r9,0,15,31                          ; Set sink lower DBAT value
  122                         rlwimi  r12,r9,0,15,31                          ; Set source lower DBAT value
  123                         rlwimi  r7,r11,0,0,14                           ; Set sink upper DBAT value
  124                         rlwimi  r8,r12,0,0,14                           ; Set source upper DBAT value
  125                         cmplw   cr1,r11,r12                                     ; See if sink and source are same block
  126                         
  127                         sync
  128 
  129                         mtdbatl 0,r11                                           ; Set sink lower DBAT 
  130                         mtdbatu 0,r7                                            ; Set sink upper DBAT
  131 
  132                         beq-    cr1,bcpvsame                            ; Source and sink are in same block
  133 
  134                         mtdbatl 1,r12                                           ; Set source lower DBAT 
  135                         mtdbatu 1,r8                                            ; Set source upper DBAT
  136 
  137 bcpvsame:       mr              r6,r3                                           ; Set source
  138                         crclr   noncache                                        ; Set cached
  139                         crclr   fixxlate                                        ; Set translation already ok
  140                         
  141                         b               copyit32                                        ; Go copy it...
  142 
  143 ;       
  144 ; void bcopy_phys(from, to, nbytes)
  145 ; Turns off data translation before the copy.  Note, this one will
  146 ; not work in user state.  This routine is used on 32 and 64-bit
  147 ; machines.
  148 ;
  149 ; Note that the address parameters are long longs.  We will transform these to 64-bit
  150 ; values.  Note that on 32-bit architectures that this will ignore the high half of the
  151 ; passed in value.  This should be ok since we can not have any bigger than 32 bit addresses
  152 ; there anyhow.
  153 ;
  154 ; Also note that you probably will not be happy if either the sink or source spans across the
  155 ; boundary between RAM and I/O space.  Good chance of hanging the machine and this code 
  156 ; will not check, so be careful.
  157 ;
  158 
  159                         .align  5
  160                         .globl  EXT(bcopy_phys)
  161 
  162 LEXT(bcopy_phys)
  163                         crclr   flipcache                                       ; (HACK) No cache flip needed
  164             rlwinm      r3,r3,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  165             mfsprg      r8,2                                            ; get processor feature flags
  166                         rlwimi  r3,r4,0,0,31                            ; Combine bottom of long long to full 64-bits
  167                         rlwinm  r4,r5,0,1,0                                     ; Duplicate high half of long long paddr into top of reg
  168                         mtcrf   0x02,r8                                         ; move pf64Bit to cr6 so we can test
  169                         rlwimi  r4,r6,0,0,31                            ; Combine bottom of long long to full 64-bits
  170                         mr              r5,r7                                           ; Get the length into the right register
  171             
  172 bcopy_phys1:                                                                    ; enter from bcopy_physvir with pf64Bit already in cr6
  173                         mfmsr   r9                                                      ; Get the MSR
  174                         crclr   noncache                                        ; Set cached
  175             bt++        pf64Bitb,bcopy_phys64           ; skip if 64-bit (only they take hint)
  176 
  177 ; 32-bit CPUs
  178             
  179             sub.        r0,r3,r4                                        ; to==from?
  180                         rlwinm  r8,r9,0,MSR_DR_BIT,MSR_DR_BIT   ; was translation on?
  181             cmpwi       cr1,r8,0                                        ; set cr1 beq if translation was off
  182                         oris    r8,r8,hi16(MASK(MSR_VEC))       ; Get vector enable
  183                         cmplwi  cr7,r5,0                                        ; Check if we have a 0 length
  184             beqlr-                                                              ; bail if to==from
  185                         ori             r8,r8,lo16(MASK(MSR_FP))        ; Get FP
  186                         mr              r6,r3                                           ; Set source
  187                         andc    r9,r9,r8                                        ; Turn off translation if it is on (should be) and FP, VEC
  188                         beqlr-  cr7                                                     ; Bail if length is 0
  189                         
  190                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
  191                         mtmsr   r9                                                      ; Set DR translation off
  192                         isync                                                           ; Wait for it
  193                         
  194                         crnot   fixxlate,cr1_eq                         ; Remember to turn on translation if it was
  195                         b               copyit32                                        ; Go copy it...
  196             
  197 ; 64-bit: turn DR off and SF on, remember if we need to restore on way out.
  198 
  199 bcopy_phys64:                                                                   ; r9 = MSR
  200 
  201                         srdi    r2,r3,31                                        ; (HACK) Get a 1 if source is in I/O memory
  202             srdi.       r0,r9,63-MSR_SF_BIT                     ; set cr0 beq on if SF was off when we were called
  203             rlwinm      r8,r9,MSR_DR_BIT+1,31,31        ; r8 <- DR bit right justified
  204             cmpld       cr1,r3,r4                                       ; to==from?
  205             li          r0,1                                            ; Note - we use this in a couple places below
  206                         lis             r6,hi16(MASK(MSR_VEC))          ; Get vector enable
  207             cmpwi       cr7,r5,0                                        ; length==0 ?
  208             ori         r6,r6,lo16(MASK(MSR_FP)|MASK(MSR_DR))   ; Add in FP and DR
  209             beqlr--     cr1                                                     ; bail if to==from
  210                         srdi    r10,r4,31                                       ; (HACK) Get a 1 if sink is in I/O memory
  211             rldimi      r9,r0,63,MSR_SF_BIT                     ; set SF on
  212             beqlr--     cr7                                                     ; bail if length==0
  213             andc        r9,r9,r6                                        ; turn DR, VEC, FP off
  214             cmpwi       cr1,r8,0                                        ; was DR on?
  215             crmove      restorex,cr0_eq                         ; if SF was off, remember to turn back off before we return
  216             mtmsrd      r9                                                      ; turn 64-bit addressing on, data translation off
  217                         cmpldi  cr0,r2,1                                        ; (HACK) Is source in I/O memory?
  218             isync                                                               ; wait for it to happen
  219                         mr              r6,r3                                           ; Set source
  220                         cmpldi  cr7,r10,1                                       ; (HACK) Is sink in I/O memory?
  221             crnot       fixxlate,cr1_eq                         ; if DR was on, remember to turn back on before we return
  222 
  223                         cror    flipcache,cr0_eq,cr7_eq         ; (HACK) See if either source or sink is in I/O area
  224 
  225                         rlwinm  r10,r9,MSR_EE_BIT+1,31,31       ; (HACK GLORIOUS HACK) Isolate the EE bit
  226                         sldi    r11,r0,31-MSR_EE_BIT            ; (HACK GLORIOUS HACK)) Get a mask for the EE bit
  227                         sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
  228                         bf++    flipcache,copyit64                      ; (HACK) No need to mess with caching...
  229                         
  230 ;
  231 ;                       HACK GLORIOUS HACK - when we force of caching, we need to also force off
  232 ;                       interruptions.  We are out of CR bits, so we need to stash the entry EE
  233 ;                       somewheres.  It is in the XER....  We NEED to change this!!!!
  234 ;
  235 
  236                         mtxer   r10                                                     ; (HACK GLORIOUS HACK) Remember EE
  237                         andc    r9,r9,r11                                       ; (HACK GLORIOUS HACK) Turn off EE bit
  238                         mfspr   r2,hid4                                         ; (HACK) Get HID4
  239                         crset   noncache                                        ; (HACK) Set non-cached
  240                         mtmsrd  r9                                                      ; (HACK GLORIOUS HACK) Force off EE
  241                         or              r2,r2,r0                                        ; (HACK) Set bit to make real accesses cache-inhibited
  242                         sync                                                            ; (HACK) Sync up
  243                         li              r0,1
  244                         mtspr   hid4,r2                                         ; (HACK) Make real accesses cache-inhibited
  245                         isync                                                           ; (HACK) Toss prefetches
  246 
  247                         lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
  248                         srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
  249                         slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared 
  250                         
  251                         sync                                                            ; (HACK)
  252                         isync                                                           ; (HACK)
  253                         
  254             b           copyit64
  255             
  256 
  257 ;       
  258 ; void bcopy(from, to, nbytes)
  259 ;
  260 
  261                         .align  5
  262                         .globl  EXT(bcopy)
  263 
  264 LEXT(bcopy)
  265 
  266                         crclr   noncache                                        ; Set cached
  267 
  268 bcpswap:        
  269                         crclr   flipcache                                       ; (HACK) No cache flip needed
  270             mfsprg      r8,2                                            ; get processor feature flags
  271             sub.        r0,r4,r3                                        ; test for to==from in mode-independent way
  272             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  273                         cmpwi   cr1,r5,0                                        ; Check if we have a 0 length
  274                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
  275                         mr              r6,r3                                           ; Set source
  276                         crclr   fixxlate                                        ; Set translation already ok
  277                         beqlr-                                                          ; Bail if "to" and "from" are the same  
  278                         beqlr-  cr1                                                     ; Bail if length is 0
  279             bt++        pf64Bitb,copyit64                       ; handle 64-bit processor
  280                         b               copyit32                                        ; Go copy it...
  281 
  282 ;
  283 ;                       When we move the memory, forward overlays must be handled.  We
  284 ;                       also can not use the cache instructions if we are from bcopy_nc.
  285 ;                       We need to preserve R3 because it needs to be returned for memcpy.
  286 ;                       We can be interrupted and lose control here.
  287 ;
  288 ;                       There is no stack, so in order to use vectors, we would
  289 ;                       need to take the vector exception. Any potential gains by using vectors 
  290 ;                       would be more than eaten up by this.
  291 ;
  292 ;                       NOTE: this code is called in three "modes":
  293 ;                               - on 32-bit processors (32-byte cache line)
  294 ;                               - on 64-bit processors running in 32-bit mode (128-byte cache line)
  295 ;                               - on 64-bit processors running in 64-bit mode (128-byte cache line)
  296 ;
  297 ;                       ALSO NOTE: bcopy is called from copyin and copyout etc
  298 ;                       with the "thread_recover" ptr set.  This means bcopy must not set up a
  299 ;                       stack frame or touch non-volatile registers, and also means that it
  300 ;                       cannot rely on turning off interrupts, because we expect to get DSIs
  301 ;                       and have execution aborted by a "longjmp" to the thread_recover
  302 ;                       routine.
  303 ;
  304         
  305                         .align  5
  306                         .globl  EXT(memcpy)
  307             ; NB: memcpy is only called in 32-bit mode, albeit on both 32- and 64-bit
  308             ; processors...
  309 LEXT(memcpy)
  310                         crclr   flipcache                                       ; (HACK) No cache flip needed
  311             mfsprg      r8,2                                            ; get processor feature flags
  312                         cmplw   cr1,r3,r4                                       ; "to" and "from" the same?
  313             mtcrf       0x02,r8                                         ; move pf64Bit to cr6 so we can test
  314                         mr              r6,r4                                           ; Set the "from"
  315                         mr.             r5,r5                                           ; Length zero?
  316                         crclr   noncache                                        ; Set cached
  317                         mr              r4,r3                                           ; Set the "to"
  318                         crclr   fixxlate                                        ; Set translation already ok
  319                         beqlr-  cr1                                                     ; "to" and "from" are the same
  320                         beqlr-                                                          ; Length is 0
  321                         crclr   restorex                                        ; Make sure we do not trash BATs on the way out
  322             bt++        pf64Bitb,copyit64                       ; handle 64-bit processors
  323                         
  324 copyit32:       sub             r12,r4,r6                                       ; Get potential overlap (negative if backward move)
  325                         lis             r8,0x7FFF                                       ; Start up a mask
  326                         srawi   r11,r12,31                                      ; Propagate the sign bit
  327                         dcbt    br0,r6                                          ; Touch in the first source line
  328                         cntlzw  r7,r5                                           ; Get the highest power of 2 factor of the length
  329                         ori             r8,r8,0xFFFF                            ; Make limit 0x7FFFFFFF
  330                         xor             r9,r12,r11                                      ; If sink - source was negative, invert bits
  331                         srw             r8,r8,r7                                        ; Get move length limitation
  332                         sub             r9,r9,r11                                       ; If sink - source was negative, add 1 and get absolute value
  333                         cmplw   r12,r5                                          ; See if we actually forward overlap
  334                         cmplwi  cr7,r9,32                                       ; See if at least a line between  source and sink
  335                         dcbtst  br0,r4                                          ; Touch in the first sink line
  336                         cmplwi  cr1,r5,32                                       ; Are we moving more than a line?
  337                         cror    noncache,noncache,cr7_lt        ; Set to not DCBZ output line if not enough space
  338                         blt-    fwdovrlap                                       ; This is a forward overlapping area, handle it...
  339 
  340 ;
  341 ;                       R4 = sink
  342 ;                       R5 = length
  343 ;                       R6 = source
  344 ;
  345                         
  346 ;
  347 ;                       Here we figure out how much we have to move to get the sink onto a
  348 ;                       cache boundary.  If we can, and there are still more that 32 bytes
  349 ;                       left to move, we can really speed things up by DCBZing the sink line.
  350 ;                       We can not do this if noncache is set because we will take an 
  351 ;                       alignment exception.
  352 
  353 G4word:                                                                                 ; enter from 64-bit case with word aligned uncached operands
  354                         neg             r0,r4                                           ; Get the number of bytes to move to align to a line boundary
  355                         rlwinm. r0,r0,0,27,31                           ; Clean it up and test it
  356                         and             r0,r0,r8                                        ; limit to the maximum front end move
  357                         mtcrf   3,r0                                            ; Make branch mask for partial moves
  358                         sub             r5,r5,r0                                        ; Set the length left to move
  359                         beq             alline                                          ; Already on a line...
  360                         
  361                         bf              31,alhalf                                       ; No single byte to do...
  362                         lbz             r7,0(r6)                                        ; Get the byte
  363                         addi    r6,r6,1                                         ; Point to the next
  364                         stb             r7,0(r4)                                        ; Save the single
  365                         addi    r4,r4,1                                         ; Bump sink
  366                         
  367 ;                       Sink is halfword aligned here
  368 
  369 alhalf:         bf              30,alword                                       ; No halfword to do...
  370                         lhz             r7,0(r6)                                        ; Get the halfword
  371                         addi    r6,r6,2                                         ; Point to the next
  372                         sth             r7,0(r4)                                        ; Save the halfword
  373                         addi    r4,r4,2                                         ; Bump sink
  374                         
  375 ;                       Sink is word aligned here
  376 
  377 alword:         bf              29,aldouble                                     ; No word to do...
  378                         lwz             r7,0(r6)                                        ; Get the word
  379                         addi    r6,r6,4                                         ; Point to the next
  380                         stw             r7,0(r4)                                        ; Save the word
  381                         addi    r4,r4,4                                         ; Bump sink
  382                         
  383 ;                       Sink is double aligned here
  384 
  385 aldouble:       bf              28,alquad                                       ; No double to do...
  386                         lwz             r7,0(r6)                                        ; Get the first word
  387                         lwz             r8,4(r6)                                        ; Get the second word
  388                         addi    r6,r6,8                                         ; Point to the next
  389                         stw             r7,0(r4)                                        ; Save the first word
  390                         stw             r8,4(r4)                                        ; Save the second word
  391                         addi    r4,r4,8                                         ; Bump sink
  392                         
  393 ;                       Sink is quadword aligned here
  394 
  395 alquad:         bf              27,alline                                       ; No quad to do...
  396                         lwz             r7,0(r6)                                        ; Get the first word
  397                         lwz             r8,4(r6)                                        ; Get the second word
  398                         lwz             r9,8(r6)                                        ; Get the third word
  399                         stw             r7,0(r4)                                        ; Save the first word
  400                         lwz             r11,12(r6)                                      ; Get the fourth word
  401                         addi    r6,r6,16                                        ; Point to the next
  402                         stw             r8,4(r4)                                        ; Save the second word
  403                         stw             r9,8(r4)                                        ; Save the third word
  404                         stw             r11,12(r4)                                      ; Save the fourth word
  405                         addi    r4,r4,16                                        ; Bump sink
  406                         
  407 ;                       Sink is line aligned here
  408 
  409 alline:         rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
  410                         mtcrf   3,r5                                            ; Make branch mask for backend partial moves
  411                         rlwinm  r11,r5,0,0,26                           ; Get number of bytes we are going to move
  412                         beq-    backend                                         ; No full lines to move
  413                         
  414                         sub             r5,r5,r11                                       ; Calculate the residual
  415                         li              r10,96                                          ; Stride for touch ahead
  416                         
  417 nxtline:        subic.  r0,r0,1                                         ; Account for the line now
  418 
  419                         bt-             noncache,skipz                          ; Skip if we are not cached...
  420                         dcbz    br0,r4                                          ; Blow away the whole line because we are replacing it
  421                         dcbt    r6,r10                                          ; Touch ahead a bit
  422                         
  423 skipz:          lwz             r7,0(r6)                                        ; Get the first word
  424                         lwz             r8,4(r6)                                        ; Get the second word
  425                         lwz             r9,8(r6)                                        ; Get the third word
  426                         stw             r7,0(r4)                                        ; Save the first word
  427                         lwz             r11,12(r6)                                      ; Get the fourth word
  428                         stw             r8,4(r4)                                        ; Save the second word
  429                         lwz             r7,16(r6)                                       ; Get the fifth word
  430                         stw             r9,8(r4)                                        ; Save the third word
  431                         lwz             r8,20(r6)                                       ; Get the sixth word
  432                         stw             r11,12(r4)                                      ; Save the fourth word
  433                         lwz             r9,24(r6)                                       ; Get the seventh word
  434                         stw             r7,16(r4)                                       ; Save the fifth word
  435                         lwz             r11,28(r6)                                      ; Get the eighth word
  436                         addi    r6,r6,32                                        ; Point to the next
  437                         stw             r8,20(r4)                                       ; Save the sixth word
  438                         stw             r9,24(r4)                                       ; Save the seventh word
  439                         stw             r11,28(r4)                                      ; Save the eighth word
  440                         addi    r4,r4,32                                        ; Bump sink
  441                         bgt+    nxtline                                         ; Do the next line, if any...
  442 
  443         
  444 ;                       Move backend quadword
  445 
  446 backend:        bf              27,noquad                                       ; No quad to do...
  447                         lwz             r7,0(r6)                                        ; Get the first word
  448                         lwz             r8,4(r6)                                        ; Get the second word
  449                         lwz             r9,8(r6)                                        ; Get the third word
  450                         lwz             r11,12(r6)                                      ; Get the fourth word
  451                         stw             r7,0(r4)                                        ; Save the first word
  452                         addi    r6,r6,16                                        ; Point to the next
  453                         stw             r8,4(r4)                                        ; Save the second word
  454                         stw             r9,8(r4)                                        ; Save the third word
  455                         stw             r11,12(r4)                                      ; Save the fourth word
  456                         addi    r4,r4,16                                        ; Bump sink
  457                         
  458 ;                       Move backend double
  459 
  460 noquad:         bf              28,nodouble                                     ; No double to do...
  461                         lwz             r7,0(r6)                                        ; Get the first word
  462                         lwz             r8,4(r6)                                        ; Get the second word
  463                         addi    r6,r6,8                                         ; Point to the next
  464                         stw             r7,0(r4)                                        ; Save the first word
  465                         stw             r8,4(r4)                                        ; Save the second word
  466                         addi    r4,r4,8                                         ; Bump sink
  467                         
  468 ;                       Move backend word
  469 
  470 nodouble:       bf              29,noword                                       ; No word to do...
  471                         lwz             r7,0(r6)                                        ; Get the word
  472                         addi    r6,r6,4                                         ; Point to the next
  473                         stw             r7,0(r4)                                        ; Save the word
  474                         addi    r4,r4,4                                         ; Bump sink
  475                         
  476 ;                       Move backend halfword
  477 
  478 noword:         bf              30,nohalf                                       ; No halfword to do...
  479                         lhz             r7,0(r6)                                        ; Get the halfword
  480                         addi    r6,r6,2                                         ; Point to the next
  481                         sth             r7,0(r4)                                        ; Save the halfword
  482                         addi    r4,r4,2                                         ; Bump sink
  483 
  484 ;                       Move backend byte
  485 
  486 nohalf:         bf              31,bcpydone                                     ; Leave cuz we are all done...  
  487                         lbz             r7,0(r6)                                        ; Get the byte
  488                         stb             r7,0(r4)                                        ; Save the single
  489 
  490 bcpydone:       
  491                         mfmsr   r9                                                      ; Get the MSR
  492                         bf++    flipcache,bcpydone0                     ; (HACK) No need to mess with caching...
  493 
  494                         li              r0,1                                            ; (HACK) Get a 1
  495                         mfxer   r10                                                     ; (HACK GLORIOUS HACK) Get the entry EE
  496                         sldi    r0,r0,32+8                                      ; (HACK) Get the right bit to turn off caching
  497                         mfspr   r2,hid4                                         ; (HACK) Get HID4
  498                         rlwinm  r10,r10,31-MSR_EE_BIT,MSR_EE_BIT,MSR_EE_BIT     ; (HACK GLORIOUS HACK) Set the EE bit
  499                         andc    r2,r2,r0                                        ; (HACK) Clear bit to make real accesses cache-inhibited
  500                         or              r9,r9,r10                                       ; (HACK GLORIOUS HACK) Set the EE in MSR
  501                         sync                                                            ; (HACK) Sync up
  502                         mtspr   hid4,r2                                         ; (HACK) Make real accesses not cache-inhibited
  503                         isync                                                           ; (HACK) Toss prefetches
  504         
  505                         lis             r12,0xE000                                      ; (HACK) Get the unlikeliest ESID possible
  506                         srdi    r12,r12,1                                       ; (HACK) Make 0x7FFFFFFFF0000000
  507                         slbie   r12                                                     ; (HACK) Make sure the ERAT is cleared 
  508 
  509                         mtmsr   r9                                                      ; (HACK GLORIOUS HACK) Set EE properly
  510 
  511 bcpydone0:
  512                         lis             r0,hi16(MASK(MSR_VEC))          ; Get the vector bit
  513                         ori             r0,r0,lo16(MASK(MSR_FP))        ; Get the float bit
  514                         bf++    fixxlate,bcpydone1                      ; skip if we do not need to fix translation...
  515                         ori             r9,r9,lo16(MASK(MSR_DR))        ; Turn data translation on
  516                         andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
  517                         mtmsr   r9                                                      ; Just do it
  518                         isync                                                           ; Hang in there
  519             
  520 bcpydone1:
  521             bflr++      restorex                                        ; done if we do not have to fix up addressing
  522             mfsprg      r8,2                                            ; get the feature flags again
  523             mtcrf       0x02,r8                                         ; put pf64Bit where we can test it
  524             bt++        pf64Bitb,bcpydone2                      ; skip if 64-bit processor
  525             
  526             ; 32-bit processor, so clear out the BATs we set up for bcopy_physvir
  527             
  528             li          r0,0                                            ; Get set to invalidate upper half
  529                         sync                                                            ; Make sure all is well
  530                         mtdbatu 0,r0                                            ; Clear sink upper DBAT
  531                         mtdbatu 1,r0                                            ; Clear source upper DBAT
  532                         sync
  533                         isync                   
  534                         blr
  535 
  536             ; 64-bit processor, so turn off 64-bit mode we turned on to do bcopy_phys
  537             
  538 bcpydone2:
  539             mfmsr       r9                                                      ; get MSR again
  540                         andc    r9,r9,r0                                        ; Make sure that FP and VEC are off
  541             rldicl      r9,r9,0,MSR_SF_BIT+1            ; clear SF
  542             mtmsrd      r9
  543             isync
  544             blr
  545 
  546 
  547 ;
  548 ;                       0123456789ABCDEF0123456789ABCDEF
  549 ;                        0123456789ABCDEF0123456789ABCDEF
  550 ;                                                                                   F
  551 ;                                                                                 DE
  552 ;                                                                         9ABC
  553 ;                                                         12345678
  554 ;             123456789ABCDEF0  
  555 ;            0
  556 
  557 ;
  558 ;                       Here is where we handle a forward overlapping move.  These will be slow
  559 ;                       because we can not kill the cache of the destination until after we have
  560 ;                       loaded/saved the source area.  Also, because reading memory backwards is
  561 ;                       slower when the cache line needs to be loaded because the critical 
  562 ;                       doubleword is loaded first, i.e., the last, then it goes back to the first,
  563 ;                       and on in order.  That means that when we are at the second to last DW we
  564 ;                       have to wait until the whole line is in cache before we can proceed.
  565 ;
  566 
  567 G4reverseWord:                                                                  ; here from 64-bit code with word aligned uncached operands
  568 fwdovrlap:      add             r4,r5,r4                                        ; Point past the last sink byte
  569                         add             r6,r5,r6                                        ; Point past the last source byte 
  570                         and             r0,r4,r8                                        ; Apply movement limit
  571                         li              r12,-1                                          ; Make sure we touch in the actual line                         
  572                         mtcrf   3,r0                                            ; Figure out the best way to move backwards                     
  573                         dcbt    r12,r6                                          ; Touch in the last line of source
  574                         rlwinm. r0,r0,0,27,31                           ; Calculate the length to adjust to cache boundary
  575                         dcbtst  r12,r4                                          ; Touch in the last line of the sink
  576                         beq-    balline                                         ; Aready on cache line boundary
  577                         
  578                         sub             r5,r5,r0                                        ; Precaculate move length left after alignment
  579                         
  580                         bf              31,balhalf                                      ; No single byte to do...
  581                         lbz             r7,-1(r6)                                       ; Get the byte
  582                         subi    r6,r6,1                                         ; Point to the next
  583                         stb             r7,-1(r4)                                       ; Save the single
  584                         subi    r4,r4,1                                         ; Bump sink
  585                         
  586 ;                       Sink is halfword aligned here
  587 
  588 balhalf:        bf              30,balword                                      ; No halfword to do...
  589                         lhz             r7,-2(r6)                                       ; Get the halfword
  590                         subi    r6,r6,2                                         ; Point to the next
  591                         sth             r7,-2(r4)                                       ; Save the halfword
  592                         subi    r4,r4,2                                         ; Bump sink
  593                         
  594 ;                       Sink is word aligned here
  595 
  596 balword:        bf              29,baldouble                            ; No word to do...
  597                         lwz             r7,-4(r6)                                       ; Get the word
  598                         subi    r6,r6,4                                         ; Point to the next
  599                         stw             r7,-4(r4)                                       ; Save the word
  600                         subi    r4,r4,4                                         ; Bump sink
  601                         
  602 ;                       Sink is double aligned here
  603 
  604 baldouble:      bf              28,balquad                                      ; No double to do...
  605                         lwz             r7,-8(r6)                                       ; Get the first word
  606                         lwz             r8,-4(r6)                                       ; Get the second word
  607                         subi    r6,r6,8                                         ; Point to the next
  608                         stw             r7,-8(r4)                                       ; Save the first word
  609                         stw             r8,-4(r4)                                       ; Save the second word
  610                         subi    r4,r4,8                                         ; Bump sink
  611                         
  612 ;                       Sink is quadword aligned here
  613 
  614 balquad:        bf              27,balline                                      ; No quad to do...
  615                         lwz             r7,-16(r6)                                      ; Get the first word
  616                         lwz             r8,-12(r6)                                      ; Get the second word
  617                         lwz             r9,-8(r6)                                       ; Get the third word
  618                         lwz             r11,-4(r6)                                      ; Get the fourth word
  619                         stw             r7,-16(r4)                                      ; Save the first word
  620                         subi    r6,r6,16                                        ; Point to the next
  621                         stw             r8,-12(r4)                                      ; Save the second word
  622                         stw             r9,-8(r4)                                       ; Save the third word
  623                         stw             r11,-4(r4)                                      ; Save the fourth word
  624                         subi    r4,r4,16                                        ; Bump sink
  625                         
  626 ;                       Sink is line aligned here
  627 
  628 balline:        rlwinm. r0,r5,27,5,31                           ; Get the number of full lines to move
  629                         mtcrf   3,r5                                            ; Make branch mask for backend partial moves
  630                         beq-    bbackend                                        ; No full lines to move
  631 
  632 
  633 ;                       Registers in use: R0, R1,     R3, R4, R5, R6
  634 ;       Registers not in use:         R2,                 R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
  635                         
  636 bnxtline:       subic.  r0,r0,1                                         ; Account for the line now
  637 
  638                         lwz             r7,-32(r6)                                      ; Get the first word
  639                         lwz             r5,-28(r6)                                      ; Get the second word
  640                         lwz             r2,-24(r6)                                      ; Get the third word
  641                         lwz             r12,-20(r6)                                     ; Get the third word
  642                         lwz             r11,-16(r6)                                     ; Get the fifth word
  643                         lwz             r10,-12(r6)                                     ; Get the sixth word
  644                         lwz             r9,-8(r6)                                       ; Get the seventh word
  645                         lwz             r8,-4(r6)                                       ; Get the eighth word
  646                         subi    r6,r6,32                                        ; Point to the next
  647                         
  648                         stw             r7,-32(r4)                                      ; Get the first word
  649                         ble-    bnotouch                                        ; Last time, skip touch of source...
  650                         dcbt    br0,r6                                          ; Touch in next source line
  651                         
  652 bnotouch:       stw             r5,-28(r4)                                      ; Get the second word
  653                         stw             r2,-24(r4)                                      ; Get the third word
  654                         stw             r12,-20(r4)                                     ; Get the third word
  655                         stw             r11,-16(r4)                                     ; Get the fifth word
  656                         stw             r10,-12(r4)                                     ; Get the sixth word
  657                         stw             r9,-8(r4)                                       ; Get the seventh word
  658                         stw             r8,-4(r4)                                       ; Get the eighth word
  659                         subi    r4,r4,32                                        ; Bump sink
  660                         
  661                         bgt+    bnxtline                                        ; Do the next line, if any...
  662 
  663 ;
  664 ;                       Note: We touched these lines in at the beginning
  665 ;
  666         
  667 ;                       Move backend quadword
  668 
  669 bbackend:       bf              27,bnoquad                                      ; No quad to do...
  670                         lwz             r7,-16(r6)                                      ; Get the first word
  671                         lwz             r8,-12(r6)                                      ; Get the second word
  672                         lwz             r9,-8(r6)                                       ; Get the third word
  673                         lwz             r11,-4(r6)                                      ; Get the fourth word
  674                         stw             r7,-16(r4)                                      ; Save the first word
  675                         subi    r6,r6,16                                        ; Point to the next
  676                         stw             r8,-12(r4)                                      ; Save the second word
  677                         stw             r9,-8(r4)                                       ; Save the third word
  678                         stw             r11,-4(r4)                                      ; Save the fourth word
  679                         subi    r4,r4,16                                        ; Bump sink
  680                         
  681 ;                       Move backend double
  682 
  683 bnoquad:        bf              28,bnodouble                            ; No double to do...
  684                         lwz             r7,-8(r6)                                       ; Get the first word
  685                         lwz             r8,-4(r6)                                       ; Get the second word
  686                         subi    r6,r6,8                                         ; Point to the next
  687                         stw             r7,-8(r4)                                       ; Save the first word
  688                         stw             r8,-4(r4)                                       ; Save the second word
  689                         subi    r4,r4,8                                         ; Bump sink
  690                         
  691 ;                       Move backend word
  692 
  693 bnodouble:      bf              29,bnoword                                      ; No word to do...
  694                         lwz             r7,-4(r6)                                       ; Get the word
  695                         subi    r6,r6,4                                         ; Point to the next
  696                         stw             r7,-4(r4)                                       ; Save the word
  697                         subi    r4,r4,4                                         ; Bump sink
  698                         
  699 ;                       Move backend halfword
  700 
  701 bnoword:        bf              30,bnohalf                                      ; No halfword to do...
  702                         lhz             r7,-2(r6)                                       ; Get the halfword
  703                         subi    r6,r6,2                                         ; Point to the next
  704                         sth             r7,-2(r4)                                       ; Save the halfword
  705                         subi    r4,r4,2                                         ; Bump sink
  706 
  707 ;                       Move backend byte
  708 
  709 bnohalf:        bf              31,bcpydone                                     ; Leave cuz we are all done...  
  710                         lbz             r7,-1(r6)                                       ; Get the byte
  711                         stb             r7,-1(r4)                                       ; Save the single
  712                         
  713                         b               bcpydone                                        ; Go exit cuz we are all done...
  714 
  715 
  716 // Here on 64-bit processors, which have a 128-byte cache line.  This can be
  717 // called either in 32 or 64-bit mode, which makes the test for reverse moves
  718 // a little tricky.  We've already filtered out the (sou==dest) and (len==0)
  719 // special cases.
  720 //
  721 // When entered:
  722 //              r4 = destination (32 or 64-bit ptr)
  723 //              r5 = length (always 32 bits)
  724 //              r6 = source (32 or 64-bit ptr)
  725 //              cr5 = noncache, fixxlate, flipcache, and restorex flags set
  726 
  727         .align  5
  728 copyit64:
  729         lis             r2,0x4000                       // r2 = 0x00000000 40000000
  730         neg             r12,r4                          // start to compute #bytes to align dest
  731                 bt--    noncache,noncache1      // (HACK) Do not even try anything cached...
  732         dcbt    0,r6                            // touch in 1st block of source
  733 noncache1:     
  734         add.    r2,r2,r2                        // if 0x00000000 80000000 < 0, we are in 32-bit mode
  735         cntlzw  r9,r5                           // get highest power-of-2 in length
  736         rlwinm  r7,r12,0,25,31          // r7 <- bytes to 128-byte align dest
  737                 bt--    noncache,noncache2      // (HACK) Do not even try anything cached...
  738         dcbtst  0,r4                            // touch in 1st destination cache block
  739 noncache2:
  740         sraw    r2,r2,r9                        // get mask with 1s for leading 0s in length, plus 1 more 1-bit
  741         bge             copyit64a                       // skip if we are running in 64-bit mode
  742         rlwinm  r4,r4,0,0,31            // running in 32-bit mode, so truncate ptrs and lengths to 32 bits
  743         rlwinm  r5,r5,0,0,31
  744         rlwinm  r6,r6,0,0,31
  745 copyit64a:                                                      // now we can use 64-bit compares even if running in 32-bit mode
  746         sub             r8,r4,r6                        // get (dest-source)
  747         andc    r7,r7,r2                        // limit bytes to align by operand length
  748         cmpld   cr1,r8,r5                       // if (dest-source)<length, must move reverse
  749         bt--    noncache,c64uncached    // skip if uncached
  750         blt--   cr1,c64rdouble          // handle cached reverse moves        
  751         
  752         
  753 // Forward, cached or doubleword aligned uncached.  This is the common case.
  754 //   r4-r6 = dest, length, source (as above)
  755 //              r7 = #bytes 128-byte align dest (limited by copy length)
  756 //     cr5 = flags, as above
  757 
  758 c64double:
  759         andi.   r8,r7,7                         // r8 <- #bytes to doubleword align
  760         srwi    r9,r7,3                         // r9 <- #doublewords to 128-byte align
  761         sub             r5,r5,r7                        // adjust length remaining
  762         cmpwi   cr1,r9,0                        // any doublewords to move to cache align?
  763         srwi    r10,r5,7                        // r10 <- 128-byte chunks to xfer after aligning dest
  764         cmpwi   cr7,r10,0                       // set cr7 on chunk count
  765         beq             c64double2                      // dest already doubleword aligned
  766         mtctr   r8
  767         b               c64double1
  768         
  769         .align  5                                       // align inner loops
  770 c64double1:                                                     // copy bytes until dest is doubleword aligned
  771         lbz             r0,0(r6)
  772         addi    r6,r6,1
  773         stb             r0,0(r4)
  774         addi    r4,r4,1
  775         bdnz    c64double1
  776 
  777 c64double2:                                                     // r9/cr1=doublewords, r10=128-byte chunks, cr7=blt if r5==0
  778         beq             cr1,c64double4          // no doublewords to xfer in order to cache align
  779         mtctr   r9
  780         b               c64double3
  781 
  782         .align  5                                       // align inner loops
  783 c64double3:                                                     // copy doublewords until dest is 128-byte aligned
  784         ld              r7,0(r6)
  785         addi    r6,r6,8
  786         std             r7,0(r4)
  787         addi    r4,r4,8
  788         bdnz    c64double3
  789         
  790 // Here to xfer 128-byte chunks, if any.  Because the IBM 970 cannot issue two stores/cycle,
  791 // we pipeline the inner loop so we can pair loads and stores.  Since we only have 8 GPRs for
  792 // data (64 bytes), we load/store each twice per 128-byte chunk.
  793 
  794 c64double4:                                                     // r10/cr7=128-byte chunks
  795         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords, after moving chunks
  796         cmpwi   cr1,r0,0                        // set cr1 on leftover doublewords
  797         beq             cr7,c64double7          // no 128-byte chunks
  798         sub             r8,r6,r4                        // r8 <- (source - dest)
  799         li              r9,128                          // start at next cache line (we've already touched in 1st line)
  800         cmpldi  cr7,r8,128                      // if (source-dest)<128, cannot use dcbz128 beacause of overlap
  801         cror    noncache,cr7_lt,noncache        // turn on "noncache" flag if (source-dest)<128
  802                 bt--    noncache,noncache3      // (HACK) Skip cache touch if noncachable
  803         dcbt128 r9,r6,1                         // start forward stream
  804 noncache3:
  805         mtctr   r10
  806         
  807         ld              r0,0(r6)                        // start pipe: load 1st half-line
  808         ld              r2,8(r6)
  809         ld              r7,16(r6)
  810         ld              r8,24(r6)
  811         ld              r9,32(r6)
  812         ld              r10,40(r6)
  813         ld              r11,48(r6)
  814         ld              r12,56(r6)
  815                 b               c64InnerLoopEntryPt
  816         
  817         .align  5                                       // align inner loop
  818 c64InnerLoop:                                           // loop copying 128-byte cache lines to 128-aligned destination
  819         std             r0,64(r4)                       // store 2nd half of chunk n
  820         ld              r0,0(r6)                        // load 1st half of chunk n+1
  821         std             r2,72(r4)
  822         ld              r2,8(r6)
  823         std             r7,80(r4)
  824         ld              r7,16(r6)
  825         std             r8,88(r4)
  826         ld              r8,24(r6)
  827         std             r9,96(r4)
  828         ld              r9,32(r6)
  829         std             r10,104(r4)
  830         ld              r10,40(r6)
  831         std             r11,112(r4)
  832         ld              r11,48(r6)
  833         std             r12,120(r4)
  834         ld              r12,56(r6)
  835         addi    r4,r4,128                       // advance to next dest chunk
  836 c64InnerLoopEntryPt:                            // initial entry into loop, with 1st halfline loaded        
  837         bt              noncache,c64InnerLoop1  // skip if uncached or overlap
  838         dcbz128 0,r4                            // avoid prefetch of next cache line
  839 c64InnerLoop1:
  840         std             r0,0(r4)                        // store 1st half of chunk n
  841         ld              r0,64(r6)                       // load 2nd half of chunk n
  842         std             r2,8(r4)
  843         ld              r2,72(r6)
  844         std             r7,16(r4)
  845         ld              r7,80(r6)
  846         std             r8,24(r4)
  847         ld              r8,88(r6)
  848         std             r9,32(r4)
  849         ld              r9,96(r6)
  850         std             r10,40(r4)
  851         ld              r10,104(r6)
  852         std             r11,48(r4)
  853         ld              r11,112(r6)
  854         std             r12,56(r4)
  855         ld              r12,120(r6)
  856         addi    r6,r6,128                       // advance to next source chunk if any
  857         bdnz    c64InnerLoop            // loop if more chunks
  858         
  859         std             r0,64(r4)                       // store 2nd half of last chunk
  860         std             r2,72(r4)
  861         std             r7,80(r4)
  862         std             r8,88(r4)
  863         std             r9,96(r4)
  864         std             r10,104(r4)
  865         std             r11,112(r4)
  866         std             r12,120(r4)
  867         addi    r4,r4,128                       // advance to next dest chunk
  868 
  869 c64double7:                         // r5 <- leftover bytes, cr1 set on doubleword count
  870         rlwinm  r0,r5,29,28,31          // r0 <- count of leftover doublewords (0-15)
  871         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes (0-7)
  872         beq             cr1,c64byte                     // no leftover doublewords
  873         mtctr   r0
  874         b               c64double8
  875         
  876         .align  5                                       // align inner loop
  877 c64double8:                                                     // loop copying leftover doublewords
  878         ld              r0,0(r6)
  879         addi    r6,r6,8
  880         std             r0,0(r4)
  881         addi    r4,r4,8
  882         bdnz    c64double8
  883 
  884 
  885 // Forward byte loop.
  886 
  887 c64byte:                                                        // r5/cr0 <- byte count (can be big if unaligned uncached)
  888                 beq             bcpydone                        // done if no leftover bytes
  889         mtctr   r5
  890         b               c64byte1
  891         
  892         .align  5                                       // align inner loop
  893 c64byte1:
  894         lbz             r0,0(r6)
  895         addi    r6,r6,1
  896         stb             r0,0(r4)
  897         addi    r4,r4,1
  898         bdnz    c64byte1
  899 
  900         b               bcpydone
  901 
  902 
  903 // Uncached copies.  We must avoid unaligned accesses, since they always take alignment
  904 // exceptions on uncached memory on 64-bit processors.  This may mean we copy long operands
  905 // a byte at a time, but that is still much faster than alignment exceptions.
  906 //   r4-r6 = dest, length, source (as above)
  907 //              r2 = mask of 1s for leading 0s in length, plus 1 extra 1
  908 //              r7 = #bytes to copy to 128-byte align dest (limited by operand length)
  909 //         cr1 = blt if reverse move required
  910 
  911 c64uncached:
  912         xor             r0,r6,r4                        // get relative alignment
  913         rlwinm  r10,r0,0,29,31          // relatively doubleword aligned?
  914         rlwinm  r11,r0,0,30,31          // relatively word aligned?
  915         not             r8,r2                           // get mask to limit initial length of copy for G4word
  916         blt             cr1,c64reverseUncached
  917         
  918         cmpwi   cr0,r10,0                       // set cr0 beq if doubleword aligned
  919         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
  920         beq             cr0,c64double           // doubleword aligned
  921         beq             cr1,G4word                      // word aligned, use G3/G4 code
  922         cmpwi   r5,0                            // set cr0 on byte count
  923         b               c64byte                         // unaligned operands
  924 
  925 c64reverseUncached:
  926         cmpwi   cr0,r10,0                       // set cr0 beq if doubleword aligned
  927         cmpwi   cr1,r11,0                       // set cr1 beq if word aligned
  928         beq             cr0,c64rdouble          // doubleword aligned so can use LD/STD
  929         beq             cr1,G4reverseWord       // word aligned, use G3/G4 code
  930         add             r6,r6,r5                        // point to (end+1) of source and dest
  931         add             r4,r4,r5
  932         cmpwi   r5,0                            // set cr0 on length
  933         b               c64rbyte                        // copy a byte at a time
  934         
  935         
  936 
  937 // Reverse doubleword copies.  This is used for all cached copies, and doubleword
  938 // aligned uncached copies.
  939 //              r4 = destination (32 or 64-bit ptr)
  940 //              r5 = length (always 32 bits)
  941 //              r6 = source (32 or 64-bit ptr)
  942 //              cr5 = noncache, fixxlate, and restorex flags set
  943 
  944 c64rdouble:
  945         add             r6,r6,r5                        // point to (end+1) of source and dest
  946         add             r4,r4,r5
  947         rlwinm. r7,r4,0,29,31           // r7 <- #bytes to doubleword align dest
  948         cmplw   cr1,r7,r5                       // operand long enough to doubleword align?
  949         blt             cr1,c64rd0                      // yes
  950         mr              r7,r5                           // no
  951 c64rd0:
  952         sub             r5,r5,r7                        // adjust length
  953         srwi    r8,r5,6                         // r8 <- 64-byte chunks to xfer
  954         cmpwi   cr1,r8,0                        // any chunks?
  955         beq             c64rd2                          // source already doubleword aligned
  956         mtctr   r7
  957 
  958 c64rd1:                                                         // copy bytes until source doublword aligned
  959         lbzu    r0,-1(r6)
  960         stbu    r0,-1(r4)
  961         bdnz    c64rd1
  962         
  963 c64rd2:                                                         // r8/cr1 <- count of 64-byte chunks
  964         rlwinm  r0,r5,29,29,31          // r0 <- count of leftover doublewords
  965         andi.   r5,r5,7                         // r5/cr0 <- count of leftover bytes
  966         cmpwi   cr7,r0,0                        // leftover doublewords?
  967         beq             cr1,c64rd4                      // no chunks to xfer
  968         li              r9,-128                         // start at next cache line
  969         mtctr   r8
  970         bt              noncache,c64rd3         // (HACK) Do not start a stream if noncachable...
  971         dcbt128 r9,r6,3                         // start reverse stream
  972         b               c64rd3
  973         
  974         .align  5                                       // align inner loop
  975 c64rd3:                                                         // loop copying 64-byte chunks
  976         ld              r7,-8(r6)
  977         ld              r8,-16(r6)
  978         ld              r9,-24(r6)
  979         ld              r10,-32(r6)
  980         ld              r11,-40(r6)
  981         ld              r12,-48(r6)
  982         std             r7,-8(r4)
  983         std             r8,-16(r4)
  984         ld              r7,-56(r6)
  985         ldu             r8,-64(r6)
  986         std             r9,-24(r4)
  987         std             r10,-32(r4)
  988         std             r11,-40(r4)
  989         std             r12,-48(r4)
  990         std             r7,-56(r4)
  991         stdu    r8,-64(r4)
  992         bdnz    c64rd3
  993 
  994 c64rd4:                                                         // r0/cr7 = leftover doublewords  r5/cr0 = leftover bytes
  995         beq             cr7,c64rbyte            // no leftover doublewords
  996         mtctr   r0
  997         
  998 c64rd5:                                                         // loop copying leftover doublewords
  999         ldu             r0,-8(r6)
 1000         stdu    r0,-8(r4)
 1001         bdnz    c64rd5
 1002 
 1003 
 1004 // Reverse byte loop.
 1005 
 1006 c64rbyte:                                                       // r5/cr0 <- byte count (can be big if unaligned uncached)
 1007         beq             bcpydone                        // done if no leftover bytes
 1008         mtctr   r5
 1009         
 1010 c64rbyte1:
 1011         lbzu    r0,-1(r6)
 1012         stbu    r0,-1(r4)
 1013         bdnz    c64rbyte1
 1014 
 1015         b               bcpydone
 1016
Cache object: 9e2959bc32fd2bca28beec24dc69ed03
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/osfmk/ppc/bcopy.s

FreeBSD/Linux Kernel Cross Reference
sys/osfmk/ppc/bcopy.s