bzero.s

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * The contents of this file constitute Original Code as defined in and
    7  * are subject to the Apple Public Source License Version 1.1 (the
    8  * "License").  You may not use this file except in compliance with the
    9  * License.  Please obtain a copy of the License at
   10  * http://www.apple.com/publicsource and read it before using this file.
   11  * 
   12  * This Original Code and all software distributed under the License are
   13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
   17  * License for the specific language governing rights and limitations
   18  * under the License.
   19  * 
   20  * @APPLE_LICENSE_HEADER_END@
   21  */
   22 
   23 #include <ppc/asm.h>
   24 #include <ppc/exception.h>
   25 #include <assym.s>
   26 
   27         .text
   28         .align  2
   29         .globl  _memset
   30         .globl  _bzero
   31         .globl  _bzero_nc
   32         .globl  _bzero_phys
   33 
   34 
   35 // ***********************
   36 // * B Z E R O _ P H Y S *
   37 // ***********************
   38 //
   39 // void bzero_phys(addr64_t phys_addr, uint32_t length);
   40 //
   41 // Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
   42 
   43         .align  5
   44 LEXT(bzero_phys)
   45         mflr    r12                             // save return address
   46         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
   47         rlwimi  r3,r4,0,0,31
   48         mr              r4,r5                   // put length where bzero() expects it
   49         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
   50         bl              EXT(bzero)              // use normal bzero() routine
   51         mtlr    r12                             // restore return
   52         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
   53         
   54 
   55 // *******************
   56 // * B Z E R O _ N C *
   57 // *******************
   58 //
   59 //      void bzero_nc(char      *addr, unsigned int length);
   60 //
   61 // For use with uncached memory.  Doesn't seem to be used at all, so probably not
   62 // performance critical.  NB: we must avoid unaligned stores, because some
   63 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
   64 // memory.  Of course, we must also avoid dcbz.
   65 
   66 LEXT(bzero_nc)
   67         cmplwi  cr1,r4,20               // too short to bother with 16-byte loops?
   68         cmplwi  cr7,r4,0                // check for (len==0)
   69         li              r6,0                    // get a 0
   70         bge             cr1,bznc1               // skip if length >=20
   71         mtctr   r4                              // set up byte loop
   72         beqlr-- cr7                             // done if len=0
   73         
   74 // Short operands, loop over bytes.
   75 
   76 bznc0:
   77         stb             r6,0(r3)
   78         addi    r3,r3,1
   79         bdnz    bznc0
   80         blr
   81         
   82 // Handle operands long enough to do doubleword stores; we must doubleword
   83 // align, to avoid alignment exceptions.
   84 
   85 bznc1:
   86         neg             r7,r3                   // start to compute #bytes to align
   87         mfsprg  r10,2                   // get feature flags
   88         andi.   r0,r7,7                 // get #bytes to doubleword align
   89         mr              r5,r3                   // make copy of operand ptr as bcopy expects
   90         mtcrf   0x02,r10                // put pf64Bitb etc in cr6
   91         beq             bzero_tail              // already doubleword aligned
   92         sub             r4,r4,r0                // adjust count
   93         mtctr   r0                              // set up loop
   94 bznc2:                                                  // zero bytes until doubleword aligned
   95         stb             r6,0(r5)
   96         addi    r5,r5,1
   97         bdnz    bznc2
   98         b               bzero_tail              // join bzero, now that r5 is aligned
   99         
  100 
  101 // *************     ***************
  102 // * B Z E R O * and * M E M S E T *
  103 // *************     ***************
  104 //
  105 // void *   memset(void *b, int c, size_t len);
  106 // void         bzero(void *b, size_t len);
  107 //
  108 // These routines support G3, G4, and the 970, and run in both 32 and
  109 // 64-bit mode.  Lengths (size_t) are always 32 bits.
  110 //
  111 // Register use:
  112 //    r0 = temp
  113 //    r2 = temp
  114 //    r3 = original ptr, not changed since memset returns it
  115 //    r4 = count of bytes to set
  116 //    r5 = working operand ptr ("rp")
  117 //    r6 = value to store (usually 0)
  118 // r7-r9 = temps
  119 //   r10 = feature flags
  120 //   r11 = old MSR (if bzero_phys)
  121 //   r12 = return address (if bzero_phys)
  122 //   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
  123 
  124         .align  5
  125 LEXT(memset)                                    // void *   memset(void *b, int c, size_t len);
  126         andi.   r6,r4,0xFF              // copy value to working register, test for 0
  127         mr              r4,r5                   // move length to working register
  128         bne--   memset1                 // skip if nonzero
  129 LEXT(bzero)                                             // void bzero(void *b, size_t len);
  130         dcbtst  0,r3                    // touch in 1st cache block
  131         mfsprg  r10,2                   // get features
  132         li              r6,0                    // get a 0
  133         neg             r7,r3                   // start to compute #bytes to align
  134         andi.   r0,r10,pf128Byte+pf32Byte // get cache line size
  135         mtcrf   0x02,r10                // put pf128Byte etc in cr6
  136         cmplw   r4,r0                   // operand length >= cache line size?
  137         mr              r5,r3                   // make copy of operand ptr (can't change r3)
  138         blt             bzero_tail              // too short for dcbz (or dcbz128)
  139         rlwinm  r0,r7,0,0x1F    // get #bytes to  32-byte align
  140         rlwinm  r9,r7,0,0x7F    // get #bytes to 128-byte align
  141         bt++    pf128Byteb,bzero_128 // skip if 128-byte processor
  142 
  143 // Operand length >=32 and cache line size is 32.
  144 //              r0 = #bytes to 32-byte align
  145 //              r4 = length
  146 //              r5 = ptr to operand
  147 //              r6 = 0
  148 
  149         sub             r2,r4,r0                // adjust length
  150         cmpwi   cr1,r0,0                // already 32-byte aligned?
  151         srwi.   r8,r2,5                 // get #32-byte chunks
  152         beq             bzero_tail              // not long enough to dcbz
  153         mtctr   r8                              // set up loop count
  154         rlwinm  r4,r2,0,27,31   // mask down to leftover byte count
  155         beq             cr1,bz_dcbz32   // skip if already 32-byte aligned
  156         
  157 // 32-byte align.  We just store 32 0s, rather than test and use conditional
  158 // branches.  This is usually faster, because there are no mispredicts.
  159 
  160         stw             r6,0(r5)                // zero next 32 bytes
  161         stw             r6,4(r5)
  162         stw             r6,8(r5)
  163         stw             r6,12(r5)
  164         stw             r6,16(r5)
  165         stw             r6,20(r5)
  166         stw             r6,24(r5)
  167         stw             r6,28(r5)
  168         add             r5,r5,r0                // now r5 is 32-byte aligned
  169         b               bz_dcbz32
  170 
  171 // Loop doing 32-byte version of DCBZ instruction.
  172 
  173         .align  4                               // align the inner loop
  174 bz_dcbz32:
  175         dcbz    0,r5                    // zero another 32 bytes
  176         addi    r5,r5,32
  177         bdnz    bz_dcbz32
  178 
  179 // Store trailing bytes.  This routine is used both by bzero and memset.
  180 //              r4 = #bytes to store (may be large if memset)
  181 //              r5 = address
  182 //              r6 = value to store (in all 8 bytes)
  183 //     cr6 = pf64Bit etc flags
  184 
  185 bzero_tail:
  186         srwi.   r0,r4,4                 // get #(16-byte-chunks)
  187         mtcrf   0x01,r4                 // remaining byte count to cr7
  188         beq             bzt3                    // no 16-byte chunks
  189         mtctr   r0                              // set up loop count
  190         bt++    pf64Bitb,bzt2   // skip if 64-bit processor
  191         b               bzt1
  192         .align  5
  193 bzt1:                                                   // loop over 16-byte chunks on 32-bit processor
  194         stw             r6,0(r5)
  195         stw             r6,4(r5)
  196         stw             r6,8(r5)
  197         stw             r6,12(r5)
  198         addi    r5,r5,16
  199         bdnz    bzt1
  200         b               bzt3
  201         .align  5
  202 bzt2:                                                   // loop over 16-byte chunks on 64-bit processor
  203         std             r6,0(r5)
  204         std             r6,8(r5)
  205         addi    r5,r5,16
  206         bdnz    bzt2
  207         bf              28,bzt4                 // 8-byte chunk?
  208         std             r6,0(r5)
  209         addi    r5,r5,8
  210         b               bzt4
  211 bzt3:
  212         bf              28,bzt4                 // 8-byte chunk?
  213         stw             r6,0(r5)
  214         stw             r6,4(r5)
  215         addi    r5,r5,8
  216 bzt4:
  217         bf              29,bzt5                 // word?
  218         stw             r6,0(r5)
  219         addi    r5,r5,4
  220 bzt5:
  221         bf              30,bzt6                 // halfword?
  222         sth             r6,0(r5)
  223         addi    r5,r5,2
  224 bzt6:
  225         bflr    31                              // byte?
  226         stb             r6,0(r5)
  227         blr
  228         
  229 // Operand length is >=128 and cache line size is 128. We assume that
  230 // because the linesize is 128 bytes, this is a 64-bit processor.
  231 //              r4 = length
  232 //              r5 = ptr to operand
  233 //              r6 = 0
  234 //              r7 = neg(r5)
  235 //              r9 = #bytes to 128-byte align
  236 
  237         .align  5
  238 bzero_128:
  239         sub             r2,r4,r9                // r2 <- length remaining after cache-line aligning
  240         rlwinm  r0,r7,0,0xF             // r0 <- #bytes to 16-byte align
  241         srwi.   r8,r2,7                 // r8 <- number of cache lines to 0
  242         std             r6,0(r5)                // always store 16 bytes to 16-byte align...
  243         std             r6,8(r5)                // ...even if too short for dcbz128
  244         add             r5,r5,r0                // 16-byte align ptr
  245         sub             r4,r4,r0                // adjust count
  246         beq             bzero_tail              // r8==0, not long enough to dcbz128
  247         sub.    r7,r9,r0                // get #bytes remaining to 128-byte align
  248         rlwinm  r4,r2,0,0x7F    // r4 <- length remaining after dcbz128'ing
  249         mtctr   r8                              // set up dcbz128 loop
  250         beq             bz_dcbz128              // already 128-byte aligned
  251         b               bz_align                // enter loop over 16-byte chunks
  252 
  253 // 128-byte align by looping over 16-byte chunks.
  254         
  255         .align  5
  256 bz_align:                                               // loop over 16-byte chunks
  257         subic.  r7,r7,16                // more to go?
  258         std             r6,0(r5)
  259         std             r6,8(r5)
  260         addi    r5,r5,16
  261         bgt             bz_align
  262         
  263         b               bz_dcbz128              // enter dcbz128 loop
  264         
  265 // Loop over 128-byte cache lines.
  266 //              r4 = length remaining after cache lines (0..127)
  267 //              r5 = ptr (128-byte aligned)
  268 //              r6 = 0
  269 //              ctr = count of cache lines to 0
  270 
  271         .align  5
  272 bz_dcbz128:
  273         dcbz128 0,r5                    // zero a 128-byte cache line
  274         addi    r5,r5,128
  275         bdnz    bz_dcbz128
  276         
  277         b               bzero_tail              // handle leftovers
  278 
  279 
  280 // Handle memset() for nonzero values.  This case is relatively infrequent;
  281 // the large majority of memset() calls are for 0.
  282 //              r3 = ptr
  283 //              r4 = count
  284 //              r6 = value in lower byte (nonzero)
  285 
  286 memset1:
  287         cmplwi  r4,16                   // too short to bother aligning?
  288         rlwimi  r6,r6,8,16,23   // replicate value to low 2 bytes
  289         mr              r5,r3                   // make working copy of operand ptr
  290         rlwimi  r6,r6,16,0,15   // value now in all 4 bytes
  291         blt             bzero_tail              // length<16, we won't be using "std"
  292         mfsprg  r10,2                   // get feature flags
  293         neg             r7,r5                   // start to compute #bytes to align
  294         rlwinm  r6,r6,0,1,0             // value now in all 8 bytes (if 64-bit)
  295         andi.   r0,r7,7                 // r6 <- #bytes to doubleword align
  296         stw             r6,0(r5)                // store 8 bytes to avoid a loop
  297         stw             r6,4(r5)
  298         mtcrf   0x02,r10                // get pf64Bit flag etc in cr6
  299         sub             r4,r4,r0                // adjust count
  300         add             r5,r5,r0                // doubleword align ptr
  301         b               bzero_tail
  302         
  303         
  304
Cache object: fdf06952bb688b69941d9e766ae5a09d
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/osfmk/ppc/bzero.s

FreeBSD/Linux Kernel Cross Reference
sys/osfmk/ppc/bzero.s