bzero.s

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
    3  *
    4  * @APPLE_LICENSE_HEADER_START@
    5  * 
    6  * Copyright (c) 1999-2003 Apple Computer, Inc.  All Rights Reserved.
    7  * 
    8  * This file contains Original Code and/or Modifications of Original Code
    9  * as defined in and that are subject to the Apple Public Source License
   10  * Version 2.0 (the 'License'). You may not use this file except in
   11  * compliance with the License. Please obtain a copy of the License at
   12  * http://www.opensource.apple.com/apsl/ and read it before using this
   13  * file.
   14  * 
   15  * The Original Code and all software distributed under the License are
   16  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
   17  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
   18  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
   19  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
   20  * Please see the License for the specific language governing rights and
   21  * limitations under the License.
   22  * 
   23  * @APPLE_LICENSE_HEADER_END@
   24  */
   25 
   26 #include <ppc/asm.h>
   27 #include <ppc/exception.h>
   28 #include <assym.s>
   29 
   30         .text
   31         .align  2
   32         .globl  _memset
   33         .globl  _bzero
   34         .globl  _bzero_nc
   35         .globl  _bzero_phys
   36 
   37 
   38 // ***********************
   39 // * B Z E R O _ P H Y S *
   40 // ***********************
   41 //
   42 // void bzero_phys(addr64_t phys_addr, uint32_t length);
   43 //
   44 // Takes a phys addr in (r3,r4), and length in r5.  We leave cache on.
   45 
   46         .align  5
   47 LEXT(bzero_phys)
   48         mflr    r12                             // save return address
   49         rlwinm  r3,r3,0,1,0             // coallesce long-long in (r3,r4) into reg64_t in r3
   50         rlwimi  r3,r4,0,0,31
   51         mr              r4,r5                   // put length where bzero() expects it
   52         bl              EXT(ml_set_physical_get_ffs)    // turn DR off, SF on, features in cr6, old MSR in r11
   53         bl              EXT(bzero)              // use normal bzero() routine
   54         mtlr    r12                             // restore return
   55         b               EXT(ml_restore)         // restore MSR, turning DR on and SF off
   56         
   57 
   58 // *******************
   59 // * B Z E R O _ N C *
   60 // *******************
   61 //
   62 //      void bzero_nc(char      *addr, unsigned int length);
   63 //
   64 // For use with uncached memory.  Doesn't seem to be used at all, so probably not
   65 // performance critical.  NB: we must avoid unaligned stores, because some
   66 // machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached
   67 // memory.  Of course, we must also avoid dcbz.
   68 
   69 LEXT(bzero_nc)
   70         cmplwi  cr1,r4,20               // too short to bother with 16-byte loops?
   71         cmplwi  cr7,r4,0                // check for (len==0)
   72         li              r6,0                    // get a 0
   73         bge             cr1,bznc1               // skip if length >=20
   74         mtctr   r4                              // set up byte loop
   75         beqlr-- cr7                             // done if len=0
   76         
   77 // Short operands, loop over bytes.
   78 
   79 bznc0:
   80         stb             r6,0(r3)
   81         addi    r3,r3,1
   82         bdnz    bznc0
   83         blr
   84         
   85 // Handle operands long enough to do doubleword stores; we must doubleword
   86 // align, to avoid alignment exceptions.
   87 
   88 bznc1:
   89         neg             r7,r3                   // start to compute #bytes to align
   90         mfsprg  r10,2                   // get feature flags
   91         andi.   r0,r7,7                 // get #bytes to doubleword align
   92         mr              r5,r3                   // make copy of operand ptr as bcopy expects
   93         mtcrf   0x02,r10                // put pf64Bitb etc in cr6
   94         beq             bzero_tail              // already doubleword aligned
   95         sub             r4,r4,r0                // adjust count
   96         mtctr   r0                              // set up loop
   97 bznc2:                                                  // zero bytes until doubleword aligned
   98         stb             r6,0(r5)
   99         addi    r5,r5,1
  100         bdnz    bznc2
  101         b               bzero_tail              // join bzero, now that r5 is aligned
  102         
  103 
  104 // *************     ***************
  105 // * B Z E R O * and * M E M S E T *
  106 // *************     ***************
  107 //
  108 // void *   memset(void *b, int c, size_t len);
  109 // void         bzero(void *b, size_t len);
  110 //
  111 // These routines support G3, G4, and the 970, and run in both 32 and
  112 // 64-bit mode.  Lengths (size_t) are always 32 bits.
  113 //
  114 // Register use:
  115 //    r0 = temp
  116 //    r2 = temp
  117 //    r3 = original ptr, not changed since memset returns it
  118 //    r4 = count of bytes to set
  119 //    r5 = working operand ptr ("rp")
  120 //    r6 = value to store (usually 0)
  121 // r7-r9 = temps
  122 //   r10 = feature flags
  123 //   r11 = old MSR (if bzero_phys)
  124 //   r12 = return address (if bzero_phys)
  125 //   cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte)
  126 
  127         .align  5
  128 LEXT(memset)                                    // void *   memset(void *b, int c, size_t len);
  129         andi.   r6,r4,0xFF              // copy value to working register, test for 0
  130         mr              r4,r5                   // move length to working register
  131         bne--   memset1                 // skip if nonzero
  132 LEXT(bzero)                                             // void bzero(void *b, size_t len);
  133         dcbtst  0,r3                    // touch in 1st cache block
  134         mfsprg  r10,2                   // get features
  135         li              r6,0                    // get a 0
  136         neg             r7,r3                   // start to compute #bytes to align
  137         andi.   r0,r10,pf128Byte+pf32Byte // get cache line size
  138         mtcrf   0x02,r10                // put pf128Byte etc in cr6
  139         cmplw   r4,r0                   // operand length >= cache line size?
  140         mr              r5,r3                   // make copy of operand ptr (can't change r3)
  141         blt             bzero_tail              // too short for dcbz (or dcbz128)
  142         rlwinm  r0,r7,0,0x1F    // get #bytes to  32-byte align
  143         rlwinm  r9,r7,0,0x7F    // get #bytes to 128-byte align
  144         bt++    pf128Byteb,bzero_128 // skip if 128-byte processor
  145 
  146 // Operand length >=32 and cache line size is 32.
  147 //              r0 = #bytes to 32-byte align
  148 //              r4 = length
  149 //              r5 = ptr to operand
  150 //              r6 = 0
  151 
  152         sub             r2,r4,r0                // adjust length
  153         cmpwi   cr1,r0,0                // already 32-byte aligned?
  154         srwi.   r8,r2,5                 // get #32-byte chunks
  155         beq             bzero_tail              // not long enough to dcbz
  156         mtctr   r8                              // set up loop count
  157         rlwinm  r4,r2,0,27,31   // mask down to leftover byte count
  158         beq             cr1,bz_dcbz32   // skip if already 32-byte aligned
  159         
  160 // 32-byte align.  We just store 32 0s, rather than test and use conditional
  161 // branches.  This is usually faster, because there are no mispredicts.
  162 
  163         stw             r6,0(r5)                // zero next 32 bytes
  164         stw             r6,4(r5)
  165         stw             r6,8(r5)
  166         stw             r6,12(r5)
  167         stw             r6,16(r5)
  168         stw             r6,20(r5)
  169         stw             r6,24(r5)
  170         stw             r6,28(r5)
  171         add             r5,r5,r0                // now r5 is 32-byte aligned
  172         b               bz_dcbz32
  173 
  174 // Loop doing 32-byte version of DCBZ instruction.
  175 
  176         .align  4                               // align the inner loop
  177 bz_dcbz32:
  178         dcbz    0,r5                    // zero another 32 bytes
  179         addi    r5,r5,32
  180         bdnz    bz_dcbz32
  181 
  182 // Store trailing bytes.  This routine is used both by bzero and memset.
  183 //              r4 = #bytes to store (may be large if memset)
  184 //              r5 = address
  185 //              r6 = value to store (in all 8 bytes)
  186 //     cr6 = pf64Bit etc flags
  187 
  188 bzero_tail:
  189         srwi.   r0,r4,4                 // get #(16-byte-chunks)
  190         mtcrf   0x01,r4                 // remaining byte count to cr7
  191         beq             bzt3                    // no 16-byte chunks
  192         mtctr   r0                              // set up loop count
  193         bt++    pf64Bitb,bzt2   // skip if 64-bit processor
  194         b               bzt1
  195         .align  5
  196 bzt1:                                                   // loop over 16-byte chunks on 32-bit processor
  197         stw             r6,0(r5)
  198         stw             r6,4(r5)
  199         stw             r6,8(r5)
  200         stw             r6,12(r5)
  201         addi    r5,r5,16
  202         bdnz    bzt1
  203         b               bzt3
  204         .align  5
  205 bzt2:                                                   // loop over 16-byte chunks on 64-bit processor
  206         std             r6,0(r5)
  207         std             r6,8(r5)
  208         addi    r5,r5,16
  209         bdnz    bzt2
  210         bf              28,bzt4                 // 8-byte chunk?
  211         std             r6,0(r5)
  212         addi    r5,r5,8
  213         b               bzt4
  214 bzt3:
  215         bf              28,bzt4                 // 8-byte chunk?
  216         stw             r6,0(r5)
  217         stw             r6,4(r5)
  218         addi    r5,r5,8
  219 bzt4:
  220         bf              29,bzt5                 // word?
  221         stw             r6,0(r5)
  222         addi    r5,r5,4
  223 bzt5:
  224         bf              30,bzt6                 // halfword?
  225         sth             r6,0(r5)
  226         addi    r5,r5,2
  227 bzt6:
  228         bflr    31                              // byte?
  229         stb             r6,0(r5)
  230         blr
  231         
  232 // Operand length is >=128 and cache line size is 128. We assume that
  233 // because the linesize is 128 bytes, this is a 64-bit processor.
  234 //              r4 = length
  235 //              r5 = ptr to operand
  236 //              r6 = 0
  237 //              r7 = neg(r5)
  238 //              r9 = #bytes to 128-byte align
  239 
  240         .align  5
  241 bzero_128:
  242         sub             r2,r4,r9                // r2 <- length remaining after cache-line aligning
  243         rlwinm  r0,r7,0,0xF             // r0 <- #bytes to 16-byte align
  244         srwi.   r8,r2,7                 // r8 <- number of cache lines to 0
  245         std             r6,0(r5)                // always store 16 bytes to 16-byte align...
  246         std             r6,8(r5)                // ...even if too short for dcbz128
  247         add             r5,r5,r0                // 16-byte align ptr
  248         sub             r4,r4,r0                // adjust count
  249         beq             bzero_tail              // r8==0, not long enough to dcbz128
  250         sub.    r7,r9,r0                // get #bytes remaining to 128-byte align
  251         rlwinm  r4,r2,0,0x7F    // r4 <- length remaining after dcbz128'ing
  252         mtctr   r8                              // set up dcbz128 loop
  253         beq             bz_dcbz128              // already 128-byte aligned
  254         b               bz_align                // enter loop over 16-byte chunks
  255 
  256 // 128-byte align by looping over 16-byte chunks.
  257         
  258         .align  5
  259 bz_align:                                               // loop over 16-byte chunks
  260         subic.  r7,r7,16                // more to go?
  261         std             r6,0(r5)
  262         std             r6,8(r5)
  263         addi    r5,r5,16
  264         bgt             bz_align
  265         
  266         b               bz_dcbz128              // enter dcbz128 loop
  267         
  268 // Loop over 128-byte cache lines.
  269 //              r4 = length remaining after cache lines (0..127)
  270 //              r5 = ptr (128-byte aligned)
  271 //              r6 = 0
  272 //              ctr = count of cache lines to 0
  273 
  274         .align  5
  275 bz_dcbz128:
  276         dcbz128 0,r5                    // zero a 128-byte cache line
  277         addi    r5,r5,128
  278         bdnz    bz_dcbz128
  279         
  280         b               bzero_tail              // handle leftovers
  281 
  282 
  283 // Handle memset() for nonzero values.  This case is relatively infrequent;
  284 // the large majority of memset() calls are for 0.
  285 //              r3 = ptr
  286 //              r4 = count
  287 //              r6 = value in lower byte (nonzero)
  288 
  289 memset1:
  290         cmplwi  r4,16                   // too short to bother aligning?
  291         rlwimi  r6,r6,8,16,23   // replicate value to low 2 bytes
  292         mr              r5,r3                   // make working copy of operand ptr
  293         rlwimi  r6,r6,16,0,15   // value now in all 4 bytes
  294         blt             bzero_tail              // length<16, we won't be using "std"
  295         mfsprg  r10,2                   // get feature flags
  296         neg             r7,r5                   // start to compute #bytes to align
  297         rlwinm  r6,r6,0,1,0             // value now in all 8 bytes (if 64-bit)
  298         andi.   r0,r7,7                 // r6 <- #bytes to doubleword align
  299         stw             r6,0(r5)                // store 8 bytes to avoid a loop
  300         stw             r6,4(r5)
  301         mtcrf   0x02,r10                // get pf64Bit flag etc in cr6
  302         sub             r4,r4,r0                // adjust count
  303         add             r5,r5,r0                // doubleword align ptr
  304         b               bzero_tail
  305         
  306         
  307
Cache object: 7471ab41f4763a921260f54b2c98f6c1
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/osfmk/ppc/bzero.s

FreeBSD/Linux Kernel Cross Reference
sys/osfmk/ppc/bzero.s