1 /* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */
2
3 /*-
4 * Copyright 2003 Wasabi Systems, Inc.
5 * All rights reserved.
6 *
7 * Written by Steve C. Woodford for Wasabi Systems, Inc.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 * must display the following acknowledgement:
19 * This product includes software developed for the NetBSD Project by
20 * Wasabi Systems, Inc.
21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse
22 * or promote products derived from this software without specific prior
23 * written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35 * POSSIBILITY OF SUCH DAMAGE.
36 *
37 */
38
39 /*
40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e
41 */
42
43 #include "opt_inet.h"
44
45 #include <machine/asm.h>
46 #include "assym.s"
47 __FBSDID("$FreeBSD$");
48
49 .syntax unified
50 /*
51 * int in_cksum(struct mbuf *m, int len)
52 *
53 * Entry:
54 * r0 m
55 * r1 len
56 *
57 * NOTE: Assumes 'm' is *never* NULL.
58 */
59 /* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
60 ENTRY(in_cksum)
61 stmfd sp!, {r4-r11,lr}
62 mov r8, #0x00
63 mov r9, r1
64 mov r10, #0x00
65 mov ip, r0
66
67 .Lin_cksum_loop:
68 ldr r1, [ip, #(M_LEN)]
69 ldr r0, [ip, #(M_DATA)]
70 ldr ip, [ip, #(M_NEXT)]
71 .Lin_cksum_entry4:
72 cmp r9, r1
73 movlt r1, r9
74 sub r9, r9, r1
75 eor r11, r10, r0
76 add r10, r10, r1
77 adds r2, r1, #0x00
78 blne _ASM_LABEL(L_cksumdata)
79 tst r11, #0x01
80 movne r2, r2, ror #8
81 adds r8, r8, r2
82 adc r8, r8, #0x00
83 cmp ip, #0x00
84 bne .Lin_cksum_loop
85
86 mov r1, #0xff
87 orr r1, r1, #0xff00
88 and r0, r8, r1
89 add r0, r0, r8, lsr #16
90 add r0, r0, r0, lsr #16
91 and r0, r0, r1
92 eor r0, r0, r1
93 ldmfd sp!, {r4-r11,pc}
94 END(in_cksum)
95
96 ENTRY(do_cksum)
97 stmfd sp!, {r4-r7, lr}
98 bl L_cksumdata
99 mov r0, r2
100 ldmfd sp!, {r4-r7, pc}
101 END(do_cksum)
102
103 /*
104 * The main in*_cksum() workhorse...
105 *
106 * Entry parameters:
107 * r0 Pointer to buffer
108 * r1 Buffer length
109 * lr Return address
110 *
111 * Returns:
112 * r2 Accumulated 32-bit sum
113 *
114 * Clobbers:
115 * r0-r7
116 */
117 /* LINTSTUB: Ignore */
118 ASENTRY_NP(L_cksumdata)
119 #ifdef _ARM_ARCH_5E
120 pld [r0] /* Pre-fetch the start of the buffer */
121 #endif
122 mov r2, #0
123
124 /* We first have to word-align the buffer. */
125 ands r7, r0, #0x03
126 beq .Lcksumdata_wordaligned
127 rsb r7, r7, #0x04
128 cmp r1, r7 /* Enough bytes left to make it? */
129 blt .Lcksumdata_endgame
130 cmp r7, #0x02
131 ldrb r4, [r0], #0x01 /* Fetch 1st byte */
132 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */
133 movlt r5, #0x00
134 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */
135 movle r6, #0x00
136 /* Combine the three bytes depending on endianness and alignment */
137 #ifdef __ARMEB__
138 orreq r2, r5, r4, lsl #8
139 orreq r2, r2, r6, lsl #24
140 orrne r2, r4, r5, lsl #8
141 orrne r2, r2, r6, lsl #16
142 #else
143 orreq r2, r4, r5, lsl #8
144 orreq r2, r2, r6, lsl #16
145 orrne r2, r5, r4, lsl #8
146 orrne r2, r2, r6, lsl #24
147 #endif
148 subs r1, r1, r7 /* Update length */
149 RETeq /* All done? */
150
151 /* Buffer is now word aligned */
152 .Lcksumdata_wordaligned:
153 #ifdef _ARM_ARCH_5E
154 cmp r1, #0x04 /* Less than 4 bytes left? */
155 blt .Lcksumdata_endgame /* Yup */
156
157 /* Now quad-align, if necessary */
158 ands r7, r0, #0x04
159 ldrne r7, [r0], #0x04
160 subne r1, r1, #0x04
161 subs r1, r1, #0x40
162 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */
163
164 /*
165 * Buffer is now quad aligned. Sum 64 bytes at a time.
166 * Note: First ldrd is hoisted above the loop, together with
167 * setting r6 to zero to avoid stalling for results in the
168 * loop. (r7 is live, from above).
169 */
170 ldrd r4, [r0], #0x08
171 mov r6, #0x00
172 .Lcksumdata_bigloop:
173 pld [r0, #0x18]
174 adds r2, r2, r6
175 adcs r2, r2, r7
176 ldrd r6, [r0], #0x08
177 adcs r2, r2, r4
178 adcs r2, r2, r5
179 ldrd r4, [r0], #0x08
180 adcs r2, r2, r6
181 adcs r2, r2, r7
182 ldrd r6, [r0], #0x08
183 adcs r2, r2, r4
184 adcs r2, r2, r5
185 ldrd r4, [r0], #0x08
186 adcs r2, r2, r6
187 adcs r2, r2, r7
188 pld [r0, #0x18]
189 ldrd r6, [r0], #0x08
190 adcs r2, r2, r4
191 adcs r2, r2, r5
192 ldrd r4, [r0], #0x08
193 adcs r2, r2, r6
194 adcs r2, r2, r7
195 ldrd r6, [r0], #0x08
196 adcs r2, r2, r4
197 adcs r2, r2, r5
198 adc r2, r2, #0x00
199 subs r1, r1, #0x40
200 ldrdge r4, [r0], #0x08
201 bge .Lcksumdata_bigloop
202
203 adds r2, r2, r6 /* r6/r7 still need summing */
204 .Lcksumdata_bigloop_end:
205 adcs r2, r2, r7
206 adc r2, r2, #0x00
207
208 #else /* !_ARM_ARCH_5E */
209
210 subs r1, r1, #0x40
211 blt .Lcksumdata_bigloop_end
212
213 .Lcksumdata_bigloop:
214 ldmia r0!, {r3, r4, r5, r6}
215 adds r2, r2, r3
216 adcs r2, r2, r4
217 adcs r2, r2, r5
218 ldmia r0!, {r3, r4, r5, r7}
219 adcs r2, r2, r6
220 adcs r2, r2, r3
221 adcs r2, r2, r4
222 adcs r2, r2, r5
223 ldmia r0!, {r3, r4, r5, r6}
224 adcs r2, r2, r7
225 adcs r2, r2, r3
226 adcs r2, r2, r4
227 adcs r2, r2, r5
228 ldmia r0!, {r3, r4, r5, r7}
229 adcs r2, r2, r6
230 adcs r2, r2, r3
231 adcs r2, r2, r4
232 adcs r2, r2, r5
233 adcs r2, r2, r7
234 adc r2, r2, #0x00
235 subs r1, r1, #0x40
236 bge .Lcksumdata_bigloop
237 .Lcksumdata_bigloop_end:
238 #endif
239
240 adds r1, r1, #0x40
241 RETeq
242 cmp r1, #0x20
243
244 #ifdef _ARM_ARCH_5E
245 ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */
246 blt .Lcksumdata_less_than_32
247 pld [r0, #0x18]
248 ldrd r6, [r0], #0x08
249 adds r2, r2, r4
250 adcs r2, r2, r5
251 ldrd r4, [r0], #0x08
252 adcs r2, r2, r6
253 adcs r2, r2, r7
254 ldrd r6, [r0], #0x08
255 adcs r2, r2, r4
256 adcs r2, r2, r5
257 adcs r2, r2, r6 /* XXX: Unavoidable result stall */
258 adcs r2, r2, r7
259 #else
260 blt .Lcksumdata_less_than_32
261 ldmia r0!, {r3, r4, r5, r6}
262 adds r2, r2, r3
263 adcs r2, r2, r4
264 adcs r2, r2, r5
265 ldmia r0!, {r3, r4, r5, r7}
266 adcs r2, r2, r6
267 adcs r2, r2, r3
268 adcs r2, r2, r4
269 adcs r2, r2, r5
270 adcs r2, r2, r7
271 #endif
272 adc r2, r2, #0x00
273 subs r1, r1, #0x20
274 RETeq
275
276 .Lcksumdata_less_than_32:
277 /* There are less than 32 bytes left */
278 and r3, r1, #0x18
279 rsb r4, r3, #0x18
280 sub r1, r1, r3
281 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
282 addne pc, pc, r4
283 nop
284
285 /*
286 * Note: We use ldm here, even on armv5e, since the combined issue/result
287 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
288 */
289 /* At least 24 bytes remaining... */
290 ldmia r0!, {r4, r5}
291 adcs r2, r2, r4
292 adcs r2, r2, r5
293
294 /* At least 16 bytes remaining... */
295 ldmia r0!, {r4, r5}
296 adcs r2, r2, r4
297 adcs r2, r2, r5
298
299 /* At least 8 bytes remaining... */
300 ldmia r0!, {r4, r5}
301 adcs r2, r2, r4
302 adcs r2, r2, r5
303
304 /* Less than 8 bytes remaining... */
305 adc r2, r2, #0x00
306 subs r1, r1, #0x04
307 blt .Lcksumdata_lessthan4
308
309 ldr r4, [r0], #0x04
310 sub r1, r1, #0x04
311 adds r2, r2, r4
312 adc r2, r2, #0x00
313
314 /* Deal with < 4 bytes remaining */
315 .Lcksumdata_lessthan4:
316 adds r1, r1, #0x04
317 RETeq
318
319 /* Deal with 1 to 3 remaining bytes, possibly misaligned */
320 .Lcksumdata_endgame:
321 ldrb r3, [r0] /* Fetch first byte */
322 cmp r1, #0x02
323 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
324 movlt r4, #0x00
325 ldrbgt r5, [r0, #0x02]
326 movle r5, #0x00
327 /* Combine the three bytes depending on endianness and alignment */
328 tst r0, #0x01
329 #ifdef __ARMEB__
330 orreq r3, r4, r3, lsl #8
331 orreq r3, r3, r5, lsl #24
332 orrne r3, r3, r4, lsl #8
333 orrne r3, r3, r5, lsl #16
334 #else
335 orreq r3, r3, r4, lsl #8
336 orreq r3, r3, r5, lsl #16
337 orrne r3, r4, r3, lsl #8
338 orrne r3, r3, r5, lsl #24
339 #endif
340 adds r2, r2, r3
341 adc r2, r2, #0x00
342 RET
343 END(L_cksumdata)
344
Cache object: 6a6a3d32a03aa53ed084c6c96a6cdfbf
|