1 /* $NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $ */
2 /*-
3 * Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in
14 * the documentation and/or other materials provided with the
15 * distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
27 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 __KERNEL_RCSID(0, "$NetBSD: cpu_in_cksum.c,v 1.2 2018/08/28 07:28:01 rin Exp $");
33
34 #include <sys/param.h>
35 #include <sys/endian.h>
36 #include <sys/mbuf.h>
37 #ifdef _KERNEL
38 #include <sys/systm.h>
39 #else
40 #include <assert.h>
41 #include <stdbool.h>
42 #include <stdio.h>
43
44 #define KASSERT(x) assert(x)
45 #endif
46
47 #include <machine/limits.h>
48
49 #include <netinet/in.h>
50
51 #ifndef _KERNEL
52 int cpu_in_cksum(struct mbuf*, int, int, uint32_t);
53 #endif
54
55 /*
56 * Checksum routine for Internet Protocol family headers (Portable Version).
57 *
58 * This routine is very heavily used in the network
59 * code and should be modified for each CPU to be as fast as possible.
60 *
61 * A discussion of different implementation techniques can be found in
62 * RFC 1071.
63 *
64 * The default implementation for 32bit architectures is using
65 * a 32bit accumulator and operating on 16bit operands.
66 *
67 * The default implementation for 64bit architectures is using
68 * a 64bit accumulator and operating on 32bit operands.
69 *
70 * Both versions are unrolled to handle 32 Byte / 64 Byte fragments as core
71 * of the inner loop. After each iteration of the inner loop, a partial
72 * reduction is done to avoid carry in long packets.
73 */
74
75 #if ULONG_MAX == 0xffffffffUL
76 /* 32bit version */
77 int
78 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
79 {
80 int mlen;
81 uint32_t sum, partial;
82 unsigned int final_acc;
83 uint8_t *data;
84 bool needs_swap, started_on_odd;
85
86 KASSERT(len >= 0);
87 KASSERT(off >= 0);
88
89 needs_swap = false;
90 started_on_odd = false;
91 sum = (initial_sum >> 16) + (initial_sum & 0xffff);
92
93 for (;;) {
94 if (__predict_false(m == NULL)) {
95 printf("in_cksum: out of data\n");
96 return -1;
97 }
98 mlen = m->m_len;
99 if (mlen > off) {
100 mlen -= off;
101 data = mtod(m, uint8_t *) + off;
102 goto post_initial_offset;
103 }
104 off -= mlen;
105 if (len == 0)
106 break;
107 m = m->m_next;
108 }
109
110 for (; len > 0; m = m->m_next) {
111 if (__predict_false(m == NULL)) {
112 printf("in_cksum: out of data\n");
113 return -1;
114 }
115 mlen = m->m_len;
116 data = mtod(m, uint8_t *);
117 post_initial_offset:
118 if (mlen == 0)
119 continue;
120 if (mlen > len)
121 mlen = len;
122 len -= mlen;
123
124 partial = 0;
125 if ((uintptr_t)data & 1) {
126 /* Align on word boundary */
127 started_on_odd = !started_on_odd;
128 #if _BYTE_ORDER == _LITTLE_ENDIAN
129 partial = *data << 8;
130 #else
131 partial = *data;
132 #endif
133 ++data;
134 --mlen;
135 }
136 needs_swap = started_on_odd;
137 while (mlen >= 32) {
138 __builtin_prefetch(data + 32);
139 partial += *(uint16_t *)data;
140 partial += *(uint16_t *)(data + 2);
141 partial += *(uint16_t *)(data + 4);
142 partial += *(uint16_t *)(data + 6);
143 partial += *(uint16_t *)(data + 8);
144 partial += *(uint16_t *)(data + 10);
145 partial += *(uint16_t *)(data + 12);
146 partial += *(uint16_t *)(data + 14);
147 partial += *(uint16_t *)(data + 16);
148 partial += *(uint16_t *)(data + 18);
149 partial += *(uint16_t *)(data + 20);
150 partial += *(uint16_t *)(data + 22);
151 partial += *(uint16_t *)(data + 24);
152 partial += *(uint16_t *)(data + 26);
153 partial += *(uint16_t *)(data + 28);
154 partial += *(uint16_t *)(data + 30);
155 data += 32;
156 mlen -= 32;
157 if (__predict_false(partial & 0xc0000000)) {
158 if (needs_swap)
159 partial = (partial << 8) + (partial >> 24);
160 sum += (partial >> 16);
161 sum += (partial & 0xffff);
162 partial = 0;
163 }
164 }
165 /*
166 * mlen is not updated below as the remaining tests
167 * are using bit masks, which are not affected.
168 */
169 if (mlen & 16) {
170 partial += *(uint16_t *)data;
171 partial += *(uint16_t *)(data + 2);
172 partial += *(uint16_t *)(data + 4);
173 partial += *(uint16_t *)(data + 6);
174 partial += *(uint16_t *)(data + 8);
175 partial += *(uint16_t *)(data + 10);
176 partial += *(uint16_t *)(data + 12);
177 partial += *(uint16_t *)(data + 14);
178 data += 16;
179 }
180 if (mlen & 8) {
181 partial += *(uint16_t *)data;
182 partial += *(uint16_t *)(data + 2);
183 partial += *(uint16_t *)(data + 4);
184 partial += *(uint16_t *)(data + 6);
185 data += 8;
186 }
187 if (mlen & 4) {
188 partial += *(uint16_t *)data;
189 partial += *(uint16_t *)(data + 2);
190 data += 4;
191 }
192 if (mlen & 2) {
193 partial += *(uint16_t *)data;
194 data += 2;
195 }
196 if (mlen & 1) {
197 #if _BYTE_ORDER == _LITTLE_ENDIAN
198 partial += *data;
199 #else
200 partial += *data << 8;
201 #endif
202 started_on_odd = !started_on_odd;
203 }
204
205 if (needs_swap)
206 partial = (partial << 8) + (partial >> 24);
207 sum += (partial >> 16) + (partial & 0xffff);
208 /*
209 * Reduce sum to allow potential byte swap
210 * in the next iteration without carry.
211 */
212 sum = (sum >> 16) + (sum & 0xffff);
213 }
214 final_acc = ((sum >> 16) & 0xffff) + (sum & 0xffff);
215 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
216 return ~final_acc & 0xffff;
217 }
218
219 #else
220 /* 64bit version */
221 int
222 cpu_in_cksum(struct mbuf *m, int len, int off, uint32_t initial_sum)
223 {
224 int mlen;
225 uint64_t sum, partial;
226 unsigned int final_acc;
227 uint8_t *data;
228 bool needs_swap, started_on_odd;
229
230 KASSERT(len >= 0);
231 KASSERT(off >= 0);
232
233 needs_swap = false;
234 started_on_odd = false;
235 sum = initial_sum;
236
237 for (;;) {
238 if (__predict_false(m == NULL)) {
239 printf("in_cksum: out of data\n");
240 return -1;
241 }
242 mlen = m->m_len;
243 if (mlen > off) {
244 mlen -= off;
245 data = mtod(m, uint8_t *) + off;
246 goto post_initial_offset;
247 }
248 off -= mlen;
249 if (len == 0)
250 break;
251 m = m->m_next;
252 }
253
254 for (; len > 0; m = m->m_next) {
255 if (__predict_false(m == NULL)) {
256 printf("in_cksum: out of data\n");
257 return -1;
258 }
259 mlen = m->m_len;
260 data = mtod(m, uint8_t *);
261 post_initial_offset:
262 if (mlen == 0)
263 continue;
264 if (mlen > len)
265 mlen = len;
266 len -= mlen;
267
268 partial = 0;
269 if ((uintptr_t)data & 1) {
270 /* Align on word boundary */
271 started_on_odd = !started_on_odd;
272 #if _BYTE_ORDER == _LITTLE_ENDIAN
273 partial = *data << 8;
274 #else
275 partial = *data;
276 #endif
277 ++data;
278 --mlen;
279 }
280 needs_swap = started_on_odd;
281 if ((uintptr_t)data & 2) {
282 if (mlen < 2)
283 goto trailing_bytes;
284 partial += *(uint16_t *)data;
285 data += 2;
286 mlen -= 2;
287 }
288 while (mlen >= 64) {
289 __builtin_prefetch(data + 32);
290 __builtin_prefetch(data + 64);
291 partial += *(uint32_t *)data;
292 partial += *(uint32_t *)(data + 4);
293 partial += *(uint32_t *)(data + 8);
294 partial += *(uint32_t *)(data + 12);
295 partial += *(uint32_t *)(data + 16);
296 partial += *(uint32_t *)(data + 20);
297 partial += *(uint32_t *)(data + 24);
298 partial += *(uint32_t *)(data + 28);
299 partial += *(uint32_t *)(data + 32);
300 partial += *(uint32_t *)(data + 36);
301 partial += *(uint32_t *)(data + 40);
302 partial += *(uint32_t *)(data + 44);
303 partial += *(uint32_t *)(data + 48);
304 partial += *(uint32_t *)(data + 52);
305 partial += *(uint32_t *)(data + 56);
306 partial += *(uint32_t *)(data + 60);
307 data += 64;
308 mlen -= 64;
309 if (__predict_false(partial & (3ULL << 62))) {
310 if (needs_swap)
311 partial = (partial << 8) + (partial >> 56);
312 sum += (partial >> 32);
313 sum += (partial & 0xffffffff);
314 partial = 0;
315 }
316 }
317 /*
318 * mlen is not updated below as the remaining tests
319 * are using bit masks, which are not affected.
320 */
321 if (mlen & 32) {
322 partial += *(uint32_t *)data;
323 partial += *(uint32_t *)(data + 4);
324 partial += *(uint32_t *)(data + 8);
325 partial += *(uint32_t *)(data + 12);
326 partial += *(uint32_t *)(data + 16);
327 partial += *(uint32_t *)(data + 20);
328 partial += *(uint32_t *)(data + 24);
329 partial += *(uint32_t *)(data + 28);
330 data += 32;
331 }
332 if (mlen & 16) {
333 partial += *(uint32_t *)data;
334 partial += *(uint32_t *)(data + 4);
335 partial += *(uint32_t *)(data + 8);
336 partial += *(uint32_t *)(data + 12);
337 data += 16;
338 }
339 if (mlen & 8) {
340 partial += *(uint32_t *)data;
341 partial += *(uint32_t *)(data + 4);
342 data += 8;
343 }
344 if (mlen & 4) {
345 partial += *(uint32_t *)data;
346 data += 4;
347 }
348 if (mlen & 2) {
349 partial += *(uint16_t *)data;
350 data += 2;
351 }
352 trailing_bytes:
353 if (mlen & 1) {
354 #if _BYTE_ORDER == _LITTLE_ENDIAN
355 partial += *data;
356 #else
357 partial += *data << 8;
358 #endif
359 started_on_odd = !started_on_odd;
360 }
361
362 if (needs_swap)
363 partial = (partial << 8) + (partial >> 56);
364 sum += (partial >> 32) + (partial & 0xffffffff);
365 /*
366 * Reduce sum to allow potential byte swap
367 * in the next iteration without carry.
368 */
369 sum = (sum >> 32) + (sum & 0xffffffff);
370 }
371 final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
372 ((sum >> 16) & 0xffff) + (sum & 0xffff);
373 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
374 final_acc = (final_acc >> 16) + (final_acc & 0xffff);
375 return ~final_acc & 0xffff;
376 }
377 #endif
Cache object: 2163c5bfe808938ded1821811f1fbc38
|