1 /*
2 * Copyright (c) 1982, 1986, 1989, 1990, 1993
3 * The Regents of the University of California. All rights reserved.
4 *
5 * sendfile(2) and related extensions:
6 * Copyright (c) 1998, David Greenman. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 * must display the following acknowledgement:
18 * This product includes software developed by the University of
19 * California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
37 * $FreeBSD$
38 */
39
40 #include "opt_compat.h"
41 #include "opt_ktrace.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/sysproto.h>
47 #include <sys/malloc.h>
48 #include <sys/filedesc.h>
49 #include <sys/proc.h>
50 #include <sys/fcntl.h>
51 #include <sys/file.h>
52 #include <sys/mbuf.h>
53 #include <sys/protosw.h>
54 #include <sys/socket.h>
55 #include <sys/socketvar.h>
56 #include <sys/signalvar.h>
57 #include <sys/uio.h>
58 #include <sys/vnode.h>
59 #include <sys/lock.h>
60 #include <sys/mount.h>
61 #ifdef KTRACE
62 #include <sys/ktrace.h>
63 #endif
64 #include <vm/vm.h>
65 #include <vm/vm_prot.h>
66 #include <vm/vm_object.h>
67 #include <vm/vm_page.h>
68 #include <vm/vm_pager.h>
69 #include <vm/vm_pageout.h>
70 #include <vm/vm_kern.h>
71 #include <vm/vm_extern.h>
72 #include <machine/limits.h>
73
74 static void sf_buf_init(void *arg);
75 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
76 static struct sf_buf *sf_buf_alloc(void);
77 static void sf_buf_ref(caddr_t addr, u_int size);
78 static void sf_buf_free(caddr_t addr, u_int size);
79
80 static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags));
81 static int recvit __P((struct proc *p, int s, struct msghdr *mp,
82 caddr_t namelenp));
83
84 static int accept1 __P((struct proc *p, struct accept_args *uap, int compat));
85 static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
86 int compat));
87 static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
88 int compat));
89
90 static SLIST_HEAD(, sf_buf) sf_freelist;
91 static vm_offset_t sf_base;
92 static struct sf_buf *sf_bufs;
93 static int sf_buf_alloc_want;
94
95 /*
96 * System call interface to the socket abstraction.
97 */
98 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
99 #define COMPAT_OLDSOCK
100 #endif
101
102 extern struct fileops socketops;
103
104 int
105 socket(p, uap)
106 struct proc *p;
107 register struct socket_args /* {
108 int domain;
109 int type;
110 int protocol;
111 } */ *uap;
112 {
113 struct filedesc *fdp = p->p_fd;
114 struct socket *so;
115 struct file *fp;
116 int fd, error;
117
118 error = falloc(p, &fp, &fd);
119 if (error)
120 return (error);
121 fp->f_flag = FREAD|FWRITE;
122 fp->f_type = DTYPE_SOCKET;
123 fp->f_ops = &socketops;
124 error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
125 if (error) {
126 fdp->fd_ofiles[fd] = 0;
127 ffree(fp);
128 } else {
129 fp->f_data = (caddr_t)so;
130 p->p_retval[0] = fd;
131 }
132 return (error);
133 }
134
135 /* ARGSUSED */
136 int
137 bind(p, uap)
138 struct proc *p;
139 register struct bind_args /* {
140 int s;
141 caddr_t name;
142 int namelen;
143 } */ *uap;
144 {
145 struct file *fp;
146 struct sockaddr *sa;
147 int error;
148
149 error = getsock(p->p_fd, uap->s, &fp);
150 if (error)
151 return (error);
152 error = getsockaddr(&sa, uap->name, uap->namelen);
153 if (error)
154 return (error);
155 error = sobind((struct socket *)fp->f_data, sa, p);
156 FREE(sa, M_SONAME);
157 return (error);
158 }
159
160 /* ARGSUSED */
161 int
162 listen(p, uap)
163 struct proc *p;
164 register struct listen_args /* {
165 int s;
166 int backlog;
167 } */ *uap;
168 {
169 struct file *fp;
170 int error;
171
172 error = getsock(p->p_fd, uap->s, &fp);
173 if (error)
174 return (error);
175 return (solisten((struct socket *)fp->f_data, uap->backlog, p));
176 }
177
178 static int
179 accept1(p, uap, compat)
180 struct proc *p;
181 register struct accept_args /* {
182 int s;
183 caddr_t name;
184 int *anamelen;
185 } */ *uap;
186 int compat;
187 {
188 struct file *fp;
189 struct sockaddr *sa;
190 int namelen, error, s;
191 struct socket *head, *so;
192 int fd;
193 short fflag; /* type must match fp->f_flag */
194
195 if (uap->name) {
196 error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
197 sizeof (namelen));
198 if(error)
199 return (error);
200 }
201 error = getsock(p->p_fd, uap->s, &fp);
202 if (error)
203 return (error);
204 s = splnet();
205 head = (struct socket *)fp->f_data;
206 if ((head->so_options & SO_ACCEPTCONN) == 0) {
207 splx(s);
208 return (EINVAL);
209 }
210 if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
211 splx(s);
212 return (EWOULDBLOCK);
213 }
214 while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
215 if (head->so_state & SS_CANTRCVMORE) {
216 head->so_error = ECONNABORTED;
217 break;
218 }
219 error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
220 "accept", 0);
221 if (error) {
222 splx(s);
223 return (error);
224 }
225 }
226 if (head->so_error) {
227 error = head->so_error;
228 head->so_error = 0;
229 splx(s);
230 return (error);
231 }
232
233 /*
234 * At this point we know that there is at least one connection
235 * ready to be accepted. Remove it from the queue prior to
236 * allocating the file descriptor for it since falloc() may
237 * block allowing another process to accept the connection
238 * instead.
239 */
240 so = head->so_comp.tqh_first;
241 TAILQ_REMOVE(&head->so_comp, so, so_list);
242 head->so_qlen--;
243
244 fflag = fp->f_flag;
245 error = falloc(p, &fp, &fd);
246 if (error) {
247 /*
248 * Probably ran out of file descriptors. Put the
249 * unaccepted connection back onto the queue and
250 * do another wakeup so some other process might
251 * have a chance at it.
252 */
253 TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
254 head->so_qlen++;
255 wakeup_one(&head->so_timeo);
256 splx(s);
257 return (error);
258 } else
259 p->p_retval[0] = fd;
260
261 so->so_state &= ~SS_COMP;
262 so->so_head = NULL;
263 if (head->so_sigio != NULL)
264 fsetown(fgetown(head->so_sigio), &so->so_sigio);
265
266 fp->f_type = DTYPE_SOCKET;
267 fp->f_flag = fflag;
268 fp->f_ops = &socketops;
269 fp->f_data = (caddr_t)so;
270 sa = 0;
271 (void) soaccept(so, &sa);
272 if (sa == 0) {
273 namelen = 0;
274 if (uap->name)
275 goto gotnoname;
276 return 0;
277 }
278 if (uap->name) {
279 /* check sa_len before it is destroyed */
280 if (namelen > sa->sa_len)
281 namelen = sa->sa_len;
282 #ifdef COMPAT_OLDSOCK
283 if (compat)
284 ((struct osockaddr *)sa)->sa_family =
285 sa->sa_family;
286 #endif
287 error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
288 if (!error)
289 gotnoname:
290 error = copyout((caddr_t)&namelen,
291 (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
292 }
293 if (sa)
294 FREE(sa, M_SONAME);
295 splx(s);
296 return (error);
297 }
298
299 int
300 accept(p, uap)
301 struct proc *p;
302 struct accept_args *uap;
303 {
304
305 return (accept1(p, uap, 0));
306 }
307
308 #ifdef COMPAT_OLDSOCK
309 int
310 oaccept(p, uap)
311 struct proc *p;
312 struct accept_args *uap;
313 {
314
315 return (accept1(p, uap, 1));
316 }
317 #endif /* COMPAT_OLDSOCK */
318
319 /* ARGSUSED */
320 int
321 connect(p, uap)
322 struct proc *p;
323 register struct connect_args /* {
324 int s;
325 caddr_t name;
326 int namelen;
327 } */ *uap;
328 {
329 struct file *fp;
330 register struct socket *so;
331 struct sockaddr *sa;
332 int error, s;
333
334 error = getsock(p->p_fd, uap->s, &fp);
335 if (error)
336 return (error);
337 so = (struct socket *)fp->f_data;
338 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
339 return (EALREADY);
340 error = getsockaddr(&sa, uap->name, uap->namelen);
341 if (error)
342 return (error);
343 error = soconnect(so, sa, p);
344 if (error)
345 goto bad;
346 if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
347 FREE(sa, M_SONAME);
348 return (EINPROGRESS);
349 }
350 s = splnet();
351 while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
352 error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
353 "connec", 0);
354 if (error)
355 break;
356 }
357 if (error == 0) {
358 error = so->so_error;
359 so->so_error = 0;
360 }
361 splx(s);
362 bad:
363 so->so_state &= ~SS_ISCONNECTING;
364 FREE(sa, M_SONAME);
365 if (error == ERESTART)
366 error = EINTR;
367 return (error);
368 }
369
370 int
371 socketpair(p, uap)
372 struct proc *p;
373 register struct socketpair_args /* {
374 int domain;
375 int type;
376 int protocol;
377 int *rsv;
378 } */ *uap;
379 {
380 register struct filedesc *fdp = p->p_fd;
381 struct file *fp1, *fp2;
382 struct socket *so1, *so2;
383 int fd, error, sv[2];
384
385 error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
386 if (error)
387 return (error);
388 error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
389 if (error)
390 goto free1;
391 error = falloc(p, &fp1, &fd);
392 if (error)
393 goto free2;
394 sv[0] = fd;
395 fp1->f_flag = FREAD|FWRITE;
396 fp1->f_type = DTYPE_SOCKET;
397 fp1->f_ops = &socketops;
398 fp1->f_data = (caddr_t)so1;
399 error = falloc(p, &fp2, &fd);
400 if (error)
401 goto free3;
402 fp2->f_flag = FREAD|FWRITE;
403 fp2->f_type = DTYPE_SOCKET;
404 fp2->f_ops = &socketops;
405 fp2->f_data = (caddr_t)so2;
406 sv[1] = fd;
407 error = soconnect2(so1, so2);
408 if (error)
409 goto free4;
410 if (uap->type == SOCK_DGRAM) {
411 /*
412 * Datagram socket connection is asymmetric.
413 */
414 error = soconnect2(so2, so1);
415 if (error)
416 goto free4;
417 }
418 error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
419 return (error);
420 free4:
421 ffree(fp2);
422 fdp->fd_ofiles[sv[1]] = 0;
423 free3:
424 ffree(fp1);
425 fdp->fd_ofiles[sv[0]] = 0;
426 free2:
427 (void)soclose(so2);
428 free1:
429 (void)soclose(so1);
430 return (error);
431 }
432
433 static int
434 sendit(p, s, mp, flags)
435 register struct proc *p;
436 int s;
437 register struct msghdr *mp;
438 int flags;
439 {
440 struct file *fp;
441 struct uio auio;
442 register struct iovec *iov;
443 register int i;
444 struct mbuf *control;
445 struct sockaddr *to;
446 int len, error;
447 struct socket *so;
448 #ifdef KTRACE
449 struct iovec *ktriov = NULL;
450 struct uio ktruio;
451 #endif
452
453 error = getsock(p->p_fd, s, &fp);
454 if (error)
455 return (error);
456 auio.uio_iov = mp->msg_iov;
457 auio.uio_iovcnt = mp->msg_iovlen;
458 auio.uio_segflg = UIO_USERSPACE;
459 auio.uio_rw = UIO_WRITE;
460 auio.uio_procp = p;
461 auio.uio_offset = 0; /* XXX */
462 auio.uio_resid = 0;
463 iov = mp->msg_iov;
464 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
465 if ((auio.uio_resid += iov->iov_len) < 0)
466 return (EINVAL);
467 }
468 if (mp->msg_name) {
469 error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
470 if (error)
471 return (error);
472 } else
473 to = 0;
474 if (mp->msg_control) {
475 if (mp->msg_controllen < sizeof(struct cmsghdr)
476 #ifdef COMPAT_OLDSOCK
477 && mp->msg_flags != MSG_COMPAT
478 #endif
479 ) {
480 error = EINVAL;
481 goto bad;
482 }
483 error = sockargs(&control, mp->msg_control,
484 mp->msg_controllen, MT_CONTROL);
485 if (error)
486 goto bad;
487 #ifdef COMPAT_OLDSOCK
488 if (mp->msg_flags == MSG_COMPAT) {
489 register struct cmsghdr *cm;
490
491 M_PREPEND(control, sizeof(*cm), M_WAIT);
492 if (control == 0) {
493 error = ENOBUFS;
494 goto bad;
495 } else {
496 cm = mtod(control, struct cmsghdr *);
497 cm->cmsg_len = control->m_len;
498 cm->cmsg_level = SOL_SOCKET;
499 cm->cmsg_type = SCM_RIGHTS;
500 }
501 }
502 #endif
503 } else
504 control = 0;
505 #ifdef KTRACE
506 if (KTRPOINT(p, KTR_GENIO)) {
507 int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
508
509 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
510 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
511 ktruio = auio;
512 }
513 #endif
514 len = auio.uio_resid;
515 so = (struct socket *)fp->f_data;
516 error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
517 flags, p);
518 if (error) {
519 if (auio.uio_resid != len && (error == ERESTART ||
520 error == EINTR || error == EWOULDBLOCK))
521 error = 0;
522 if (error == EPIPE)
523 psignal(p, SIGPIPE);
524 }
525 if (error == 0)
526 p->p_retval[0] = len - auio.uio_resid;
527 #ifdef KTRACE
528 if (ktriov != NULL) {
529 if (error == 0) {
530 ktruio.uio_iov = ktriov;
531 ktruio.uio_resid = p->p_retval[0];
532 ktrgenio(p->p_tracep, s, UIO_WRITE, &ktruio, error);
533 }
534 FREE(ktriov, M_TEMP);
535 }
536 #endif
537 bad:
538 if (to)
539 FREE(to, M_SONAME);
540 return (error);
541 }
542
543 int
544 sendto(p, uap)
545 struct proc *p;
546 register struct sendto_args /* {
547 int s;
548 caddr_t buf;
549 size_t len;
550 int flags;
551 caddr_t to;
552 int tolen;
553 } */ *uap;
554 {
555 struct msghdr msg;
556 struct iovec aiov;
557
558 msg.msg_name = uap->to;
559 msg.msg_namelen = uap->tolen;
560 msg.msg_iov = &aiov;
561 msg.msg_iovlen = 1;
562 msg.msg_control = 0;
563 #ifdef COMPAT_OLDSOCK
564 msg.msg_flags = 0;
565 #endif
566 aiov.iov_base = uap->buf;
567 aiov.iov_len = uap->len;
568 return (sendit(p, uap->s, &msg, uap->flags));
569 }
570
571 #ifdef COMPAT_OLDSOCK
572 int
573 osend(p, uap)
574 struct proc *p;
575 register struct osend_args /* {
576 int s;
577 caddr_t buf;
578 int len;
579 int flags;
580 } */ *uap;
581 {
582 struct msghdr msg;
583 struct iovec aiov;
584
585 msg.msg_name = 0;
586 msg.msg_namelen = 0;
587 msg.msg_iov = &aiov;
588 msg.msg_iovlen = 1;
589 aiov.iov_base = uap->buf;
590 aiov.iov_len = uap->len;
591 msg.msg_control = 0;
592 msg.msg_flags = 0;
593 return (sendit(p, uap->s, &msg, uap->flags));
594 }
595
596 int
597 osendmsg(p, uap)
598 struct proc *p;
599 register struct osendmsg_args /* {
600 int s;
601 caddr_t msg;
602 int flags;
603 } */ *uap;
604 {
605 struct msghdr msg;
606 struct iovec aiov[UIO_SMALLIOV], *iov;
607 int error;
608
609 error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
610 if (error)
611 return (error);
612 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
613 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
614 return (EMSGSIZE);
615 MALLOC(iov, struct iovec *,
616 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
617 M_WAITOK);
618 } else
619 iov = aiov;
620 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
621 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
622 if (error)
623 goto done;
624 msg.msg_flags = MSG_COMPAT;
625 msg.msg_iov = iov;
626 error = sendit(p, uap->s, &msg, uap->flags);
627 done:
628 if (iov != aiov)
629 FREE(iov, M_IOV);
630 return (error);
631 }
632 #endif
633
634 int
635 sendmsg(p, uap)
636 struct proc *p;
637 register struct sendmsg_args /* {
638 int s;
639 caddr_t msg;
640 int flags;
641 } */ *uap;
642 {
643 struct msghdr msg;
644 struct iovec aiov[UIO_SMALLIOV], *iov;
645 int error;
646
647 error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
648 if (error)
649 return (error);
650 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
651 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
652 return (EMSGSIZE);
653 MALLOC(iov, struct iovec *,
654 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
655 M_WAITOK);
656 } else
657 iov = aiov;
658 if (msg.msg_iovlen &&
659 (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
660 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
661 goto done;
662 msg.msg_iov = iov;
663 #ifdef COMPAT_OLDSOCK
664 msg.msg_flags = 0;
665 #endif
666 error = sendit(p, uap->s, &msg, uap->flags);
667 done:
668 if (iov != aiov)
669 FREE(iov, M_IOV);
670 return (error);
671 }
672
673 static int
674 recvit(p, s, mp, namelenp)
675 register struct proc *p;
676 int s;
677 register struct msghdr *mp;
678 caddr_t namelenp;
679 {
680 struct file *fp;
681 struct uio auio;
682 register struct iovec *iov;
683 register int i;
684 int len, error;
685 struct mbuf *m, *control = 0;
686 caddr_t ctlbuf;
687 struct socket *so;
688 struct sockaddr *fromsa = 0;
689 #ifdef KTRACE
690 struct iovec *ktriov = NULL;
691 struct uio ktruio;
692 #endif
693
694 error = getsock(p->p_fd, s, &fp);
695 if (error)
696 return (error);
697 auio.uio_iov = mp->msg_iov;
698 auio.uio_iovcnt = mp->msg_iovlen;
699 auio.uio_segflg = UIO_USERSPACE;
700 auio.uio_rw = UIO_READ;
701 auio.uio_procp = p;
702 auio.uio_offset = 0; /* XXX */
703 auio.uio_resid = 0;
704 iov = mp->msg_iov;
705 for (i = 0; i < mp->msg_iovlen; i++, iov++) {
706 if ((auio.uio_resid += iov->iov_len) < 0)
707 return (EINVAL);
708 }
709 #ifdef KTRACE
710 if (KTRPOINT(p, KTR_GENIO)) {
711 int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
712
713 MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
714 bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
715 ktruio = auio;
716 }
717 #endif
718 len = auio.uio_resid;
719 so = (struct socket *)fp->f_data;
720 error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
721 (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
722 &mp->msg_flags);
723 if (error) {
724 if (auio.uio_resid != len && (error == ERESTART ||
725 error == EINTR || error == EWOULDBLOCK))
726 error = 0;
727 }
728 #ifdef KTRACE
729 if (ktriov != NULL) {
730 if (error == 0) {
731 ktruio.uio_iov = ktriov;
732 ktruio.uio_resid = len - auio.uio_resid;
733 ktrgenio(p->p_tracep, s, UIO_READ, &ktruio, error);
734 }
735 FREE(ktriov, M_TEMP);
736 }
737 #endif
738 if (error)
739 goto out;
740 p->p_retval[0] = len - auio.uio_resid;
741 if (mp->msg_name) {
742 len = mp->msg_namelen;
743 if (len <= 0 || fromsa == 0)
744 len = 0;
745 else {
746 #ifndef MIN
747 #define MIN(a,b) ((a)>(b)?(b):(a))
748 #endif
749 /* save sa_len before it is destroyed by MSG_COMPAT */
750 len = MIN(len, fromsa->sa_len);
751 #ifdef COMPAT_OLDSOCK
752 if (mp->msg_flags & MSG_COMPAT)
753 ((struct osockaddr *)fromsa)->sa_family =
754 fromsa->sa_family;
755 #endif
756 error = copyout(fromsa,
757 (caddr_t)mp->msg_name, (unsigned)len);
758 if (error)
759 goto out;
760 }
761 mp->msg_namelen = len;
762 if (namelenp &&
763 (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
764 #ifdef COMPAT_OLDSOCK
765 if (mp->msg_flags & MSG_COMPAT)
766 error = 0; /* old recvfrom didn't check */
767 else
768 #endif
769 goto out;
770 }
771 }
772 if (mp->msg_control) {
773 #ifdef COMPAT_OLDSOCK
774 /*
775 * We assume that old recvmsg calls won't receive access
776 * rights and other control info, esp. as control info
777 * is always optional and those options didn't exist in 4.3.
778 * If we receive rights, trim the cmsghdr; anything else
779 * is tossed.
780 */
781 if (control && mp->msg_flags & MSG_COMPAT) {
782 if (mtod(control, struct cmsghdr *)->cmsg_level !=
783 SOL_SOCKET ||
784 mtod(control, struct cmsghdr *)->cmsg_type !=
785 SCM_RIGHTS) {
786 mp->msg_controllen = 0;
787 goto out;
788 }
789 control->m_len -= sizeof (struct cmsghdr);
790 control->m_data += sizeof (struct cmsghdr);
791 }
792 #endif
793 len = mp->msg_controllen;
794 m = control;
795 mp->msg_controllen = 0;
796 ctlbuf = (caddr_t) mp->msg_control;
797
798 while (m && len > 0) {
799 unsigned int tocopy;
800
801 if (len >= m->m_len)
802 tocopy = m->m_len;
803 else {
804 mp->msg_flags |= MSG_CTRUNC;
805 tocopy = len;
806 }
807
808 if (error = copyout((caddr_t)mtod(m, caddr_t),
809 ctlbuf, tocopy))
810 goto out;
811
812 ctlbuf += tocopy;
813 len -= tocopy;
814 m = m->m_next;
815 }
816 mp->msg_controllen = ctlbuf - mp->msg_control;
817 }
818 out:
819 if (fromsa)
820 FREE(fromsa, M_SONAME);
821 if (control)
822 m_freem(control);
823 return (error);
824 }
825
826 int
827 recvfrom(p, uap)
828 struct proc *p;
829 register struct recvfrom_args /* {
830 int s;
831 caddr_t buf;
832 size_t len;
833 int flags;
834 caddr_t from;
835 int *fromlenaddr;
836 } */ *uap;
837 {
838 struct msghdr msg;
839 struct iovec aiov;
840 int error;
841
842 if (uap->fromlenaddr) {
843 error = copyin((caddr_t)uap->fromlenaddr,
844 (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
845 if (error)
846 return (error);
847 } else
848 msg.msg_namelen = 0;
849 msg.msg_name = uap->from;
850 msg.msg_iov = &aiov;
851 msg.msg_iovlen = 1;
852 aiov.iov_base = uap->buf;
853 aiov.iov_len = uap->len;
854 msg.msg_control = 0;
855 msg.msg_flags = uap->flags;
856 return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr));
857 }
858
859 #ifdef COMPAT_OLDSOCK
860 int
861 orecvfrom(p, uap)
862 struct proc *p;
863 struct recvfrom_args *uap;
864 {
865
866 uap->flags |= MSG_COMPAT;
867 return (recvfrom(p, uap));
868 }
869 #endif
870
871
872 #ifdef COMPAT_OLDSOCK
873 int
874 orecv(p, uap)
875 struct proc *p;
876 register struct orecv_args /* {
877 int s;
878 caddr_t buf;
879 int len;
880 int flags;
881 } */ *uap;
882 {
883 struct msghdr msg;
884 struct iovec aiov;
885
886 msg.msg_name = 0;
887 msg.msg_namelen = 0;
888 msg.msg_iov = &aiov;
889 msg.msg_iovlen = 1;
890 aiov.iov_base = uap->buf;
891 aiov.iov_len = uap->len;
892 msg.msg_control = 0;
893 msg.msg_flags = uap->flags;
894 return (recvit(p, uap->s, &msg, (caddr_t)0));
895 }
896
897 /*
898 * Old recvmsg. This code takes advantage of the fact that the old msghdr
899 * overlays the new one, missing only the flags, and with the (old) access
900 * rights where the control fields are now.
901 */
902 int
903 orecvmsg(p, uap)
904 struct proc *p;
905 register struct orecvmsg_args /* {
906 int s;
907 struct omsghdr *msg;
908 int flags;
909 } */ *uap;
910 {
911 struct msghdr msg;
912 struct iovec aiov[UIO_SMALLIOV], *iov;
913 int error;
914
915 error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
916 sizeof (struct omsghdr));
917 if (error)
918 return (error);
919 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
920 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
921 return (EMSGSIZE);
922 MALLOC(iov, struct iovec *,
923 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
924 M_WAITOK);
925 } else
926 iov = aiov;
927 msg.msg_flags = uap->flags | MSG_COMPAT;
928 error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
929 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
930 if (error)
931 goto done;
932 msg.msg_iov = iov;
933 error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
934
935 if (msg.msg_controllen && error == 0)
936 error = copyout((caddr_t)&msg.msg_controllen,
937 (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
938 done:
939 if (iov != aiov)
940 FREE(iov, M_IOV);
941 return (error);
942 }
943 #endif
944
945 int
946 recvmsg(p, uap)
947 struct proc *p;
948 register struct recvmsg_args /* {
949 int s;
950 struct msghdr *msg;
951 int flags;
952 } */ *uap;
953 {
954 struct msghdr msg;
955 struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
956 register int error;
957
958 error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
959 if (error)
960 return (error);
961 if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
962 if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
963 return (EMSGSIZE);
964 MALLOC(iov, struct iovec *,
965 sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
966 M_WAITOK);
967 } else
968 iov = aiov;
969 #ifdef COMPAT_OLDSOCK
970 msg.msg_flags = uap->flags &~ MSG_COMPAT;
971 #else
972 msg.msg_flags = uap->flags;
973 #endif
974 uiov = msg.msg_iov;
975 msg.msg_iov = iov;
976 error = copyin((caddr_t)uiov, (caddr_t)iov,
977 (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
978 if (error)
979 goto done;
980 error = recvit(p, uap->s, &msg, (caddr_t)0);
981 if (!error) {
982 msg.msg_iov = uiov;
983 error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
984 }
985 done:
986 if (iov != aiov)
987 FREE(iov, M_IOV);
988 return (error);
989 }
990
991 /* ARGSUSED */
992 int
993 shutdown(p, uap)
994 struct proc *p;
995 register struct shutdown_args /* {
996 int s;
997 int how;
998 } */ *uap;
999 {
1000 struct file *fp;
1001 int error;
1002
1003 error = getsock(p->p_fd, uap->s, &fp);
1004 if (error)
1005 return (error);
1006 return (soshutdown((struct socket *)fp->f_data, uap->how));
1007 }
1008
1009 /* ARGSUSED */
1010 int
1011 setsockopt(p, uap)
1012 struct proc *p;
1013 register struct setsockopt_args /* {
1014 int s;
1015 int level;
1016 int name;
1017 caddr_t val;
1018 int valsize;
1019 } */ *uap;
1020 {
1021 struct file *fp;
1022 struct sockopt sopt;
1023 int error;
1024
1025 if (uap->val == 0 && uap->valsize != 0)
1026 return (EFAULT);
1027 if (uap->valsize < 0)
1028 return (EINVAL);
1029
1030 error = getsock(p->p_fd, uap->s, &fp);
1031 if (error)
1032 return (error);
1033
1034 sopt.sopt_dir = SOPT_SET;
1035 sopt.sopt_level = uap->level;
1036 sopt.sopt_name = uap->name;
1037 sopt.sopt_val = uap->val;
1038 sopt.sopt_valsize = uap->valsize;
1039 sopt.sopt_p = p;
1040
1041 return (sosetopt((struct socket *)fp->f_data, &sopt));
1042 }
1043
1044 /* ARGSUSED */
1045 int
1046 getsockopt(p, uap)
1047 struct proc *p;
1048 register struct getsockopt_args /* {
1049 int s;
1050 int level;
1051 int name;
1052 caddr_t val;
1053 int *avalsize;
1054 } */ *uap;
1055 {
1056 int valsize, error;
1057 struct file *fp;
1058 struct sockopt sopt;
1059
1060 error = getsock(p->p_fd, uap->s, &fp);
1061 if (error)
1062 return (error);
1063 if (uap->val) {
1064 error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
1065 sizeof (valsize));
1066 if (error)
1067 return (error);
1068 if (valsize < 0)
1069 return (EINVAL);
1070 } else
1071 valsize = 0;
1072
1073 sopt.sopt_dir = SOPT_GET;
1074 sopt.sopt_level = uap->level;
1075 sopt.sopt_name = uap->name;
1076 sopt.sopt_val = uap->val;
1077 sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
1078 sopt.sopt_p = p;
1079
1080 error = sogetopt((struct socket *)fp->f_data, &sopt);
1081 if (error == 0) {
1082 valsize = sopt.sopt_valsize;
1083 error = copyout((caddr_t)&valsize,
1084 (caddr_t)uap->avalsize, sizeof (valsize));
1085 }
1086 return (error);
1087 }
1088
1089 /*
1090 * Get socket name.
1091 */
1092 /* ARGSUSED */
1093 static int
1094 getsockname1(p, uap, compat)
1095 struct proc *p;
1096 register struct getsockname_args /* {
1097 int fdes;
1098 caddr_t asa;
1099 int *alen;
1100 } */ *uap;
1101 int compat;
1102 {
1103 struct file *fp;
1104 register struct socket *so;
1105 struct sockaddr *sa;
1106 int len, error;
1107
1108 error = getsock(p->p_fd, uap->fdes, &fp);
1109 if (error)
1110 return (error);
1111 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1112 if (error)
1113 return (error);
1114 so = (struct socket *)fp->f_data;
1115 sa = 0;
1116 error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
1117 if (error)
1118 goto bad;
1119 if (sa == 0) {
1120 len = 0;
1121 goto gotnothing;
1122 }
1123
1124 len = MIN(len, sa->sa_len);
1125 #ifdef COMPAT_OLDSOCK
1126 if (compat)
1127 ((struct osockaddr *)sa)->sa_family = sa->sa_family;
1128 #endif
1129 error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1130 if (error == 0)
1131 gotnothing:
1132 error = copyout((caddr_t)&len, (caddr_t)uap->alen,
1133 sizeof (len));
1134 bad:
1135 if (sa)
1136 FREE(sa, M_SONAME);
1137 return (error);
1138 }
1139
1140 int
1141 getsockname(p, uap)
1142 struct proc *p;
1143 struct getsockname_args *uap;
1144 {
1145
1146 return (getsockname1(p, uap, 0));
1147 }
1148
1149 #ifdef COMPAT_OLDSOCK
1150 int
1151 ogetsockname(p, uap)
1152 struct proc *p;
1153 struct getsockname_args *uap;
1154 {
1155
1156 return (getsockname1(p, uap, 1));
1157 }
1158 #endif /* COMPAT_OLDSOCK */
1159
1160 /*
1161 * Get name of peer for connected socket.
1162 */
1163 /* ARGSUSED */
1164 static int
1165 getpeername1(p, uap, compat)
1166 struct proc *p;
1167 register struct getpeername_args /* {
1168 int fdes;
1169 caddr_t asa;
1170 int *alen;
1171 } */ *uap;
1172 int compat;
1173 {
1174 struct file *fp;
1175 register struct socket *so;
1176 struct sockaddr *sa;
1177 int len, error;
1178
1179 error = getsock(p->p_fd, uap->fdes, &fp);
1180 if (error)
1181 return (error);
1182 so = (struct socket *)fp->f_data;
1183 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
1184 return (ENOTCONN);
1185 error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
1186 if (error)
1187 return (error);
1188 sa = 0;
1189 error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
1190 if (error)
1191 goto bad;
1192 if (sa == 0) {
1193 len = 0;
1194 goto gotnothing;
1195 }
1196 len = MIN(len, sa->sa_len);
1197 #ifdef COMPAT_OLDSOCK
1198 if (compat)
1199 ((struct osockaddr *)sa)->sa_family =
1200 sa->sa_family;
1201 #endif
1202 error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
1203 if (error)
1204 goto bad;
1205 gotnothing:
1206 error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
1207 bad:
1208 if (sa) FREE(sa, M_SONAME);
1209 return (error);
1210 }
1211
1212 int
1213 getpeername(p, uap)
1214 struct proc *p;
1215 struct getpeername_args *uap;
1216 {
1217
1218 return (getpeername1(p, uap, 0));
1219 }
1220
1221 #ifdef COMPAT_OLDSOCK
1222 int
1223 ogetpeername(p, uap)
1224 struct proc *p;
1225 struct ogetpeername_args *uap;
1226 {
1227
1228 /* XXX uap should have type `getpeername_args *' to begin with. */
1229 return (getpeername1(p, (struct getpeername_args *)uap, 1));
1230 }
1231 #endif /* COMPAT_OLDSOCK */
1232
1233 int
1234 sockargs(mp, buf, buflen, type)
1235 struct mbuf **mp;
1236 caddr_t buf;
1237 int buflen, type;
1238 {
1239 register struct sockaddr *sa;
1240 register struct mbuf *m;
1241 int error;
1242
1243 if ((u_int)buflen > MLEN) {
1244 #ifdef COMPAT_OLDSOCK
1245 if (type == MT_SONAME && (u_int)buflen <= 112)
1246 buflen = MLEN; /* unix domain compat. hack */
1247 else
1248 #endif
1249 return (EINVAL);
1250 }
1251 m = m_get(M_WAIT, type);
1252 if (m == NULL)
1253 return (ENOBUFS);
1254 m->m_len = buflen;
1255 error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
1256 if (error)
1257 (void) m_free(m);
1258 else {
1259 *mp = m;
1260 if (type == MT_SONAME) {
1261 sa = mtod(m, struct sockaddr *);
1262
1263 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1264 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1265 sa->sa_family = sa->sa_len;
1266 #endif
1267 sa->sa_len = buflen;
1268 }
1269 }
1270 return (error);
1271 }
1272
1273 int
1274 getsockaddr(namp, uaddr, len)
1275 struct sockaddr **namp;
1276 caddr_t uaddr;
1277 size_t len;
1278 {
1279 struct sockaddr *sa;
1280 int error;
1281
1282 if (len > SOCK_MAXADDRLEN)
1283 return ENAMETOOLONG;
1284 MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
1285 error = copyin(uaddr, sa, len);
1286 if (error) {
1287 FREE(sa, M_SONAME);
1288 } else {
1289 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
1290 if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
1291 sa->sa_family = sa->sa_len;
1292 #endif
1293 sa->sa_len = len;
1294 *namp = sa;
1295 }
1296 return error;
1297 }
1298
1299 int
1300 getsock(fdp, fdes, fpp)
1301 struct filedesc *fdp;
1302 int fdes;
1303 struct file **fpp;
1304 {
1305 register struct file *fp;
1306
1307 if ((unsigned)fdes >= fdp->fd_nfiles ||
1308 (fp = fdp->fd_ofiles[fdes]) == NULL)
1309 return (EBADF);
1310 if (fp->f_type != DTYPE_SOCKET)
1311 return (ENOTSOCK);
1312 *fpp = fp;
1313 return (0);
1314 }
1315
1316 /*
1317 * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
1318 * XXX - The sf_buf functions are currently private to sendfile(2), so have
1319 * been made static, but may be useful in the future for doing zero-copy in
1320 * other parts of the networking code.
1321 */
1322 static void
1323 sf_buf_init(void *arg)
1324 {
1325 int i;
1326
1327 SLIST_INIT(&sf_freelist);
1328 sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
1329 sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT);
1330 bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
1331 for (i = 0; i < nsfbufs; i++) {
1332 sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
1333 SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
1334 }
1335 }
1336
1337 /*
1338 * Get an sf_buf from the freelist. Will block if none are available.
1339 */
1340 static struct sf_buf *
1341 sf_buf_alloc()
1342 {
1343 struct sf_buf *sf;
1344 int s;
1345
1346 s = splimp();
1347 while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
1348 sf_buf_alloc_want = 1;
1349 tsleep(&sf_freelist, PVM, "sfbufa", 0);
1350 }
1351 SLIST_REMOVE_HEAD(&sf_freelist, free_list);
1352 splx(s);
1353 sf->refcnt = 1;
1354 return (sf);
1355 }
1356
1357 #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
1358 static void
1359 sf_buf_ref(caddr_t addr, u_int size)
1360 {
1361 struct sf_buf *sf;
1362
1363 sf = dtosf(addr);
1364 if (sf->refcnt == 0)
1365 panic("sf_buf_ref: referencing a free sf_buf");
1366 sf->refcnt++;
1367 }
1368
1369 /*
1370 * Lose a reference to an sf_buf. When none left, detach mapped page
1371 * and release resources back to the system.
1372 *
1373 * Must be called at splimp.
1374 */
1375 static void
1376 sf_buf_free(caddr_t addr, u_int size)
1377 {
1378 struct sf_buf *sf;
1379 struct vm_page *m;
1380 int s;
1381
1382 sf = dtosf(addr);
1383 if (sf->refcnt == 0)
1384 panic("sf_buf_free: freeing free sf_buf");
1385 sf->refcnt--;
1386 if (sf->refcnt == 0) {
1387 pmap_qremove((vm_offset_t)addr, 1);
1388 m = sf->m;
1389 s = splvm();
1390 vm_page_unwire(m, 0);
1391 /*
1392 * Check for the object going away on us. This can
1393 * happen since we don't hold a reference to it.
1394 * If so, we're responsible for freeing the page.
1395 */
1396 if (m->wire_count == 0 && m->object == NULL)
1397 vm_page_free(m);
1398 splx(s);
1399 sf->m = NULL;
1400 SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
1401 if (sf_buf_alloc_want) {
1402 sf_buf_alloc_want = 0;
1403 wakeup(&sf_freelist);
1404 }
1405 }
1406 }
1407
1408 /*
1409 * sendfile(2).
1410 * int sendfile(int fd, int s, off_t offset, size_t nbytes,
1411 * struct sf_hdtr *hdtr, off_t *sbytes, int flags)
1412 *
1413 * Send a file specified by 'fd' and starting at 'offset' to a socket
1414 * specified by 's'. Send only 'nbytes' of the file or until EOF if
1415 * nbytes == 0. Optionally add a header and/or trailer to the socket
1416 * output. If specified, write the total number of bytes sent into *sbytes.
1417 */
1418 int
1419 sendfile(struct proc *p, struct sendfile_args *uap)
1420 {
1421 struct file *fp;
1422 struct filedesc *fdp = p->p_fd;
1423 struct vnode *vp;
1424 struct vm_object *obj;
1425 struct socket *so;
1426 struct mbuf *m;
1427 struct sf_buf *sf;
1428 struct vm_page *pg;
1429 struct writev_args nuap;
1430 struct sf_hdtr hdtr;
1431 off_t off, xfsize, sbytes = 0;
1432 int error = 0, s;
1433
1434 /*
1435 * Do argument checking. Must be a regular file in, stream
1436 * type and connected socket out, positive offset.
1437 */
1438 if (((u_int)uap->fd) >= fdp->fd_nfiles ||
1439 (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
1440 (fp->f_flag & FREAD) == 0) {
1441 error = EBADF;
1442 goto done;
1443 }
1444 if (fp->f_type != DTYPE_VNODE) {
1445 error = EINVAL;
1446 goto done;
1447 }
1448 vp = (struct vnode *)fp->f_data;
1449 obj = vp->v_object;
1450 if (vp->v_type != VREG || obj == NULL) {
1451 error = EINVAL;
1452 goto done;
1453 }
1454 error = getsock(p->p_fd, uap->s, &fp);
1455 if (error)
1456 goto done;
1457 so = (struct socket *)fp->f_data;
1458 if (so->so_type != SOCK_STREAM) {
1459 error = EINVAL;
1460 goto done;
1461 }
1462 if ((so->so_state & SS_ISCONNECTED) == 0) {
1463 error = ENOTCONN;
1464 goto done;
1465 }
1466 if (uap->offset < 0) {
1467 error = EINVAL;
1468 goto done;
1469 }
1470
1471 /*
1472 * If specified, get the pointer to the sf_hdtr struct for
1473 * any headers/trailers.
1474 */
1475 if (uap->hdtr != NULL) {
1476 error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
1477 if (error)
1478 goto done;
1479 /*
1480 * Send any headers. Wimp out and use writev(2).
1481 */
1482 if (hdtr.headers != NULL) {
1483 nuap.fd = uap->s;
1484 nuap.iovp = hdtr.headers;
1485 nuap.iovcnt = hdtr.hdr_cnt;
1486 error = writev(p, &nuap);
1487 if (error)
1488 goto done;
1489 sbytes += p->p_retval[0];
1490 }
1491 }
1492
1493 /*
1494 * Protect against multiple writers to the socket.
1495 */
1496 (void) sblock(&so->so_snd, M_WAITOK);
1497
1498 /*
1499 * Loop through the pages in the file, starting with the requested
1500 * offset. Get a file page (do I/O if necessary), map the file page
1501 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
1502 * it on the socket.
1503 */
1504 for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
1505 vm_pindex_t pindex;
1506 vm_offset_t pgoff;
1507
1508 pindex = OFF_TO_IDX(off);
1509 retry_lookup:
1510 /*
1511 * Calculate the amount to transfer. Not to exceed a page,
1512 * the EOF, or the passed in nbytes.
1513 */
1514 xfsize = obj->un_pager.vnp.vnp_size - off;
1515 if (xfsize > PAGE_SIZE)
1516 xfsize = PAGE_SIZE;
1517 pgoff = (vm_offset_t)(off & PAGE_MASK);
1518 if (PAGE_SIZE - pgoff < xfsize)
1519 xfsize = PAGE_SIZE - pgoff;
1520 if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
1521 xfsize = uap->nbytes - sbytes;
1522 if (xfsize <= 0)
1523 break;
1524 /*
1525 * Optimize the non-blocking case by looking at the socket space
1526 * before going to the extra work of constituting the sf_buf.
1527 */
1528 if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
1529 if (so->so_state & SS_CANTSENDMORE)
1530 error = EPIPE;
1531 else
1532 error = EAGAIN;
1533 sbunlock(&so->so_snd);
1534 goto done;
1535 }
1536 /*
1537 * Attempt to look up the page. If the page doesn't exist or the
1538 * part we're interested in isn't valid, then read it from disk.
1539 * If some other part of the kernel has this page (i.e. it's busy),
1540 * then disk I/O may be occuring on it, so wait and retry.
1541 */
1542 pg = vm_page_lookup(obj, pindex);
1543 if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy &&
1544 !vm_page_is_valid(pg, pgoff, xfsize))) {
1545 struct uio auio;
1546 struct iovec aiov;
1547 int bsize;
1548
1549 if (pg == NULL) {
1550 pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
1551 if (pg == NULL) {
1552 VM_WAIT;
1553 goto retry_lookup;
1554 }
1555 /*
1556 * don't just clear PG_BUSY manually -
1557 * vm_page_alloc() should be considered opaque,
1558 * use the VM routine provided to clear
1559 * PG_BUSY.
1560 */
1561 vm_page_wakeup(pg);
1562
1563 }
1564 /*
1565 * Ensure that our page is still around when the I/O completes.
1566 */
1567 vm_page_io_start(pg);
1568 vm_page_wire(pg);
1569 /*
1570 * Get the page from backing store.
1571 */
1572 bsize = vp->v_mount->mnt_stat.f_iosize;
1573 auio.uio_iov = &aiov;
1574 auio.uio_iovcnt = 1;
1575 aiov.iov_base = 0;
1576 aiov.iov_len = MAXBSIZE;
1577 auio.uio_resid = MAXBSIZE;
1578 auio.uio_offset = trunc_page(off);
1579 auio.uio_segflg = UIO_NOCOPY;
1580 auio.uio_rw = UIO_READ;
1581 auio.uio_procp = p;
1582 vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
1583 error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
1584 p->p_ucred);
1585 VOP_UNLOCK(vp, 0, p);
1586 vm_page_flag_clear(pg, PG_ZERO);
1587 vm_page_io_finish(pg);
1588 if (error) {
1589 vm_page_unwire(pg, 0);
1590 /*
1591 * See if anyone else might know about this page.
1592 * If not and it is not valid, then free it.
1593 */
1594 if (pg->wire_count == 0 && pg->valid == 0 &&
1595 pg->busy == 0 && !(pg->flags & PG_BUSY) &&
1596 pg->hold_count == 0)
1597 vm_page_free(pg);
1598 sbunlock(&so->so_snd);
1599 goto done;
1600 }
1601 } else {
1602 if ((pg->flags & PG_BUSY) || pg->busy) {
1603 s = splvm();
1604 if ((pg->flags & PG_BUSY) || pg->busy) {
1605 /*
1606 * Page is busy. Wait and retry.
1607 */
1608 vm_page_flag_set(pg, PG_WANTED);
1609 tsleep(pg, PVM, "sfpbsy", 0);
1610 splx(s);
1611 goto retry_lookup;
1612 }
1613 splx(s);
1614 }
1615 /*
1616 * Protect from having the page ripped out from beneath us.
1617 */
1618 vm_page_wire(pg);
1619 }
1620 /*
1621 * Allocate a kernel virtual page and insert the physical page
1622 * into it.
1623 */
1624 sf = sf_buf_alloc();
1625 sf->m = pg;
1626 pmap_qenter(sf->kva, &pg, 1);
1627 /*
1628 * Get an mbuf header and set it up as having external storage.
1629 */
1630 MGETHDR(m, M_WAIT, MT_DATA);
1631 if (m == NULL) {
1632 error = ENOBUFS;
1633 goto done;
1634 }
1635 m->m_ext.ext_free = sf_buf_free;
1636 m->m_ext.ext_ref = sf_buf_ref;
1637 m->m_ext.ext_buf = (void *)sf->kva;
1638 m->m_ext.ext_size = PAGE_SIZE;
1639 m->m_data = (char *) sf->kva + pgoff;
1640 m->m_flags |= M_EXT;
1641 m->m_pkthdr.len = m->m_len = xfsize;
1642 /*
1643 * Add the buffer to the socket buffer chain.
1644 */
1645 s = splnet();
1646 retry_space:
1647 /*
1648 * Make sure that the socket is still able to take more data.
1649 * CANTSENDMORE being true usually means that the connection
1650 * was closed. so_error is true when an error was sensed after
1651 * a previous send.
1652 * The state is checked after the page mapping and buffer
1653 * allocation above since those operations may block and make
1654 * any socket checks stale. From this point forward, nothing
1655 * blocks before the pru_send (or more accurately, any blocking
1656 * results in a loop back to here to re-check).
1657 */
1658 if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
1659 if (so->so_state & SS_CANTSENDMORE) {
1660 error = EPIPE;
1661 } else {
1662 error = so->so_error;
1663 so->so_error = 0;
1664 }
1665 m_freem(m);
1666 sbunlock(&so->so_snd);
1667 splx(s);
1668 goto done;
1669 }
1670 /*
1671 * Wait for socket space to become available. We do this just
1672 * after checking the connection state above in order to avoid
1673 * a race condition with sbwait().
1674 */
1675 if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
1676 if (so->so_state & SS_NBIO) {
1677 m_freem(m);
1678 sbunlock(&so->so_snd);
1679 splx(s);
1680 error = EAGAIN;
1681 goto done;
1682 }
1683 error = sbwait(&so->so_snd);
1684 /*
1685 * An error from sbwait usually indicates that we've
1686 * been interrupted by a signal. If we've sent anything
1687 * then return bytes sent, otherwise return the error.
1688 */
1689 if (error) {
1690 m_freem(m);
1691 sbunlock(&so->so_snd);
1692 splx(s);
1693 goto done;
1694 }
1695 goto retry_space;
1696 }
1697 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
1698 splx(s);
1699 if (error) {
1700 sbunlock(&so->so_snd);
1701 goto done;
1702 }
1703 }
1704 sbunlock(&so->so_snd);
1705
1706 /*
1707 * Send trailers. Wimp out and use writev(2).
1708 */
1709 if (uap->hdtr != NULL && hdtr.trailers != NULL) {
1710 nuap.fd = uap->s;
1711 nuap.iovp = hdtr.trailers;
1712 nuap.iovcnt = hdtr.trl_cnt;
1713 error = writev(p, &nuap);
1714 if (error)
1715 goto done;
1716 sbytes += p->p_retval[0];
1717 }
1718
1719 done:
1720 if (uap->sbytes != NULL) {
1721 copyout(&sbytes, uap->sbytes, sizeof(off_t));
1722 }
1723 return (error);
1724 }
Cache object: e2b4b0f328341b8f38a3e0eaf21b7f9b
|