/* * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_zero.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef ZERO_COPY_SOCKETS #include #include #endif SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, NULL, UIO_MAXIOV, "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); #ifdef ZERO_COPY_SOCKETS /* Declared in uipc_socket.c */ extern int so_zero_copy_receive; static int vm_pgmoveco(vm_map_t mapa, vm_object_t srcobj, vm_offset_t kaddr, vm_offset_t uaddr) { vm_map_t map = mapa; vm_page_t kern_pg, user_pg; vm_object_t uobject; vm_map_entry_t entry; vm_pindex_t upindex, kpindex; vm_prot_t prot; boolean_t wired; /* * First lookup the kernel page. */ kern_pg = PHYS_TO_VM_PAGE(vtophys(kaddr)); /* * XXX The vm object containing kern_pg needs locking. */ if ((vm_map_lookup(&map, uaddr, VM_PROT_WRITE, &entry, &uobject, &upindex, &prot, &wired)) != KERN_SUCCESS) { return(EFAULT); } VM_OBJECT_LOCK(uobject); if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) { do vm_page_lock_queues(); while (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco")); vm_page_busy(user_pg); pmap_remove_all(user_pg); vm_page_free(user_pg); } else vm_page_lock_queues(); if (kern_pg->busy || ((kern_pg->queue - kern_pg->pc) == PQ_FREE) || (kern_pg->hold_count != 0)|| (kern_pg->flags & PG_BUSY)) { printf("vm_pgmoveco: pindex(%lu), busy(%d), PG_BUSY(%d), " "hold(%d) paddr(0x%lx)\n", (u_long)kern_pg->pindex, kern_pg->busy, (kern_pg->flags & PG_BUSY) ? 1 : 0, kern_pg->hold_count, (u_long)kern_pg->phys_addr); if ((kern_pg->queue - kern_pg->pc) == PQ_FREE) panic("vm_pgmoveco: renaming free page"); else panic("vm_pgmoveco: renaming busy page"); } kpindex = kern_pg->pindex; vm_page_busy(kern_pg); vm_page_rename(kern_pg, uobject, upindex); vm_page_flag_clear(kern_pg, PG_BUSY); kern_pg->valid = VM_PAGE_BITS_ALL; vm_page_unlock_queues(); VM_OBJECT_UNLOCK(uobject); vm_map_lookup_done(map, entry); return(KERN_SUCCESS); } #endif /* ZERO_COPY_SOCKETS */ int uiomove(void *cp, int n, struct uio *uio) { struct thread *td = curthread; struct iovec *iov; u_int cnt; int error = 0; int save = 0; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomove: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomove proc")); if (td) { mtx_lock_spin(&sched_lock); save = td->td_flags & TDF_DEADLKTREAT; td->td_flags |= TDF_DEADLKTREAT; mtx_unlock_spin(&sched_lock); } while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); if (uio->uio_rw == UIO_READ) error = copyout(cp, iov->iov_base, cnt); else error = copyin(iov->iov_base, cp, cnt); if (error) goto out; break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp = (char *)cp + cnt; n -= cnt; } out: if (td && save == 0) { mtx_lock_spin(&sched_lock); td->td_flags &= ~TDF_DEADLKTREAT; mtx_unlock_spin(&sched_lock); } return (error); } /* * Wrapper for uiomove() that validates the arguments against a known-good * kernel buffer. Currently, uiomove accepts a signed (n) argument, which * is almost definitely a bad thing, so we catch that here as well. We * return a runtime failure, but it might be desirable to generate a runtime * assertion failure instead. */ int uiomove_frombuf(void *buf, int buflen, struct uio *uio) { unsigned int offset, n; if (uio->uio_offset < 0 || uio->uio_resid < 0 || (offset = uio->uio_offset) != uio->uio_offset) return (EINVAL); if (buflen <= 0 || offset >= buflen) return (0); if ((n = buflen - offset) > INT_MAX) return (EINVAL); return (uiomove((char *)buf + offset, n, uio)); } #ifdef ZERO_COPY_SOCKETS /* * Experimental support for zero-copy I/O */ static int userspaceco(void *cp, u_int cnt, struct uio *uio, struct vm_object *obj, int disposable) { struct iovec *iov; int error; iov = uio->uio_iov; if (uio->uio_rw == UIO_READ) { if ((so_zero_copy_receive != 0) && (obj != NULL) && ((cnt & PAGE_MASK) == 0) && ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && ((uio->uio_offset & PAGE_MASK) == 0) && ((((intptr_t) cp) & PAGE_MASK) == 0) && (obj->type == OBJT_DEFAULT) && (disposable != 0)) { /* SOCKET: use page-trading */ /* * We only want to call vm_pgmoveco() on * disposeable pages, since it gives the * kernel page to the userland process. */ error = vm_pgmoveco(&curproc->p_vmspace->vm_map, obj, (vm_offset_t)cp, (vm_offset_t)iov->iov_base); /* * If we get an error back, attempt * to use copyout() instead. The * disposable page should be freed * automatically if we weren't able to move * it into userland. */ if (error != 0) error = copyout(cp, iov->iov_base, cnt); } else { error = copyout(cp, iov->iov_base, cnt); } } else { error = copyin(iov->iov_base, cp, cnt); } return (error); } int uiomoveco(void *cp, int n, struct uio *uio, struct vm_object *obj, int disposable) { struct iovec *iov; u_int cnt; int error; KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, ("uiomoveco: mode")); KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_td == curthread, ("uiomoveco proc")); while (n > 0 && uio->uio_resid) { iov = uio->uio_iov; cnt = iov->iov_len; if (cnt == 0) { uio->uio_iov++; uio->uio_iovcnt--; continue; } if (cnt > n) cnt = n; switch (uio->uio_segflg) { case UIO_USERSPACE: if (ticks - PCPU_GET(switchticks) >= hogticks) uio_yield(); error = userspaceco(cp, cnt, uio, obj, disposable); if (error) return (error); break; case UIO_SYSSPACE: if (uio->uio_rw == UIO_READ) bcopy(cp, iov->iov_base, cnt); else bcopy(iov->iov_base, cp, cnt); break; case UIO_NOCOPY: break; } iov->iov_base = (char *)iov->iov_base + cnt; iov->iov_len -= cnt; uio->uio_resid -= cnt; uio->uio_offset += cnt; cp = (char *)cp + cnt; n -= cnt; } return (0); } #endif /* ZERO_COPY_SOCKETS */ /* * Give next character to user as result of read. */ int ureadc(int c, struct uio *uio) { struct iovec *iov; char *iov_base; again: if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) panic("ureadc"); iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iovcnt--; uio->uio_iov++; goto again; } switch (uio->uio_segflg) { case UIO_USERSPACE: if (subyte(iov->iov_base, c) < 0) return (EFAULT); break; case UIO_SYSSPACE: iov_base = iov->iov_base; *iov_base = c; iov->iov_base = iov_base; break; case UIO_NOCOPY: break; } iov->iov_base = (char *)iov->iov_base + 1; iov->iov_len--; uio->uio_resid--; uio->uio_offset++; return (0); } /* * General routine to allocate a hash table. */ void * hashinit(int elements, struct malloc_type *type, u_long *hashmask) { long hashsize; LIST_HEAD(generic, generic) *hashtbl; int i; if (elements <= 0) panic("hashinit: bad elements"); for (hashsize = 1; hashsize <= elements; hashsize <<= 1) continue; hashsize >>= 1; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); for (i = 0; i < hashsize; i++) LIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } void hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask) { LIST_HEAD(generic, generic) *hashtbl, *hp; hashtbl = vhashtbl; for (hp = hashtbl; hp <= &hashtbl[hashmask]; hp++) if (!LIST_EMPTY(hp)) panic("hashdestroy: hash not empty"); free(hashtbl, type); } static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; #define NPRIMES (sizeof(primes) / sizeof(primes[0])) /* * General routine to allocate a prime number sized hash table. */ void * phashinit(int elements, struct malloc_type *type, u_long *nentries) { long hashsize; LIST_HEAD(generic, generic) *hashtbl; int i; if (elements <= 0) panic("phashinit: bad elements"); for (i = 1, hashsize = primes[1]; hashsize <= elements;) { i++; if (i == NPRIMES) break; hashsize = primes[i]; } hashsize = primes[i - 1]; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); for (i = 0; i < hashsize; i++) LIST_INIT(&hashtbl[i]); *nentries = hashsize; return (hashtbl); } void uio_yield(void) { struct thread *td; td = curthread; mtx_lock_spin(&sched_lock); DROP_GIANT(); sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */ td->td_proc->p_stats->p_ru.ru_nivcsw++; mi_switch(); mtx_unlock_spin(&sched_lock); PICKUP_GIANT(); } int copyinfrom(const void * __restrict src, void * __restrict dst, size_t len, int seg) { int error = 0; switch (seg) { case UIO_USERSPACE: error = copyin(src, dst, len); break; case UIO_SYSSPACE: bcopy(src, dst, len); break; default: panic("copyinfrom: bad seg %d\n", seg); } return (error); } int copyinstrfrom(const void * __restrict src, void * __restrict dst, size_t len, size_t * __restrict copied, int seg) { int error = 0; switch (seg) { case UIO_USERSPACE: error = copyinstr(src, dst, len, copied); break; case UIO_SYSSPACE: error = copystr(src, dst, len, copied); break; default: panic("copyinstrfrom: bad seg %d\n", seg); } return (error); }