diff --git a/sys/amd64/linux32/linux32_dummy.c b/sys/amd64/linux32/linux32_dummy.c index 95bf3ec88e9c..1ae64bb0889b 100644 --- a/sys/amd64/linux32/linux32_dummy.c +++ b/sys/amd64/linux32/linux32_dummy.c @@ -70,9 +70,6 @@ DUMMY(pivot_root); DUMMY(mincore); DUMMY(ptrace); DUMMY(lookup_dcookie); -DUMMY(epoll_create); -DUMMY(epoll_ctl); -DUMMY(epoll_wait); DUMMY(remap_file_pages); DUMMY(timer_create); DUMMY(timer_settime); @@ -129,7 +126,6 @@ DUMMY(timerfd_gettime); /* linux 2.6.27: */ DUMMY(signalfd4); DUMMY(eventfd2); -DUMMY(epoll_create1); DUMMY(dup3); DUMMY(inotify_init1); /* linux 2.6.30: */ diff --git a/sys/amd64/linux32/syscalls.master b/sys/amd64/linux32/syscalls.master index c3a10afeb0a3..b9a082938777 100644 --- a/sys/amd64/linux32/syscalls.master +++ b/sys/amd64/linux32/syscalls.master @@ -430,9 +430,11 @@ 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } 253 AUE_NULL STD { int linux_lookup_dcookie(void); } -254 AUE_NULL STD { int linux_epoll_create(void); } -255 AUE_NULL STD { int linux_epoll_ctl(void); } -256 AUE_NULL STD { int linux_epoll_wait(void); } +254 AUE_NULL STD { int linux_epoll_create(l_int size); } +255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \ + struct linux_epoll_event *event); } +256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \ + l_int maxevents, l_int timeout); } 257 AUE_NULL STD { int linux_remap_file_pages(void); } 258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); } 259 AUE_NULL STD { int linux_timer_create(void); } @@ -534,7 +536,7 @@ ; linux 2.6.27: 327 AUE_NULL STD { int linux_signalfd4(void); } 328 AUE_NULL STD { int linux_eventfd2(void); } -329 AUE_NULL STD { int linux_epoll_create1(void); } +329 AUE_NULL STD { int linux_epoll_create1(l_int flags); } 330 AUE_NULL STD { int linux_dup3(void); } 331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); } 332 AUE_NULL STD { int linux_inotify_init1(void); } diff --git a/sys/compat/linux/linux_epoll.c b/sys/compat/linux/linux_epoll.c new file mode 100644 index 000000000000..b9e1f2b0923c --- /dev/null +++ b/sys/compat/linux/linux_epoll.c @@ -0,0 +1,554 @@ +/*- + * Copyright (c) 2007 Roman Divacky + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef KTRACE +#include +#endif + +#ifdef COMPAT_LINUX32 +#include +#include +#else +#include +#include +#endif + +#define ktrepoll_events(evt, count) \ + ktrstruct("linux_epoll_event", (evt), count * sizeof(*evt)) + +/* + * epoll defines 'struct epoll_event' with the field 'data' as 64 bits + * on all architectures. But on 32 bit architectures BSD 'struct kevent' only + * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied + * data verbatuim. Therefore on 32 bit architectures we allocate 64-bit memory + * block to pass user supplied data for every file descriptor. + */ +typedef uint64_t epoll_udata_t; +#if defined(__i386__) +#define EPOLL_WIDE_USER_DATA 1 +#else +#define EPOLL_WIDE_USER_DATA 0 +#endif + +#if EPOLL_WIDE_USER_DATA + +/* + * Approach similar to epoll_user_data could also be used to + * keep track of event bits per file descriptor for all architectures. + * However, it isn't obvious that such tracking would be beneficial + * in practice. + */ + +struct epoll_user_data { + unsigned sz; + epoll_udata_t data[1]; +}; +static MALLOC_DEFINE(M_LINUX_EPOLL, "epoll", "memory for epoll system"); +#define EPOLL_USER_DATA_SIZE(ndata) \ + (sizeof(struct epoll_user_data)+((ndata)-1)*sizeof(epoll_udata_t)) +#define EPOLL_USER_DATA_MARGIN 16 + +static void epoll_init_user_data(struct thread *td, struct file *epfp); +static void epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data); +static epoll_udata_t epoll_get_user_data(struct thread *td, struct file *epfp, int fd); +static fo_close_t epoll_close; + +/* overload kqueue fileops */ +static struct fileops epollops = { + .fo_read = kqueue_read, + .fo_write = kqueue_write, + .fo_truncate = kqueue_truncate, + .fo_ioctl = kqueue_ioctl, + .fo_poll = kqueue_poll, + .fo_kqfilter = kqueue_kqfilter, + .fo_stat = kqueue_stat, + .fo_close = epoll_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, +}; +#endif + +static struct file* epoll_fget(struct thread *td, int epfd); + +struct epoll_copyin_args { + struct kevent *changelist; +}; + +struct epoll_copyout_args { + struct linux_epoll_event *leventlist; + int count; + int error; +#if KTRACE || EPOLL_WIDE_USER_DATA + struct thread *td; +#endif +#if EPOLL_WIDE_USER_DATA + struct file *epfp; +#endif +}; + + +/* Create a new epoll file descriptor. */ + +static int +linux_epoll_create_common(struct thread *td) +{ + struct file *fp; + int error; + + error = kern_kqueue_locked(td, &fp); +#if EPOLL_WIDE_USER_DATA + if (error == 0) { + epoll_init_user_data(td, fp); + fdrop(fp, td); + } +#endif + return (error); +} + +int +linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) +{ + if (args->size <= 0) + return (EINVAL); + /* args->size is unused. Linux just tests it + * and then forgets it as well. */ + + return (linux_epoll_create_common(td)); +} + +int +linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) +{ + int error; + + error = linux_epoll_create_common(td); + + if (!error) { + if (args->flags & LINUX_EPOLL_CLOEXEC) + td->td_proc->p_fd->fd_ofiles[td->td_retval[0]].fde_flags |= UF_EXCLOSE; + if (args->flags & LINUX_EPOLL_NONBLOCK) + linux_msg(td, "epoll_create1 doesn't yet support EPOLL_NONBLOCK flag\n"); + } + + return (error); +} + +/* Structure converting function from epoll to kevent. */ +static int +linux_epoll_to_kevent(struct thread *td, +#if EPOLL_WIDE_USER_DATA + struct file *epfp, +#endif + int fd, struct linux_epoll_event *l_event, int kev_flags, struct kevent *kevent, int *nkevents) +{ + /* flags related to how event is registered */ + if (l_event->events & LINUX_EPOLLONESHOT) + kev_flags |= EV_ONESHOT; + if (l_event->events & LINUX_EPOLLET) { + kev_flags |= EV_CLEAR; + } + + /* flags related to what event is registered */ + if (l_event->events & LINUX_EPOLLIN || + l_event->events & LINUX_EPOLLRDNORM || + l_event->events & LINUX_EPOLLPRI || + l_event->events & LINUX_EPOLLRDHUP) { + EV_SET(kevent++, fd, EVFILT_READ, kev_flags, 0, 0, + (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data)); + ++*nkevents; + } + if (l_event->events & LINUX_EPOLLOUT || + l_event->events & LINUX_EPOLLWRNORM) { + EV_SET(kevent++, fd, EVFILT_WRITE, kev_flags, 0, 0, + (void*)(EPOLL_WIDE_USER_DATA ? 0 : l_event->data)); + ++*nkevents; + } + if (l_event->events & LINUX_EPOLLRDBAND || + l_event->events & LINUX_EPOLLWRBAND || + l_event->events & LINUX_EPOLLHUP || + l_event->events & LINUX_EPOLLMSG || + l_event->events & LINUX_EPOLLWAKEUP || + l_event->events & LINUX_EPOLLERR) { + linux_msg(td, "epoll_ctl doesn't yet support some event flags supplied: 0x%x\n", + l_event->events); + return (EINVAL); + } + +#if EPOLL_WIDE_USER_DATA + epoll_set_user_data(td, epfp, fd, l_event->data); +#endif + return (0); +} + +/* + * Structure converting function from kevent to epoll. In a case + * this is called on error in registration we store the error in + * event->data and pick it up later in linux_epoll_ctl(). + */ +static void +linux_kevent_to_epoll( +#if EPOLL_WIDE_USER_DATA + struct thread *td, struct file *epfp, +#endif + struct kevent *kevent, struct linux_epoll_event *l_event) +{ + if ((kevent->flags & EV_ERROR) == 0) + switch (kevent->filter) { + case EVFILT_READ: + l_event->events = LINUX_EPOLLIN|LINUX_EPOLLRDNORM|LINUX_EPOLLPRI; + break; + case EVFILT_WRITE: + l_event->events = LINUX_EPOLLOUT|LINUX_EPOLLWRNORM; + break; + } +#if EPOLL_WIDE_USER_DATA + l_event->data = epoll_get_user_data(td, epfp, kevent->ident); +#else + l_event->data = (epoll_udata_t)kevent->udata; +#endif +} + +/* + * Copyout callback used by kevent. This converts kevent + * events to epoll events and copies them back to the + * userspace. This is also called on error on registering + * of the filter. + */ +static int +epoll_kev_copyout(void *arg, struct kevent *kevp, int count) +{ + struct epoll_copyout_args *args; + struct linux_epoll_event *eep; + int error, i; + + args = (struct epoll_copyout_args*) arg; + eep = malloc(sizeof(*eep) * count, M_TEMP, M_WAITOK | M_ZERO); + + for (i = 0; i < count; i++) + linux_kevent_to_epoll( +#if EPOLL_WIDE_USER_DATA + args->td, args->epfp, +#endif + &kevp[i], &eep[i]); + + error = copyout(eep, args->leventlist, count * sizeof(*eep)); + if (!error) { + args->leventlist += count; + args->count += count; + } else if (!args->error) + args->error = error; + +#ifdef KTRACE + if (KTRPOINT(args->td, KTR_STRUCT)) + ktrepoll_events(eep, count); +#endif + + free(eep, M_TEMP); + return (error); +} + +/* + * Copyin callback used by kevent. This copies already + * converted filters from kernel memory to the kevent + * internal kernel memory. Hence the memcpy instead of + * copyin. + */ +static int +epoll_kev_copyin(void *arg, struct kevent *kevp, int count) +{ + struct epoll_copyin_args *args; + + args = (struct epoll_copyin_args*) arg; + + memcpy(kevp, args->changelist, count * sizeof(*kevp)); + args->changelist += count; + + return (0); +} + +static int +ignore_enoent(int error) { + if (error == ENOENT) + error = 0; + return (error); +} + +static int +delete_event(struct thread *td, struct file *epfp, int fd, int filter) +{ + struct epoll_copyin_args ciargs; + struct kevent kev; + struct kevent_copyops k_ops = { &ciargs, + NULL, + epoll_kev_copyin}; + ciargs.changelist = &kev; + + EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); + return (kern_kevent_locked(td, epfp, 1, 0, &k_ops, NULL)); +} + +static int +delete_all_events(struct thread *td, struct file *epfp, int fd) +{ + /* here we ignore ENONT, because we don't keep track of events here */ + int error1, error2; + + error1 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_READ)); + error2 = ignore_enoent(delete_event(td, epfp, fd, EVFILT_WRITE)); + + /* report any errors we got */ + if (error1) + return (error1); + if (error2) + return (error2); + return (0); +} + +/* + * Load epoll filter, convert it to kevent filter + * and load it into kevent subsystem. + */ +int +linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) +{ + struct file *epfp; + struct epoll_copyin_args ciargs; + struct kevent kev[2]; + struct kevent_copyops k_ops = { &ciargs, + NULL, + epoll_kev_copyin}; + struct linux_epoll_event le; + int kev_flags; + int nchanges = 0; + int error; + + if (args->epfd == args->fd) + return (EINVAL); + + if (args->op != LINUX_EPOLL_CTL_DEL) { + error = copyin(args->event, &le, sizeof(le)); + if (error) + return (error); + } +#ifdef DEBUG + if (ldebug(epoll_ctl)) + printf(ARGS(epoll_ctl,"%i, %i, %i, %u"), args->epfd, args->op, + args->fd, le.events); +#endif +#ifdef KTRACE + if (KTRPOINT(td, KTR_STRUCT) && args->op != LINUX_EPOLL_CTL_DEL) + ktrepoll_events(&le, 1); +#endif + epfp = epoll_fget(td, args->epfd); + + ciargs.changelist = kev; + + switch (args->op) { + case LINUX_EPOLL_CTL_MOD: + /* we don't memorize which events were set for this FD + on this level, so just delete all we could have set: + EVFILT_READ and EVFILT_WRITE, ignoring any errors + */ + error = delete_all_events(td, epfp, args->fd); + if (error) + goto leave; + /* FALLTHROUGH */ + case LINUX_EPOLL_CTL_ADD: + kev_flags = EV_ADD | EV_ENABLE; + break; + case LINUX_EPOLL_CTL_DEL: + /* CTL_DEL means unregister this fd with this epoll */ + error = delete_all_events(td, epfp, args->fd); + goto leave; + default: + error = EINVAL; + goto leave; + } + + error = linux_epoll_to_kevent(td, +#if EPOLL_WIDE_USER_DATA + epfp, +#endif + args->fd, &le, kev_flags, kev, &nchanges); + if (error) + goto leave; + + error = kern_kevent_locked(td, epfp, nchanges, 0, &k_ops, NULL); +leave: + fdrop(epfp, td); + return (error); +} + +/* + * Wait for a filter to be triggered on the epoll file descriptor. */ +int +linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) +{ + struct file *epfp; + struct timespec ts, *tsp; + struct epoll_copyout_args coargs; + struct kevent_copyops k_ops = { &coargs, + epoll_kev_copyout, + NULL}; + int error; + + if (args->maxevents <= 0 || args->maxevents > LINUX_MAX_EVENTS) + return (EINVAL); + + epfp = epoll_fget(td, args->epfd); + + coargs.leventlist = args->events; + coargs.count = 0; + coargs.error = 0; +#if defined(KTRACE) || EPOLL_WIDE_USER_DATA + coargs.td = td; +#endif +#if EPOLL_WIDE_USER_DATA + coargs.epfp = epfp; +#endif + + if (args->timeout != -1) { + if (args->timeout < 0) { + error = EINVAL; + goto leave; + } + /* Convert from milliseconds to timespec. */ + ts.tv_sec = args->timeout / 1000; + ts.tv_nsec = (args->timeout % 1000) * 1000000; + tsp = &ts; + } else { + tsp = NULL; + } + + error = kern_kevent_locked(td, epfp, 0, args->maxevents, &k_ops, tsp); + if (!error && coargs.error) + error = coargs.error; + + /* + * kern_keven might return ENOMEM which is not expected from epoll_wait. + * Maybe we should translate that but I don't think it matters at all. + */ + + if (!error) + td->td_retval[0] = coargs.count; +leave: + fdrop(epfp, td); + return (error); +} + +#if EPOLL_WIDE_USER_DATA +/* + * we store user_data vector in an unused for kqueue descriptor + * field fvn_epollpriv in struct file. + */ +#define EPOLL_USER_DATA_GET(epfp) \ + ((struct epoll_user_data*)(epfp)->f_vnun.fvn_epollpriv) +#define EPOLL_USER_DATA_SET(epfp, udv) \ + (epfp)->f_vnun.fvn_epollpriv = (udv) + +static void +epoll_init_user_data(struct thread *td, struct file *epfp) +{ + struct epoll_user_data *udv; + + /* override file ops to have our close operation */ + atomic_store_rel_ptr((volatile uintptr_t *)&epfp->f_ops, (uintptr_t)&epollops); + + /* allocate epoll_user_data initially for up to 16 file descriptor values */ + udv = malloc(EPOLL_USER_DATA_SIZE(EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK); + udv->sz = EPOLL_USER_DATA_MARGIN; + EPOLL_USER_DATA_SET(epfp, udv); +} + +static void +epoll_set_user_data(struct thread *td, struct file *epfp, int fd, epoll_udata_t user_data) +{ + struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp); + + if (fd >= udv->sz) { + udv = realloc(udv, EPOLL_USER_DATA_SIZE(fd + EPOLL_USER_DATA_MARGIN), M_LINUX_EPOLL, M_WAITOK); + udv->sz = fd + EPOLL_USER_DATA_MARGIN; + EPOLL_USER_DATA_SET(epfp, udv); + } + udv->data[fd] = user_data; +} + +static epoll_udata_t +epoll_get_user_data(struct thread *td, struct file *epfp, int fd) +{ + struct epoll_user_data *udv = EPOLL_USER_DATA_GET(epfp); + if (fd >= udv->sz) + panic("epoll: user data vector is too small"); + + return (udv->data[fd]); +} + +/*ARGSUSED*/ +static int +epoll_close(struct file *epfp, struct thread *td) +{ + /* free user data vector */ + free(EPOLL_USER_DATA_GET(epfp), M_LINUX_EPOLL); + /* over to kqueue parent */ + return (kqueue_close(epfp, td)); +} +#endif + +static struct file* +epoll_fget(struct thread *td, int epfd) +{ + struct file *fp; + cap_rights_t rights; + + if (fget(td, epfd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp) != 0) + panic("epoll: no file object found for kqueue descriptor"); + + return (fp); +} + diff --git a/sys/compat/linux/linux_epoll.h b/sys/compat/linux/linux_epoll.h new file mode 100644 index 000000000000..aea4185341df --- /dev/null +++ b/sys/compat/linux/linux_epoll.h @@ -0,0 +1,68 @@ +/*- + * Copyright (c) 2007 Roman Divacky + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LINUX_EPOLL_H_ +#define _LINUX_EPOLL_H_ + +#ifdef __amd64__ +#define EPOLL_PACKED __packed +#else +#define EPOLL_PACKED +#endif + +struct linux_epoll_event { + uint32_t events; + uint64_t data; +} EPOLL_PACKED; + +#define LINUX_EPOLLIN 0x001 +#define LINUX_EPOLLPRI 0x002 +#define LINUX_EPOLLOUT 0x004 +#define LINUX_EPOLLRDNORM 0x040 +#define LINUX_EPOLLRDBAND 0x080 +#define LINUX_EPOLLWRNORM 0x100 +#define LINUX_EPOLLWRBAND 0x200 +#define LINUX_EPOLLMSG 0x400 +#define LINUX_EPOLLERR 0x008 +#define LINUX_EPOLLHUP 0x010 +#define LINUX_EPOLLRDHUP 0x2000 +#define LINUX_EPOLLWAKEUP 1u<<29 +#define LINUX_EPOLLONESHOT 1u<<30 +#define LINUX_EPOLLET 1u<<31 + +#define LINUX_EPOLL_CTL_ADD 1 +#define LINUX_EPOLL_CTL_DEL 2 +#define LINUX_EPOLL_CTL_MOD 3 + +#define LINUX_EPOLL_CLOEXEC 02000000 +#define LINUX_EPOLL_NONBLOCK 00004000 + +#define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct linux_epoll_event)) + +#endif /* !_LINUX_EPOLL_H_ */ + diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64 index c1647d3f61a9..babfcab4e0ab 100644 --- a/sys/conf/files.amd64 +++ b/sys/conf/files.amd64 @@ -467,6 +467,7 @@ amd64/linux32/linux32_support.s optional compat_linux32 \ dependency "linux32_assym.h" amd64/linux32/linux32_sysent.c optional compat_linux32 amd64/linux32/linux32_sysvec.c optional compat_linux32 +compat/linux/linux_epoll.c optional compat_linux32 compat/linux/linux_emul.c optional compat_linux32 compat/linux/linux_file.c optional compat_linux32 compat/linux/linux_fork.c optional compat_linux32 diff --git a/sys/conf/files.i386 b/sys/conf/files.i386 index 24dac5fb8772..17791a6efb21 100644 --- a/sys/conf/files.i386 +++ b/sys/conf/files.i386 @@ -80,6 +80,7 @@ hptrr_lib.o optional hptrr \ cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_epoll.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux diff --git a/sys/conf/files.pc98 b/sys/conf/files.pc98 index a8e60b6f7290..ee915018e433 100644 --- a/sys/conf/files.pc98 +++ b/sys/conf/files.pc98 @@ -41,6 +41,7 @@ ukbdmap.h optional ukbd_dflt_keymap \ cddl/contrib/opensolaris/common/atomic/i386/opensolaris_atomic.S optional zfs compile-with "${ZFS_S}" compat/linprocfs/linprocfs.c optional linprocfs compat/linsysfs/linsysfs.c optional linsysfs +compat/linux/linux_epoll.c optional compat_linux compat/linux/linux_emul.c optional compat_linux compat/linux/linux_file.c optional compat_linux compat/linux/linux_fork.c optional compat_linux diff --git a/sys/i386/linux/linux_dummy.c b/sys/i386/linux/linux_dummy.c index ab77790c3e5b..f8526e116075 100644 --- a/sys/i386/linux/linux_dummy.c +++ b/sys/i386/linux/linux_dummy.c @@ -72,9 +72,6 @@ DUMMY(setfsgid); DUMMY(pivot_root); DUMMY(mincore); DUMMY(lookup_dcookie); -DUMMY(epoll_create); -DUMMY(epoll_ctl); -DUMMY(epoll_wait); DUMMY(remap_file_pages); DUMMY(fstatfs64); DUMMY(mbind); @@ -120,7 +117,6 @@ DUMMY(timerfd_gettime); /* linux 2.6.27: */ DUMMY(signalfd4); DUMMY(eventfd2); -DUMMY(epoll_create1); DUMMY(dup3); DUMMY(inotify_init1); /* linux 2.6.30: */ diff --git a/sys/i386/linux/syscalls.master b/sys/i386/linux/syscalls.master index bb1716638639..1f260bdb14d9 100644 --- a/sys/i386/linux/syscalls.master +++ b/sys/i386/linux/syscalls.master @@ -432,9 +432,11 @@ 251 AUE_NULL UNIMPL 252 AUE_EXIT STD { int linux_exit_group(int error_code); } 253 AUE_NULL STD { int linux_lookup_dcookie(void); } -254 AUE_NULL STD { int linux_epoll_create(void); } -255 AUE_NULL STD { int linux_epoll_ctl(void); } -256 AUE_NULL STD { int linux_epoll_wait(void); } +254 AUE_NULL STD { int linux_epoll_create(l_int size); } +255 AUE_NULL STD { int linux_epoll_ctl(l_int epfd, l_int op, l_int fd, \ + struct linux_epoll_event *event); } +256 AUE_NULL STD { int linux_epoll_wait(l_int epfd, struct linux_epoll_event *events, \ + l_int maxevents, l_int timeout); } 257 AUE_NULL STD { int linux_remap_file_pages(void); } 258 AUE_NULL STD { int linux_set_tid_address(int *tidptr); } 259 AUE_NULL STD { int linux_timer_create(clockid_t clock_id, \ @@ -544,7 +546,7 @@ ; linux 2.6.27: 327 AUE_NULL STD { int linux_signalfd4(void); } 328 AUE_NULL STD { int linux_eventfd2(void); } -329 AUE_NULL STD { int linux_epoll_create1(void); } +329 AUE_NULL STD { int linux_epoll_create1(l_int flags); } 330 AUE_NULL STD { int linux_dup3(void); } 331 AUE_NULL STD { int linux_pipe2(l_int *pipefds, l_int flags); } 332 AUE_NULL STD { int linux_inotify_init1(void); } diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c index 85ea78cd6da1..f4b6c1931364 100644 --- a/sys/kern/kern_event.c +++ b/sys/kern/kern_event.c @@ -107,16 +107,7 @@ static void kqueue_wakeup(struct kqueue *kq); static struct filterops *kqueue_fo_find(int filt); static void kqueue_fo_release(int filt); -static fo_rdwr_t kqueue_read; -static fo_rdwr_t kqueue_write; -static fo_truncate_t kqueue_truncate; -static fo_ioctl_t kqueue_ioctl; -static fo_poll_t kqueue_poll; -static fo_kqfilter_t kqueue_kqfilter; -static fo_stat_t kqueue_stat; -static fo_close_t kqueue_close; - -static struct fileops kqueueops = { +struct fileops kqueueops = { .fo_read = kqueue_read, .fo_write = kqueue_write, .fo_truncate = kqueue_truncate, @@ -303,7 +294,7 @@ filt_fileattach(struct knote *kn) } /*ARGSUSED*/ -static int +int kqueue_kqfilter(struct file *fp, struct knote *kn) { struct kqueue *kq = kn->kn_fp->f_data; @@ -688,34 +679,7 @@ filt_usertouch(struct knote *kn, struct kevent *kev, u_long type) int sys_kqueue(struct thread *td, struct kqueue_args *uap) { - struct filedesc *fdp; - struct kqueue *kq; - struct file *fp; - int fd, error; - - fdp = td->td_proc->p_fd; - error = falloc(td, &fp, &fd, 0); - if (error) - goto done2; - - /* An extra reference on `fp' has been held for us by falloc(). */ - kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); - mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK); - TAILQ_INIT(&kq->kq_head); - kq->kq_fdp = fdp; - knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); - TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); - - FILEDESC_XLOCK(fdp); - TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); - FILEDESC_XUNLOCK(fdp); - - finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); - fdrop(fp, td); - - td->td_retval[0] = fd; -done2: - return (error); + return (kern_kqueue(td)); } #ifndef _SYS_SYSPROTO_H_ @@ -816,20 +780,76 @@ kevent_copyin(void *arg, struct kevent *kevp, int count) return (error); } +int +kern_kqueue(struct thread *td) +{ + struct file *fp; + int error; + + error = kern_kqueue_locked(td, &fp); + + fdrop(fp, td); + return (error); +} + +int +kern_kqueue_locked(struct thread *td, struct file **fpp) +{ + struct filedesc *fdp; + struct kqueue *kq; + struct file *fp; + int fd, error; + + fdp = td->td_proc->p_fd; + error = falloc(td, &fp, &fd, 0); + if (error) + return (error); + + /* An extra reference on `fp' has been held for us by falloc(). */ + kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO); + mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK); + TAILQ_INIT(&kq->kq_head); + kq->kq_fdp = fdp; + knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock); + TASK_INIT(&kq->kq_task, 0, kqueue_task, kq); + + FILEDESC_XLOCK(fdp); + TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list); + FILEDESC_XUNLOCK(fdp); + + finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops); + + td->td_retval[0] = fd; + *fpp = fp; + return (0); +} + int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout) +{ + struct file *fp; + cap_rights_t rights; + int error; + + if ((error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp)) != 0) + return (error); + + error = kern_kevent_locked(td, fp, nchanges, nevents, k_ops, timeout); + + fdrop(fp, td); + return (error); +} + +int +kern_kevent_locked(struct thread *td, struct file *fp, int nchanges, int nevents, + struct kevent_copyops *k_ops, const struct timespec *timeout) { struct kevent keva[KQ_NEVENTS]; struct kevent *kevp, *changes; struct kqueue *kq; - struct file *fp; - cap_rights_t rights; int i, n, nerrors, error; - error = fget(td, fd, cap_rights_init(&rights, CAP_POST_EVENT), &fp); - if (error != 0) - return (error); if ((error = kqueue_acquire(fp, &kq)) != 0) goto done_norel; @@ -872,7 +892,6 @@ kern_kevent(struct thread *td, int fd, int nchanges, int nevents, done: kqueue_release(kq, 0); done_norel: - fdrop(fp, td); return (error); } @@ -1526,7 +1545,7 @@ kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops, * This could be expanded to call kqueue_scan, if desired. */ /*ARGSUSED*/ -static int +int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { @@ -1534,7 +1553,7 @@ kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, } /*ARGSUSED*/ -static int +int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { @@ -1542,7 +1561,7 @@ kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, } /*ARGSUSED*/ -static int +int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred, struct thread *td) { @@ -1551,7 +1570,7 @@ kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred, } /*ARGSUSED*/ -static int +int kqueue_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { @@ -1599,7 +1618,7 @@ kqueue_ioctl(struct file *fp, u_long cmd, void *data, } /*ARGSUSED*/ -static int +int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { @@ -1626,7 +1645,7 @@ kqueue_poll(struct file *fp, int events, struct ucred *active_cred, } /*ARGSUSED*/ -static int +int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { @@ -1644,7 +1663,7 @@ kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, } /*ARGSUSED*/ -static int +int kqueue_close(struct file *fp, struct thread *td) { struct kqueue *kq = fp->f_data; diff --git a/sys/modules/linux/Makefile b/sys/modules/linux/Makefile index 7ed6e989388f..ce46aa879a93 100644 --- a/sys/modules/linux/Makefile +++ b/sys/modules/linux/Makefile @@ -9,7 +9,7 @@ CFLAGS+=-DCOMPAT_FREEBSD32 -DCOMPAT_LINUX32 KMOD= linux SRCS= linux_fork.c linux${SFX}_dummy.c linux_emul.c linux_file.c \ - linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c \ + linux_futex.c linux_getcwd.c linux_ioctl.c linux_ipc.c linux_epoll.c \ linux${SFX}_machdep.c linux_mib.c linux_misc.c linux_signal.c \ linux_socket.c linux_stats.c linux_sysctl.c linux${SFX}_sysent.c \ linux${SFX}_sysvec.c linux_uid16.c linux_util.c linux_time.c \ diff --git a/sys/sys/event.h b/sys/sys/event.h index 03bd7b90cfa5..60bced72d2eb 100644 --- a/sys/sys/event.h +++ b/sys/sys/event.h @@ -236,6 +236,9 @@ struct proc; struct knlist; struct mtx; struct rwlock; +struct uio; +struct stat; +struct ucred; extern void knote(struct knlist *list, long hint, int lockflags); extern void knote_fork(struct knlist *list, int pid); @@ -261,6 +264,21 @@ extern int kqfd_register(int fd, struct kevent *kev, struct thread *p, extern int kqueue_add_filteropts(int filt, struct filterops *filtops); extern int kqueue_del_filteropts(int filt); +int kqueue_read(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td); +int kqueue_write(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td); +int kqueue_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td); +int kqueue_ioctl(struct file *fp, u_long cmd, void *data, + struct ucred *active_cred, struct thread *td); +int kqueue_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td); +int kqueue_kqfilter(struct file *fp, struct knote *kn); +int kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred, + struct thread *td); +int kqueue_close(struct file *fp, struct thread *td); + #else /* !_KERNEL */ #include diff --git a/sys/sys/file.h b/sys/sys/file.h index 7b373f0d7091..b4c1ad4ead60 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -169,6 +169,8 @@ struct file { union { struct cdev_privdata *fvn_cdevpriv; /* (d) Private data for the cdev. */ + void *fvn_epollpriv; + /* (d) Private data for the epoll. */ struct fadvise_info *fvn_advice; } f_vnun; /* diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 17f2b97db9ce..92dd8befb7a9 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -121,8 +121,13 @@ int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data); int kern_jail(struct thread *td, struct jail *j); int kern_jail_get(struct thread *td, struct uio *options, int flags); int kern_jail_set(struct thread *td, struct uio *options, int flags); +int kern_kqueue(struct thread *td); +int kern_kqueue_locked(struct thread *td, struct file **fpp); int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, struct kevent_copyops *k_ops, const struct timespec *timeout); +int kern_kevent_locked(struct thread *td, struct file *fp, int nchanges, + int nevents, + struct kevent_copyops *k_ops, const struct timespec *timeout); int kern_kldload(struct thread *td, const char *file, int *fileid); int kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat); int kern_kldunload(struct thread *td, int fileid, int flags); @@ -248,6 +253,8 @@ int kern_utimes(struct thread *td, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg); int kern_utimesat(struct thread *td, int fd, char *path, enum uio_seg pathseg, struct timeval *tptr, enum uio_seg tptrseg); +int kern_utimensat(struct thread *td, int fd, char *path, + enum uio_seg pathseg, struct timespec *tptr, enum uio_seg tptrseg); int kern_wait(struct thread *td, pid_t pid, int *status, int options, struct rusage *rup); int kern_wait6(struct thread *td, enum idtype idtype, id_t id, int *status,