diff --git a/UPDATING b/UPDATING index 85a005af6cd4..f60c8046072a 100644 --- a/UPDATING +++ b/UPDATING @@ -22,6 +22,10 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 8.x IS SLOW: to maximize performance. (To disable malloc debugging, run ln -s aj /etc/malloc.conf.) +20090527: + Add support for hierarchical jails. Remove global securelevel. + Bump __FreeBSD_version to 800091. + 20090523: The layout of struct vnet_net has changed, therefore modules need to be rebuilt. diff --git a/lib/libc/sys/jail.2 b/lib/libc/sys/jail.2 index 726aa789afd7..4b047bfd6f12 100644 --- a/lib/libc/sys/jail.2 +++ b/lib/libc/sys/jail.2 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" -.Dd April 29, 2009 +.Dd May 27, 2009 .Dt JAIL 2 .Os .Sh NAME @@ -283,7 +283,7 @@ of the jail for the given address family. It is possible to identify a process as jailed by examining .Dq Li /proc//status : it will show a field near the end of the line, either as -a single hyphen for a process at large, or the hostname currently +a single hyphen for a process at large, or the name currently set for the prison for jailed processes. .Sh ERRORS The @@ -292,7 +292,10 @@ system call will fail if: .Bl -tag -width Er .It Bq Er EPERM -This process is not allowed to create a jail. +This process is not allowed to create a jail, either because it is not +the super-user, or because it is in a jail where the +.Va allow.jails +parameter is not set. .It Bq Er EFAULT .Fa jail points to an address outside the allocated address space of the process. @@ -308,7 +311,10 @@ system call will fail if: .Bl -tag -width Er .It Bq Er EPERM -This process is not allowed to create a jail. +This process is not allowed to create a jail, either because it is not +the super-user, or because it is in a jail where the +.Va allow.jails +parameter is not set. .It Bq Er EPERM A jail parameter was set to a less restrictive value then the current environment. @@ -324,6 +330,11 @@ or parameter does not exist, and the .Dv JAIL_CREATE flag is not set. +.It Bq Er ENOENT +The jail referred to by a +.Va jid +is not accessible by the process, because the process is in a different +jail. .It Bq Er EEXIST The jail referred to by a .Va jid @@ -368,6 +379,11 @@ or .Va name parameter does not exist. .It Bq Er ENOENT +The jail referred to by a +.Va jid +is not accessible by the process, because the process is in a different +jail. +.It Bq Er ENOENT The .Va lastjid parameter is greater than the highest current jail ID. @@ -429,4 +445,4 @@ for R&D Associates who contributed it to .Fx . .An James Gritton -added the extensible jail parameters. +added the extensible jail parameters and hierarchical jails. diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c index a8f9b55402fe..9301b8de62bc 100644 --- a/sys/compat/freebsd32/freebsd32_misc.c +++ b/sys/compat/freebsd32/freebsd32_misc.c @@ -112,8 +112,6 @@ CTASSERT(sizeof(struct msghdr32) == 28); CTASSERT(sizeof(struct stat32) == 96); CTASSERT(sizeof(struct sigaction32) == 24); -extern int jail_max_af_ips; - static int freebsd32_kevent_copyout(void *arg, struct kevent *kevp, int count); static int freebsd32_kevent_copyin(void *arg, struct kevent *kevp, int count); @@ -2044,17 +2042,9 @@ freebsd32_sysctl(struct thread *td, struct freebsd32_sysctl_args *uap) int freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap) { - struct iovec optiov[10]; - struct uio opt; - char *u_path, *u_hostname, *u_name; -#ifdef INET - struct in_addr *u_ip4; -#endif -#ifdef INET6 - struct in6_addr *u_ip6; -#endif uint32_t version; int error; + struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) @@ -2066,45 +2056,14 @@ freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap) /* FreeBSD single IPv4 jails. */ struct jail32_v0 j32_v0; + bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0)); if (error) return (error); - u_path = malloc(MAXPATHLEN + MAXHOSTNAMELEN, M_TEMP, M_WAITOK); - u_hostname = u_path + MAXPATHLEN; - opt.uio_iov = optiov; - opt.uio_iovcnt = 4; - opt.uio_offset = -1; - opt.uio_resid = -1; - opt.uio_segflg = UIO_SYSSPACE; - opt.uio_rw = UIO_READ; - opt.uio_td = td; - optiov[0].iov_base = "path"; - optiov[0].iov_len = sizeof("path"); - optiov[1].iov_base = u_path; - error = copyinstr(PTRIN(j32_v0.path), u_path, MAXPATHLEN, - &optiov[1].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - optiov[2].iov_base = "host.hostname"; - optiov[2].iov_len = sizeof("host.hostname"); - optiov[3].iov_base = u_hostname; - error = copyinstr(PTRIN(j32_v0.hostname), u_hostname, - MAXHOSTNAMELEN, &optiov[3].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } -#ifdef INET - optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = &j32_v0.ip_number; - j32_v0.ip_number = htonl(j32_v0.ip_number); - optiov[opt.uio_iovcnt].iov_len = sizeof(j32_v0.ip_number); - opt.uio_iovcnt++; -#endif + CP(j32_v0, j, version); + PTRIN_CP(j32_v0, j, path); + PTRIN_CP(j32_v0, j, hostname); + j.ip4s = j32_v0.ip_number; break; } @@ -2119,109 +2078,18 @@ freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap) { /* FreeBSD multi-IPv4/IPv6,noIP jails. */ struct jail32 j32; - size_t tmplen; error = copyin(uap->jail, &j32, sizeof(struct jail32)); if (error) return (error); - tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; -#ifdef INET - if (j32.ip4s > jail_max_af_ips) - return (EINVAL); - tmplen += j32.ip4s * sizeof(struct in_addr); -#else - if (j32.ip4s > 0) - return (EINVAL); -#endif -#ifdef INET6 - if (j32.ip6s > jail_max_af_ips) - return (EINVAL); - tmplen += j32.ip6s * sizeof(struct in6_addr); -#else - if (j32.ip6s > 0) - return (EINVAL); -#endif - u_path = malloc(tmplen, M_TEMP, M_WAITOK); - u_hostname = u_path + MAXPATHLEN; - u_name = u_hostname + MAXHOSTNAMELEN; -#ifdef INET - u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); -#endif -#ifdef INET6 -#ifdef INET - u_ip6 = (struct in6_addr *)(u_ip4 + j32.ip4s); -#else - u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); -#endif -#endif - opt.uio_iov = optiov; - opt.uio_iovcnt = 4; - opt.uio_offset = -1; - opt.uio_resid = -1; - opt.uio_segflg = UIO_SYSSPACE; - opt.uio_rw = UIO_READ; - opt.uio_td = td; - optiov[0].iov_base = "path"; - optiov[0].iov_len = sizeof("path"); - optiov[1].iov_base = u_path; - error = copyinstr(PTRIN(j32.path), u_path, MAXPATHLEN, - &optiov[1].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - optiov[2].iov_base = "host.hostname"; - optiov[2].iov_len = sizeof("host.hostname"); - optiov[3].iov_base = u_hostname; - error = copyinstr(PTRIN(j32.hostname), u_hostname, - MAXHOSTNAMELEN, &optiov[3].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - if (PTRIN(j32.jailname) != NULL) { - optiov[opt.uio_iovcnt].iov_base = "name"; - optiov[opt.uio_iovcnt].iov_len = sizeof("name"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_name; - error = copyinstr(PTRIN(j32.jailname), u_name, - MAXHOSTNAMELEN, &optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; - } -#ifdef INET - optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_ip4; - optiov[opt.uio_iovcnt].iov_len = - j32.ip4s * sizeof(struct in_addr); - error = copyin(PTRIN(j32.ip4), u_ip4, - optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; -#endif -#ifdef INET6 - optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_ip6; - optiov[opt.uio_iovcnt].iov_len = - j32.ip6s * sizeof(struct in6_addr); - error = copyin(PTRIN(j32.ip6), u_ip6, - optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; -#endif + CP(j32, j, version); + PTRIN_CP(j32, j, path); + PTRIN_CP(j32, j, hostname); + PTRIN_CP(j32, j, jailname); + CP(j32, j, ip4s); + CP(j32, j, ip6s); + PTRIN_CP(j32, j, ip4); + PTRIN_CP(j32, j, ip6); break; } @@ -2229,9 +2097,7 @@ freebsd32_jail(struct thread *td, struct freebsd32_jail_args *uap) /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } - error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); - free(u_path, M_TEMP); - return (error); + return (kern_jail(td, &j)); } int diff --git a/sys/compat/linux/linux_mib.c b/sys/compat/linux/linux_mib.c index f3d5ec7a2ca5..58af9c5d62e7 100644 --- a/sys/compat/linux/linux_mib.c +++ b/sys/compat/linux/linux_mib.c @@ -57,16 +57,18 @@ struct linux_prison { int pr_osrel; }; +static struct linux_prison lprison0 = { + .pr_osname = "Linux", + .pr_osrelease = "2.6.16", + .pr_oss_version = 0x030600, + .pr_osrel = 2006016 +}; + static unsigned linux_osd_jail_slot; SYSCTL_NODE(_compat, OID_AUTO, linux, CTLFLAG_RW, 0, "Linux mode"); -static struct mtx osname_lock; -MTX_SYSINIT(linux_osname, &osname_lock, "linux osname", MTX_DEF); - -static char linux_osname[LINUX_MAX_UTSNAME] = "Linux"; - static int linux_sysctl_osname(SYSCTL_HANDLER_ARGS) { @@ -86,9 +88,6 @@ SYSCTL_PROC(_compat_linux, OID_AUTO, osname, 0, 0, linux_sysctl_osname, "A", "Linux kernel OS name"); -static char linux_osrelease[LINUX_MAX_UTSNAME] = "2.6.16"; -static int linux_osrel = 2006016; - static int linux_sysctl_osrelease(SYSCTL_HANDLER_ARGS) { @@ -108,8 +107,6 @@ SYSCTL_PROC(_compat_linux, OID_AUTO, osrelease, 0, 0, linux_sysctl_osrelease, "A", "Linux kernel OS release"); -static int linux_oss_version = 0x030600; - static int linux_sysctl_oss_version(SYSCTL_HANDLER_ARGS) { @@ -161,69 +158,74 @@ linux_map_osrel(char *osrelease, int *osrel) } /* - * Returns holding the prison mutex if return non-NULL. + * Find a prison with Linux info. + * Return the Linux info and the (locked) prison. */ static struct linux_prison * -linux_get_prison(struct thread *td, struct prison **prp) +linux_find_prison(struct prison *spr, struct prison **prp) { struct prison *pr; struct linux_prison *lpr; - KASSERT(td == curthread, ("linux_get_prison() called on !curthread")); - *prp = pr = td->td_ucred->cr_prison; - if (pr == NULL || !linux_osd_jail_slot) - return (NULL); - mtx_lock(&pr->pr_mtx); - lpr = osd_jail_get(pr, linux_osd_jail_slot); - if (lpr == NULL) + if (!linux_osd_jail_slot) + /* In case osd_register failed. */ + spr = &prison0; + for (pr = spr;; pr = pr->pr_parent) { + mtx_lock(&pr->pr_mtx); + lpr = (pr == &prison0) + ? &lprison0 + : osd_jail_get(pr, linux_osd_jail_slot); + if (lpr != NULL) + break; mtx_unlock(&pr->pr_mtx); + } + *prp = pr; return (lpr); } /* - * Ensure a prison has its own Linux info. The prison should be locked on - * entrance and will be locked on exit (though it may get unlocked in the - * interrim). + * Ensure a prison has its own Linux info. If lprp is non-null, point it to + * the Linux info and lock the prison. */ static int linux_alloc_prison(struct prison *pr, struct linux_prison **lprp) { + struct prison *ppr; struct linux_prison *lpr, *nlpr; int error; /* If this prison already has Linux info, return that. */ error = 0; - mtx_assert(&pr->pr_mtx, MA_OWNED); - lpr = osd_jail_get(pr, linux_osd_jail_slot); - if (lpr != NULL) + lpr = linux_find_prison(pr, &ppr); + if (ppr == pr) goto done; /* * Allocate a new info record. Then check again, in case something * changed during the allocation. */ - mtx_unlock(&pr->pr_mtx); + mtx_unlock(&ppr->pr_mtx); nlpr = malloc(sizeof(struct linux_prison), M_PRISON, M_WAITOK); - mtx_lock(&pr->pr_mtx); - lpr = osd_jail_get(pr, linux_osd_jail_slot); - if (lpr != NULL) { + lpr = linux_find_prison(pr, &ppr); + if (ppr == pr) { free(nlpr, M_PRISON); goto done; } + /* Inherit the initial values from the ancestor. */ + mtx_lock(&pr->pr_mtx); error = osd_jail_set(pr, linux_osd_jail_slot, nlpr); - if (error) - free(nlpr, M_PRISON); - else { + if (error == 0) { + bcopy(lpr, nlpr, sizeof(*lpr)); lpr = nlpr; - mtx_lock(&osname_lock); - strncpy(lpr->pr_osname, linux_osname, LINUX_MAX_UTSNAME); - strncpy(lpr->pr_osrelease, linux_osrelease, LINUX_MAX_UTSNAME); - lpr->pr_oss_version = linux_oss_version; - lpr->pr_osrel = linux_osrel; - mtx_unlock(&osname_lock); + } else { + free(nlpr, M_PRISON); + lpr = NULL; } -done: + mtx_unlock(&ppr->pr_mtx); + done: if (lprp != NULL) *lprp = lpr; + else + mtx_unlock(&pr->pr_mtx); return (error); } @@ -233,7 +235,6 @@ linux_alloc_prison(struct prison *pr, struct linux_prison **lprp) static int linux_prison_create(void *obj, void *data) { - int error; struct prison *pr = obj; struct vfsoptlist *opts = data; @@ -243,10 +244,7 @@ linux_prison_create(void *obj, void *data) * Inherit a prison's initial values from its parent * (different from NULL which also inherits changes). */ - mtx_lock(&pr->pr_mtx); - error = linux_alloc_prison(pr, NULL); - mtx_unlock(&pr->pr_mtx); - return (error); + return linux_alloc_prison(pr, NULL); } static int @@ -254,7 +252,7 @@ linux_prison_check(void *obj __unused, void *data) { struct vfsoptlist *opts = data; char *osname, *osrelease; - int error, len, oss_version; + int error, len, osrel, oss_version; /* Check that the parameters are correct. */ (void)vfs_flagopt(opts, "linux", NULL, 0); @@ -280,6 +278,11 @@ linux_prison_check(void *obj __unused, void *data) vfs_opterror(opts, "linux.osrelease too long"); return (ENAMETOOLONG); } + error = linux_map_osrel(osrelease, &osrel); + if (error != 0) { + vfs_opterror(opts, "linux.osrelease format error"); + return (error); + } } error = vfs_copyopt(opts, "linux.oss_version", &oss_version, sizeof(oss_version)); @@ -310,7 +313,7 @@ linux_prison_set(void *obj, void *data) yeslinux = 1; error = vfs_copyopt(opts, "linux.oss_version", &oss_version, sizeof(oss_version)); - gotversion = error == 0; + gotversion = (error == 0); yeslinux |= gotversion; if (nolinux) { /* "nolinux": inherit the parent's Linux info. */ @@ -322,7 +325,6 @@ linux_prison_set(void *obj, void *data) * "linux" or "linux.*": * the prison gets its own Linux info. */ - mtx_lock(&pr->pr_mtx); error = linux_alloc_prison(pr, &lpr); if (error) { mtx_unlock(&pr->pr_mtx); @@ -360,14 +362,16 @@ static int linux_prison_get(void *obj, void *data) { struct linux_prison *lpr; + struct prison *ppr; struct prison *pr = obj; struct vfsoptlist *opts = data; int error, i; - mtx_lock(&pr->pr_mtx); - /* Tell whether this prison has its own Linux info. */ - lpr = osd_jail_get(pr, linux_osd_jail_slot); - i = lpr != NULL; + static int version0; + + /* See if this prison is the one with the Linux info. */ + lpr = linux_find_prison(pr, &ppr); + i = (ppr == pr); error = vfs_setopt(opts, "linux", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; @@ -375,11 +379,22 @@ linux_prison_get(void *obj, void *data) error = vfs_setopt(opts, "nolinux", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done; - /* - * It's kind of bogus to give the root info, but leave it to the caller - * to check the above flag. - */ - if (lpr != NULL) { + if (i) { + /* + * If this prison is inheriting its Linux info, report + * empty/zero parameters. + */ + error = vfs_setopts(opts, "linux.osname", ""); + if (error != 0 && error != ENOENT) + goto done; + error = vfs_setopts(opts, "linux.osrelease", ""); + if (error != 0 && error != ENOENT) + goto done; + error = vfs_setopt(opts, "linux.oss_version", &version0, + sizeof(lpr->pr_oss_version)); + if (error != 0 && error != ENOENT) + goto done; + } else { error = vfs_setopts(opts, "linux.osname", lpr->pr_osname); if (error != 0 && error != ENOENT) goto done; @@ -390,24 +405,11 @@ linux_prison_get(void *obj, void *data) &lpr->pr_oss_version, sizeof(lpr->pr_oss_version)); if (error != 0 && error != ENOENT) goto done; - } else { - mtx_lock(&osname_lock); - error = vfs_setopts(opts, "linux.osname", linux_osname); - if (error != 0 && error != ENOENT) - goto done; - error = vfs_setopts(opts, "linux.osrelease", linux_osrelease); - if (error != 0 && error != ENOENT) - goto done; - error = vfs_setopt(opts, "linux.oss_version", - &linux_oss_version, sizeof(linux_oss_version)); - if (error != 0 && error != ENOENT) - goto done; - mtx_unlock(&osname_lock); } error = 0; done: - mtx_unlock(&pr->pr_mtx); + mtx_unlock(&ppr->pr_mtx); return (error); } @@ -434,11 +436,8 @@ linux_osd_jail_register(void) if (linux_osd_jail_slot > 0) { /* Copy the system linux info to any current prisons. */ sx_xlock(&allprison_lock); - TAILQ_FOREACH(pr, &allprison, pr_list) { - mtx_lock(&pr->pr_mtx); + TAILQ_FOREACH(pr, &allprison, pr_list) (void)linux_alloc_prison(pr, NULL); - mtx_unlock(&pr->pr_mtx); - } sx_xunlock(&allprison_lock); } } @@ -457,15 +456,9 @@ linux_get_osname(struct thread *td, char *dst) struct prison *pr; struct linux_prison *lpr; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - bcopy(lpr->pr_osname, dst, LINUX_MAX_UTSNAME); - mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&osname_lock); - bcopy(linux_osname, dst, LINUX_MAX_UTSNAME); - mtx_unlock(&osname_lock); - } + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + bcopy(lpr->pr_osname, dst, LINUX_MAX_UTSNAME); + mtx_unlock(&pr->pr_mtx); } int @@ -474,16 +467,9 @@ linux_set_osname(struct thread *td, char *osname) struct prison *pr; struct linux_prison *lpr; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME); - mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&osname_lock); - strcpy(linux_osname, osname); - mtx_unlock(&osname_lock); - } - + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + strlcpy(lpr->pr_osname, osname, LINUX_MAX_UTSNAME); + mtx_unlock(&pr->pr_mtx); return (0); } @@ -493,15 +479,9 @@ linux_get_osrelease(struct thread *td, char *dst) struct prison *pr; struct linux_prison *lpr; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - bcopy(lpr->pr_osrelease, dst, LINUX_MAX_UTSNAME); - mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&osname_lock); - bcopy(linux_osrelease, dst, LINUX_MAX_UTSNAME); - mtx_unlock(&osname_lock); - } + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + bcopy(lpr->pr_osrelease, dst, LINUX_MAX_UTSNAME); + mtx_unlock(&pr->pr_mtx); } int @@ -511,12 +491,9 @@ linux_kernver(struct thread *td) struct linux_prison *lpr; int osrel; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - osrel = lpr->pr_osrel; - mtx_unlock(&pr->pr_mtx); - } else - osrel = linux_osrel; + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + osrel = lpr->pr_osrel; + mtx_unlock(&pr->pr_mtx); return (osrel); } @@ -527,27 +504,12 @@ linux_set_osrelease(struct thread *td, char *osrelease) struct linux_prison *lpr; int error; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - error = linux_map_osrel(osrelease, &lpr->pr_osrel); - if (error) { - mtx_unlock(&pr->pr_mtx); - return (error); - } + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + error = linux_map_osrel(osrelease, &lpr->pr_osrel); + if (error == 0) strlcpy(lpr->pr_osrelease, osrelease, LINUX_MAX_UTSNAME); - mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&osname_lock); - error = linux_map_osrel(osrelease, &linux_osrel); - if (error) { - mtx_unlock(&osname_lock); - return (error); - } - strcpy(linux_osrelease, osrelease); - mtx_unlock(&osname_lock); - } - - return (0); + mtx_unlock(&pr->pr_mtx); + return (error); } int @@ -557,12 +519,9 @@ linux_get_oss_version(struct thread *td) struct linux_prison *lpr; int version; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - version = lpr->pr_oss_version; - mtx_unlock(&pr->pr_mtx); - } else - version = linux_oss_version; + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + version = lpr->pr_oss_version; + mtx_unlock(&pr->pr_mtx); return (version); } @@ -572,16 +531,9 @@ linux_set_oss_version(struct thread *td, int oss_version) struct prison *pr; struct linux_prison *lpr; - lpr = linux_get_prison(td, &pr); - if (lpr != NULL) { - lpr->pr_oss_version = oss_version; - mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&osname_lock); - linux_oss_version = oss_version; - mtx_unlock(&osname_lock); - } - + lpr = linux_find_prison(td->td_ucred->cr_prison, &pr); + lpr->pr_oss_version = oss_version; + mtx_unlock(&pr->pr_mtx); return (0); } diff --git a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c index fee97c581fcc..3e0e63321919 100644 --- a/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c +++ b/sys/contrib/ipfilter/netinet/ip_fil_freebsd.c @@ -318,8 +318,10 @@ int iplioctl(dev, cmd, data, mode # if (__FreeBSD_version >= 500024) struct thread *p; # if (__FreeBSD_version >= 500043) +# define p_cred td_ucred # define p_uid td_ucred->cr_ruid # else +# define p_cred t_proc->p_cred # define p_uid t_proc->p_cred->p_ruid # endif # else @@ -342,7 +344,11 @@ int mode; SPL_INT(s); #if (BSD >= 199306) && defined(_KERNEL) +# if (__FreeBSD_version >= 500034) + if (securelevel_ge(p->p_cred, 3) && (mode & FWRITE)) +# else if ((securelevel >= 3) && (mode & FWRITE)) +# endif return EPERM; #endif diff --git a/sys/contrib/ipfilter/netinet/ip_nat.c b/sys/contrib/ipfilter/netinet/ip_nat.c index d6f0b55a187a..f790c7d1ec3b 100644 --- a/sys/contrib/ipfilter/netinet/ip_nat.c +++ b/sys/contrib/ipfilter/netinet/ip_nat.c @@ -662,7 +662,11 @@ void *ctx; return EPERM; } # else +# if defined(__FreeBSD_version) && (__FreeBSD_version >= 500034) + if (securelevel_ge(curthread->td_ucred, 3) && (mode & FWRITE)) { +# else if ((securelevel >= 3) && (mode & FWRITE)) { +# endif return EPERM; } # endif diff --git a/sys/fs/procfs/procfs_status.c b/sys/fs/procfs/procfs_status.c index a1eefb48bd56..80d739285600 100644 --- a/sys/fs/procfs/procfs_status.c +++ b/sys/fs/procfs/procfs_status.c @@ -151,10 +151,11 @@ procfs_doprocstatus(PFS_FILL_ARGS) sbuf_printf(sb, ",%lu", (u_long)cr->cr_groups[i]); } - if (jailed(p->p_ucred)) { - mtx_lock(&p->p_ucred->cr_prison->pr_mtx); - sbuf_printf(sb, " %s", p->p_ucred->cr_prison->pr_host); - mtx_unlock(&p->p_ucred->cr_prison->pr_mtx); + if (jailed(cr)) { + mtx_lock(&cr->cr_prison->pr_mtx); + sbuf_printf(sb, " %s", + prison_name(td->td_ucred->cr_prison, cr->cr_prison)); + mtx_unlock(&cr->cr_prison->pr_mtx); } else { sbuf_printf(sb, " -"); } diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 97b7f0fcf578..1f412c6f2749 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -436,6 +437,7 @@ proc0_init(void *dummy __unused) td->td_oncpu = 0; td->td_flags = TDF_INMEM|TDP_KTHREAD; td->td_cpuset = cpuset_thread0(); + prison0.pr_cpuset = cpuset_ref(td->td_cpuset); p->p_peers = 0; p->p_leader = p; @@ -452,7 +454,7 @@ proc0_init(void *dummy __unused) p->p_ucred->cr_ngroups = 1; /* group 0 */ p->p_ucred->cr_uidinfo = uifind(0); p->p_ucred->cr_ruidinfo = uifind(0); - p->p_ucred->cr_prison = NULL; /* Don't jail it. */ + p->p_ucred->cr_prison = &prison0; #ifdef VIMAGE KASSERT(LIST_FIRST(&vimage_head) != NULL, ("vimage_head empty")); P_TO_VIMAGE(p) = LIST_FIRST(&vimage_head); /* set ucred->cr_vimage */ diff --git a/sys/kern/kern_cpuset.c b/sys/kern/kern_cpuset.c index fc2e51b246c8..ed8c311845e5 100644 --- a/sys/kern/kern_cpuset.c +++ b/sys/kern/kern_cpuset.c @@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -53,7 +54,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include /* Must come after sys/proc.h */ #include @@ -225,23 +225,16 @@ cpuset_lookup(cpusetid_t setid, struct thread *td) KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__)); if (set != NULL && jailed(td->td_ucred)) { - struct cpuset *rset, *jset; - struct prison *pr; + struct cpuset *jset, *tset; - rset = cpuset_refroot(set); - - pr = td->td_ucred->cr_prison; - mtx_lock(&pr->pr_mtx); - cpuset_ref(pr->pr_cpuset); - jset = pr->pr_cpuset; - mtx_unlock(&pr->pr_mtx); - - if (jset->cs_id != rset->cs_id) { + jset = td->td_ucred->cr_prison->pr_cpuset; + for (tset = set; tset != NULL; tset = tset->cs_parent) + if (tset == jset) + break; + if (tset == NULL) { cpuset_rel(set); set = NULL; } - cpuset_rel(jset); - cpuset_rel(rset); } return (set); @@ -456,25 +449,14 @@ cpuset_which(cpuwhich_t which, id_t id, struct proc **pp, struct thread **tdp, struct prison *pr; sx_slock(&allprison_lock); - pr = prison_find(id); + pr = prison_find_child(curthread->td_ucred->cr_prison, id); sx_sunlock(&allprison_lock); if (pr == NULL) return (ESRCH); - if (jailed(curthread->td_ucred)) { - if (curthread->td_ucred->cr_prison == pr) { - cpuset_ref(pr->pr_cpuset); - set = pr->pr_cpuset; - } - } else { - cpuset_ref(pr->pr_cpuset); - set = pr->pr_cpuset; - } + cpuset_ref(pr->pr_cpuset); + *setp = pr->pr_cpuset; mtx_unlock(&pr->pr_mtx); - if (set) { - *setp = set; - return (0); - } - return (ESRCH); + return (0); } case CPU_WHICH_IRQ: return (0); @@ -731,21 +713,15 @@ cpuset_thread0(void) * In case of no error, returns the set in *setp locked with a reference. */ int -cpuset_create_root(struct thread *td, struct cpuset **setp) +cpuset_create_root(struct prison *pr, struct cpuset **setp) { - struct cpuset *root; struct cpuset *set; int error; - KASSERT(td != NULL, ("[%s:%d] invalid td", __func__, __LINE__)); + KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__)); KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__)); - thread_lock(td); - root = cpuset_refroot(td->td_cpuset); - thread_unlock(td); - - error = cpuset_create(setp, td->td_cpuset, &root->cs_mask); - cpuset_rel(root); + error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask); if (error) return (error); diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 4553bb40e469..c93d05a64d32 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2416,24 +2416,25 @@ dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, } /* - * Scan all active processes to see if any of them have a current or root - * directory of `olddp'. If so, replace them with the new mount point. + * Scan all active processes and prisons to see if any of them have a current + * or root directory of `olddp'. If so, replace them with the new mount point. */ void mountcheckdirs(struct vnode *olddp, struct vnode *newdp) { struct filedesc *fdp; + struct prison *pr; struct proc *p; int nrele; if (vrefcnt(olddp) == 1) return; + nrele = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { fdp = fdhold(p); if (fdp == NULL) continue; - nrele = 0; FILEDESC_XLOCK(fdp); if (fdp->fd_cdir == olddp) { vref(newdp); @@ -2445,17 +2446,40 @@ mountcheckdirs(struct vnode *olddp, struct vnode *newdp) fdp->fd_rdir = newdp; nrele++; } + if (fdp->fd_jdir == olddp) { + vref(newdp); + fdp->fd_jdir = newdp; + nrele++; + } FILEDESC_XUNLOCK(fdp); fddrop(fdp); - while (nrele--) - vrele(olddp); } sx_sunlock(&allproc_lock); if (rootvnode == olddp) { - vrele(rootvnode); vref(newdp); rootvnode = newdp; + nrele++; } + mtx_lock(&prison0.pr_mtx); + if (prison0.pr_root == olddp) { + vref(newdp); + prison0.pr_root = newdp; + nrele++; + } + mtx_unlock(&prison0.pr_mtx); + sx_slock(&allprison_lock); + TAILQ_FOREACH(pr, &allprison, pr_list) { + mtx_lock(&pr->pr_mtx); + if (pr->pr_root == olddp) { + vref(newdp); + pr->pr_root = newdp; + nrele++; + } + mtx_unlock(&pr->pr_mtx); + } + sx_sunlock(&allprison_lock); + while (nrele--) + vrele(olddp); } struct filedesc_to_leader * diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 89b92c61a1a1..575fe9b71e1b 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -455,9 +455,8 @@ exit1(struct thread *td, int rv) p->p_xstat = rv; p->p_xthread = td; - /* In case we are jailed tell the prison that we are gone. */ - if (jailed(p->p_ucred)) - prison_proc_free(p->p_ucred->cr_prison); + /* Tell the prison that we are gone. */ + prison_proc_free(p->p_ucred->cr_prison); #ifdef KDTRACE_HOOKS /* diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 4e8dbb19ee2f..0c16c3ffa7f5 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -54,7 +55,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -458,9 +458,8 @@ fork1(td, flags, pages, procp) p2->p_ucred = crhold(td->td_ucred); - /* In case we are jailed tell the prison that we exist. */ - if (jailed(p2->p_ucred)) - prison_proc_hold(p2->p_ucred->cr_prison); + /* Tell the prison that we exist. */ + prison_proc_hold(p2->p_ucred->cr_prison); PROC_UNLOCK(p2); diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c index d0cc440dd7d2..b12a478e153f 100644 --- a/sys/kern/kern_jail.c +++ b/sys/kern/kern_jail.c @@ -41,6 +41,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -48,7 +49,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -71,66 +71,38 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_PRISON, "prison", "Prison structures"); -SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, - "Jail rules"); +/* prison0 describes what is "real" about the system. */ +struct prison prison0 = { + .pr_id = 0, + .pr_name = "0", + .pr_ref = 1, + .pr_uref = 1, + .pr_path = "/", + .pr_securelevel = -1, + .pr_children = LIST_HEAD_INITIALIZER(&prison0.pr_children), + .pr_allow = PR_ALLOW_ALL, +}; +MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF); -int jail_set_hostname_allowed = 1; -SYSCTL_INT(_security_jail, OID_AUTO, set_hostname_allowed, CTLFLAG_RW, - &jail_set_hostname_allowed, 0, - "Processes in jail can set their hostnames"); - -int jail_socket_unixiproute_only = 1; -SYSCTL_INT(_security_jail, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW, - &jail_socket_unixiproute_only, 0, - "Processes in jail are limited to creating UNIX/IP/route sockets only"); - -int jail_sysvipc_allowed = 0; -SYSCTL_INT(_security_jail, OID_AUTO, sysvipc_allowed, CTLFLAG_RW, - &jail_sysvipc_allowed, 0, - "Processes in jail can use System V IPC primitives"); - -static int jail_enforce_statfs = 2; -SYSCTL_INT(_security_jail, OID_AUTO, enforce_statfs, CTLFLAG_RW, - &jail_enforce_statfs, 0, - "Processes in jail cannot see all mounted file systems"); - -int jail_allow_raw_sockets = 0; -SYSCTL_INT(_security_jail, OID_AUTO, allow_raw_sockets, CTLFLAG_RW, - &jail_allow_raw_sockets, 0, - "Prison root can create raw sockets"); - -int jail_chflags_allowed = 0; -SYSCTL_INT(_security_jail, OID_AUTO, chflags_allowed, CTLFLAG_RW, - &jail_chflags_allowed, 0, - "Processes in jail can alter system file flags"); - -int jail_mount_allowed = 0; -SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW, - &jail_mount_allowed, 0, - "Processes in jail can mount/unmount jail-friendly file systems"); - -int jail_max_af_ips = 255; -SYSCTL_INT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, - &jail_max_af_ips, 0, - "Number of IP addresses a jail may have at most per address family"); - -/* allprison, lastprid, and prisoncount are protected by allprison_lock. */ +/* allprison and lastprid are protected by allprison_lock. */ struct sx allprison_lock; SX_SYSINIT(allprison_lock, &allprison_lock, "allprison"); struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison); int lastprid = 0; -int prisoncount = 0; static int do_jail_attach(struct thread *td, struct prison *pr); static void prison_complete(void *context, int pending); static void prison_deref(struct prison *pr, int flags); +static char *prison_path(struct prison *pr1, struct prison *pr2); +static void prison_remove_one(struct prison *pr); #ifdef INET static int _prison_check_ip4(struct prison *pr, struct in_addr *ia); +static int prison_restrict_ip4(struct prison *pr, struct in_addr *newip4); #endif #ifdef INET6 static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6); +static int prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6); #endif -static int sysctl_jail_list(SYSCTL_HANDLER_ARGS); /* Flags for prison_deref */ #define PD_DEREF 0x01 @@ -139,6 +111,58 @@ static int sysctl_jail_list(SYSCTL_HANDLER_ARGS); #define PD_LIST_SLOCKED 0x08 #define PD_LIST_XLOCKED 0x10 +/* + * Parameter names corresponding to PR_* flag values + */ +static char *pr_flag_names[] = { + [0] = "persist", +#ifdef INET + [2] = "ip4", +#endif +#ifdef INET6 + [3] = "ip6", +#endif +}; + +static char *pr_flag_nonames[] = { + [0] = "nopersist", +#ifdef INET + [2] = "noip4", +#endif +#ifdef INET6 + [3] = "noip6", +#endif +}; + +static char *pr_allow_names[] = { + "allow.set_hostname", + "allow.sysvipc", + "allow.raw_sockets", + "allow.chflags", + "allow.mount", + "allow.quotas", + "allow.jails", + "allow.socket_af", +}; + +static char *pr_allow_nonames[] = { + "allow.noset_hostname", + "allow.nosysvipc", + "allow.noraw_sockets", + "allow.nochflags", + "allow.nomount", + "allow.noquotas", + "allow.nojails", + "allow.nosocket_af", +}; + +#define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME +static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW; +static int jail_default_enforce_statfs = 2; +#if defined(INET) || defined(INET6) +static int jail_max_af_ips = 255; +#endif + #ifdef INET static int qcmp_v4(const void *ip1, const void *ip2) @@ -195,17 +219,9 @@ qcmp_v6(const void *ip1, const void *ip2) int jail(struct thread *td, struct jail_args *uap) { - struct iovec optiov[10]; - struct uio opt; - char *u_path, *u_hostname, *u_name; -#ifdef INET - struct in_addr *u_ip4; -#endif -#ifdef INET6 - struct in6_addr *u_ip6; -#endif uint32_t version; int error; + struct jail j; error = copyin(uap->jail, &version, sizeof(uint32_t)); if (error) @@ -214,48 +230,17 @@ jail(struct thread *td, struct jail_args *uap) switch (version) { case 0: { - /* FreeBSD single IPv4 jails. */ struct jail_v0 j0; + /* FreeBSD single IPv4 jails. */ + bzero(&j, sizeof(struct jail)); error = copyin(uap->jail, &j0, sizeof(struct jail_v0)); if (error) return (error); - u_path = malloc(MAXPATHLEN + MAXHOSTNAMELEN, M_TEMP, M_WAITOK); - u_hostname = u_path + MAXPATHLEN; - opt.uio_iov = optiov; - opt.uio_iovcnt = 4; - opt.uio_offset = -1; - opt.uio_resid = -1; - opt.uio_segflg = UIO_SYSSPACE; - opt.uio_rw = UIO_READ; - opt.uio_td = td; - optiov[0].iov_base = "path"; - optiov[0].iov_len = sizeof("path"); - optiov[1].iov_base = u_path; - error = - copyinstr(j0.path, u_path, MAXPATHLEN, &optiov[1].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - optiov[2].iov_base = "host.hostname"; - optiov[2].iov_len = sizeof("host.hostname"); - optiov[3].iov_base = u_hostname; - error = copyinstr(j0.hostname, u_hostname, MAXHOSTNAMELEN, - &optiov[3].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } -#ifdef INET - optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = &j0.ip_number; - j0.ip_number = htonl(j0.ip_number); - optiov[opt.uio_iovcnt].iov_len = sizeof(j0.ip_number); - opt.uio_iovcnt++; -#endif + j.version = j0.version; + j.path = j0.path; + j.hostname = j0.hostname; + j.ip4s = j0.ip_number; break; } @@ -267,122 +252,168 @@ jail(struct thread *td, struct jail_args *uap) return (EINVAL); case 2: /* JAIL_API_VERSION */ - { /* FreeBSD multi-IPv4/IPv6,noIP jails. */ - struct jail j; - size_t tmplen; - error = copyin(uap->jail, &j, sizeof(struct jail)); if (error) return (error); - tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; -#ifdef INET - if (j.ip4s > jail_max_af_ips) - return (EINVAL); - tmplen += j.ip4s * sizeof(struct in_addr); -#else - if (j.ip4s > 0) - return (EINVAL); -#endif -#ifdef INET6 - if (j.ip6s > jail_max_af_ips) - return (EINVAL); - tmplen += j.ip6s * sizeof(struct in6_addr); -#else - if (j.ip6s > 0) - return (EINVAL); -#endif - u_path = malloc(tmplen, M_TEMP, M_WAITOK); - u_hostname = u_path + MAXPATHLEN; - u_name = u_hostname + MAXHOSTNAMELEN; -#ifdef INET - u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); -#endif -#ifdef INET6 -#ifdef INET - u_ip6 = (struct in6_addr *)(u_ip4 + j.ip4s); -#else - u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); -#endif -#endif - opt.uio_iov = optiov; - opt.uio_iovcnt = 4; - opt.uio_offset = -1; - opt.uio_resid = -1; - opt.uio_segflg = UIO_SYSSPACE; - opt.uio_rw = UIO_READ; - opt.uio_td = td; - optiov[0].iov_base = "path"; - optiov[0].iov_len = sizeof("path"); - optiov[1].iov_base = u_path; - error = - copyinstr(j.path, u_path, MAXPATHLEN, &optiov[1].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - optiov[2].iov_base = "host.hostname"; - optiov[2].iov_len = sizeof("host.hostname"); - optiov[3].iov_base = u_hostname; - error = copyinstr(j.hostname, u_hostname, MAXHOSTNAMELEN, - &optiov[3].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - if (j.jailname != NULL) { - optiov[opt.uio_iovcnt].iov_base = "name"; - optiov[opt.uio_iovcnt].iov_len = sizeof("name"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_name; - error = copyinstr(j.jailname, u_name, MAXHOSTNAMELEN, - &optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; - } -#ifdef INET - optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_ip4; - optiov[opt.uio_iovcnt].iov_len = - j.ip4s * sizeof(struct in_addr); - error = copyin(j.ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; -#endif -#ifdef INET6 - optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; - optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); - opt.uio_iovcnt++; - optiov[opt.uio_iovcnt].iov_base = u_ip6; - optiov[opt.uio_iovcnt].iov_len = - j.ip6s * sizeof(struct in6_addr); - error = copyin(j.ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); - if (error) { - free(u_path, M_TEMP); - return (error); - } - opt.uio_iovcnt++; -#endif break; - } default: /* Sci-Fi jails are not supported, sorry. */ return (EINVAL); } + return (kern_jail(td, &j)); +} + +int +kern_jail(struct thread *td, struct jail *j) +{ + struct iovec optiov[24]; + struct uio opt; + char *u_path, *u_hostname, *u_name; +#ifdef INET + int ip4s; + struct in_addr *u_ip4; +#endif +#ifdef INET6 + struct in6_addr *u_ip6; +#endif + size_t tmplen; + int error, enforce_statfs, fi; + + bzero(&optiov, sizeof(optiov)); + opt.uio_iov = optiov; + opt.uio_iovcnt = 0; + opt.uio_offset = -1; + opt.uio_resid = -1; + opt.uio_segflg = UIO_SYSSPACE; + opt.uio_rw = UIO_READ; + opt.uio_td = td; + + /* Set permissions for top-level jails from sysctls. */ + if (!jailed(td->td_ucred)) { + for (fi = 0; fi < sizeof(pr_allow_names) / + sizeof(pr_allow_names[0]); fi++) { + optiov[opt.uio_iovcnt].iov_base = + (jail_default_allow & (1 << fi)) + ? pr_allow_names[fi] : pr_allow_nonames[fi]; + optiov[opt.uio_iovcnt].iov_len = + strlen(optiov[opt.uio_iovcnt].iov_base) + 1; + opt.uio_iovcnt += 2; + } + optiov[opt.uio_iovcnt].iov_base = "enforce_statfs"; + optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs"); + opt.uio_iovcnt++; + enforce_statfs = jail_default_enforce_statfs; + optiov[opt.uio_iovcnt].iov_base = &enforce_statfs; + optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs); + opt.uio_iovcnt++; + } + + tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN; +#ifdef INET + ip4s = (j->version == 0) ? 1 : j->ip4s; + if (ip4s > jail_max_af_ips) + return (EINVAL); + tmplen += ip4s * sizeof(struct in_addr); +#else + if (j->ip4s > 0) + return (EINVAL); +#endif +#ifdef INET6 + if (j->ip6s > jail_max_af_ips) + return (EINVAL); + tmplen += j->ip6s * sizeof(struct in6_addr); +#else + if (j->ip6s > 0) + return (EINVAL); +#endif + u_path = malloc(tmplen, M_TEMP, M_WAITOK); + u_hostname = u_path + MAXPATHLEN; + u_name = u_hostname + MAXHOSTNAMELEN; +#ifdef INET + u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN); +#endif +#ifdef INET6 +#ifdef INET + u_ip6 = (struct in6_addr *)(u_ip4 + ip4s); +#else + u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN); +#endif +#endif + optiov[opt.uio_iovcnt].iov_base = "path"; + optiov[opt.uio_iovcnt].iov_len = sizeof("path"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_path; + error = copyinstr(j->path, u_path, MAXPATHLEN, + &optiov[opt.uio_iovcnt].iov_len); + if (error) { + free(u_path, M_TEMP); + return (error); + } + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = "host.hostname"; + optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_hostname; + error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN, + &optiov[opt.uio_iovcnt].iov_len); + if (error) { + free(u_path, M_TEMP); + return (error); + } + opt.uio_iovcnt++; + if (j->jailname != NULL) { + optiov[opt.uio_iovcnt].iov_base = "name"; + optiov[opt.uio_iovcnt].iov_len = sizeof("name"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_name; + error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN, + &optiov[opt.uio_iovcnt].iov_len); + if (error) { + free(u_path, M_TEMP); + return (error); + } + opt.uio_iovcnt++; + } +#ifdef INET + optiov[opt.uio_iovcnt].iov_base = "ip4.addr"; + optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_ip4; + optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr); + if (j->version == 0) + u_ip4->s_addr = j->ip4s; + else { + error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len); + if (error) { + free(u_path, M_TEMP); + return (error); + } + } + opt.uio_iovcnt++; +#endif +#ifdef INET6 + optiov[opt.uio_iovcnt].iov_base = "ip6.addr"; + optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr"); + opt.uio_iovcnt++; + optiov[opt.uio_iovcnt].iov_base = u_ip6; + optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr); + error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len); + if (error) { + free(u_path, M_TEMP); + return (error); + } + opt.uio_iovcnt++; +#endif + KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]), + ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt)); error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH); free(u_path, M_TEMP); return (error); } + /* * struct jail_set_args { * struct iovec *iovp; @@ -420,23 +451,27 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) #endif struct vfsopt *opt; struct vfsoptlist *opts; - struct prison *pr, *deadpr, *tpr; + struct prison *pr, *deadpr, *mypr, *ppr, *tpr; struct vnode *root; char *errmsg, *host, *name, *p, *path; +#if defined(INET) || defined(INET6) void *op; - int created, cuflags, error, errmsg_len, errmsg_pos; - int gotslevel, jid, len; +#endif + size_t namelen, onamelen; + int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos; + int gotenforce, gotslevel, fi, jid, len; int slevel, vfslocked; #if defined(INET) || defined(INET6) - int ii; + int ii, ij; #endif #ifdef INET - int ip4s; + int ip4s, ip4a, redo_ip4; #endif #ifdef INET6 - int ip6s; + int ip6s, ip6a, redo_ip6; #endif unsigned pr_flags, ch_flags; + unsigned pr_allow, ch_allow, tallow; char numbuf[12]; error = priv_check(td, PRIV_JAIL_SET); @@ -444,6 +479,9 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) error = priv_check(td, PRIV_JAIL_ATTACH); if (error) return (error); + mypr = ppr = td->td_ucred->cr_prison; + if ((flags & JAIL_CREATE) && !(mypr->pr_allow & PR_ALLOW_JAILS)) + return (EPERM); if (flags & ~JAIL_SET_MASK) return (EINVAL); @@ -461,12 +499,17 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) if (error) return (error); #ifdef INET + ip4a = 0; ip4 = NULL; #endif #ifdef INET6 + ip6a = 0; ip6 = NULL; #endif +#if defined(INET) || defined(INET6) + again: +#endif error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == ENOENT) jid = 0; @@ -481,9 +524,22 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) else gotslevel = 1; + error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce)); + gotenforce = (error == 0); + if (gotenforce) { + if (enforce < 0 || enforce > 2) + return (EINVAL); + } else if (error != ENOENT) + goto done_free; + pr_flags = ch_flags = 0; - vfs_flagopt(opts, "persist", &pr_flags, PR_PERSIST); - vfs_flagopt(opts, "nopersist", &ch_flags, PR_PERSIST); + for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); + fi++) { + if (pr_flag_names[fi] == NULL) + continue; + vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi); + vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi); + } ch_flags |= pr_flags; if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE && !(pr_flags & PR_PERSIST)) { @@ -492,6 +548,14 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) goto done_errmsg; } + pr_allow = ch_allow = 0; + for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); + fi++) { + vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi); + vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi); + } + ch_allow |= pr_allow; + error = vfs_getopt(opts, "name", (void **)&name, &len); if (error == ENOENT) name = NULL; @@ -524,6 +588,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } } + /* This might be the second time around for this option. */ #ifdef INET error = vfs_getopt(opts, "ip4.addr", &op, &ip4s); if (error == ENOENT) @@ -533,42 +598,53 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) else if (ip4s & (sizeof(*ip4) - 1)) { error = EINVAL; goto done_free; - } else if (ip4s > 0) { - ip4s /= sizeof(*ip4); - if (ip4s > jail_max_af_ips) { - error = EINVAL; - vfs_opterror(opts, "too many IPv4 addresses"); - goto done_errmsg; - } - ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); - bcopy(op, ip4, ip4s * sizeof(*ip4)); - /* - * IP addresses are all sorted but ip[0] to preserve the - * primary IP address as given from userland. This special IP - * is used for unbound outgoing connections as well for - * "loopback" traffic. - */ - if (ip4s > 1) - qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4); - /* - * Check for duplicate addresses and do some simple zero and - * broadcast checks. If users give other bogus addresses it is - * their problem. - * - * We do not have to care about byte order for these checks so - * we will do them in NBO. - */ - for (ii = 0; ii < ip4s; ii++) { - if (ip4[ii].s_addr == INADDR_ANY || - ip4[ii].s_addr == INADDR_BROADCAST) { + } else { + ch_flags |= PR_IP4_USER; + pr_flags |= PR_IP4_USER; + if (ip4s > 0) { + ip4s /= sizeof(*ip4); + if (ip4s > jail_max_af_ips) { error = EINVAL; - goto done_free; + vfs_opterror(opts, "too many IPv4 addresses"); + goto done_errmsg; } - if ((ii+1) < ip4s && - (ip4[0].s_addr == ip4[ii+1].s_addr || - ip4[ii].s_addr == ip4[ii+1].s_addr)) { - error = EINVAL; - goto done_free; + if (ip4a < ip4s) { + ip4a = ip4s; + free(ip4, M_PRISON); + ip4 = NULL; + } + if (ip4 == NULL) + ip4 = malloc(ip4a * sizeof(*ip4), M_PRISON, + M_WAITOK); + bcopy(op, ip4, ip4s * sizeof(*ip4)); + /* + * IP addresses are all sorted but ip[0] to preserve + * the primary IP address as given from userland. + * This special IP is used for unbound outgoing + * connections as well for "loopback" traffic. + */ + if (ip4s > 1) + qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4); + /* + * Check for duplicate addresses and do some simple + * zero and broadcast checks. If users give other bogus + * addresses it is their problem. + * + * We do not have to care about byte order for these + * checks so we will do them in NBO. + */ + for (ii = 0; ii < ip4s; ii++) { + if (ip4[ii].s_addr == INADDR_ANY || + ip4[ii].s_addr == INADDR_BROADCAST) { + error = EINVAL; + goto done_free; + } + if ((ii+1) < ip4s && + (ip4[0].s_addr == ip4[ii+1].s_addr || + ip4[ii].s_addr == ip4[ii+1].s_addr)) { + error = EINVAL; + goto done_free; + } } } } @@ -583,28 +659,39 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) else if (ip6s & (sizeof(*ip6) - 1)) { error = EINVAL; goto done_free; - } else if (ip6s > 0) { - ip6s /= sizeof(*ip6); - if (ip6s > jail_max_af_ips) { - error = EINVAL; - vfs_opterror(opts, "too many IPv6 addresses"); - goto done_errmsg; - } - ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); - bcopy(op, ip6, ip6s * sizeof(*ip6)); - if (ip6s > 1) - qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6); - for (ii = 0; ii < ip6s; ii++) { - if (IN6_IS_ADDR_UNSPECIFIED(&ip6[0])) { + } else { + ch_flags |= PR_IP6_USER; + pr_flags |= PR_IP6_USER; + if (ip6s > 0) { + ip6s /= sizeof(*ip6); + if (ip6s > jail_max_af_ips) { error = EINVAL; - goto done_free; + vfs_opterror(opts, "too many IPv6 addresses"); + goto done_errmsg; } - if ((ii+1) < ip6s && - (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || - IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) - { - error = EINVAL; - goto done_free; + if (ip6a < ip6s) { + ip6a = ip6s; + free(ip6, M_PRISON); + ip6 = NULL; + } + if (ip6 == NULL) + ip6 = malloc(ip6a * sizeof(*ip6), M_PRISON, + M_WAITOK); + bcopy(op, ip6, ip6s * sizeof(*ip6)); + if (ip6s > 1) + qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6); + for (ii = 0; ii < ip6s; ii++) { + if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) { + error = EINVAL; + goto done_free; + } + if ((ii+1) < ip6s && + (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) || + IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1]))) + { + error = EINVAL; + goto done_free; + } } } } @@ -627,13 +714,15 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) error = EINVAL; goto done_free; } - if (len > MAXPATHLEN) { - error = ENAMETOOLONG; - goto done_free; - } if (len < 2 || (len == 2 && path[0] == '/')) path = NULL; else { + /* Leave room for a real-root full pathname. */ + if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/") + ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) { + error = ENAMETOOLONG; + goto done_free; + } NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_SYSSPACE, path, td); error = namei(&nd); @@ -683,7 +772,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } pr = NULL; if (jid != 0) { - /* See if a requested jid already exists. */ + /* + * See if a requested jid already exists. There is an + * information leak here if the jid exists but is not within + * the caller's jail hierarchy. Jail creators will get EEXIST + * even though they cannot see the jail, and CREATE | UPDATE + * will return ENOENT which is not normally a valid error. + */ if (jid < 0) { error = EINVAL; vfs_opterror(opts, "negative jid"); @@ -691,6 +786,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } pr = prison_find(jid); if (pr != NULL) { + ppr = pr->pr_parent; /* Create: jid must not exist. */ if (cuflags == JAIL_CREATE) { mtx_unlock(&pr->pr_mtx); @@ -699,7 +795,10 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) jid); goto done_unlock_list; } - if (pr->pr_uref == 0) { + if (!prison_ischild(mypr, pr)) { + mtx_unlock(&pr->pr_mtx); + pr = NULL; + } else if (pr->pr_uref == 0) { if (!(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); error = ENOENT; @@ -717,7 +816,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * name. */ if (name == NULL) - name = pr->pr_name; + name = prison_name(mypr, pr); } } } @@ -738,12 +837,43 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) * because that is the jail being updated). */ if (name != NULL) { + p = strrchr(name, '.'); + if (p != NULL) { + /* + * This is a hierarchical name. Split it into the + * parent and child names, and make sure the parent + * exists or matches an already found jail. + */ + *p = '\0'; + if (pr != NULL) { + if (strncmp(name, ppr->pr_name, p - name) || + ppr->pr_name[p - name] != '\0') { + mtx_unlock(&pr->pr_mtx); + error = EINVAL; + vfs_opterror(opts, + "cannot change jail's parent"); + goto done_unlock_list; + } + } else { + ppr = prison_find_name(mypr, name); + if (ppr == NULL) { + error = ENOENT; + vfs_opterror(opts, + "jail \"%s\" not found", name); + goto done_unlock_list; + } + mtx_unlock(&ppr->pr_mtx); + } + name = p + 1; + } if (name[0] != '\0') { - deadpr = NULL; + namelen = + (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1; name_again: - TAILQ_FOREACH(tpr, &allprison, pr_list) { + deadpr = NULL; + FOREACH_PRISON_CHILD(ppr, tpr) { if (tpr != pr && tpr->pr_ref > 0 && - !strcmp(tpr->pr_name, name)) { + !strcmp(tpr->pr_name + namelen, name)) { if (pr == NULL && cuflags != JAIL_CREATE) { mtx_lock(&tpr->pr_mtx); @@ -763,7 +893,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) /* * Create, or update(jid): * name must not exist in an - * active jail. + * active sibling jail. */ error = EEXIST; if (pr != NULL) @@ -810,6 +940,16 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) /* If there's no prison to update, create a new one and link it in. */ if (pr == NULL) { created = 1; + mtx_lock(&ppr->pr_mtx); + if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) { + mtx_unlock(&ppr->pr_mtx); + error = ENOENT; + vfs_opterror(opts, "parent jail went away!"); + goto done_unlock_list; + } + ppr->pr_ref++; + ppr->pr_uref++; + mtx_unlock(&ppr->pr_mtx); pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO); if (jid == 0) { /* Find the next free jid. */ @@ -829,7 +969,9 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) vfs_opterror(opts, "no available jail IDs"); free(pr, M_PRISON); - goto done_unlock_list; + prison_deref(ppr, PD_DEREF | + PD_DEUREF | PD_LIST_XLOCKED); + goto done_releroot; } jid++; goto findnext; @@ -848,24 +990,53 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } if (tpr == NULL) TAILQ_INSERT_TAIL(&allprison, pr, pr_list); - prisoncount++; + LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling); + for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) + tpr->pr_prisoncount++; + pr->pr_parent = ppr; pr->pr_id = jid; + + /* Set some default values, and inherit some from the parent. */ if (name == NULL) name = ""; if (path == NULL) { path = "/"; - root = rootvnode; + root = mypr->pr_root; vref(root); } +#ifdef INET + pr->pr_flags |= ppr->pr_flags & PR_IP4; + pr->pr_ip4s = ppr->pr_ip4s; + if (ppr->pr_ip4 != NULL) { + pr->pr_ip4 = malloc(pr->pr_ip4s * + sizeof(struct in_addr), M_PRISON, M_WAITOK); + bcopy(ppr->pr_ip4, pr->pr_ip4, + pr->pr_ip4s * sizeof(*pr->pr_ip4)); + } +#endif +#ifdef INET6 + pr->pr_flags |= ppr->pr_flags & PR_IP6; + pr->pr_ip6s = ppr->pr_ip6s; + if (ppr->pr_ip6 != NULL) { + pr->pr_ip6 = malloc(pr->pr_ip6s * + sizeof(struct in6_addr), M_PRISON, M_WAITOK); + bcopy(ppr->pr_ip6, pr->pr_ip6, + pr->pr_ip6s * sizeof(*pr->pr_ip6)); + } +#endif + pr->pr_securelevel = ppr->pr_securelevel; + pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow; + pr->pr_enforce_statfs = ppr->pr_enforce_statfs; - mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF); + LIST_INIT(&pr->pr_children); + mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK); /* * Allocate a dedicated cpuset for each jail. * Unlike other initial settings, this may return an erorr. */ - error = cpuset_create_root(td, &pr->pr_cpuset); + error = cpuset_create_root(ppr, &pr->pr_cpuset); if (error) { prison_deref(pr, PD_LIST_XLOCKED); goto done_releroot; @@ -887,103 +1058,392 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) } /* Do final error checking before setting anything. */ - error = 0; -#if defined(INET) || defined(INET6) - if ( + if (gotslevel) { + if (slevel < ppr->pr_securelevel) { + error = EPERM; + goto done_deref_locked; + } + } + if (gotenforce) { + if (enforce < ppr->pr_enforce_statfs) { + error = EPERM; + goto done_deref_locked; + } + } #ifdef INET - ip4s > 0 -#ifdef INET6 - || -#endif -#endif -#ifdef INET6 - ip6s > 0 -#endif - ) - /* - * Check for conflicting IP addresses. We permit them if there - * is no more than 1 IP on each jail. If there is a duplicate - * on a jail with more than one IP stop checking and return - * error. - */ - TAILQ_FOREACH(tpr, &allprison, pr_list) { - if (tpr == pr || tpr->pr_uref == 0) - continue; -#ifdef INET - if ((ip4s > 0 && tpr->pr_ip4s > 1) || - (ip4s > 1 && tpr->pr_ip4s > 0)) - for (ii = 0; ii < ip4s; ii++) + if (ch_flags & PR_IP4_USER) { + if (ppr->pr_flags & PR_IP4) { + if (!(pr_flags & PR_IP4_USER)) { + /* + * Silently ignore attempts to make the IP + * addresses unrestricted when the parent is + * restricted; in other words, interpret + * "unrestricted" as "as unrestricted as + * possible". + */ + ip4s = ppr->pr_ip4s; + if (ip4s == 0) { + free(ip4, M_PRISON); + ip4 = NULL; + } else if (ip4s <= ip4a) { + /* Inherit the parent's address(es). */ + bcopy(ppr->pr_ip4, ip4, + ip4s * sizeof(*ip4)); + } else { + /* + * There's no room for the parent's + * address list. Allocate some more. + */ + ip4a = ip4s; + free(ip4, M_PRISON); + ip4 = malloc(ip4a * sizeof(*ip4), + M_PRISON, M_NOWAIT); + if (ip4 != NULL) + bcopy(ppr->pr_ip4, ip4, + ip4s * sizeof(*ip4)); + else { + /* Allocation failed without + * sleeping. Unlocking the + * prison now will invalidate + * some checks and prematurely + * show an unfinished new jail. + * So let go of everything and + * start over. + */ + prison_deref(pr, created + ? PD_LOCKED | + PD_LIST_XLOCKED + : PD_DEREF | PD_LOCKED | + PD_LIST_XLOCKED); + if (root != NULL) { + vfslocked = + VFS_LOCK_GIANT( + root->v_mount); + vrele(root); + VFS_UNLOCK_GIANT( + vfslocked); + } + ip4 = malloc(ip4a * + sizeof(*ip4), M_PRISON, + M_WAITOK); + goto again; + } + } + } else if (ip4s > 0) { + /* + * Make sure the new set of IP addresses is a + * subset of the parent's list. Don't worry + * about the parent being unlocked, as any + * setting is done with allprison_lock held. + */ + for (ij = 0; ij < ppr->pr_ip4s; ij++) + if (ip4[0].s_addr == + ppr->pr_ip4[ij].s_addr) + break; + if (ij == ppr->pr_ip4s) { + error = EPERM; + goto done_deref_locked; + } + if (ip4s > 1) { + for (ii = ij = 1; ii < ip4s; ii++) { + if (ip4[ii].s_addr == + ppr->pr_ip4[0].s_addr) + continue; + for (; ij < ppr->pr_ip4s; ij++) + if (ip4[ii].s_addr == + ppr->pr_ip4[ij].s_addr) + break; + if (ij == ppr->pr_ip4s) + break; + } + if (ij == ppr->pr_ip4s) { + error = EPERM; + goto done_deref_locked; + } + } + } + } + if (ip4s > 0) { + /* + * Check for conflicting IP addresses. We permit them + * if there is no more than one IP on each jail. If + * there is a duplicate on a jail with more than one + * IP stop checking and return error. + */ + FOREACH_PRISON_DESCENDANT(&prison0, tpr, descend) { + if (tpr == pr || tpr->pr_uref == 0) { + descend = 0; + continue; + } + if (!(tpr->pr_flags & PR_IP4_USER)) + continue; + descend = 0; + if (tpr->pr_ip4 == NULL || + (ip4s == 1 && tpr->pr_ip4s == 1)) + continue; + for (ii = 0; ii < ip4s; ii++) { if (_prison_check_ip4(tpr, &ip4[ii]) == 0) { - error = EINVAL; + error = EADDRINUSE; vfs_opterror(opts, "IPv4 addresses clash"); goto done_deref_locked; } + } + } + } + } #endif #ifdef INET6 - if ((ip6s > 0 && tpr->pr_ip6s > 1) || - (ip6s > 1 && tpr->pr_ip6s > 0)) - for (ii = 0; ii < ip6s; ii++) + if (ch_flags & PR_IP6_USER) { + if (ppr->pr_flags & PR_IP6) { + if (!(pr_flags & PR_IP6_USER)) { + /* + * Silently ignore attempts to make the IP + * addresses unrestricted when the parent is + * restricted. + */ + ip6s = ppr->pr_ip6s; + if (ip6s == 0) { + free(ip6, M_PRISON); + ip6 = NULL; + } else if (ip6s <= ip6a) { + /* Inherit the parent's address(es). */ + bcopy(ppr->pr_ip6, ip6, + ip6s * sizeof(*ip6)); + } else { + /* + * There's no room for the parent's + * address list. + */ + ip6a = ip6s; + free(ip6, M_PRISON); + ip6 = malloc(ip6a * sizeof(*ip6), + M_PRISON, M_NOWAIT); + if (ip6 != NULL) + bcopy(ppr->pr_ip6, ip6, + ip6s * sizeof(*ip6)); + else { + prison_deref(pr, created + ? PD_LOCKED | + PD_LIST_XLOCKED + : PD_DEREF | PD_LOCKED | + PD_LIST_XLOCKED); + if (root != NULL) { + vfslocked = + VFS_LOCK_GIANT( + root->v_mount); + vrele(root); + VFS_UNLOCK_GIANT( + vfslocked); + } + ip6 = malloc(ip6a * + sizeof(*ip6), M_PRISON, + M_WAITOK); + goto again; + } + } + } else if (ip6s > 0) { + /* + * Make sure the new set of IP addresses is a + * subset of the parent's list. + */ + for (ij = 0; ij < ppr->pr_ip6s; ij++) + if (IN6_ARE_ADDR_EQUAL(&ip6[0], + &ppr->pr_ip6[ij])) + break; + if (ij == ppr->pr_ip6s) { + error = EPERM; + goto done_deref_locked; + } + if (ip6s > 1) { + for (ii = ij = 1; ii < ip6s; ii++) { + if (IN6_ARE_ADDR_EQUAL(&ip6[ii], + &ppr->pr_ip6[0])) + continue; + for (; ij < ppr->pr_ip6s; ij++) + if (IN6_ARE_ADDR_EQUAL( + &ip6[ii], + &ppr->pr_ip6[ij])) + break; + if (ij == ppr->pr_ip6s) + break; + } + if (ij == ppr->pr_ip6s) { + error = EPERM; + goto done_deref_locked; + } + } + } + } + if (ip6s > 0) { + /* Check for conflicting IP addresses. */ + FOREACH_PRISON_DESCENDANT(&prison0, tpr, descend) { + if (tpr == pr || tpr->pr_uref == 0) { + descend = 0; + continue; + } + if (!(tpr->pr_flags & PR_IP6_USER)) + continue; + descend = 0; + if (tpr->pr_ip6 == NULL || + (ip6s == 1 && tpr->pr_ip6s == 1)) + continue; + for (ii = 0; ii < ip6s; ii++) { if (_prison_check_ip6(tpr, &ip6[ii]) == 0) { - error = EINVAL; + error = EADDRINUSE; vfs_opterror(opts, "IPv6 addresses clash"); goto done_deref_locked; } -#endif + } + } } + } #endif - if (error == 0 && name != NULL) { + onamelen = namelen = 0; + if (name != NULL) { /* Give a default name of the jid. */ if (name[0] == '\0') snprintf(name = numbuf, sizeof(numbuf), "%d", jid); else if (strtoul(name, &p, 10) != jid && *p == '\0') { error = EINVAL; vfs_opterror(opts, "name cannot be numeric"); + goto done_deref_locked; + } + /* + * Make sure the name isn't too long for the prison or its + * children. + */ + onamelen = strlen(pr->pr_name); + namelen = strlen(name); + if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) { + error = ENAMETOOLONG; + goto done_deref_locked; + } + FOREACH_PRISON_DESCENDANT(pr, tpr, descend) { + if (strlen(tpr->pr_name) + (namelen - onamelen) >= + sizeof(pr->pr_name)) { + error = ENAMETOOLONG; + goto done_deref_locked; + } } } - if (error) { - done_deref_locked: - /* - * Some parameter had an error so do not set anything. - * If this is a new jail, it will go away without ever - * having been seen. - */ - prison_deref(pr, created - ? PD_LOCKED | PD_LIST_XLOCKED - : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); - goto done_releroot; + if (pr_allow & ~ppr->pr_allow) { + error = EPERM; + goto done_deref_locked; } /* Set the parameters of the prison. */ #ifdef INET - if (ip4s >= 0) { - pr->pr_ip4s = ip4s; - free(pr->pr_ip4, M_PRISON); - pr->pr_ip4 = ip4; - ip4 = NULL; + redo_ip4 = 0; + if (ch_flags & PR_IP4_USER) { + if (pr_flags & PR_IP4_USER) { + /* Some restriction set. */ + pr->pr_flags |= PR_IP4; + if (ip4s >= 0) { + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4s = ip4s; + pr->pr_ip4 = ip4; + ip4 = NULL; + } + } else if (ppr->pr_flags & PR_IP4) { + /* This restriction cleared, but keep inherited. */ + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4s = ip4s; + pr->pr_ip4 = ip4; + ip4 = NULL; + } else { + /* Restriction cleared, now unrestricted. */ + pr->pr_flags &= ~PR_IP4; + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4s = 0; + } + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { + if (prison_restrict_ip4(tpr, NULL)) { + redo_ip4 = 1; + descend = 0; + } + } } #endif #ifdef INET6 - if (ip6s >= 0) { - pr->pr_ip6s = ip6s; - free(pr->pr_ip6, M_PRISON); - pr->pr_ip6 = ip6; - ip6 = NULL; + redo_ip6 = 0; + if (ch_flags & PR_IP6_USER) { + if (pr_flags & PR_IP6_USER) { + /* Some restriction set. */ + pr->pr_flags |= PR_IP6; + if (ip6s >= 0) { + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6s = ip6s; + pr->pr_ip6 = ip6; + ip6 = NULL; + } + } else if (ppr->pr_flags & PR_IP6) { + /* This restriction cleared, but keep inherited. */ + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6s = ip6s; + pr->pr_ip6 = ip6; + ip6 = NULL; + } else { + /* Restriction cleared, now unrestricted. */ + pr->pr_flags &= ~PR_IP6; + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6s = 0; + } + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { + if (prison_restrict_ip6(tpr, NULL)) { + redo_ip6 = 1; + descend = 0; + } + } } #endif - if (gotslevel) + if (gotslevel) { pr->pr_securelevel = slevel; - if (name != NULL) - strlcpy(pr->pr_name, name, sizeof(pr->pr_name)); + /* Set all child jails to be at least this level. */ + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) + if (tpr->pr_securelevel < slevel) + tpr->pr_securelevel = slevel; + } + if (gotenforce) { + pr->pr_enforce_statfs = enforce; + /* Pass this restriction on to the children. */ + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) + if (tpr->pr_enforce_statfs < enforce) + tpr->pr_enforce_statfs = enforce; + } + if (name != NULL) { + if (ppr == &prison0) + strlcpy(pr->pr_name, name, sizeof(pr->pr_name)); + else + snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s", + ppr->pr_name, name); + /* Change this component of child names. */ + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { + bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen, + strlen(tpr->pr_name + onamelen) + 1); + bcopy(pr->pr_name, tpr->pr_name, namelen); + } + } if (path != NULL) { - strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); + /* Try to keep a real-rooted full pathname. */ + if (path[0] == '/' && strcmp(mypr->pr_path, "/")) + snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s", + mypr->pr_path, path); + else + strlcpy(pr->pr_path, path, sizeof(pr->pr_path)); pr->pr_root = root; } if (host != NULL) strlcpy(pr->pr_host, host, sizeof(pr->pr_host)); + if ((tallow = ch_allow & ~pr_allow)) { + /* Clear allow bits in all children. */ + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) + tpr->pr_allow &= ~tallow; + } + pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow; /* * Persistent prisons get an extra reference, and prisons losing their * persist flag lose that reference. Only do this for existing prisons @@ -1002,6 +1462,44 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; mtx_unlock(&pr->pr_mtx); + /* Locks may have prevented a complete restriction of child IP + * addresses. If so, allocate some more memory and try again. + */ +#ifdef INET + while (redo_ip4) { + ip4s = pr->pr_ip4s; + ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK); + mtx_lock(&pr->pr_mtx); + redo_ip4 = 0; + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { + if (prison_restrict_ip4(tpr, ip4)) { + if (ip4 != NULL) + ip4 = NULL; + else + redo_ip4 = 1; + } + } + mtx_unlock(&pr->pr_mtx); + } +#endif +#ifdef INET6 + while (redo_ip6) { + ip6s = pr->pr_ip6s; + ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK); + mtx_lock(&pr->pr_mtx); + redo_ip6 = 0; + FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) { + if (prison_restrict_ip6(tpr, ip6)) { + if (ip6 != NULL) + ip6 = NULL; + else + redo_ip6 = 1; + } + } + mtx_unlock(&pr->pr_mtx); + } +#endif + /* Let the modules do their work. */ sx_downgrade(&allprison_lock); if (created) { @@ -1054,6 +1552,11 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) td->td_retval[0] = pr->pr_id; goto done_errmsg; + done_deref_locked: + prison_deref(pr, created + ? PD_LOCKED | PD_LIST_XLOCKED + : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); + goto done_releroot; done_unlock_list: sx_xunlock(&allprison_lock); done_releroot: @@ -1090,72 +1593,6 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags) return (error); } -/* - * Sysctl nodes to describe jail parameters. Maximum length of string - * parameters is returned in the string itself, and the other parameters - * exist merely to make themselves and their types known. - */ -SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, - "Jail parameters"); - -int -sysctl_jail_param(SYSCTL_HANDLER_ARGS) -{ - int i; - long l; - size_t s; - char numbuf[12]; - - switch (oidp->oid_kind & CTLTYPE) - { - case CTLTYPE_LONG: - case CTLTYPE_ULONG: - l = 0; -#ifdef SCTL_MASK32 - if (!(req->flags & SCTL_MASK32)) -#endif - return (SYSCTL_OUT(req, &l, sizeof(l))); - case CTLTYPE_INT: - case CTLTYPE_UINT: - i = 0; - return (SYSCTL_OUT(req, &i, sizeof(i))); - case CTLTYPE_STRING: - snprintf(numbuf, sizeof(numbuf), "%d", arg2); - return - (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); - case CTLTYPE_STRUCT: - s = (size_t)arg2; - return (SYSCTL_OUT(req, &s, sizeof(s))); - } - return (0); -} - -SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail ID"); -SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); -SYSCTL_JAIL_PARAM(, cpuset, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); -SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RD, MAXPATHLEN, "Jail root path"); -SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, - "I", "Jail secure level"); -SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, - "B", "Jail persistence"); -SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, - "B", "Jail is in the process of shutting down"); - -SYSCTL_JAIL_PARAM_NODE(host, "Jail host info"); -SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, - "Jail hostname"); - -#ifdef INET -SYSCTL_JAIL_PARAM_NODE(ip4, "Jail IPv4 address virtualization"); -SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), - "S,in_addr,a", "Jail IPv4 addresses"); -#endif -#ifdef INET6 -SYSCTL_JAIL_PARAM_NODE(ip6, "Jail IPv6 address virtualization"); -SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), - "S,in6_addr,a", "Jail IPv6 addresses"); -#endif - /* * struct jail_get_args { @@ -1188,11 +1625,11 @@ jail_get(struct thread *td, struct jail_get_args *uap) int kern_jail_get(struct thread *td, struct uio *optuio, int flags) { - struct prison *pr; + struct prison *pr, *mypr; struct vfsopt *opt; struct vfsoptlist *opts; char *errmsg, *name; - int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos; + int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos; if (flags & ~JAIL_GET_MASK) return (EINVAL); @@ -1202,12 +1639,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) if (error) return (error); errmsg_pos = vfs_getopt_pos(opts, "errmsg"); - - /* Don't allow a jailed process to see any jails, not even its own. */ - if (jailed(td->td_ucred)) { - vfs_opterror(opts, "jail not found"); - return (ENOENT); - } + mypr = td->td_ucred->cr_prison; /* * Find the prison specified by one of: lastjid, jid, name. @@ -1216,7 +1648,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid)); if (error == 0) { TAILQ_FOREACH(pr, &allprison, pr_list) { - if (pr->pr_id > jid) { + if (pr->pr_id > jid && prison_ischild(mypr, pr)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0 && (pr->pr_uref > 0 || (flags & JAIL_DYING))) @@ -1235,7 +1667,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) error = vfs_copyopt(opts, "jid", &jid, sizeof(jid)); if (error == 0) { if (jid != 0) { - pr = prison_find(jid); + pr = prison_find_child(mypr, jid); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); @@ -1259,7 +1691,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) error = EINVAL; goto done_unlock_list; } - pr = prison_find_name(name); + pr = prison_find_name(mypr, name); if (pr != NULL) { if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) { mtx_unlock(&pr->pr_mtx); @@ -1288,14 +1720,18 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); if (error != 0 && error != ENOENT) goto done_deref; - error = vfs_setopts(opts, "name", pr->pr_name); + i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id; + error = vfs_setopt(opts, "parent", &i, sizeof(i)); if (error != 0 && error != ENOENT) goto done_deref; - error = vfs_setopt(opts, "cpuset", &pr->pr_cpuset->cs_id, + error = vfs_setopts(opts, "name", prison_name(mypr, pr)); + if (error != 0 && error != ENOENT) + goto done_deref; + error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id, sizeof(pr->pr_cpuset->cs_id)); if (error != 0 && error != ENOENT) goto done_deref; - error = vfs_setopts(opts, "path", pr->pr_path); + error = vfs_setopts(opts, "path", prison_path(mypr, pr)); if (error != 0 && error != ENOENT) goto done_deref; #ifdef INET @@ -1317,14 +1753,36 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) error = vfs_setopts(opts, "host.hostname", pr->pr_host); if (error != 0 && error != ENOENT) goto done_deref; - i = pr->pr_flags & PR_PERSIST ? 1 : 0; - error = vfs_setopt(opts, "persist", &i, sizeof(i)); - if (error != 0 && error != ENOENT) - goto done_deref; - i = !i; - error = vfs_setopt(opts, "nopersist", &i, sizeof(i)); + error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs, + sizeof(pr->pr_enforce_statfs)); if (error != 0 && error != ENOENT) goto done_deref; + for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); + fi++) { + if (pr_flag_names[fi] == NULL) + continue; + i = (pr->pr_flags & (1 << fi)) ? 1 : 0; + error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i)); + if (error != 0 && error != ENOENT) + goto done_deref; + i = !i; + error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i)); + if (error != 0 && error != ENOENT) + goto done_deref; + } + for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); + fi++) { + if (pr_allow_names[fi] == NULL) + continue; + i = (pr->pr_allow & (1 << fi)) ? 1 : 0; + error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i)); + if (error != 0 && error != ENOENT) + goto done_deref; + i = !i; + error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i)); + if (error != 0 && error != ENOENT) + goto done_deref; + } i = (pr->pr_uref == 0); error = vfs_setopt(opts, "dying", &i, sizeof(i)); if (error != 0 && error != ENOENT) @@ -1399,6 +1857,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) return (error); } + /* * struct jail_remove_args { * int jid; @@ -1407,21 +1866,61 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags) int jail_remove(struct thread *td, struct jail_remove_args *uap) { - struct prison *pr; - struct proc *p; - int deuref, error; + struct prison *pr, *cpr, *lpr, *tpr; + int descend, error; error = priv_check(td, PRIV_JAIL_REMOVE); if (error) return (error); sx_xlock(&allprison_lock); - pr = prison_find(uap->jid); + pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_xunlock(&allprison_lock); return (EINVAL); } + /* Remove all descendants of this prison, then remove this prison. */ + pr->pr_ref++; + pr->pr_flags |= PR_REMOVE; + if (!LIST_EMPTY(&pr->pr_children)) { + mtx_unlock(&pr->pr_mtx); + lpr = NULL; + FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { + mtx_lock(&cpr->pr_mtx); + if (cpr->pr_ref > 0) { + tpr = cpr; + cpr->pr_ref++; + cpr->pr_flags |= PR_REMOVE; + } else { + /* Already removed - do not do it again. */ + tpr = NULL; + } + mtx_unlock(&cpr->pr_mtx); + if (lpr != NULL) { + mtx_lock(&lpr->pr_mtx); + prison_remove_one(lpr); + sx_xlock(&allprison_lock); + } + lpr = tpr; + } + if (lpr != NULL) { + mtx_lock(&lpr->pr_mtx); + prison_remove_one(lpr); + sx_xlock(&allprison_lock); + } + mtx_lock(&pr->pr_mtx); + } + prison_remove_one(pr); + return (0); +} + +static void +prison_remove_one(struct prison *pr) +{ + struct proc *p; + int deuref; + /* If the prison was persistent, it is not anymore. */ deuref = 0; if (pr->pr_flags & PR_PERSIST) { @@ -1430,17 +1929,18 @@ jail_remove(struct thread *td, struct jail_remove_args *uap) pr->pr_flags &= ~PR_PERSIST; } - /* If there are no references left, remove the prison now. */ - if (pr->pr_ref == 0) { + /* + * jail_remove added a reference. If that's the only one, remove + * the prison now. + */ + KASSERT(pr->pr_ref > 0, + ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); + if (pr->pr_ref == 1) { prison_deref(pr, deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED); - return (0); + return; } - /* - * Keep a temporary reference to make sure this prison sticks around. - */ - pr->pr_ref++; mtx_unlock(&pr->pr_mtx); sx_xunlock(&allprison_lock); /* @@ -1455,9 +1955,8 @@ jail_remove(struct thread *td, struct jail_remove_args *uap) PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); - /* Remove the temporary reference. */ + /* Remove the temporary reference added by jail_remove. */ prison_deref(pr, deuref | PD_DEREF); - return (0); } @@ -1477,7 +1976,7 @@ jail_attach(struct thread *td, struct jail_attach_args *uap) return (error); sx_slock(&allprison_lock); - pr = prison_find(uap->jid); + pr = prison_find_child(td->td_ucred->cr_prison, uap->jid); if (pr == NULL) { sx_sunlock(&allprison_lock); return (EINVAL); @@ -1499,6 +1998,7 @@ jail_attach(struct thread *td, struct jail_attach_args *uap) static int do_jail_attach(struct thread *td, struct prison *pr) { + struct prison *ppr; struct proc *p; struct ucred *newcred, *oldcred; int vfslocked, error; @@ -1526,6 +2026,7 @@ do_jail_attach(struct thread *td, struct prison *pr) /* * Reparent the newly attached process to this jail. */ + ppr = td->td_ucred->cr_prison; p = td->td_proc; error = cpuset_setproc_update_set(p, pr->pr_cpuset); if (error) @@ -1553,6 +2054,7 @@ do_jail_attach(struct thread *td, struct prison *pr) p->p_ucred = newcred; PROC_UNLOCK(p); crfree(oldcred); + prison_deref(ppr, PD_DEREF | PD_DEUREF); return (0); e_unlock: VOP_UNLOCK(pr->pr_root, 0); @@ -1560,11 +2062,12 @@ do_jail_attach(struct thread *td, struct prison *pr) VFS_UNLOCK_GIANT(vfslocked); e_revert_osd: /* Tell modules this thread is still in its old jail after all. */ - (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td); + (void)osd_jail_call(ppr, PR_METHOD_ATTACH, td); prison_deref(pr, PD_DEREF | PD_DEUREF); return (error); } + /* * Returns a locked prison instance, or NULL on failure. */ @@ -1586,18 +2089,42 @@ prison_find(int prid) } /* - * Look for the named prison. Returns a locked prison or NULL. + * Find a prison that is a descendant of mypr. Returns a locked prison or NULL. */ struct prison * -prison_find_name(const char *name) +prison_find_child(struct prison *mypr, int prid) { - struct prison *pr, *deadpr; + struct prison *pr; + int descend; sx_assert(&allprison_lock, SX_LOCKED); + FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { + if (pr->pr_id == prid) { + mtx_lock(&pr->pr_mtx); + if (pr->pr_ref > 0) + return (pr); + mtx_unlock(&pr->pr_mtx); + } + } + return (NULL); +} + +/* + * Look for the name relative to mypr. Returns a locked prison or NULL. + */ +struct prison * +prison_find_name(struct prison *mypr, const char *name) +{ + struct prison *pr, *deadpr; + size_t mylen; + int descend; + + sx_assert(&allprison_lock, SX_LOCKED); + mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1; again: deadpr = NULL; - TAILQ_FOREACH(pr, &allprison, pr_list) { - if (!strcmp(pr->pr_name, name)) { + FOREACH_PRISON_DESCENDANT(mypr, pr, descend) { + if (!strcmp(pr->pr_name + mylen, name)) { mtx_lock(&pr->pr_mtx); if (pr->pr_ref > 0) { if (pr->pr_uref > 0) @@ -1607,7 +2134,7 @@ prison_find_name(const char *name) mtx_unlock(&pr->pr_mtx); } } - /* There was no valid prison - perhaps there was a dying one */ + /* There was no valid prison - perhaps there was a dying one. */ if (deadpr != NULL) { mtx_lock(&deadpr->pr_mtx); if (deadpr->pr_ref == 0) { @@ -1618,6 +2145,25 @@ prison_find_name(const char *name) return (deadpr); } +/* + * See if a prison has the specific flag set. + */ +int +prison_flag(struct ucred *cred, unsigned flag) +{ + + /* This is an atomic read, so no locking is necessary. */ + return (cred->cr_prison->pr_flags & flag); +} + +int +prison_allow(struct ucred *cred, unsigned flag) +{ + + /* This is an atomic read, so no locking is necessary. */ + return (cred->cr_prison->pr_allow & flag); +} + /* * Remove a prison reference. If that was the last reference, remove the * prison itself - but not in this context in case there are locks held. @@ -1661,14 +2207,41 @@ prison_complete(void *context, int pending) static void prison_deref(struct prison *pr, int flags) { + struct prison *ppr, *tpr; int vfslocked; if (!(flags & PD_LOCKED)) mtx_lock(&pr->pr_mtx); + /* Decrement the user references in a separate loop. */ if (flags & PD_DEUREF) { - pr->pr_uref--; + for (tpr = pr;; tpr = tpr->pr_parent) { + if (tpr != pr) + mtx_lock(&tpr->pr_mtx); + if (--tpr->pr_uref > 0) + break; + KASSERT(tpr != &prison0, ("prison0 pr_uref=0")); + mtx_unlock(&tpr->pr_mtx); + } /* Done if there were only user references to remove. */ if (!(flags & PD_DEREF)) { + mtx_unlock(&tpr->pr_mtx); + if (flags & PD_LIST_SLOCKED) + sx_sunlock(&allprison_lock); + else if (flags & PD_LIST_XLOCKED) + sx_xunlock(&allprison_lock); + return; + } + if (tpr != pr) { + mtx_unlock(&tpr->pr_mtx); + mtx_lock(&pr->pr_mtx); + } + } + + for (;;) { + if (flags & PD_DEREF) + pr->pr_ref--; + /* If the prison still has references, nothing else to do. */ + if (pr->pr_ref > 0) { mtx_unlock(&pr->pr_mtx); if (flags & PD_LIST_SLOCKED) sx_sunlock(&allprison_lock); @@ -1676,51 +2249,45 @@ prison_deref(struct prison *pr, int flags) sx_xunlock(&allprison_lock); return; } - } - if (flags & PD_DEREF) - pr->pr_ref--; - /* If the prison still has references, nothing else to do. */ - if (pr->pr_ref > 0) { + mtx_unlock(&pr->pr_mtx); - if (flags & PD_LIST_SLOCKED) - sx_sunlock(&allprison_lock); - else if (flags & PD_LIST_XLOCKED) - sx_xunlock(&allprison_lock); - return; - } - - KASSERT(pr->pr_uref == 0, - ("%s: Trying to remove an active prison (jid=%d).", __func__, - pr->pr_id)); - mtx_unlock(&pr->pr_mtx); - if (flags & PD_LIST_SLOCKED) { - if (!sx_try_upgrade(&allprison_lock)) { - sx_sunlock(&allprison_lock); + if (flags & PD_LIST_SLOCKED) { + if (!sx_try_upgrade(&allprison_lock)) { + sx_sunlock(&allprison_lock); + sx_xlock(&allprison_lock); + } + } else if (!(flags & PD_LIST_XLOCKED)) sx_xlock(&allprison_lock); + + TAILQ_REMOVE(&allprison, pr, pr_list); + LIST_REMOVE(pr, pr_sibling); + ppr = pr->pr_parent; + for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent) + tpr->pr_prisoncount--; + sx_downgrade(&allprison_lock); + + if (pr->pr_root != NULL) { + vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount); + vrele(pr->pr_root); + VFS_UNLOCK_GIANT(vfslocked); } - } else if (!(flags & PD_LIST_XLOCKED)) - sx_xlock(&allprison_lock); - - TAILQ_REMOVE(&allprison, pr, pr_list); - prisoncount--; - sx_xunlock(&allprison_lock); - - if (pr->pr_root != NULL) { - vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount); - vrele(pr->pr_root); - VFS_UNLOCK_GIANT(vfslocked); - } - mtx_destroy(&pr->pr_mtx); + mtx_destroy(&pr->pr_mtx); #ifdef INET - free(pr->pr_ip4, M_PRISON); + free(pr->pr_ip4, M_PRISON); #endif #ifdef INET6 - free(pr->pr_ip6, M_PRISON); + free(pr->pr_ip6, M_PRISON); #endif - if (pr->pr_cpuset != NULL) - cpuset_rel(pr->pr_cpuset); - osd_jail_exit(pr); - free(pr, M_PRISON); + if (pr->pr_cpuset != NULL) + cpuset_rel(pr->pr_cpuset); + osd_jail_exit(pr); + free(pr, M_PRISON); + + /* Removing a prison frees a reference on its parent. */ + pr = ppr; + mtx_lock(&pr->pr_mtx); + flags = PD_DEREF | PD_LIST_SLOCKED; + } } void @@ -1765,11 +2332,98 @@ prison_proc_free(struct prison *pr) #ifdef INET +/* + * Restrict a prison's IP address list with its parent's, possibly replacing + * it. Return true if the replacement buffer was used (or would have been). + */ +static int +prison_restrict_ip4(struct prison *pr, struct in_addr *newip4) +{ + int ii, ij, used; + struct prison *ppr; + + ppr = pr->pr_parent; + if (!(pr->pr_flags & PR_IP4_USER)) { + /* This has no user settings, so just copy the parent's list. */ + if (pr->pr_ip4s < ppr->pr_ip4s) { + /* + * There's no room for the parent's list. Use the + * new list buffer, which is assumed to be big enough + * (if it was passed). If there's no buffer, try to + * allocate one. + */ + used = 1; + if (newip4 == NULL) { + newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4), + M_PRISON, M_NOWAIT); + if (newip4 != NULL) + used = 0; + } + if (newip4 != NULL) { + bcopy(ppr->pr_ip4, newip4, + ppr->pr_ip4s * sizeof(*newip4)); + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4 = newip4; + pr->pr_ip4s = ppr->pr_ip4s; + pr->pr_flags |= PR_IP4; + } + return (used); + } + pr->pr_ip4s = ppr->pr_ip4s; + if (pr->pr_ip4s > 0) + bcopy(ppr->pr_ip4, pr->pr_ip4, + pr->pr_ip4s * sizeof(*newip4)); + else if (pr->pr_ip4 != NULL) { + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4 = NULL; + } + pr->pr_flags = + (pr->pr_flags & ~PR_IP4) | (ppr->pr_flags & PR_IP4); + } else if (pr->pr_ip4s > 0 && (ppr->pr_flags & PR_IP4)) { + /* Remove addresses that aren't in the parent. */ + for (ij = 0; ij < ppr->pr_ip4s; ij++) + if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr) + break; + if (ij < ppr->pr_ip4s) + ii = 1; + else { + bcopy(pr->pr_ip4 + 1, pr->pr_ip4, + --pr->pr_ip4s * sizeof(*pr->pr_ip4)); + ii = 0; + } + for (ij = 1; ii < pr->pr_ip4s; ) { + if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) { + ii++; + continue; + } + switch (ij >= ppr->pr_ip4s ? -1 : + qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) { + case -1: + bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii, + (--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4)); + break; + case 0: + ii++; + ij++; + break; + case 1: + ij++; + break; + } + } + if (pr->pr_ip4s == 0) { + free(pr->pr_ip4, M_PRISON); + pr->pr_ip4 = NULL; + } + } + return (0); +} + /* * Pass back primary IPv4 address of this jail. * - * If not jailed return success but do not alter the address. Caller has to - * make sure to initialize it correctly (e.g. INADDR_ANY). + * If not restricted return success but do not alter the address. Caller has + * to make sure to initialize it correctly (e.g. INADDR_ANY). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4. * Address returned in NBO. @@ -1782,10 +2436,14 @@ prison_get_ip4(struct ucred *cred, struct in_addr *ia) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP4)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP4)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -1796,13 +2454,38 @@ prison_get_ip4(struct ucred *cred, struct in_addr *ia) return (0); } +/* + * Return true if pr1 and pr2 have the same IPv4 address restrictions. + */ +int +prison_equal_ip4(struct prison *pr1, struct prison *pr2) +{ + + if (pr1 == pr2) + return (1); + + /* + * jail_set maintains an exclusive hold on allprison_lock while it + * changes the IP addresses, so only a shared hold is needed. This is + * easier than locking the two prisons which would require finding the + * proper locking order and end up needing allprison_lock anyway. + */ + sx_slock(&allprison_lock); + while (pr1 != &prison0 && !(pr1->pr_flags & PR_IP4_USER)) + pr1 = pr1->pr_parent; + while (pr2 != &prison0 && !(pr2->pr_flags & PR_IP4_USER)) + pr2 = pr2->pr_parent; + sx_sunlock(&allprison_lock); + return (pr1 == pr2); +} + /* * Make sure our (source) address is set to something meaningful to this * jail. * - * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if - * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4. - * Address passed in in NBO and returned in NBO. + * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, + * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail + * doesn't allow IPv4. Address passed in in NBO and returned in NBO. */ int prison_local_ip4(struct ucred *cred, struct in_addr *ia) @@ -1814,10 +2497,14 @@ prison_local_ip4(struct ucred *cred, struct in_addr *ia) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP4)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP4)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -1859,10 +2546,14 @@ prison_remote_ip4(struct ucred *cred, struct in_addr *ia) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP4)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP4)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -1884,9 +2575,9 @@ prison_remote_ip4(struct ucred *cred, struct in_addr *ia) /* * Check if given address belongs to the jail referenced by cred/prison. * - * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if - * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv4. - * Address passed in in NBO. + * Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail, + * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail + * doesn't allow IPv4. Address passed in in NBO. */ static int _prison_check_ip4(struct prison *pr, struct in_addr *ia) @@ -1927,10 +2618,14 @@ prison_check_ip4(struct ucred *cred, struct in_addr *ia) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia != NULL, ("%s: ia is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP4)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP4)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip4 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -1943,11 +2638,96 @@ prison_check_ip4(struct ucred *cred, struct in_addr *ia) #endif #ifdef INET6 +static int +prison_restrict_ip6(struct prison *pr, struct in6_addr *newip6) +{ + int ii, ij, used; + struct prison *ppr; + + ppr = pr->pr_parent; + if (!(pr->pr_flags & PR_IP6_USER)) { + /* This has no user settings, so just copy the parent's list. */ + if (pr->pr_ip6s < ppr->pr_ip6s) { + /* + * There's no room for the parent's list. Use the + * new list buffer, which is assumed to be big enough + * (if it was passed). If there's no buffer, try to + * allocate one. + */ + used = 1; + if (newip6 == NULL) { + newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6), + M_PRISON, M_NOWAIT); + if (newip6 != NULL) + used = 0; + } + if (newip6 != NULL) { + bcopy(ppr->pr_ip6, newip6, + ppr->pr_ip6s * sizeof(*newip6)); + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6 = newip6; + pr->pr_ip6s = ppr->pr_ip6s; + pr->pr_flags |= PR_IP6; + } + return (used); + } + pr->pr_ip6s = ppr->pr_ip6s; + if (pr->pr_ip6s > 0) + bcopy(ppr->pr_ip6, pr->pr_ip6, + pr->pr_ip6s * sizeof(*newip6)); + else if (pr->pr_ip6 != NULL) { + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6 = NULL; + } + pr->pr_flags = + (pr->pr_flags & ~PR_IP6) | (ppr->pr_flags & PR_IP6); + } else if (pr->pr_ip6s > 0 && (ppr->pr_flags & PR_IP6)) { + /* Remove addresses that aren't in the parent. */ + for (ij = 0; ij < ppr->pr_ip6s; ij++) + if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], + &ppr->pr_ip6[ij])) + break; + if (ij < ppr->pr_ip6s) + ii = 1; + else { + bcopy(pr->pr_ip6 + 1, pr->pr_ip6, + --pr->pr_ip6s * sizeof(*pr->pr_ip6)); + ii = 0; + } + for (ij = 1; ii < pr->pr_ip6s; ) { + if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii], + &ppr->pr_ip6[0])) { + ii++; + continue; + } + switch (ij >= ppr->pr_ip4s ? -1 : + qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) { + case -1: + bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii, + (--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6)); + break; + case 0: + ii++; + ij++; + break; + case 1: + ij++; + break; + } + } + if (pr->pr_ip6s == 0) { + free(pr->pr_ip6, M_PRISON); + pr->pr_ip6 = NULL; + } + } + return 0; +} + /* * Pass back primary IPv6 address for this jail. * - * If not jailed return success but do not alter the address. Caller has to - * make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). + * If not restricted return success but do not alter the address. Caller has + * to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT). * * Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6. */ @@ -1959,10 +2739,14 @@ prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP6)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP6)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -1973,14 +2757,34 @@ prison_get_ip6(struct ucred *cred, struct in6_addr *ia6) return (0); } +/* + * Return true if pr1 and pr2 have the same IPv6 address restrictions. + */ +int +prison_equal_ip6(struct prison *pr1, struct prison *pr2) +{ + + if (pr1 == pr2) + return (1); + + sx_slock(&allprison_lock); + while (pr1 != &prison0 && !(pr1->pr_flags & PR_IP6_USER)) + pr1 = pr1->pr_parent; + while (pr2 != &prison0 && !(pr2->pr_flags & PR_IP6_USER)) + pr2 = pr2->pr_parent; + sx_sunlock(&allprison_lock); + return (pr1 == pr2); +} + /* * Make sure our (source) address is set to something meaningful to this jail. * * v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0) * when needed while binding. * - * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if - * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6. + * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, + * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail + * doesn't allow IPv6. */ int prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) @@ -1991,10 +2795,14 @@ prison_local_ip6(struct ucred *cred, struct in6_addr *ia6, int v6only) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP6)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP6)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -2035,10 +2843,14 @@ prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP6)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP6)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -2060,8 +2872,9 @@ prison_remote_ip6(struct ucred *cred, struct in6_addr *ia6) /* * Check if given address belongs to the jail referenced by cred/prison. * - * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if - * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow IPv6. + * Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail, + * EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail + * doesn't allow IPv6. */ static int _prison_check_ip6(struct prison *pr, struct in6_addr *ia6) @@ -2102,10 +2915,14 @@ prison_check_ip6(struct ucred *cred, struct in6_addr *ia6) KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__)); - if (!jailed(cred)) - return (0); pr = cred->cr_prison; + if (!(pr->pr_flags & PR_IP6)) + return (0); mtx_lock(&pr->pr_mtx); + if (!(pr->pr_flags & PR_IP6)) { + mtx_unlock(&pr->pr_mtx); + return (0); + } if (pr->pr_ip6 == NULL) { mtx_unlock(&pr->pr_mtx); return (EAFNOSUPPORT); @@ -2126,34 +2943,42 @@ prison_check_ip6(struct ucred *cred, struct in6_addr *ia6) int prison_check_af(struct ucred *cred, int af) { + struct prison *pr; int error; KASSERT(cred != NULL, ("%s: cred is NULL", __func__)); - - if (!jailed(cred)) - return (0); - + pr = cred->cr_prison; error = 0; switch (af) { #ifdef INET case AF_INET: - if (cred->cr_prison->pr_ip4 == NULL) - error = EAFNOSUPPORT; + if (pr->pr_flags & PR_IP4) + { + mtx_lock(&pr->pr_mtx); + if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL) + error = EAFNOSUPPORT; + mtx_unlock(&pr->pr_mtx); + } break; #endif #ifdef INET6 case AF_INET6: - if (cred->cr_prison->pr_ip6 == NULL) - error = EAFNOSUPPORT; + if (pr->pr_flags & PR_IP6) + { + mtx_lock(&pr->pr_mtx); + if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL) + error = EAFNOSUPPORT; + mtx_unlock(&pr->pr_mtx); + } break; #endif case AF_LOCAL: case AF_ROUTE: break; default: - if (jail_socket_unixiproute_only) + if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); @@ -2163,9 +2988,9 @@ prison_check_af(struct ucred *cred, int af) * Check if given address belongs to the jail referenced by cred (wrapper to * prison_check_ip[46]). * - * Returns 0 if not jailed or if address belongs to jail, EADDRNOTAVAIL if - * the address doesn't belong, or EAFNOSUPPORT if the jail doesn't allow - * the address family. IPv4 Address passed in in NBO. + * Returns 0 if jail doesn't restrict the address family or if address belongs + * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if + * the jail doesn't allow the address family. IPv4 Address passed in in NBO. */ int prison_if(struct ucred *cred, struct sockaddr *sa) @@ -2197,7 +3022,7 @@ prison_if(struct ucred *cred, struct sockaddr *sa) break; #endif default: - if (jailed(cred) && jail_socket_unixiproute_only) + if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF)) error = EAFNOSUPPORT; } return (error); @@ -2210,17 +3035,24 @@ int prison_check(struct ucred *cred1, struct ucred *cred2) { - if (jailed(cred1)) { - if (!jailed(cred2)) - return (ESRCH); - if (cred2->cr_prison != cred1->cr_prison) - return (ESRCH); - } #ifdef VIMAGE if (cred2->cr_vimage->v_procg != cred1->cr_vimage->v_procg) return (ESRCH); #endif + return ((cred1->cr_prison == cred2->cr_prison || + prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH); +} +/* + * Return 1 if p2 is a child of p1, otherwise 0. + */ +int +prison_ischild(struct prison *pr1, struct prison *pr2) +{ + + for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent) + if (pr1 == pr2) + return (1); return (0); } @@ -2231,7 +3063,7 @@ int jailed(struct ucred *cred) { - return (cred->cr_prison != NULL); + return (cred->cr_prison != &prison0); } /* @@ -2267,12 +3099,12 @@ prison_canseemount(struct ucred *cred, struct mount *mp) struct statfs *sp; size_t len; - if (!jailed(cred) || jail_enforce_statfs == 0) - return (0); pr = cred->cr_prison; + if (pr->pr_enforce_statfs == 0) + return (0); if (pr->pr_root->v_mount == mp) return (0); - if (jail_enforce_statfs == 2) + if (pr->pr_enforce_statfs == 2) return (ENOENT); /* * If jail's chroot directory is set to "/" we should be able to see @@ -2302,9 +3134,9 @@ prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp) struct prison *pr; size_t len; - if (!jailed(cred) || jail_enforce_statfs == 0) - return; pr = cred->cr_prison; + if (pr->pr_enforce_statfs == 0) + return; if (prison_canseemount(cred, mp) != 0) { bzero(sp->f_mntonname, sizeof(sp->f_mntonname)); strlcpy(sp->f_mntonname, "[restricted]", @@ -2417,6 +3249,13 @@ prison_priv_check(struct ucred *cred, int priv) case PRIV_IPC_MSGSIZE: case PRIV_MQ_ADMIN: + /* + * Jail operations within a jail work on child jails. + */ + case PRIV_JAIL_ATTACH: + case PRIV_JAIL_SET: + case PRIV_JAIL_REMOVE: + /* * Jail implements its own inter-process limits, so allow * root processes in jail to change scheduling on other @@ -2469,7 +3308,7 @@ prison_priv_check(struct ucred *cred, int priv) * setting system flags. */ case PRIV_VFS_SYSFLAGS: - if (jail_chflags_allowed) + if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS) return (0); else return (EPERM); @@ -2482,7 +3321,7 @@ prison_priv_check(struct ucred *cred, int priv) case PRIV_VFS_UNMOUNT: case PRIV_VFS_MOUNT_NONUSER: case PRIV_VFS_MOUNT_OWNER: - if (jail_mount_allowed) + if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT) return (0); else return (EPERM); @@ -2505,7 +3344,7 @@ prison_priv_check(struct ucred *cred, int priv) * Conditionally allow creating raw sockets in jail. */ case PRIV_NETINET_RAW: - if (jail_allow_raw_sockets) + if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS) return (0); else return (EPERM); @@ -2528,11 +3367,68 @@ prison_priv_check(struct ucred *cred, int priv) } } +/* + * Return the part of pr2's name that is relative to pr1, or the whole name + * if it does not directly follow. + */ + +char * +prison_name(struct prison *pr1, struct prison *pr2) +{ + char *name; + + /* Jails see themselves as "0" (if they see themselves at all). */ + if (pr1 == pr2) + return "0"; + name = pr2->pr_name; + if (prison_ischild(pr1, pr2)) { + /* + * pr1 isn't locked (and allprison_lock may not be either) + * so its length can't be counted on. But the number of dots + * can be counted on - and counted. + */ + for (; pr1 != &prison0; pr1 = pr1->pr_parent) + name = strchr(name, '.') + 1; + } + return (name); +} + +/* + * Return the part of pr2's path that is relative to pr1, or the whole path + * if it does not directly follow. + */ +static char * +prison_path(struct prison *pr1, struct prison *pr2) +{ + char *path1, *path2; + int len1; + + path1 = pr1->pr_path; + path2 = pr2->pr_path; + if (!strcmp(path1, "/")) + return (path2); + len1 = strlen(path1); + if (strncmp(path1, path2, len1)) + return (path2); + if (path2[len1] == '\0') + return "/"; + if (path2[len1] == '/') + return (path2 + len1); + return (path2); +} + + +/* + * Jail-related sysctls. + */ +SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0, + "Jails"); + static int sysctl_jail_list(SYSCTL_HANDLER_ARGS) { struct xprison *xp; - struct prison *pr; + struct prison *pr, *cpr; #ifdef INET struct in_addr *ip4 = NULL; int ip4s = 0; @@ -2541,62 +3437,62 @@ sysctl_jail_list(SYSCTL_HANDLER_ARGS) struct in_addr *ip6 = NULL; int ip6s = 0; #endif - int error; - - if (jailed(req->td->td_ucred)) - return (0); + int descend, error; xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK); + pr = req->td->td_ucred->cr_prison; error = 0; sx_slock(&allprison_lock); - TAILQ_FOREACH(pr, &allprison, pr_list) { + FOREACH_PRISON_DESCENDANT(pr, cpr, descend) { +#if defined(INET) || defined(INET6) again: - mtx_lock(&pr->pr_mtx); +#endif + mtx_lock(&cpr->pr_mtx); #ifdef INET - if (pr->pr_ip4s > 0) { - if (ip4s < pr->pr_ip4s) { - ip4s = pr->pr_ip4s; - mtx_unlock(&pr->pr_mtx); + if (cpr->pr_ip4s > 0) { + if (ip4s < cpr->pr_ip4s) { + ip4s = cpr->pr_ip4s; + mtx_unlock(&cpr->pr_mtx); ip4 = realloc(ip4, ip4s * sizeof(struct in_addr), M_TEMP, M_WAITOK); goto again; } - bcopy(pr->pr_ip4, ip4, - pr->pr_ip4s * sizeof(struct in_addr)); + bcopy(cpr->pr_ip4, ip4, + cpr->pr_ip4s * sizeof(struct in_addr)); } #endif #ifdef INET6 - if (pr->pr_ip6s > 0) { - if (ip6s < pr->pr_ip6s) { - ip6s = pr->pr_ip6s; - mtx_unlock(&pr->pr_mtx); + if (cpr->pr_ip6s > 0) { + if (ip6s < cpr->pr_ip6s) { + ip6s = cpr->pr_ip6s; + mtx_unlock(&cpr->pr_mtx); ip6 = realloc(ip6, ip6s * sizeof(struct in6_addr), M_TEMP, M_WAITOK); goto again; } - bcopy(pr->pr_ip6, ip6, - pr->pr_ip6s * sizeof(struct in6_addr)); + bcopy(cpr->pr_ip6, ip6, + cpr->pr_ip6s * sizeof(struct in6_addr)); } #endif - if (pr->pr_ref == 0) { - mtx_unlock(&pr->pr_mtx); + if (cpr->pr_ref == 0) { + mtx_unlock(&cpr->pr_mtx); continue; } bzero(xp, sizeof(*xp)); xp->pr_version = XPRISON_VERSION; - xp->pr_id = pr->pr_id; - xp->pr_state = pr->pr_uref > 0 + xp->pr_id = cpr->pr_id; + xp->pr_state = cpr->pr_uref > 0 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING; - strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path)); - strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host)); - strlcpy(xp->pr_name, pr->pr_name, sizeof(xp->pr_name)); + strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path)); + strlcpy(xp->pr_host, cpr->pr_host, sizeof(xp->pr_host)); + strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name)); #ifdef INET - xp->pr_ip4s = pr->pr_ip4s; + xp->pr_ip4s = cpr->pr_ip4s; #endif #ifdef INET6 - xp->pr_ip6s = pr->pr_ip6s; + xp->pr_ip6s = cpr->pr_ip6s; #endif - mtx_unlock(&pr->pr_mtx); + mtx_unlock(&cpr->pr_mtx); error = SYSCTL_OUT(req, xp, sizeof(*xp)); if (error) break; @@ -2642,15 +3538,197 @@ sysctl_jail_jailed(SYSCTL_HANDLER_ARGS) return (error); } + SYSCTL_PROC(_security_jail, OID_AUTO, jailed, CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_jail_jailed, "I", "Process in jail?"); +#if defined(INET) || defined(INET6) +SYSCTL_INT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW, + &jail_max_af_ips, 0, + "Number of IP addresses a jail may have at most per address family"); +#endif + +/* + * Default parameters for jail(2) compatability. For historical reasons, + * the sysctl names have varying similarity to the parameter names. Prisons + * just see their own parameters, and can't change them. + */ +static int +sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS) +{ + struct prison *pr; + int allow, error, i; + + pr = req->td->td_ucred->cr_prison; + allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow; + + /* Get the current flag value, and convert it to a boolean. */ + i = (allow & arg2) ? 1 : 0; + if (arg1 != NULL) + i = !i; + error = sysctl_handle_int(oidp, &i, 0, req); + if (error || !req->newptr) + return (error); + i = i ? arg2 : 0; + if (arg1 != NULL) + i ^= arg2; + /* + * The sysctls don't have CTLFLAGS_PRISON, so assume prison0 + * for writing. + */ + mtx_lock(&prison0.pr_mtx); + jail_default_allow = (jail_default_allow & ~arg2) | i; + mtx_unlock(&prison0.pr_mtx); + return (0); +} + +SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I", + "Processes in jail can set their hostnames"); +SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I", + "Processes in jail are limited to creating UNIX/IP/route sockets only"); +SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I", + "Processes in jail can use System V IPC primitives"); +SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I", + "Prison root can create raw sockets"); +SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I", + "Processes in jail can alter system file flags"); +SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I", + "Processes in jail can mount/unmount jail-friendly file systems"); + +static int +sysctl_jail_default_level(SYSCTL_HANDLER_ARGS) +{ + struct prison *pr; + int level, error; + + pr = req->td->td_ucred->cr_prison; + level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2); + error = sysctl_handle_int(oidp, &level, 0, req); + if (error || !req->newptr) + return (error); + *(int *)arg1 = level; + return (0); +} + +SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs, + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs), + sysctl_jail_default_level, "I", + "Processes in jail cannot see all mounted file systems"); + +/* + * Nodes to describe jail parameters. Maximum length of string parameters + * is returned in the string itself, and the other parameters exist merely + * to make themselves and their types known. + */ +SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0, + "Jail parameters"); + +int +sysctl_jail_param(SYSCTL_HANDLER_ARGS) +{ + int i; + long l; + size_t s; + char numbuf[12]; + + switch (oidp->oid_kind & CTLTYPE) + { + case CTLTYPE_LONG: + case CTLTYPE_ULONG: + l = 0; +#ifdef SCTL_MASK32 + if (!(req->flags & SCTL_MASK32)) +#endif + return (SYSCTL_OUT(req, &l, sizeof(l))); + case CTLTYPE_INT: + case CTLTYPE_UINT: + i = 0; + return (SYSCTL_OUT(req, &i, sizeof(i))); + case CTLTYPE_STRING: + snprintf(numbuf, sizeof(numbuf), "%d", arg2); + return + (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req)); + case CTLTYPE_STRUCT: + s = (size_t)arg2; + return (SYSCTL_OUT(req, &s, sizeof(s))); + } + return (0); +} + +SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID"); +SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID"); +SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name"); +SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path"); +SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW, + "I", "Jail secure level"); +SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW, + "I", "Jail cannot see all mounted file systems"); +SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail persistence"); +SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD, + "B", "Jail is in the process of shutting down"); + +SYSCTL_JAIL_PARAM_NODE(host, "Jail host info"); +SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN, + "Jail hostname"); + +SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset"); +SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID"); + +#ifdef INET +SYSCTL_JAIL_PARAM_NODE(ip4, "Jail IPv4 address virtualization"); +SYSCTL_JAIL_PARAM(, noip4, CTLTYPE_INT | CTLFLAG_RW, + "BN", "Jail w/ no IP address virtualization"); +SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr), + "S,in_addr,a", "Jail IPv4 addresses"); +#endif +#ifdef INET6 +SYSCTL_JAIL_PARAM_NODE(ip6, "Jail IPv6 address virtualization"); +SYSCTL_JAIL_PARAM(, noip6, CTLTYPE_INT | CTLFLAG_RW, + "BN", "Jail w/ no IP address virtualization"); +SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr), + "S,in6_addr,a", "Jail IPv6 addresses"); +#endif + +SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags"); +SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may set hostname"); +SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may use SYSV IPC"); +SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may create raw sockets"); +SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may alter system file flags"); +SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may mount/unmount jail-friendly file systems"); +SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may set file quotas"); +SYSCTL_JAIL_PARAM(_allow, jails, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may create child jails"); +SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW, + "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route"); + + #ifdef DDB static void db_show_prison(struct prison *pr) { + int fi; #if defined(INET) || defined(INET6) int ii; #endif @@ -2661,6 +3739,7 @@ db_show_prison(struct prison *pr) db_printf("prison %p:\n", pr); db_printf(" jid = %d\n", pr->pr_id); db_printf(" name = %s\n", pr->pr_name); + db_printf(" parent = %p\n", pr->pr_parent); db_printf(" ref = %d\n", pr->pr_ref); db_printf(" uref = %d\n", pr->pr_uref); db_printf(" path = %s\n", pr->pr_path); @@ -2668,10 +3747,20 @@ db_show_prison(struct prison *pr) ? pr->pr_cpuset->cs_id : -1); db_printf(" root = %p\n", pr->pr_root); db_printf(" securelevel = %d\n", pr->pr_securelevel); + db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children)); + db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling)); db_printf(" flags = %x", pr->pr_flags); - if (pr->pr_flags & PR_PERSIST) - db_printf(" persist"); + for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]); + fi++) + if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi))) + db_printf(" %s", pr_flag_names[fi]); + db_printf(" allow = %x", pr->pr_allow); + for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]); + fi++) + if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi))) + db_printf(" %s", pr_allow_names[fi]); db_printf("\n"); + db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs); db_printf(" host.hostname = %s\n", pr->pr_host); #ifdef INET db_printf(" ip4s = %d\n", pr->pr_ip4s); @@ -2694,27 +3783,37 @@ DB_SHOW_COMMAND(prison, db_show_prison_command) struct prison *pr; if (!have_addr) { - /* Show all prisons in the list. */ - TAILQ_FOREACH(pr, &allprison, pr_list) { - db_show_prison(pr); - if (db_pager_quit) - break; + /* + * Show all prisons in the list, and prison0 which is not + * listed. + */ + db_show_prison(&prison0); + if (!db_pager_quit) { + TAILQ_FOREACH(pr, &allprison, pr_list) { + db_show_prison(pr); + if (db_pager_quit) + break; + } } return; } - /* Look for a prison with the ID and with references. */ - TAILQ_FOREACH(pr, &allprison, pr_list) - if (pr->pr_id == addr && pr->pr_ref > 0) - break; - if (pr == NULL) - /* Look again, without requiring a reference. */ + if (addr == 0) + pr = &prison0; + else { + /* Look for a prison with the ID and with references. */ TAILQ_FOREACH(pr, &allprison, pr_list) - if (pr->pr_id == addr) + if (pr->pr_id == addr && pr->pr_ref > 0) break; - if (pr == NULL) - /* Assume address points to a valid prison. */ - pr = (struct prison *)addr; + if (pr == NULL) + /* Look again, without requiring a reference. */ + TAILQ_FOREACH(pr, &allprison, pr_list) + if (pr->pr_id == addr) + break; + if (pr == NULL) + /* Assume address points to a valid prison. */ + pr = (struct prison *)addr; + } db_show_prison(pr); } diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c index 9f2a17ce4e26..f032d439d471 100644 --- a/sys/kern/kern_linker.c +++ b/sys/kern/kern_linker.c @@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -375,7 +376,7 @@ linker_load_file(const char *filename, linker_file_t *result) int foundfile, error; /* Refuse to load modules if securelevel raised */ - if (securelevel > 0) + if (prison0.pr_securelevel > 0) return (EPERM); KLD_LOCK_ASSERT(); @@ -580,7 +581,7 @@ linker_file_unload(linker_file_t file, int flags) int error, i; /* Refuse to unload modules if securelevel raised. */ - if (securelevel > 0) + if (prison0.pr_securelevel > 0) return (EPERM); KLD_LOCK_ASSERT(); diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c index 43fcfa4ae0e3..38116d531ec8 100644 --- a/sys/kern/kern_mib.c +++ b/sys/kern/kern_mib.c @@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -228,8 +229,8 @@ sysctl_hostname(SYSCTL_HANDLER_ARGS) int error; pr = req->td->td_ucred->cr_prison; - if (pr != NULL) { - if (!jail_set_hostname_allowed && req->newptr) + if (pr != &prison0) { + if (!(pr->pr_allow & PR_ALLOW_SET_HOSTNAME) && req->newptr) return (EPERM); /* * Process is in jail, so make a local copy of jail @@ -259,9 +260,12 @@ sysctl_hostname(SYSCTL_HANDLER_ARGS) error = sysctl_handle_string(oidp, tmphostname, sizeof tmphostname, req); if (req->newptr != NULL && error == 0) { + mtx_lock(&prison0.pr_mtx); mtx_lock(&hostname_mtx); + bcopy(tmphostname, prison0.pr_host, MAXHOSTNAMELEN); bcopy(tmphostname, V_hostname, MAXHOSTNAMELEN); mtx_unlock(&hostname_mtx); + mtx_unlock(&prison0.pr_mtx); } } return (error); @@ -278,55 +282,43 @@ SYSCTL_INT(_regression, OID_AUTO, securelevel_nonmonotonic, CTLFLAG_RW, ®ression_securelevel_nonmonotonic, 0, "securelevel may be lowered"); #endif -int securelevel = -1; -static struct mtx securelevel_mtx; - -MTX_SYSINIT(securelevel_lock, &securelevel_mtx, "securelevel mutex lock", - MTX_DEF); - static int sysctl_kern_securelvl(SYSCTL_HANDLER_ARGS) { - struct prison *pr; - int error, level; + struct prison *pr, *cpr; + int descend, error, level; pr = req->td->td_ucred->cr_prison; /* - * If the process is in jail, return the maximum of the global and - * local levels; otherwise, return the global level. Perform a - * lockless read since the securelevel is an integer. + * Reading the securelevel is easy, since the current jail's level + * is known to be at least as secure as any higher levels. Perform + * a lockless read since the securelevel is an integer. */ - if (pr != NULL) - level = imax(securelevel, pr->pr_securelevel); - else - level = securelevel; + level = pr->pr_securelevel; error = sysctl_handle_int(oidp, &level, 0, req); if (error || !req->newptr) return (error); - /* - * Permit update only if the new securelevel exceeds the - * global level, and local level if any. - */ - if (pr != NULL) { - mtx_lock(&pr->pr_mtx); - if (!regression_securelevel_nonmonotonic && - (level < imax(securelevel, pr->pr_securelevel))) { - mtx_unlock(&pr->pr_mtx); - return (EPERM); - } - pr->pr_securelevel = level; + /* Permit update only if the new securelevel exceeds the old. */ + sx_slock(&allprison_lock); + mtx_lock(&pr->pr_mtx); + if (!regression_securelevel_nonmonotonic && + level < pr->pr_securelevel) { mtx_unlock(&pr->pr_mtx); - } else { - mtx_lock(&securelevel_mtx); - if (!regression_securelevel_nonmonotonic && - (level < securelevel)) { - mtx_unlock(&securelevel_mtx); - return (EPERM); - } - securelevel = level; - mtx_unlock(&securelevel_mtx); + sx_sunlock(&allprison_lock); + return (EPERM); } + pr->pr_securelevel = level; + /* + * Set all child jails to be at least this level, but do not lower + * them (even if regression_securelevel_nonmonotonic). + */ + FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend) { + if (cpr->pr_securelevel < level) + cpr->pr_securelevel = level; + } + mtx_unlock(&pr->pr_mtx); + sx_sunlock(&allprison_lock); return (error); } diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 70dbb6098355..61aa1df5ee96 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -739,8 +739,8 @@ fill_kinfo_proc_only(struct proc *p, struct kinfo_proc *kp) /* If jailed(cred), emulate the old P_JAILED flag. */ if (jailed(cred)) { kp->ki_flag |= P_JAILED; - /* If inside a jail, use 0 as a jail ID. */ - if (!jailed(curthread->td_ucred)) + /* If inside the jail, use 0 as a jail ID. */ + if (cred->cr_prison != curthread->td_ucred->cr_prison) kp->ki_jid = cred->cr_prison->pr_id; } } diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index b35f81f5316e..1c5f68bd42cf 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -1263,33 +1263,25 @@ groupmember(gid_t gid, struct ucred *cred) * (securelevel >= level). Note that the logic is inverted -- these * functions return EPERM on "success" and 0 on "failure". * + * Due to care taken when setting the securelevel, we know that no jail will + * be less secure that its parent (or the physical system), so it is sufficient + * to test the current jail only. + * * XXXRW: Possibly since this has to do with privilege, it should move to * kern_priv.c. */ int securelevel_gt(struct ucred *cr, int level) { - int active_securelevel; - active_securelevel = securelevel; - KASSERT(cr != NULL, ("securelevel_gt: null cr")); - if (cr->cr_prison != NULL) - active_securelevel = imax(cr->cr_prison->pr_securelevel, - active_securelevel); - return (active_securelevel > level ? EPERM : 0); + return (cr->cr_prison->pr_securelevel > level ? EPERM : 0); } int securelevel_ge(struct ucred *cr, int level) { - int active_securelevel; - active_securelevel = securelevel; - KASSERT(cr != NULL, ("securelevel_ge: null cr")); - if (cr->cr_prison != NULL) - active_securelevel = imax(cr->cr_prison->pr_securelevel, - active_securelevel); - return (active_securelevel >= level ? EPERM : 0); + return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0); } /* @@ -1823,7 +1815,7 @@ crfree(struct ucred *cr) /* * Free a prison, if any. */ - if (jailed(cr)) + if (cr->cr_prison != NULL) prison_free(cr->cr_prison); #ifdef VIMAGE /* XXX TODO: find out why and when cr_vimage can be NULL here! */ @@ -1863,8 +1855,7 @@ crcopy(struct ucred *dest, struct ucred *src) (caddr_t)&src->cr_startcopy)); uihold(dest->cr_uidinfo); uihold(dest->cr_ruidinfo); - if (jailed(dest)) - prison_hold(dest->cr_prison); + prison_hold(dest->cr_prison); #ifdef VIMAGE KASSERT(src->cr_vimage != NULL, ("cr_vimage == NULL")); refcount_acquire(&dest->cr_vimage->vi_ucredrefc); diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c index fe92186c27b7..2a2a3d02b182 100644 --- a/sys/kern/sysv_msg.c +++ b/sys/kern/sysv_msg.c @@ -337,7 +337,7 @@ msgsys(td, uap) { int error; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) @@ -410,7 +410,7 @@ kern_msgctl(td, msqid, cmd, msqbuf) int rval, error, msqix; register struct msqid_kernel *msqkptr; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); @@ -564,7 +564,7 @@ msgget(td, uap) DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&msq_mtx); @@ -674,7 +674,7 @@ kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) register struct msg *msghdr; short next; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&msq_mtx); @@ -1012,7 +1012,7 @@ kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype) int msqix, error = 0; short next; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c index 12a2c4e44c54..a99cf4ead45e 100644 --- a/sys/kern/sysv_sem.c +++ b/sys/kern/sysv_sem.c @@ -344,7 +344,7 @@ semsys(td, uap) { int error; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) @@ -583,7 +583,7 @@ kern_semctl(struct thread *td, int semid, int semnum, int cmd, DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); array = NULL; @@ -855,7 +855,7 @@ semget(struct thread *td, struct semget_args *uap) struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&sem_mtx); @@ -982,7 +982,7 @@ semop(struct thread *td, struct semop_args *uap) #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c index 5311e082b76d..b374b5c9743f 100644 --- a/sys/kern/sysv_shm.c +++ b/sys/kern/sysv_shm.c @@ -303,7 +303,7 @@ shmdt(td, uap) int i; int error = 0; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&Giant); shmmap_s = p->p_vmspace->vm_shm; @@ -357,7 +357,7 @@ kern_shmat(td, shmid, shmaddr, shmflg) int rv; int error = 0; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&Giant); shmmap_s = p->p_vmspace->vm_shm; @@ -480,7 +480,7 @@ oshmctl(td, uap) struct shmid_kernel *shmseg; struct oshmid_ds outbuf; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&Giant); shmseg = shm_find_segment_by_shmid(uap->shmid); @@ -542,7 +542,7 @@ kern_shmctl(td, shmid, cmd, buf, bufsz) int error = 0; struct shmid_kernel *shmseg; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&Giant); @@ -823,7 +823,7 @@ shmget(td, uap) int segnum, mode; int error; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&Giant); mode = uap->shmflg & ACCESSPERMS; @@ -861,7 +861,7 @@ shmsys(td, uap) #if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43)) int error; - if (!jail_sysvipc_allowed && jailed(td->td_ucred)) + if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); if (uap->which < 0 || uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index a6e663a402a5..4a520e901a86 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -446,6 +447,7 @@ lookup(struct nameidata *ndp) struct vnode *dp = 0; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ + struct prison *pr; int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ @@ -603,9 +605,14 @@ lookup(struct nameidata *ndp) goto bad; } for (;;) { + for (pr = cnp->cn_cred->cr_prison; pr != NULL; + pr = pr->pr_parent) + if (dp == pr->pr_root) + break; if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || + pr != NULL || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index becc525a969b..fa00290c98fd 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -1420,6 +1420,11 @@ static void root_mount_done(void) { + /* Keep prison0's root in sync with the global rootvnode. */ + mtx_lock(&prison0.pr_mtx); + prison0.pr_root = rootvnode; + vref(prison0.pr_root); + mtx_unlock(&prison0.pr_mtx); /* * Use a mutex to prevent the wakeup being missed and waiting for * an extra 1 second sleep. diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 8c26b13139bd..05d9de54a758 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -467,21 +467,13 @@ vfs_suser(struct mount *mp, struct thread *td) return (EPERM); /* - * If the file system was mounted outside a jail and a jailed thread - * tries to access it, deny immediately. + * If the file system was mounted outside the jail of the calling + * thread, deny immediately. */ - if (!jailed(mp->mnt_cred) && jailed(td->td_ucred)) + if (mp->mnt_cred->cr_prison != td->td_ucred->cr_prison && + !prison_ischild(td->td_ucred->cr_prison, mp->mnt_cred->cr_prison)) return (EPERM); - /* - * If the file system was mounted inside different jail that the jail of - * the calling thread, deny immediately. - */ - if (jailed(mp->mnt_cred) && jailed(td->td_ucred) && - mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) { - return (EPERM); - } - /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified @@ -2900,7 +2892,7 @@ DB_SHOW_COMMAND(mount, db_show_mount) db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); - if (mp->mnt_cred->cr_prison != NULL) + if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d\n", mp->mnt_ref); diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index ae3904df22bb..a99d435612cb 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -164,12 +164,6 @@ sync(td, uap) return (0); } -/* XXX PRISON: could be per prison flag */ -static int prison_quotas; -#if 0 -SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, ""); -#endif - /* * Change filesystem quotas. */ @@ -198,7 +192,7 @@ quotactl(td, uap) AUDIT_ARG(cmd, uap->cmd); AUDIT_ARG(uid, uap->uid); - if (jailed(td->td_ucred) && !prison_quotas) + if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS)) return (EPERM); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1, UIO_USERSPACE, uap->path, td); diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 95faba760476..125db4000288 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -373,6 +373,8 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, /* * As a last resort return the 'default' jail address. */ + ia = ((struct sockaddr_in *)rt->rt_ifa->ifa_addr)-> + sin_addr; if (prison_get_ip4(cred, &ia) != 0) return (ESRCH); } @@ -414,6 +416,8 @@ rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp, /* * As a last resort return the 'default' jail address. */ + ia6 = ((struct sockaddr_in6 *)rt->rt_ifa->ifa_addr)-> + sin6_addr; if (prison_get_ip6(cred, &ia6) != 0) return (ESRCH); } diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 94460bb0fc14..b507fbf4b64c 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -602,7 +602,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, goto done; } - if (cred == NULL || !jailed(cred)) { + if (cred == NULL || !prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } @@ -646,7 +646,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ifnet *ifp; /* If not jailed, use the default returned. */ - if (cred == NULL || !jailed(cred)) { + if (cred == NULL || !prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; @@ -711,7 +711,7 @@ in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&sain))); - if (cred == NULL || !jailed(cred)) { + if (cred == NULL || !prison_flag(cred, PR_IP4)) { #if __FreeBSD_version < 800000 if (ia == NULL) ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; @@ -1222,7 +1222,8 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, * Found? */ if (cred == NULL || - inp->inp_cred->cr_prison == cred->cr_prison) + prison_equal_ip4(cred->cr_prison, + inp->inp_cred->cr_prison)) return (inp); } } @@ -1254,7 +1255,8 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && - inp->inp_cred->cr_prison != cred->cr_prison) + !prison_equal_ip4(inp->inp_cred->cr_prison, + cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ @@ -1335,7 +1337,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ - if (jailed(inp->inp_cred)) + if (prison_flag(inp->inp_cred, PR_IP4)) return (inp); if (tmpinp == NULL) tmpinp = inp; @@ -1380,7 +1382,7 @@ in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, (inp->inp_flags & INP_FAITH) == 0) continue; - injail = jailed(inp->inp_cred); + injail = prison_flag(inp->inp_cred, PR_IP4); if (injail) { if (prison_check_ip4(inp->inp_cred, &laddr) != 0) diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index a49240c14b0c..caeb2ae7665a 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -1008,7 +1008,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, * Remember addr if jailed, to prevent * rebinding. */ - if (jailed(td->td_ucred)) + if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; if (in_pcbinshash(inp) != 0) { diff --git a/sys/netinet6/in6.c b/sys/netinet6/in6.c index 60d56a8ae8e7..921618e888a6 100644 --- a/sys/netinet6/in6.c +++ b/sys/netinet6/in6.c @@ -660,7 +660,6 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { INIT_VNET_INET6(ifp->if_vnet); - INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ int error = 0, hostIsNew = 0, plen = -1; struct in6_ifaddr *oia; struct sockaddr_in6 dst6; @@ -1017,7 +1016,6 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, /* * join node information group address */ -#define hostnamelen strlen(V_hostname) delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* @@ -1027,10 +1025,7 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } - mtx_lock(&hostname_mtx); - if (in6_nigroup(ifp, V_hostname, hostnamelen, - &mltaddr.sin6_addr) == 0) { - mtx_unlock(&hostname_mtx); + if (in6_nigroup(ifp, NULL, -1, &mltaddr.sin6_addr) == 0) { imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, delay); /* XXX jinmei */ if (!imm) { @@ -1044,9 +1039,7 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } - } else - mtx_unlock(&hostname_mtx); -#undef hostnamelen + } /* * join interface-local all-nodes address. diff --git a/sys/netinet6/in6_ifattach.c b/sys/netinet6/in6_ifattach.c index 4738f9149637..c1f828ba253d 100644 --- a/sys/netinet6/in6_ifattach.c +++ b/sys/netinet6/in6_ifattach.c @@ -620,23 +620,44 @@ int in6_nigroup(struct ifnet *ifp, const char *name, int namelen, struct in6_addr *in6) { + INIT_VPROCG(TD_TO_VPROCG(curthread)); /* XXX V_hostname needs this */ const char *p; u_char *q; MD5_CTX ctxt; + int use_hostname; u_int8_t digest[16]; char l; char n[64]; /* a single label must not exceed 63 chars */ - if (!namelen || !name) + /* + * If no name is given and namelen is -1, + * we try to do the hostname lookup ourselves. + */ + if (!name && namelen == -1) { + use_hostname = 1; + mtx_lock(&hostname_mtx); + name = V_hostname; + namelen = strlen(name); + } else + use_hostname = 0; + if (!name || !namelen) { + if (use_hostname) + mtx_unlock(&hostname_mtx); return -1; + } p = name; while (p && *p && *p != '.' && p - name < namelen) p++; - if (p - name > sizeof(n) - 1) + if (p == name || p - name > sizeof(n) - 1) { + if (use_hostname) + mtx_unlock(&hostname_mtx); return -1; /* label too long */ + } l = p - name; strncpy(n, name, l); + if (use_hostname) + mtx_unlock(&hostname_mtx); n[(int)l] = '\0'; for (q = n; *q; q++) { if ('A' <= *q && *q <= 'Z') diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index e446a059416f..e5727120c2c3 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -666,7 +666,8 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, inp->inp_lport == lport) { /* Found. */ if (cred == NULL || - inp->inp_cred->cr_prison == cred->cr_prison) + prison_equal_ip6(cred->cr_prison, + inp->inp_cred->cr_prison)) return (inp); } } @@ -698,7 +699,8 @@ in6_pcblookup_local(struct inpcbinfo *pcbinfo, struct in6_addr *laddr, LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (cred != NULL && - inp->inp_cred->cr_prison != cred->cr_prison) + !prison_equal_ip6(cred->cr_prison, + inp->inp_cred->cr_prison)) continue; /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) @@ -838,7 +840,7 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, * the inp here, without any checks. * Well unless both bound with SO_REUSEPORT? */ - if (jailed(inp->inp_cred)) + if (prison_flag(inp->inp_cred, PR_IP6)) return (inp); if (tmpinp == NULL) tmpinp = inp; @@ -878,7 +880,7 @@ in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, if (faith && (inp->inp_flags & INP_FAITH) == 0) continue; - injail = jailed(inp->inp_cred); + injail = prison_flag(inp->inp_cred, PR_IP6); if (injail) { if (prison_check_ip6(inp->inp_cred, laddr) != 0) diff --git a/sys/nfsserver/nfs_srvsock.c b/sys/nfsserver/nfs_srvsock.c index 6f42e31939a6..639772b8a3b5 100644 --- a/sys/nfsserver/nfs_srvsock.c +++ b/sys/nfsserver/nfs_srvsock.c @@ -43,6 +43,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -699,6 +700,8 @@ nfsrv_dorec(struct nfssvc_sock *slp, struct nfsd *nfsd, nd = malloc(sizeof (struct nfsrv_descript), M_NFSRVDESC, M_WAITOK); nd->nd_cr = crget(); + prison_hold(&prison0); + nd->nd_cr->cr_prison = &prison0; NFSD_LOCK(); nd->nd_md = nd->nd_mrep = m; nd->nd_nam2 = nam; diff --git a/sys/security/mac_bsdextended/mac_bsdextended.c b/sys/security/mac_bsdextended/mac_bsdextended.c index cc3685156dea..c37d02adfb7f 100644 --- a/sys/security/mac_bsdextended/mac_bsdextended.c +++ b/sys/security/mac_bsdextended/mac_bsdextended.c @@ -271,8 +271,8 @@ ugidfw_rulecheck(struct mac_bsdextended_rule *rule, } if (rule->mbr_subject.mbs_flags & MBS_PRISON_DEFINED) { - match = (cred->cr_prison != NULL && - cred->cr_prison->pr_id == rule->mbr_subject.mbs_prison); + match = + (cred->cr_prison->pr_id == rule->mbr_subject.mbs_prison); if (rule->mbr_subject.mbs_neg & MBS_PRISON_DEFINED) match = !match; if (!match) diff --git a/sys/sys/cpuset.h b/sys/sys/cpuset.h index 326160d7f617..1b0bd319f8a3 100644 --- a/sys/sys/cpuset.h +++ b/sys/sys/cpuset.h @@ -169,14 +169,14 @@ struct cpuset { #define CPU_SET_RDONLY 0x0002 /* No modification allowed. */ extern cpuset_t *cpuset_root; +struct prison; struct proc; -struct thread; struct cpuset *cpuset_thread0(void); struct cpuset *cpuset_ref(struct cpuset *); void cpuset_rel(struct cpuset *); int cpuset_setthread(lwpid_t id, cpuset_t *); -int cpuset_create_root(struct thread *, struct cpuset **); +int cpuset_create_root(struct prison *, struct cpuset **); int cpuset_setproc_update_set(struct proc *, struct cpuset *); #else diff --git a/sys/sys/jail.h b/sys/sys/jail.h index 60e2d324b66e..fc466e9ea57e 100644 --- a/sys/sys/jail.h +++ b/sys/sys/jail.h @@ -122,8 +122,8 @@ int jail_remove(int); #include #include -#include -#include +#include +#include #include #define JAIL_MAX 999999 @@ -137,8 +137,6 @@ MALLOC_DECLARE(M_PRISON); #include -struct cpuset; - /* * This structure describes a prison. It is pointed to by all struct * ucreds's of the inmates. pr_ref keeps track of them and is used to @@ -162,7 +160,7 @@ struct prison { struct vnode *pr_root; /* (c) vnode to rdir */ char pr_host[MAXHOSTNAMELEN]; /* (p) jail hostname */ char pr_name[MAXHOSTNAMELEN]; /* (p) admin jail name */ - void *pr_spare; /* was pr_linux */ + struct prison *pr_parent; /* (c) containing jail */ int pr_securelevel; /* (p) securelevel */ struct task pr_task; /* (d) destroy task */ struct mtx pr_mtx; @@ -171,15 +169,37 @@ struct prison { struct in_addr *pr_ip4; /* (p) v4 IPs of jail */ int pr_ip6s; /* (p) number of v6 IPs */ struct in6_addr *pr_ip6; /* (p) v6 IPs of jail */ + LIST_HEAD(, prison) pr_children; /* (a) list of child jails */ + LIST_ENTRY(prison) pr_sibling; /* (a) next in parent's list */ + int pr_prisoncount; /* (a) number of child jails */ + unsigned pr_allow; /* (p) PR_ALLOW_* flags */ + int pr_enforce_statfs; /* (p) statfs permission */ }; #endif /* _KERNEL || _WANT_PRISON */ #ifdef _KERNEL -/* - * Flag bits set via options or internally - */ +/* Flag bits set via options */ #define PR_PERSIST 0x00000001 /* Can exist without processes */ +#define PR_IP4_USER 0x00000004 /* Virtualize IPv4 addresses */ +#define PR_IP6_USER 0x00000008 /* Virtualize IPv6 addresses */ + +/* Internal flag bits */ #define PR_REMOVE 0x01000000 /* In process of being removed */ +#define PR_IP4 0x02000000 /* IPv4 virtualized by this jail or */ + /* an ancestor */ +#define PR_IP6 0x04000000 /* IPv6 virtualized by this jail or */ + /* an ancestor */ + +/* Flags for pr_allow */ +#define PR_ALLOW_SET_HOSTNAME 0x0001 +#define PR_ALLOW_SYSVIPC 0x0002 +#define PR_ALLOW_RAW_SOCKETS 0x0004 +#define PR_ALLOW_CHFLAGS 0x0008 +#define PR_ALLOW_MOUNT 0x0010 +#define PR_ALLOW_QUOTAS 0x0020 +#define PR_ALLOW_JAILS 0x0040 +#define PR_ALLOW_SOCKET_AF 0x0080 +#define PR_ALLOW_ALL 0x00ff /* * OSD methods @@ -192,16 +212,68 @@ struct prison { #define PR_MAXMETHOD 5 /* - * Sysctl-set variables that determine global jail policy - * - * XXX MIB entries will need to be protected by a mutex. + * Lock/unlock a prison. + * XXX These exist not so much for general convenience, but to be useable in + * the FOREACH_PRISON_DESCENDANT_LOCKED macro which can't handle them in + * non-function form as currently defined. */ -extern int jail_set_hostname_allowed; -extern int jail_socket_unixiproute_only; -extern int jail_sysvipc_allowed; -extern int jail_getfsstat_jailrootonly; -extern int jail_allow_raw_sockets; -extern int jail_chflags_allowed; +static __inline void +prison_lock(struct prison *pr) +{ + + mtx_lock(&pr->pr_mtx); +} + +static __inline void +prison_unlock(struct prison *pr) +{ + + mtx_unlock(&pr->pr_mtx); +} + +/* Traverse a prison's immediate children. */ +#define FOREACH_PRISON_CHILD(ppr, cpr) \ + LIST_FOREACH(cpr, &(ppr)->pr_children, pr_sibling) + +/* + * Preorder traversal of all of a prison's descendants. + * This ugly loop allows the macro to be followed by a single block + * as expected in a looping primitive. + */ +#define FOREACH_PRISON_DESCENDANT(ppr, cpr, descend) \ + for ((cpr) = (ppr), (descend) = 1; \ + ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ + ? LIST_FIRST(&(cpr)->pr_children) \ + : ((cpr) == (ppr) \ + ? NULL \ + : (((descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ + ? LIST_NEXT(cpr, pr_sibling) \ + : (cpr)->pr_parent))));) \ + if (!(descend)) \ + ; \ + else + +/* + * As above, but lock descendants on the way down and unlock on the way up. + */ +#define FOREACH_PRISON_DESCENDANT_LOCKED(ppr, cpr, descend) \ + for ((cpr) = (ppr), (descend) = 1; \ + ((cpr) = (((descend) && !LIST_EMPTY(&(cpr)->pr_children)) \ + ? LIST_FIRST(&(cpr)->pr_children) \ + : ((cpr) == (ppr) \ + ? NULL \ + : ((prison_unlock(cpr), \ + (descend) = LIST_NEXT(cpr, pr_sibling) != NULL) \ + ? LIST_NEXT(cpr, pr_sibling) \ + : (cpr)->pr_parent))));) \ + if ((descend) ? (prison_lock(cpr), 0) : 1) \ + ; \ + else + +/* + * Attributes of the physical system, and the root of the jail tree. + */ +extern struct prison prison0; TAILQ_HEAD(prisonlist, prison); extern struct prisonlist allprison; @@ -235,23 +307,29 @@ struct sockaddr; struct statfs; int jailed(struct ucred *cred); void getcredhostname(struct ucred *cred, char *, size_t); +int prison_allow(struct ucred *, unsigned); int prison_check(struct ucred *cred1, struct ucred *cred2); int prison_canseemount(struct ucred *cred, struct mount *mp); void prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp); struct prison *prison_find(int prid); -struct prison *prison_find_name(const char *name); +struct prison *prison_find_child(struct prison *, int); +struct prison *prison_find_name(struct prison *, const char *); +int prison_flag(struct ucred *, unsigned); void prison_free(struct prison *pr); void prison_free_locked(struct prison *pr); void prison_hold(struct prison *pr); void prison_hold_locked(struct prison *pr); void prison_proc_hold(struct prison *); void prison_proc_free(struct prison *); +int prison_ischild(struct prison *, struct prison *); +int prison_equal_ip4(struct prison *, struct prison *); int prison_get_ip4(struct ucred *cred, struct in_addr *ia); int prison_local_ip4(struct ucred *cred, struct in_addr *ia); int prison_remote_ip4(struct ucred *cred, struct in_addr *ia); int prison_check_ip4(struct ucred *cred, struct in_addr *ia); #ifdef INET6 +int prison_equal_ip6(struct prison *, struct prison *); int prison_get_ip6(struct ucred *, struct in6_addr *); int prison_local_ip6(struct ucred *, struct in6_addr *, int); int prison_remote_ip6(struct ucred *, struct in6_addr *); @@ -259,6 +337,7 @@ int prison_check_ip6(struct ucred *, struct in6_addr *); #endif int prison_check_af(struct ucred *cred, int af); int prison_if(struct ucred *cred, struct sockaddr *sa); +char *prison_name(struct prison *, struct prison *); int prison_priv_check(struct ucred *cred, int priv); int sysctl_jail_param(struct sysctl_oid *, void *, int , struct sysctl_req *); diff --git a/sys/sys/param.h b/sys/sys/param.h index eb8c57f3c3a9..834dde3da4ef 100644 --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -57,7 +57,7 @@ * is created, otherwise 1. */ #undef __FreeBSD_version -#define __FreeBSD_version 800090 /* Master, propagated to newvers */ +#define __FreeBSD_version 800091 /* Master, propagated to newvers */ #ifndef LOCORE #include diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h index 1dcc8022cfe5..e814a001b834 100644 --- a/sys/sys/syscallsubr.h +++ b/sys/sys/syscallsubr.h @@ -37,6 +37,7 @@ struct file; struct itimerval; struct image_args; +struct jail; struct mbuf; struct msghdr; struct msqid_ds; @@ -105,6 +106,7 @@ int kern_getsockname(struct thread *td, int fd, struct sockaddr **sa, int kern_getsockopt(struct thread *td, int s, int level, int name, void *optval, enum uio_seg valseg, socklen_t *valsize); int kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data); +int kern_jail(struct thread *td, struct jail *j); int kern_jail_get(struct thread *td, struct uio *options, int flags); int kern_jail_set(struct thread *td, struct uio *options, int flags); int kern_kevent(struct thread *td, int fd, int nchanges, int nevents, diff --git a/sys/sys/systm.h b/sys/sys/systm.h index c20536a01a3c..1956a8f122f9 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -45,8 +45,6 @@ #include #include /* for people using printf mainly */ -extern int securelevel; /* system security level (see init(8)) */ - extern int cold; /* nonzero if we are doing a cold boot */ extern int rebooting; /* boot() has been called. */ extern const char *panicstr; /* panic message */ diff --git a/sys/ufs/ufs/ufs_vnops.c b/sys/ufs/ufs/ufs_vnops.c index 723cc890d867..49ced4373aad 100644 --- a/sys/ufs/ufs/ufs_vnops.c +++ b/sys/ufs/ufs/ufs_vnops.c @@ -61,7 +61,6 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include