1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-18 10:35:55 +00:00
freebsd/sys/nfsclient/nfs_bio.c

1235 lines
32 KiB
C
Raw Normal View History

1994-05-24 10:09:53 +00:00
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Rick Macklem at The University of Guelph.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
* $Id: nfs_bio.c,v 1.51 1998/03/06 09:46:43 msmith Exp $
1994-05-24 10:09:53 +00:00
*/
1994-05-24 10:09:53 +00:00
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
1994-05-24 10:09:53 +00:00
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_prot.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
#include <vm/vnode_pager.h>
1994-05-24 10:09:53 +00:00
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
1994-05-24 10:09:53 +00:00
#include <nfs/nfs.h>
#include <nfs/nfsmount.h>
#include <nfs/nqnfs.h>
#include <nfs/nfsnode.h>
1994-05-24 10:09:53 +00:00
1995-12-17 21:14:36 +00:00
static struct buf *nfs_getcacheblk __P((struct vnode *vp, daddr_t bn, int size,
struct proc *p));
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
static void nfs_prot_buf __P((struct buf *bp, int off, int n));
1994-05-24 10:09:53 +00:00
extern int nfs_numasync;
extern struct nfsstats nfsstats;
/*
* Vnode op for VM getpages.
*/
int
nfs_getpages(ap)
struct vop_getpages_args *ap;
{
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
int i, error, nextoff, size, toff, npages;
struct uio uio;
struct iovec iov;
vm_page_t m;
vm_offset_t kva;
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
struct buf *bp;
if ((ap->a_vp->v_object) == NULL) {
printf("nfs_getpages: called with non-merged cache vnode??\n");
return EOPNOTSUPP;
}
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
/*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
bp = getpbuf();
npages = btoc(ap->a_count);
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, ap->a_m, npages);
iov.iov_base = (caddr_t) kva;
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
iov.iov_len = ap->a_count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
uio.uio_offset = IDX_TO_OFF(ap->a_m[0]->pindex);
uio.uio_resid = ap->a_count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_procp = curproc;
error = nfs_readrpc(ap->a_vp, &uio, curproc->p_ucred);
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
pmap_qremove(kva, npages);
relpbuf(bp);
if (error && (uio.uio_resid == ap->a_count))
return VM_PAGER_ERROR;
size = ap->a_count - uio.uio_resid;
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
vm_page_t m;
nextoff = toff + PAGE_SIZE;
m = ap->a_m[i];
m->flags &= ~PG_ZERO;
if (nextoff <= size) {
m->valid = VM_PAGE_BITS_ALL;
m->dirty = 0;
} else {
int nvalid = ((size + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1);
vm_page_set_validclean(m, 0, nvalid);
}
if (i != ap->a_reqpage) {
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
/*
* Whether or not to leave the page activated is up in
* the air, but we should put the page on a page queue
* somewhere (it already is in the object). Result:
* It appears that emperical results show that
* deactivating pages is best.
*/
/*
* Just in case someone was asking for this page we
* now tell them that it is ok to use.
*/
if (!error) {
if (m->flags & PG_WANTED)
vm_page_activate(m);
else
vm_page_deactivate(m);
PAGE_WAKEUP(m);
} else {
vnode_pager_freepage(m);
}
}
}
return 0;
}
/*
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
* Vnode op for VM putpages.
*/
int
nfs_putpages(ap)
struct vop_putpages_args *ap;
{
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
struct uio uio;
struct iovec iov;
vm_page_t m;
vm_offset_t kva;
struct buf *bp;
int iomode, must_commit, i, error, npages;
int *rtvals;
rtvals = ap->a_rtvals;
npages = btoc(ap->a_count);
for (i = 0; i < npages; i++) {
rtvals[i] = VM_PAGER_AGAIN;
}
/*
* We use only the kva address for the buffer, but this is extremely
* convienient and fast.
*/
bp = getpbuf();
kva = (vm_offset_t) bp->b_data;
pmap_qenter(kva, ap->a_m, npages);
iov.iov_base = (caddr_t) kva;
iov.iov_len = ap->a_count;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = IDX_TO_OFF(ap->a_m[0]->pindex);
uio.uio_resid = ap->a_count;
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_WRITE;
uio.uio_procp = curproc;
if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
iomode = NFSV3WRITE_UNSTABLE;
else
iomode = NFSV3WRITE_FILESYNC;
error = nfs_writerpc(ap->a_vp, &uio,
curproc->p_ucred, &iomode, &must_commit);
pmap_qremove(kva, npages);
relpbuf(bp);
if (!error) {
int nwritten = round_page(ap->a_count - uio.uio_resid) / PAGE_SIZE;
for (i = 0; i < nwritten; i++) {
rtvals[i] = VM_PAGER_OK;
ap->a_m[i]->dirty = 0;
}
if (must_commit)
nfs_clearcommit(ap->a_vp->v_mount);
}
return ap->a_rtvals[0];
}
1994-05-24 10:09:53 +00:00
/*
* Vnode op for read using bio
* Any similarity to readip() is purely coincidental
*/
int
nfs_bioread(vp, uio, ioflag, cred, getpages)
1994-05-24 10:09:53 +00:00
register struct vnode *vp;
register struct uio *uio;
int ioflag;
struct ucred *cred;
int getpages;
1994-05-24 10:09:53 +00:00
{
register struct nfsnode *np = VTONFS(vp);
register int biosize, diff, i;
struct buf *bp = 0, *rabp;
1994-05-24 10:09:53 +00:00
struct vattr vattr;
struct proc *p;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
daddr_t lbn, rabn;
int bufsize;
int nra, error = 0, n = 0, on = 0, not_readin;
1994-05-24 10:09:53 +00:00
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
panic("nfs_read mode");
#endif
if (uio->uio_resid == 0)
return (0);
if (uio->uio_offset < 0)
1994-05-24 10:09:53 +00:00
return (EINVAL);
p = uio->uio_procp;
if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
(void)nfs_fsinfo(nmp, vp, cred, p);
biosize = vp->v_mount->mnt_stat.f_iosize;
1994-05-24 10:09:53 +00:00
/*
* For nfs, cache consistency can only be maintained approximately.
* Although RFC1094 does not specify the criteria, the following is
* believed to be compatible with the reference port.
* For nqnfs, full cache consistency is maintained within the loop.
* For nfs:
* If the file's modify time on the server has changed since the
* last read rpc or you have written to the file,
* you may have lost data cache consistency with the
* server, so flush all of the file's data out of the cache.
* Then force a getattr rpc to ensure that you have up to date
* attributes.
* NB: This implies that cache data can be read when up to
* NFS_ATTRTIMEO seconds out of date. If you find that you need current
* attributes this could be forced by setting n_attrstamp to 0 before
* the VOP_GETATTR() call.
*/
if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
1994-05-24 10:09:53 +00:00
if (np->n_flag & NMODIFIED) {
if (vp->v_type != VREG) {
if (vp->v_type != VDIR)
panic("nfs: bioread, not dir");
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
}
np->n_attrstamp = 0;
error = VOP_GETATTR(vp, &vattr, cred, p);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
np->n_mtime = vattr.va_mtime.tv_sec;
1994-05-24 10:09:53 +00:00
} else {
error = VOP_GETATTR(vp, &vattr, cred, p);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
if (np->n_mtime != vattr.va_mtime.tv_sec) {
if (vp->v_type == VDIR)
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
np->n_mtime = vattr.va_mtime.tv_sec;
1994-05-24 10:09:53 +00:00
}
}
}
do {
/*
* Get a valid lease. If cached data is stale, flush it.
*/
if (nmp->nm_flag & NFSMNT_NQNFS) {
if (NQNFS_CKINVALID(vp, np, ND_READ)) {
1994-05-24 10:09:53 +00:00
do {
error = nqnfs_getlease(vp, ND_READ, cred, p);
1994-05-24 10:09:53 +00:00
} while (error == NQNFS_EXPIRED);
if (error)
return (error);
if (np->n_lrev != np->n_brev ||
(np->n_flag & NQNFSNONCACHE) ||
((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
if (vp->v_type == VDIR)
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
np->n_brev = np->n_lrev;
}
} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
}
}
if (np->n_flag & NQNFSNONCACHE) {
switch (vp->v_type) {
case VREG:
return (nfs_readrpc(vp, uio, cred));
1994-05-24 10:09:53 +00:00
case VLNK:
return (nfs_readlinkrpc(vp, uio, cred));
1994-05-24 10:09:53 +00:00
case VDIR:
break;
default:
printf(" NQNFSNONCACHE: type %x unexpected\n",
vp->v_type);
1994-05-24 10:09:53 +00:00
};
}
switch (vp->v_type) {
case VREG:
nfsstats.biocache_reads++;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize - 1);
1994-05-24 10:09:53 +00:00
not_readin = 1;
/*
* Start the read ahead(s), as required.
*/
if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
1994-05-24 10:09:53 +00:00
for (nra = 0; nra < nmp->nm_readahead &&
(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
rabn = lbn + 1 + nra;
1994-05-24 10:09:53 +00:00
if (!incore(vp, rabn)) {
rabp = nfs_getcacheblk(vp, rabn, biosize, p);
if (!rabp)
return (EINTR);
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
1994-05-24 10:09:53 +00:00
rabp->b_flags |= (B_READ | B_ASYNC);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
vfs_busy_pages(rabp, 0);
1994-05-24 10:09:53 +00:00
if (nfs_asyncio(rabp, cred)) {
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
rabp->b_flags |= B_INVAL|B_ERROR;
vfs_unbusy_pages(rabp);
1994-05-24 10:09:53 +00:00
brelse(rabp);
}
} else
brelse(rabp);
1994-05-24 10:09:53 +00:00
}
}
}
/*
* If the block is in the cache and has the required data
* in a valid region, just copy it out.
* Otherwise, get the block and write back/read in,
* as required.
*/
again:
bufsize = biosize;
if ((off_t)(lbn + 1) * biosize > np->n_size &&
(off_t)(lbn + 1) * biosize - np->n_size < biosize) {
bufsize = np->n_size - lbn * biosize;
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
}
bp = nfs_getcacheblk(vp, lbn, bufsize, p);
if (!bp)
return (EINTR);
/*
* If we are being called from nfs_getpages, we must
* make sure the buffer is a vmio buffer. The vp will
* already be setup for vmio but there may be some old
* non-vmio buffers attached to it.
*/
if (getpages && !(bp->b_flags & B_VMIO)) {
#ifdef DIAGNOSTIC
printf("nfs_bioread: non vmio buf found, discarding\n");
#endif
bp->b_flags |= B_NOCACHE;
bp->b_flags |= B_INVAFTERWRITE;
if (bp->b_dirtyend > 0) {
if ((bp->b_flags & B_DELWRI) == 0)
panic("nfsbioread");
if (VOP_BWRITE(bp) == EINTR)
return (EINTR);
} else
brelse(bp);
goto again;
}
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
not_readin = 0;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
return (error);
}
1994-05-24 10:09:53 +00:00
}
if (bufsize > on) {
n = min((unsigned)(bufsize - on), uio->uio_resid);
} else {
n = 0;
}
1994-05-24 10:09:53 +00:00
diff = np->n_size - uio->uio_offset;
if (diff < n)
n = diff;
if (not_readin && n > 0) {
if (on < bp->b_validoff || (on + n) > bp->b_validend) {
bp->b_flags |= B_NOCACHE;
bp->b_flags |= B_INVAFTERWRITE;
1994-05-24 10:09:53 +00:00
if (bp->b_dirtyend > 0) {
if ((bp->b_flags & B_DELWRI) == 0)
panic("nfsbioread");
if (VOP_BWRITE(bp) == EINTR)
return (EINTR);
} else
brelse(bp);
goto again;
}
}
vp->v_lastr = lbn;
diff = (on >= bp->b_validend) ? 0 : (bp->b_validend - on);
if (diff < n)
n = diff;
break;
case VLNK:
nfsstats.biocache_readlinks++;
bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
if (!bp)
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
bp->b_flags |= B_ERROR;
brelse(bp);
return (error);
}
1994-05-24 10:09:53 +00:00
}
n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
on = 0;
break;
case VDIR:
nfsstats.biocache_readdirs++;
if (np->n_direofoffset
&& uio->uio_offset >= np->n_direofoffset) {
return (0);
}
lbn = uio->uio_offset / NFS_DIRBLKSIZ;
on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
1994-05-24 10:09:53 +00:00
if (!bp)
return (EINTR);
if ((bp->b_flags & B_CACHE) == 0) {
bp->b_flags |= B_READ;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
}
while (error == NFSERR_BAD_COOKIE) {
nfs_invaldir(vp);
error = nfs_vinvalbuf(vp, 0, cred, p, 1);
/*
* Yuck! The directory has been modified on the
* server. The only way to get the block is by
* reading from the beginning to get all the
* offset cookies.
*/
for (i = 0; i <= lbn && !error; i++) {
if (np->n_direofoffset
&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
return (0);
bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
if (!bp)
return (EINTR);
if ((bp->b_flags & B_DONE) == 0) {
bp->b_flags |= B_READ;
vfs_busy_pages(bp, 0);
error = nfs_doio(bp, cred, p);
if (error) {
brelse(bp);
} else if (i < lbn) {
brelse(bp);
}
}
1994-05-24 10:09:53 +00:00
}
}
if (error)
return (error);
1994-05-24 10:09:53 +00:00
}
/*
* If not eof and read aheads are enabled, start one.
* (You need the current block first, so that you have the
* directory offset cookie of the next block.)
1994-05-24 10:09:53 +00:00
*/
if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
(np->n_direofoffset == 0 ||
(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
!(np->n_flag & NQNFSNONCACHE) &&
!incore(vp, lbn + 1)) {
rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
1994-05-24 10:09:53 +00:00
if (rabp) {
if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) {
1994-05-24 10:09:53 +00:00
rabp->b_flags |= (B_READ | B_ASYNC);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
vfs_busy_pages(rabp, 0);
1994-05-24 10:09:53 +00:00
if (nfs_asyncio(rabp, cred)) {
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
rabp->b_flags |= B_INVAL|B_ERROR;
vfs_unbusy_pages(rabp);
1994-05-24 10:09:53 +00:00
brelse(rabp);
}
} else {
brelse(rabp);
1994-05-24 10:09:53 +00:00
}
}
}
/*
* Make sure we use a signed variant of min() since
* the second term may be negative.
*/
n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
1994-05-24 10:09:53 +00:00
break;
default:
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
break;
1994-05-24 10:09:53 +00:00
};
if (n > 0) {
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
error = uiomove(bp->b_data + on, (int)n, uio);
1994-05-24 10:09:53 +00:00
}
switch (vp->v_type) {
case VREG:
break;
case VLNK:
n = 0;
break;
case VDIR:
if (np->n_flag & NQNFSNONCACHE)
bp->b_flags |= B_INVAL;
1994-05-24 10:09:53 +00:00
break;
default:
printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
}
brelse(bp);
1994-05-24 10:09:53 +00:00
} while (error == 0 && uio->uio_resid > 0 && n > 0);
return (error);
}
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
static void
nfs_prot_buf(bp, off, n)
struct buf *bp;
int off;
int n;
{
int pindex, boff, end;
if ((bp->b_flags & B_VMIO) == 0)
return;
end = round_page(off + n);
for (boff = trunc_page(off); boff < end; boff += PAGE_SIZE) {
pindex = boff >> PAGE_SHIFT;
vm_page_protect(bp->b_pages[pindex], VM_PROT_NONE);
}
}
1994-05-24 10:09:53 +00:00
/*
* Vnode op for write using bio
*/
int
1994-05-24 10:09:53 +00:00
nfs_write(ap)
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
struct ucred *a_cred;
} */ *ap;
{
register int biosize;
register struct uio *uio = ap->a_uio;
struct proc *p = uio->uio_procp;
register struct vnode *vp = ap->a_vp;
struct nfsnode *np = VTONFS(vp);
register struct ucred *cred = ap->a_cred;
int ioflag = ap->a_ioflag;
struct buf *bp;
struct vattr vattr;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
daddr_t lbn;
int bufsize;
int n, on, error = 0, iomode, must_commit;
1994-05-24 10:09:53 +00:00
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_WRITE)
panic("nfs_write mode");
if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
panic("nfs_write proc");
#endif
if (vp->v_type != VREG)
return (EIO);
if (np->n_flag & NWRITEERR) {
np->n_flag &= ~NWRITEERR;
return (np->n_error);
}
if ((nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_GOTFSINFO)) == NFSMNT_NFSV3)
(void)nfs_fsinfo(nmp, vp, cred, p);
1994-05-24 10:09:53 +00:00
if (ioflag & (IO_APPEND | IO_SYNC)) {
if (np->n_flag & NMODIFIED) {
np->n_attrstamp = 0;
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
}
if (ioflag & IO_APPEND) {
np->n_attrstamp = 0;
error = VOP_GETATTR(vp, &vattr, cred, p);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
uio->uio_offset = np->n_size;
}
}
if (uio->uio_offset < 0)
return (EINVAL);
if (uio->uio_resid == 0)
return (0);
/*
* Maybe this should be above the vnode op call, but so long as
* file servers have no limits, i don't think it matters
*/
if (p && uio->uio_offset + uio->uio_resid >
p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
psignal(p, SIGXFSZ);
return (EFBIG);
}
/*
* I use nm_rsize, not nm_wsize so that all buffer cache blocks
* will be the same size within a filesystem. nfs_writerpc will
* still use nm_wsize when sizing the rpc's.
*/
biosize = vp->v_mount->mnt_stat.f_iosize;
1994-05-24 10:09:53 +00:00
do {
/*
* Check for a valid write lease.
*/
if ((nmp->nm_flag & NFSMNT_NQNFS) &&
NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1994-05-24 10:09:53 +00:00
do {
error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1994-05-24 10:09:53 +00:00
} while (error == NQNFS_EXPIRED);
if (error)
return (error);
if (np->n_lrev != np->n_brev ||
(np->n_flag & NQNFSNONCACHE)) {
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
np->n_brev = np->n_lrev;
}
}
if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
iomode = NFSV3WRITE_FILESYNC;
error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
if (must_commit)
nfs_clearcommit(vp->v_mount);
return (error);
}
1994-05-24 10:09:53 +00:00
nfsstats.biocache_writes++;
lbn = uio->uio_offset / biosize;
on = uio->uio_offset & (biosize-1);
n = min((unsigned)(biosize - on), uio->uio_resid);
again:
if (uio->uio_offset + n > np->n_size) {
np->n_size = uio->uio_offset + n;
np->n_flag |= NMODIFIED;
vnode_pager_setsize(vp, (u_long)np->n_size);
}
bufsize = biosize;
if ((lbn + 1) * biosize > np->n_size) {
bufsize = np->n_size - lbn * biosize;
bufsize = (bufsize + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
}
bp = nfs_getcacheblk(vp, lbn, bufsize, p);
1994-05-24 10:09:53 +00:00
if (!bp)
return (EINTR);
if (bp->b_wcred == NOCRED) {
crhold(cred);
bp->b_wcred = cred;
}
np->n_flag |= NMODIFIED;
if ((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend > np->n_size) {
bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
}
/*
* If the new write will leave a contiguous dirty
* area, just update the b_dirtyoff and b_dirtyend,
* otherwise force a write rpc of the old dirty area.
*/
if (bp->b_dirtyend > 0 &&
(on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) {
bp->b_proc = p;
if (VOP_BWRITE(bp) == EINTR)
return (EINTR);
goto again;
}
1994-05-24 10:09:53 +00:00
/*
* Check for valid write lease and get one as required.
* In case getblk() and/or bwrite() delayed us.
*/
if ((nmp->nm_flag & NFSMNT_NQNFS) &&
NQNFS_CKINVALID(vp, np, ND_WRITE)) {
1994-05-24 10:09:53 +00:00
do {
error = nqnfs_getlease(vp, ND_WRITE, cred, p);
1994-05-24 10:09:53 +00:00
} while (error == NQNFS_EXPIRED);
if (error) {
brelse(bp);
return (error);
}
if (np->n_lrev != np->n_brev ||
(np->n_flag & NQNFSNONCACHE)) {
brelse(bp);
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
np->n_brev = np->n_lrev;
goto again;
}
}
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
error = uiomove((char *)bp->b_data + on, n, uio);
if (error) {
1994-05-24 10:09:53 +00:00
bp->b_flags |= B_ERROR;
brelse(bp);
return (error);
}
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
/*
* This will keep the buffer and mmaped regions more coherent.
*/
nfs_prot_buf(bp, on, n);
1994-05-24 10:09:53 +00:00
if (bp->b_dirtyend > 0) {
bp->b_dirtyoff = min(on, bp->b_dirtyoff);
bp->b_dirtyend = max((on + n), bp->b_dirtyend);
} else {
bp->b_dirtyoff = on;
bp->b_dirtyend = on + n;
}
if (bp->b_validend == 0 || bp->b_validend < bp->b_dirtyoff ||
bp->b_validoff > bp->b_dirtyend) {
bp->b_validoff = bp->b_dirtyoff;
bp->b_validend = bp->b_dirtyend;
} else {
bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff);
bp->b_validend = max(bp->b_validend, bp->b_dirtyend);
}
/*
* Since this block is being modified, it must be written
* again and not just committed.
*/
bp->b_flags &= ~B_NEEDCOMMIT;
1994-05-24 10:09:53 +00:00
/*
* If the lease is non-cachable or IO_SYNC do bwrite().
*/
if ((np->n_flag & NQNFSNONCACHE) || (ioflag & IO_SYNC)) {
bp->b_proc = p;
This mega-commit is meant to fix numerous interrelated problems. There has been some bitrot and incorrect assumptions in the vfs_bio code. These problems have manifest themselves worse on NFS type filesystems, but can still affect local filesystems under certain circumstances. Most of the problems have involved mmap consistancy, and as a side-effect broke the vfs.ioopt code. This code might have been committed seperately, but almost everything is interrelated. 1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that are fully valid. 2) Rather than deactivating erroneously read initial (header) pages in kern_exec, we now free them. 3) Fix the rundown of non-VMIO buffers that are in an inconsistent (missing vp) state. 4) Fix the disassociation of pages from buffers in brelse. The previous code had rotted and was faulty in a couple of important circumstances. 5) Remove a gratuitious buffer wakeup in vfs_vmio_release. 6) Remove a crufty and currently unused cluster mechanism for VBLK files in vfs_bio_awrite. When the code is functional, I'll add back a cleaner version. 7) The page busy count wakeups assocated with the buffer cache usage were incorrectly cleaned up in a previous commit by me. Revert to the original, correct version, but with a cleaner implementation. 8) The cluster read code now tries to keep data associated with buffers more aggressively (without breaking the heuristics) when it is presumed that the read data (buffers) will be soon needed. 9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The delay loop waiting is not useful for filesystem locks, due to the length of the time intervals. 10) Correct and clean-up spec_getpages. 11) Implement a fully functional nfs_getpages, nfs_putpages. 12) Fix nfs_write so that modifications are coherent with the NFS data on the server disk (at least as well as NFS seems to allow.) 13) Properly support MS_INVALIDATE on NFS. 14) Properly pass down MS_INVALIDATE to lower levels of the VM code from vm_map_clean. 15) Better support the notion of pages being busy but valid, so that fewer in-transit waits occur. (use p->busy more for pageouts instead of PG_BUSY.) Since the page is fully valid, it is still usable for reads. 16) It is possible (in error) for cached pages to be busy. Make the page allocation code handle that case correctly. (It should probably be a printf or panic, but I want the system to handle coding errors robustly. I'll probably add a printf.) 17) Correct the design and usage of vm_page_sleep. It didn't handle consistancy problems very well, so make the design a little less lofty. After vm_page_sleep, if it ever blocked, it is still important to relookup the page (if the object generation count changed), and verify it's status (always.) 18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up. 19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush. 20) Fix vm_pager_put_pages and it's descendents to support an int flag instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
if (ioflag & IO_INVAL)
bp->b_flags |= B_INVAL;
error = VOP_BWRITE(bp);
if (error)
1994-05-24 10:09:53 +00:00
return (error);
if (np->n_flag & NQNFSNONCACHE) {
error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
if (error)
return (error);
}
1994-05-24 10:09:53 +00:00
} else if ((n + on) == biosize &&
(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
bp->b_proc = (struct proc *)0;
bp->b_flags |= B_ASYNC;
(void)nfs_writebp(bp, 0);
1994-05-24 10:09:53 +00:00
} else
bdwrite(bp);
} while (uio->uio_resid > 0 && n > 0);
return (0);
}
/*
* Get an nfs cache block.
* Allocate a new one if the block isn't currently in the cache
* and return the block marked busy. If the calling process is
* interrupted by a signal for an interruptible mount point, return
* NULL.
*/
1995-12-17 21:14:36 +00:00
static struct buf *
1994-05-24 10:09:53 +00:00
nfs_getcacheblk(vp, bn, size, p)
struct vnode *vp;
daddr_t bn;
int size;
struct proc *p;
{
register struct buf *bp;
struct mount *mp;
struct nfsmount *nmp;
mp = vp->v_mount;
nmp = VFSTONFS(mp);
1994-05-24 10:09:53 +00:00
if (nmp->nm_flag & NFSMNT_INT) {
bp = getblk(vp, bn, size, PCATCH, 0);
while (bp == (struct buf *)0) {
if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
return ((struct buf *)0);
bp = getblk(vp, bn, size, 0, 2 * hz);
}
} else
bp = getblk(vp, bn, size, 0, 0);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
if( vp->v_type == VREG) {
int biosize;
biosize = mp->mnt_stat.f_iosize;
bp->b_blkno = (bn * biosize) / DEV_BSIZE;
}
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
1994-05-24 10:09:53 +00:00
return (bp);
}
/*
* Flush and invalidate all dirty buffers. If another process is already
* doing the flush, just wait for completion.
*/
int
1994-05-24 10:09:53 +00:00
nfs_vinvalbuf(vp, flags, cred, p, intrflg)
struct vnode *vp;
int flags;
struct ucred *cred;
struct proc *p;
int intrflg;
{
register struct nfsnode *np = VTONFS(vp);
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
int error = 0, slpflag, slptimeo;
if (vp->v_flag & VXLOCK) {
return (0);
}
1994-05-24 10:09:53 +00:00
if ((nmp->nm_flag & NFSMNT_INT) == 0)
intrflg = 0;
if (intrflg) {
slpflag = PCATCH;
slptimeo = 2 * hz;
} else {
slpflag = 0;
slptimeo = 0;
}
/*
* First wait for any other process doing a flush to complete.
*/
while (np->n_flag & NFLUSHINPROG) {
np->n_flag |= NFLUSHWANT;
error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
slptimeo);
if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
return (EINTR);
}
/*
* Now, flush as required.
*/
np->n_flag |= NFLUSHINPROG;
error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
while (error) {
if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
np->n_flag &= ~NFLUSHINPROG;
if (np->n_flag & NFLUSHWANT) {
np->n_flag &= ~NFLUSHWANT;
wakeup((caddr_t)&np->n_flag);
}
return (EINTR);
}
error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
}
np->n_flag &= ~(NMODIFIED | NFLUSHINPROG);
if (np->n_flag & NFLUSHWANT) {
np->n_flag &= ~NFLUSHWANT;
wakeup((caddr_t)&np->n_flag);
}
return (0);
}
/*
* Initiate asynchronous I/O. Return an error if no nfsiods are available.
* This is mainly to avoid queueing async I/O requests when the nfsiods
* are all hung on a dead server.
*/
int
1994-05-24 10:09:53 +00:00
nfs_asyncio(bp, cred)
register struct buf *bp;
struct ucred *cred;
{
struct nfsmount *nmp;
int i;
int gotiod;
int slpflag = 0;
int slptimeo = 0;
int error;
1994-05-24 10:09:53 +00:00
if (nfs_numasync == 0)
return (EIO);
nmp = VFSTONFS(bp->b_vp->v_mount);
again:
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
gotiod = FALSE;
/*
* Find a free iod to process this request.
*/
1994-05-24 10:09:53 +00:00
for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
if (nfs_iodwant[i]) {
/*
* Found one, so wake it up and tell it which
* mount to process.
*/
NFS_DPF(ASYNCIO,
("nfs_asyncio: waking iod %d for mount %p\n",
i, nmp));
nfs_iodwant[i] = (struct proc *)0;
nfs_iodmount[i] = nmp;
nmp->nm_bufqiods++;
wakeup((caddr_t)&nfs_iodwant[i]);
gotiod = TRUE;
break;
}
/*
* If none are free, we may already have an iod working on this mount
* point. If so, it will process our request.
*/
if (!gotiod) {
if (nmp->nm_bufqiods > 0) {
NFS_DPF(ASYNCIO,
("nfs_asyncio: %d iods are already processing mount %p\n",
nmp->nm_bufqiods, nmp));
gotiod = TRUE;
}
}
/*
* If we have an iod which can process the request, then queue
* the buffer.
*/
if (gotiod) {
/*
* Ensure that the queue never grows too large.
*/
while (nmp->nm_bufqlen >= 2*nfs_numasync) {
NFS_DPF(ASYNCIO,
("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
nmp->nm_bufqwant = TRUE;
error = tsleep(&nmp->nm_bufq, slpflag | PRIBIO,
"nfsaio", slptimeo);
if (error) {
if (nfs_sigintr(nmp, NULL, bp->b_proc))
return (EINTR);
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
}
}
/*
* We might have lost our iod while sleeping,
* so check and loop if nescessary.
*/
if (nmp->nm_bufqiods == 0) {
NFS_DPF(ASYNCIO,
("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
goto again;
}
}
1994-05-24 10:09:53 +00:00
if (bp->b_flags & B_READ) {
if (bp->b_rcred == NOCRED && cred != NOCRED) {
crhold(cred);
bp->b_rcred = cred;
}
} else {
bp->b_flags |= B_WRITEINPROG;
1994-05-24 10:09:53 +00:00
if (bp->b_wcred == NOCRED && cred != NOCRED) {
crhold(cred);
bp->b_wcred = cred;
}
}
1995-05-30 08:16:23 +00:00
TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
nmp->nm_bufqlen++;
1994-05-24 10:09:53 +00:00
return (0);
}
/*
* All the iods are busy on other mounts, so return EIO to
* force the caller to process the i/o synchronously.
*/
NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
return (EIO);
1994-05-24 10:09:53 +00:00
}
/*
* Do an I/O operation to/from a cache block. This may be called
* synchronously or from an nfsiod.
*/
int
nfs_doio(bp, cr, p)
register struct buf *bp;
struct ucred *cr;
1994-05-24 10:09:53 +00:00
struct proc *p;
{
register struct uio *uiop;
register struct vnode *vp;
struct nfsnode *np;
struct nfsmount *nmp;
int error = 0, diff, len, iomode, must_commit = 0;
1994-05-24 10:09:53 +00:00
struct uio uio;
struct iovec io;
vp = bp->b_vp;
np = VTONFS(vp);
nmp = VFSTONFS(vp->v_mount);
uiop = &uio;
uiop->uio_iov = &io;
uiop->uio_iovcnt = 1;
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_procp = p;
/*
* Historically, paging was done with physio, but no more.
*/
if (bp->b_flags & B_PHYS) {
/*
* ...though reading /dev/drum still gets us here.
*/
io.iov_len = uiop->uio_resid = bp->b_bcount;
/* mapping was done by vmapbuf() */
io.iov_base = bp->b_data;
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
if (bp->b_flags & B_READ) {
uiop->uio_rw = UIO_READ;
nfsstats.read_physios++;
error = nfs_readrpc(vp, uiop, cr);
} else {
int com;
iomode = NFSV3WRITE_DATASYNC;
uiop->uio_rw = UIO_WRITE;
nfsstats.write_physios++;
error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
}
if (error) {
bp->b_flags |= B_ERROR;
bp->b_error = error;
}
} else if (bp->b_flags & B_READ) {
1994-05-24 10:09:53 +00:00
io.iov_len = uiop->uio_resid = bp->b_bcount;
io.iov_base = bp->b_data;
uiop->uio_rw = UIO_READ;
switch (vp->v_type) {
case VREG:
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
1994-05-24 10:09:53 +00:00
nfsstats.read_bios++;
error = nfs_readrpc(vp, uiop, cr);
if (!error) {
bp->b_validoff = 0;
if (uiop->uio_resid) {
/*
* If len > 0, there is a hole in the file and
* no writes after the hole have been pushed to
* the server yet.
* Just zero fill the rest of the valid area.
*/
diff = bp->b_bcount - uiop->uio_resid;
len = np->n_size - (((u_quad_t)bp->b_blkno) * DEV_BSIZE
1994-05-24 10:09:53 +00:00
+ diff);
if (len > 0) {
len = min(len, uiop->uio_resid);
bzero((char *)bp->b_data + diff, len);
bp->b_validend = diff + len;
} else
bp->b_validend = diff;
} else
bp->b_validend = bp->b_bcount;
}
if (p && (vp->v_flag & VTEXT) &&
(((nmp->nm_flag & NFSMNT_NQNFS) &&
NQNFS_CKINVALID(vp, np, ND_READ) &&
1994-05-24 10:09:53 +00:00
np->n_lrev != np->n_brev) ||
(!(nmp->nm_flag & NFSMNT_NQNFS) &&
np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
1994-05-24 10:09:53 +00:00
uprintf("Process killed due to text file modification\n");
psignal(p, SIGKILL);
p->p_flag |= P_NOSWAP;
}
break;
case VLNK:
uiop->uio_offset = (off_t)0;
1994-05-24 10:09:53 +00:00
nfsstats.readlink_bios++;
error = nfs_readlinkrpc(vp, uiop, cr);
break;
case VDIR:
nfsstats.readdir_bios++;
uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
error = nfs_readdirplusrpc(vp, uiop, cr);
if (error == NFSERR_NOTSUPP)
nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
}
if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
error = nfs_readdirrpc(vp, uiop, cr);
1994-05-24 10:09:53 +00:00
break;
default:
printf("nfs_doio: type %x unexpected\n",vp->v_type);
break;
1994-05-24 10:09:53 +00:00
};
if (error) {
bp->b_flags |= B_ERROR;
bp->b_error = error;
}
} else {
if (((bp->b_blkno * DEV_BSIZE) + bp->b_dirtyend) > np->n_size)
bp->b_dirtyend = np->n_size - (bp->b_blkno * DEV_BSIZE);
if (bp->b_dirtyend > bp->b_dirtyoff) {
io.iov_len = uiop->uio_resid = bp->b_dirtyend
- bp->b_dirtyoff;
uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE
+ bp->b_dirtyoff;
io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
uiop->uio_rw = UIO_WRITE;
nfsstats.write_bios++;
if ((bp->b_flags & (B_ASYNC | B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == B_ASYNC)
iomode = NFSV3WRITE_UNSTABLE;
else
iomode = NFSV3WRITE_FILESYNC;
bp->b_flags |= B_WRITEINPROG;
error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
if (!error && iomode == NFSV3WRITE_UNSTABLE) {
bp->b_flags |= B_NEEDCOMMIT;
if (bp->b_dirtyoff == 0
&& bp->b_dirtyend == bp->b_bufsize)
bp->b_flags |= B_CLUSTEROK;
} else
bp->b_flags &= ~B_NEEDCOMMIT;
bp->b_flags &= ~B_WRITEINPROG;
1994-05-24 10:09:53 +00:00
/*
* For an interrupted write, the buffer is still valid
* and the write hasn't been pushed to the server yet,
* so we can't set B_ERROR and report the interruption
* by setting B_EINTR. For the B_ASYNC case, B_EINTR
* is not relevant, so the rpc attempt is essentially
* a noop. For the case of a V3 write rpc not being
* committed to stable storage, the block is still
* dirty and requires either a commit rpc or another
* write rpc with iomode == NFSV3WRITE_FILESYNC before
* the block is reused. This is indicated by setting
* the B_DELWRI and B_NEEDCOMMIT flags.
*/
if (error == EINTR
|| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
bp->b_flags &= ~(B_INVAL|B_NOCACHE);
++numdirtybuffers;
bp->b_flags |= B_DELWRI;
reassignbuf(bp, vp);
if ((bp->b_flags & B_ASYNC) == 0)
bp->b_flags |= B_EINTR;
} else {
if (error) {
bp->b_flags |= B_ERROR;
bp->b_error = np->n_error = error;
np->n_flag |= NWRITEERR;
}
bp->b_dirtyoff = bp->b_dirtyend = 0;
1994-05-24 10:09:53 +00:00
}
} else {
bp->b_resid = 0;
biodone(bp);
return (0);
1994-05-24 10:09:53 +00:00
}
}
bp->b_resid = uiop->uio_resid;
if (must_commit)
nfs_clearcommit(vp->v_mount);
1994-05-24 10:09:53 +00:00
biodone(bp);
return (error);
}