diff -u --recursive --new-file linux-2.4.17/Documentation/Configure.help linux-2.4.17-rpc_tweaks/Documentation/Configure.help --- linux-2.4.17/Documentation/Configure.help Fri Dec 21 18:41:53 2001 +++ linux-2.4.17-rpc_tweaks/Documentation/Configure.help Sun Jan 13 17:17:57 2002 @@ -14332,6 +14332,30 @@ If unsure, say N. +Allow direct I/O on files in NFS +CONFIG_NFS_DIRECTIO + There are important applications whose performance or correctness + depends on uncached access to file data. Database clusters (multiple + copies of the same instance running on separate hosts) implement their + own cache coherency protocol that subsumes the NFS cache protocols. + Applications that process datasets considerably larger than the client's + memory do not always benefit from a local cache. A streaming video + server, for instance, has no need to cache the contents of a file. + + This option enables applications to perform direct I/O on files in NFS + file systems using the O_DIRECT open() flag. When O_DIRECT is set for + files, their data is not cached in the system's page cache. Direct + read and write operations are aligned to block boundaries. Data is + moved to and from user-level application buffers directly. + + Unless your program is designed to use O_DIRECT properly, you are much + better off allowing the NFS client to manage caching for you. Misusing + O_DIRECT can cause poor server performance or network storms. This + kernel build option defaults OFF to avoid exposing system administrators + unwittingly to a potentially hazardous feature. + + If unsure, say N. + Root file system on NFS CONFIG_ROOT_NFS If you want your Linux box to mount its whole root file system (the diff -u --recursive --new-file linux-2.4.17/fs/Config.in linux-2.4.17-rpc_tweaks/fs/Config.in --- linux-2.4.17/fs/Config.in Mon Nov 12 18:34:16 2001 +++ linux-2.4.17-rpc_tweaks/fs/Config.in Sun Jan 13 17:22:01 2002 @@ -95,10 +95,12 @@ dep_tristate 'InterMezzo file system support (experimental, replicating fs)' CONFIG_INTERMEZZO_FS $CONFIG_INET $CONFIG_EXPERIMENTAL dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET dep_mbool ' Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS + dep_mbool ' Allow direct I/O on NFS files (EXPERIMENTAL)' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS $CONFIG_EXPERIMENTAL dep_bool ' Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET dep_mbool ' Provide NFSv3 server support' CONFIG_NFSD_V3 $CONFIG_NFSD + dep_mbool ' Provide NFS server over TCP support (EXPERIMENTAL)' CONFIG_NFSD_TCP $CONFIG_NFSD $CONFIG_EXPERIMENTAL if [ "$CONFIG_NFS_FS" = "y" -o "$CONFIG_NFSD" = "y" ]; then define_tristate CONFIG_SUNRPC y diff -u --recursive --new-file linux-2.4.17/fs/block_dev.c linux-2.4.17-rpc_tweaks/fs/block_dev.c --- linux-2.4.17/fs/block_dev.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/block_dev.c Sun Jan 13 17:17:57 2002 @@ -113,9 +113,9 @@ return 0; } -static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int blkdev_direct_IO(int rw, struct file * file, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { - return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block); + return generic_direct_IO(rw, file, iobuf, blocknr, blocksize, blkdev_get_block); } static int blkdev_writepage(struct page * page) diff -u --recursive --new-file linux-2.4.17/fs/buffer.c linux-2.4.17-rpc_tweaks/fs/buffer.c --- linux-2.4.17/fs/buffer.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/buffer.c Sun Jan 13 17:17:57 2002 @@ -1999,10 +1999,11 @@ return tmp.b_blocknr; } -int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +int generic_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) { int i, nr_blocks, retval; unsigned long * blocks = iobuf->blocks; + struct inode * inode = filp->f_dentry->d_inode; nr_blocks = iobuf->length / blocksize; /* build the blocklist */ diff -u --recursive --new-file linux-2.4.17/fs/ext2/inode.c linux-2.4.17-rpc_tweaks/fs/ext2/inode.c --- linux-2.4.17/fs/ext2/inode.c Wed Nov 21 23:07:25 2001 +++ linux-2.4.17-rpc_tweaks/fs/ext2/inode.c Sun Jan 13 17:17:57 2002 @@ -592,9 +592,9 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } -static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int ext2_direct_IO(int rw, struct file * file, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { - return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); + return generic_direct_IO(rw, file, iobuf, blocknr, blocksize, ext2_get_block); } struct address_space_operations ext2_aops = { readpage: ext2_readpage, diff -u --recursive --new-file linux-2.4.17/fs/lockd/clntproc.c linux-2.4.17-rpc_tweaks/fs/lockd/clntproc.c --- linux-2.4.17/fs/lockd/clntproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/clntproc.c Sun Jan 13 17:21:27 2002 @@ -569,11 +569,15 @@ printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_rebind: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -650,12 +654,16 @@ } die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_cancel: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff -u --recursive --new-file linux-2.4.17/fs/lockd/svc.c linux-2.4.17-rpc_tweaks/fs/lockd/svc.c --- linux-2.4.17/fs/lockd/svc.c Sun Oct 21 19:32:33 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/svc.c Sun Jan 13 17:22:01 2002 @@ -36,7 +36,7 @@ #include #define NLMDBG_FACILITY NLMDBG_SVC -#define LOCKD_BUFSIZE (1024 + NLMSSVC_XDRSIZE) +#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) #define ALLOWED_SIGS (sigmask(SIGKILL)) extern struct svc_program nlmsvc_program; @@ -237,7 +237,7 @@ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, 0, NLMSVC_XDRSIZE); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NLMSVC_XDRSIZE); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; diff -u --recursive --new-file linux-2.4.17/fs/lockd/svc4proc.c linux-2.4.17-rpc_tweaks/fs/lockd/svc4proc.c --- linux-2.4.17/fs/lockd/svc4proc.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/svc4proc.c Sun Jan 13 17:21:27 2002 @@ -17,6 +17,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -499,7 +500,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.17/fs/lockd/svclock.c linux-2.4.17-rpc_tweaks/fs/lockd/svclock.c --- linux-2.4.17/fs/lockd/svclock.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/svclock.c Sun Jan 13 17:21:27 2002 @@ -576,9 +576,10 @@ dprintk("lockd: GRANT_MSG RPC callback\n"); dprintk("callback: looking for cookie %x \n", *(unsigned int *)(call->a_args.cookie.data)); + lock_kernel(); if (!(block = nlmsvc_find_block(&call->a_args.cookie))) { dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data)); - return; + goto out; } /* Technically, we should down the file semaphore here. Since we @@ -599,6 +600,8 @@ block->b_incall = 0; nlm_release_host(call->a_host); + out: + unlock_kernel(); } /* diff -u --recursive --new-file linux-2.4.17/fs/lockd/svcproc.c linux-2.4.17-rpc_tweaks/fs/lockd/svcproc.c --- linux-2.4.17/fs/lockd/svcproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/svcproc.c Sun Jan 13 17:21:27 2002 @@ -18,6 +18,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -527,7 +528,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.17/fs/lockd/xdr.c linux-2.4.17-rpc_tweaks/fs/lockd/xdr.c --- linux-2.4.17/fs/lockd/xdr.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/xdr.c Sun Jan 13 17:22:01 2002 @@ -230,7 +230,7 @@ static inline int xdr_ressize_check(struct svc_rqst *rqstp, u32 *p) { - struct svc_buf *buf = &rqstp->rq_resbuf; + struct svc_buf *buf = rqstp->rq_resbuf; buf->len = p - buf->base; return (buf->len <= buf->buflen); diff -u --recursive --new-file linux-2.4.17/fs/lockd/xdr4.c linux-2.4.17-rpc_tweaks/fs/lockd/xdr4.c --- linux-2.4.17/fs/lockd/xdr4.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.17-rpc_tweaks/fs/lockd/xdr4.c Sun Jan 13 17:22:01 2002 @@ -236,7 +236,7 @@ static int xdr_ressize_check(struct svc_rqst *rqstp, u32 *p) { - struct svc_buf *buf = &rqstp->rq_resbuf; + struct svc_buf *buf = rqstp->rq_resbuf; buf->len = p - buf->base; return (buf->len <= buf->buflen); diff -u --recursive --new-file linux-2.4.17/fs/namei.c linux-2.4.17-rpc_tweaks/fs/namei.c --- linux-2.4.17/fs/namei.c Wed Oct 17 23:46:29 2001 +++ linux-2.4.17-rpc_tweaks/fs/namei.c Sun Jan 13 17:19:02 2002 @@ -454,7 +454,7 @@ while (*name=='/') name++; if (!*name) - goto return_base; + goto return_reval; inode = nd->dentry->d_inode; if (current->link_count) @@ -573,7 +573,7 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: - goto return_base; + goto return_reval; } if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { err = nd->dentry->d_op->d_hash(nd->dentry, &this); @@ -624,6 +624,17 @@ nd->last_type = LAST_DOT; else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; +return_reval: + /* + * We bypassed the ordinary revalidation routines, so + * NFS wants to check the cached inode for staleness. + */ + inode = nd->dentry->d_inode; + if (inode && inode->i_op && inode->i_op->check_stale) { + err = inode->i_op->check_stale(inode); + if (err) + break; + } return_base: return 0; out_dput: diff -u --recursive --new-file linux-2.4.17/fs/nfs/Makefile linux-2.4.17-rpc_tweaks/fs/nfs/Makefile --- linux-2.4.17/fs/nfs/Makefile Fri Nov 9 23:28:15 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/Makefile Sun Jan 13 17:17:57 2002 @@ -14,6 +14,7 @@ obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +obj-$(CONFIG_NFS_DIRECTIO) += direct.o obj-m := $(O_TARGET) diff -u --recursive --new-file linux-2.4.17/fs/nfs/dir.c linux-2.4.17-rpc_tweaks/fs/nfs/dir.c --- linux-2.4.17/fs/nfs/dir.c Tue Jun 12 20:15:08 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/dir.c Tue Jan 15 11:44:15 2002 @@ -34,8 +34,11 @@ #define NFS_PARANOIA 1 /* #define NFS_DEBUG_VERBOSE 1 */ +static loff_t nfs_dir_llseek(struct file *, loff_t, int); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *); +static int nfs_cached_lookup(struct inode *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); static int nfs_create(struct inode *, struct dentry *, int); static int nfs_mkdir(struct inode *, struct dentry *, int); static int nfs_rmdir(struct inode *, struct dentry *); @@ -47,6 +50,7 @@ struct inode *, struct dentry *); struct file_operations nfs_dir_operations = { + llseek: nfs_dir_llseek, read: generic_read_dir, readdir: nfs_readdir, open: nfs_open, @@ -66,8 +70,28 @@ permission: nfs_permission, revalidate: nfs_revalidate, setattr: nfs_notify_change, + check_stale: nfs_check_stale, }; +static loff_t nfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + switch (origin) { + case 1: + if (offset == 0) { + offset = file->f_pos; + break; + } + case 2: + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_reada = 0; + file->f_version = ++event; + } + return (offset <= 0) ? 0 : offset; +} + typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int); typedef struct { struct file *file; @@ -108,13 +132,15 @@ error = NFS_PROTO(inode)->readdir(inode, cred, desc->entry->cookie, buffer, NFS_SERVER(inode)->dtsize, desc->plus); /* We requested READDIRPLUS, but the server doesn't grok it */ - if (desc->plus && error == -ENOTSUPP) { - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; - desc->plus = 0; - goto again; - } - if (error < 0) + if (error < 0) { + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + desc->plus = 0; + goto again; + } goto error; + } SetPageUptodate(page); kunmap(page); /* Ensure consistent page alignment of the data. @@ -195,7 +221,6 @@ dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index); - desc->plus = NFS_USE_READDIRPLUS(inode); page = read_cache_page(&inode->i_data, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page)) { @@ -247,6 +272,24 @@ return res; } +static unsigned int nfs_type2dtype[] = { + DT_UNKNOWN, + DT_REG, + DT_DIR, + DT_BLK, + DT_CHR, + DT_LNK, + DT_SOCK, + DT_UNKNOWN, + DT_FIFO +}; + +static inline +unsigned int nfs_type_to_d_type(enum nfs_ftype type) +{ + return nfs_type2dtype[type]; +} + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -263,11 +306,17 @@ dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target); for(;;) { + unsigned d_type = DT_UNKNOWN; /* Note: entry->prev_cookie contains the cookie for * retrieving the current dirent on the server */ fileid = nfs_fileid_to_ino_t(entry->ino); + + /* Use readdirplus info */ + if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) + d_type = nfs_type_to_d_type(entry->fattr->type); + res = filldir(dirent, entry->name, entry->len, - entry->prev_cookie, fileid, DT_UNKNOWN); + entry->prev_cookie, fileid, d_type); if (res < 0) break; file->f_pos = desc->target = entry->cookie; @@ -334,7 +383,8 @@ /* Reset read descriptor so it searches the page cache from * the start upon the next call to readdir_search_pagecache() */ desc->page_index = 0; - memset(desc->entry, 0, sizeof(*desc->entry)); + desc->entry->cookie = desc->entry->prev_cookie = 0; + desc->entry->eof = 0; out: dfprintk(VFS, "NFS: uncached_readdir() returns %d\n", status); return status; @@ -353,9 +403,11 @@ nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_entry my_entry; + struct nfs_fh fh; + struct nfs_fattr fattr; long res; - res = nfs_revalidate(dentry); + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res < 0) return res; @@ -366,12 +418,16 @@ * itself. */ memset(desc, 0, sizeof(*desc)); - memset(&my_entry, 0, sizeof(my_entry)); - desc->file = filp; desc->target = filp->f_pos; - desc->entry = &my_entry; desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; + my_entry.fh = &fh; + my_entry.fattr = &fattr; + desc->entry = &my_entry; while(!desc->entry->eof) { res = readdir_search_pagecache(desc); @@ -402,6 +458,21 @@ } /* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + */ +static inline +int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +{ + if (IS_ROOT(dentry)) + return 1; + if (nfs_revalidate_inode(NFS_SERVER(dir), dir)) + return 0; + return time_after(dentry->d_time, NFS_MTIME_UPDATE(dir)); +} + +/* * Whenever an NFS operation succeeds, we know that the dentry * is valid, so we update the revalidation timestamp. */ @@ -410,48 +481,31 @@ dentry->d_time = jiffies; } -static inline int nfs_dentry_force_reval(struct dentry *dentry, int flags) +static inline +int nfs_lookup_verify_inode(struct inode *inode, int flags) { - struct inode *inode = dentry->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(inode); - + struct nfs_server *server = NFS_SERVER(inode); /* - * If it's the last lookup in a series, we use a stricter - * cache consistency check by looking at the parent mtime. - * - * If it's been modified in the last hour, be really strict. - * (This still means that we can avoid doing unnecessary - * work on directories like /usr/share/bin etc which basically - * never change). + * If we're interested in close-to-open cache consistency, + * then we revalidate the inode upon lookup. */ - if (!(flags & LOOKUP_CONTINUE)) { - long diff = CURRENT_TIME - dentry->d_parent->d_inode->i_mtime; - - if (diff < 15*60) - timeout = 0; - } - - return time_after(jiffies,dentry->d_time + timeout); + if (!(server->flags & NFS_MOUNT_NOCTO) && !(flags & LOOKUP_CONTINUE)) + NFS_CACHEINV(inode); + return nfs_revalidate_inode(server, inode); } /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. * - * If mtime is close to present time, we revalidate - * more often. + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. */ -#define NFS_REVALIDATE_NEGATIVE (1 * HZ) -static inline int nfs_neg_need_reval(struct dentry *dentry) +static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(dir); - long diff = CURRENT_TIME - dir->i_mtime; - - if (diff < 5*60 && timeout > NFS_REVALIDATE_NEGATIVE) - timeout = NFS_REVALIDATE_NEGATIVE; - - return time_after(jiffies, dentry->d_time + timeout); + if (!nfs_check_verifier(dir, dentry)) + return 1; + return time_after(jiffies, dentry->d_time + NFS_ATTRTIMEO(dir)); } /* @@ -462,9 +516,8 @@ * NOTE! The hit can be a negative hit too, don't assume * we have an inode! * - * If the dentry is older than the revalidation interval, - * we do a new lookup and verify that the dentry is still - * correct. + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. */ static int nfs_lookup_revalidate(struct dentry * dentry, int flags) { @@ -477,13 +530,9 @@ lock_kernel(); dir = dentry->d_parent->d_inode; inode = dentry->d_inode; - /* - * If we don't have an inode, let's look at the parent - * directory mtime to get a hint about how often we - * should validate things.. - */ + if (!inode) { - if (nfs_neg_need_reval(dentry)) + if (nfs_neg_need_reval(dir, dentry)) goto out_bad; goto out_valid; } @@ -494,48 +543,49 @@ goto out_bad; } - if (!nfs_dentry_force_reval(dentry, flags)) + /* Force a full look up iff the parent directory has changed */ + if (nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid; + } - if (IS_ROOT(dentry)) { - __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) + goto out_bad; + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid_renew; } - /* - * Do a new lookup and check the dentry attributes. - */ + if (NFS_STALE(inode)) + goto out_bad; + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error) goto out_bad; - - /* Inode number matches? */ - if (!(fattr.valid & NFS_ATTR_FATTR) || - NFS_FSID(inode) != fattr.fsid || - NFS_FILEID(inode) != fattr.fileid) + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) goto out_bad; - - /* Ok, remember that we successfully checked it.. */ - nfs_refresh_inode(inode, &fattr); - - if (nfs_inode_is_stale(inode, &fhandle, &fattr)) + if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; out_valid_renew: nfs_renew_times(dentry); -out_valid: + out_valid: unlock_kernel(); return 1; -out_bad: - shrink_dcache_parent(dentry); - /* If we have submounts, don't unhash ! */ - if (have_submounts(dentry)) - goto out_valid; - d_drop(dentry); - /* Purge readdir caches. */ - nfs_zap_caches(dir); - if (inode && S_ISDIR(inode->i_mode)) + out_bad: + NFS_CACHEINV(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ nfs_zap_caches(inode); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); unlock_kernel(); return 0; } @@ -565,9 +615,12 @@ { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { lock_kernel(); + inode->i_nlink--; nfs_complete_unlink(dentry); unlock_kernel(); } + if (is_bad_inode(inode)) + force_delete(inode); iput(inode); } @@ -594,6 +647,20 @@ error = -ENOMEM; dentry->d_op = &nfs_dentry_operations; + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + error = -EACCES; + inode = nfs_fhget(dentry, &fhandle, &fattr); + if (inode) { + if (!(NFS_SERVER(dir)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + d_add(dentry, inode); + nfs_renew_times(dentry); + error = 0; + } + goto out; + } + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); inode = NULL; if (error == -ENOENT) @@ -604,14 +671,87 @@ if (inode) { no_entry: d_add(dentry, inode); - nfs_renew_times(dentry); error = 0; } + nfs_renew_times(dentry); } out: return ERR_PTR(error); } +static inline +int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) +{ + struct nfs_entry *entry = desc->entry; + int status; + + while((status = dir_decode(desc)) == 0) { + if (entry->len != dentry->d_name.len) + continue; + if (memcmp(entry->name, dentry->d_name.name, entry->len)) + continue; + if (!(entry->fattr->valid & NFS_ATTR_FATTR)) + continue; + break; + } + return status; +} + +/* + * Use the cached Readdirplus results in order to avoid a LOOKUP call + * whenever we believe that the parent directory has not changed. + * + * We assume that any file creation/rename changes the directory mtime. + * As this results in a page cache invalidation whenever it occurs, + * we don't require any other tests for cache coherency. + */ +static +int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + nfs_readdir_descriptor_t desc; + struct nfs_server *server; + struct nfs_entry entry; + struct page *page; + unsigned long timestamp = NFS_MTIME_UPDATE(dir); + int res; + + if (!NFS_USE_READDIRPLUS(dir)) + return -ENOENT; + server = NFS_SERVER(dir); + if (server->flags & NFS_MOUNT_NOAC) + return -ENOENT; + nfs_revalidate_inode(server, dir); + + entry.fh = fh; + entry.fattr = fattr; + + desc.decode = NFS_PROTO(dir)->decode_dirent; + desc.entry = &entry; + desc.page_index = 0; + desc.plus = 1; + + for(;(page = find_get_page(&dir->i_data, desc.page_index)); desc.page_index++) { + + res = -EIO; + if (Page_Uptodate(page)) { + desc.ptr = kmap(page); + res = find_dirent_name(&desc, page, dentry); + kunmap(page); + } + page_cache_release(page); + + if (res == 0) + goto out_found; + if (res != -EAGAIN) + break; + } + return -ENOENT; + out_found: + fattr->timestamp = timestamp; + return 0; +} + /* * Code common to create, mkdir, and mknod. */ diff -u --recursive --new-file linux-2.4.17/fs/nfs/direct.c linux-2.4.17-rpc_tweaks/fs/nfs/direct.c --- linux-2.4.17/fs/nfs/direct.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.17-rpc_tweaks/fs/nfs/direct.c Wed Feb 20 17:17:26 2002 @@ -0,0 +1,390 @@ +/* + * linux/fs/nfs/direct.c + * + * High-performance direct I/O for the NFS client + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. Applications that manage their own data caching, such + * as databases, make very good use of direct I/O on local file systems. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Note that I/O to read in executables (e.g. kernel_read) cannot use + * direct (kiobuf) reads because there is no vma backing the passed-in + * data buffer. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * + * Initial implementation: 12/2001 by Chuck Lever + * + * TODO: + * + * 1. Use concurrent asynchronous network requests rather than + * serialized synchronous network requests for normal (non-sync) + * direct I/O. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define VERF_SIZE (2 * sizeof(__u32)) + +static /* inline */ int +nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg) +{ + int result; + struct inode * inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_readres res = { &fattr, arg->count, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_READ : NFSPROC_READ; +#else + msg.rpc_proc = NFSPROC_READ; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = nfs_file_cred(file); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + return result; +} + +static /* inline */ int +nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg, + struct nfs_writeverf *verf) +{ + int result; + struct inode *inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_writeres res = { &fattr, verf, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_WRITE : NFSPROC_WRITE; +#else + msg.rpc_proc = NFSPROC_WRITE; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = get_rpccred(nfs_file_cred(file)); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + put_rpccred(msg.rpc_cred); + unlock_kernel(); + +#ifdef CONFIG_NFS_V3 + if (NFS_PROTO(inode)->version == 3) { + if (result > 0) { + if ((arg->stable == NFS_FILE_SYNC) && + (verf->committed != NFS_FILE_SYNC)) { + printk(KERN_ERR __FUNCTION__ + ": server didn't sync stable write request\n"); + return -EIO; + } + + if (result != arg->count) { + printk(KERN_INFO __FUNCTION__ + ": short write, count=%u, result=%d\n", + arg->count, result); + } + } + return result; + } else { +#endif + verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */ + if (result == 0) + return arg->count; + return result; +#ifdef CONFIG_NFS_V3 + } +#endif +} + +#ifdef CONFIG_NFS_V3 +static /* inline */ int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0, + {{0, 0}, {0,0}, {0,0}, {0,0}, + {0,0}, {0,0}, {0,0}, {0,0}} }; + struct nfs_writeres res = { &fattr, verf, 0 }; + struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL }; + + fattr.valid = 0; + + lock_kernel(); + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + return result; +} +#else +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + return 0; +} +#endif + +/* + * Walk through the iobuf and create an iovec for each "rsize" bytes. + */ +static int +nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset, + size_t count) +{ + int curpage, total; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + struct nfs_readargs args = { NFS_FH(inode), 0, 0, 0 }; + + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > rsize) + request = rsize; + args.count = request; + args.offset = offset; + + starting_offset = iobuf->offset; + first = last = curpage; + while (curpage < iobuf->nr_pages) { + struct page *page = iobuf->maplist[curpage]; + + if (!page) + return -EFAULT; + + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + starting_offset = 0; /* zero after the first page */ + last = curpage; + curpage++; + iovec++; + args.nriov++; + } + + result = nfs_direct_read_rpc(file, &args); + + for (i = first; i < last; i++) { + flush_dcache_page(iobuf->maplist[i]); + kunmap(iobuf->maplist[i]); + } + + if (result < 0) { + if (result == -EISDIR) + total = -EINVAL; + else + total = result; + break; + } + + total += result; + count -= result; + offset += result; + + if (result < args.count) /* NFSv2ism */ + break; + }; + return total; +} + +/* + * Walk through the iobuf and create an iovec for each "wsize" bytes. + * If only one network write is necessary, or if the O_SYNC flag or + * 'sync' mount option are present, or if this is a V2 inode, use + * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. + * + * The mechanics of this function are much the same as nfs_direct_read, + * with the added complexity of committing unstable writes. + */ +static int +nfs_direct_write(struct file *file, struct kiobuf *iobuf, + loff_t offset, size_t count) +{ + int curpage, total; + int need_commit = 0; + loff_t save_offset = offset; + struct inode *inode = file->f_dentry->d_inode; + int wsize = NFS_SERVER(inode)->wsize; + struct nfs_writeverf first_verf, ret_verf; + struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0 }; + +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && + (!IS_SYNC(inode))) + args.stable = NFS_UNSTABLE; +#endif + +retry: + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > wsize) + request = wsize; + args.count = request; + args.offset = offset; + + starting_offset = iobuf->offset; + first = last = curpage; + while (curpage < iobuf->nr_pages) { + struct page *page = iobuf->maplist[curpage]; + + if (!page) + return -EFAULT; + + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + starting_offset = 0; /* zero after the first page */ + last = curpage; + curpage++; + iovec++; + args.nriov++; + } + + result = nfs_direct_write_rpc(file, &args, &ret_verf); + + for (i = first; i < last; i++) + kunmap(iobuf->maplist[i]); + + if (result < 0) { + total = result; + break; + } + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + count -= result; + offset += result; + }; + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + if (nfs_direct_commit_rpc(inode, save_offset, + iobuf->length - count, &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + return total; + +print_retry: + printk(KERN_INFO __FUNCTION__ + ": detected server restart; retrying with FILE_SYNC\n"); + args.stable = NFS_FILE_SYNC; + offset = save_offset; + count = iobuf->length; + goto retry; +} + +/* + * Read or write data, moving the data directly to/from the + * application's buffer without caching in the page cache. + * + * Rules for direct I/O + * + * 1. block size = 512 bytes or more + * 2. file byte offset is block aligned + * 3. byte count is a multiple of block size + * 4. user buffer is not aligned + * 5. user buffer is faulted in and pinned + * + * These are verified before we get here. + */ +int +nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf, + unsigned long blocknr, int blocksize) +{ + int result = -EINVAL; + size_t count = iobuf->length; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + loff_t offset = blocknr << inode->i_blkbits; + + switch (rw) { + case READ: + dfprintk(VFS, + "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_read(file, iobuf, offset, count); + break; + case WRITE: + dfprintk(VFS, + "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_write(file, iobuf, offset, count); + break; + default: + break; + } + + dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + return result; +} diff -u --recursive --new-file linux-2.4.17/fs/nfs/file.c linux-2.4.17-rpc_tweaks/fs/nfs/file.c --- linux-2.4.17/fs/nfs/file.c Sun Sep 23 18:48:01 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/file.c Sun Jan 13 17:21:27 2002 @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include #include #include #include @@ -99,7 +100,9 @@ dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) *ppos); + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!result) result = generic_file_read(file, buf, count, ppos); return result; @@ -115,7 +118,9 @@ dfprintk(VFS, "nfs: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); + lock_kernel(); status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!status) status = generic_file_mmap(file, vma); return status; @@ -134,13 +139,11 @@ dfprintk(VFS, "nfs: fsync(%x/%ld)\n", inode->i_dev, inode->i_ino); - lock_kernel(); status = nfs_wb_file(inode, file); if (!status) { status = file->f_error; file->f_error = 0; } - unlock_kernel(); return status; } @@ -160,17 +163,7 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - long status; - loff_t pos = ((loff_t)page->index<mapping->host; - - lock_kernel(); - status = nfs_updatepage(file, page, offset, to-offset); - unlock_kernel(); - /* most likely it's already done. CHECKME */ - if (pos > inode->i_size) - inode->i_size = pos; - return status; + return nfs_updatepage(file, page, offset, to-offset); } /* @@ -204,7 +197,10 @@ sync_page: nfs_sync_page, writepage: nfs_writepage, prepare_write: nfs_prepare_write, - commit_write: nfs_commit_write + commit_write: nfs_commit_write, +#ifdef CONFIG_NFS_DIRECTIO + direct_IO: nfs_direct_IO, +#endif }; /* @@ -224,7 +220,9 @@ result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (result) goto out; diff -u --recursive --new-file linux-2.4.17/fs/nfs/flushd.c linux-2.4.17-rpc_tweaks/fs/nfs/flushd.c --- linux-2.4.17/fs/nfs/flushd.c Fri Nov 9 23:28:15 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/flushd.c Tue Feb 19 15:51:15 2002 @@ -51,6 +51,19 @@ * This is the wait queue all cluster daemons sleep on */ static struct rpc_wait_queue flushd_queue = RPC_INIT_WAITQ("nfs_flushd"); +static spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED; + +static inline void +nfs_lock_flushd(void) +{ + spin_lock(&nfs_flushd_lock); +} + +static inline void +nfs_unlock_flushd(void) +{ + spin_unlock(&nfs_flushd_lock); +} /* * Local function declarations. @@ -59,7 +72,7 @@ static void nfs_flushd_exit(struct rpc_task *); -int nfs_reqlist_init(struct nfs_server *server) +static int nfs_reqlist_init(struct nfs_server *server) { struct nfs_reqlist *cache; struct rpc_task *task; @@ -67,12 +80,11 @@ dprintk("NFS: writecache_init\n"); - lock_kernel(); - status = -ENOMEM; /* Create the RPC task */ if (!(task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC))) - goto out_unlock; + return -ENOMEM; + nfs_lock_flushd(); cache = server->rw_requests; status = 0; @@ -89,22 +101,21 @@ cache->auth = server->client->cl_auth; task->tk_action = nfs_flushd; task->tk_exit = nfs_flushd_exit; + nfs_unlock_flushd(); rpc_execute(task); - unlock_kernel(); return 0; out_unlock: - if (task) - rpc_release_task(task); - unlock_kernel(); - return status; + nfs_unlock_flushd(); + rpc_release_task(task); + return 0; } void nfs_reqlist_exit(struct nfs_server *server) { struct nfs_reqlist *cache; - lock_kernel(); + nfs_lock_flushd(); cache = server->rw_requests; if (!cache) goto out; @@ -114,11 +125,13 @@ while (cache->task) { rpc_exit(cache->task, 0); rpc_wake_up_task(cache->task); + nfs_unlock_flushd(); interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ); + nfs_lock_flushd(); } out: - unlock_kernel(); + nfs_unlock_flushd(); } int nfs_reqlist_alloc(struct nfs_server *server) @@ -136,7 +149,7 @@ init_waitqueue_head(&cache->request_wait); server->rw_requests = cache; - return 0; + return nfs_reqlist_init(server); } void nfs_reqlist_free(struct nfs_server *server) @@ -183,11 +196,13 @@ } dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid); + nfs_lock_flushd(); if (task->tk_action) { task->tk_timeout = NFS_FLUSHD_TIMEOUT; cache->runat = jiffies + task->tk_timeout; rpc_sleep_on(&flushd_queue, task, NULL, NULL); } + nfs_unlock_flushd(); } static void @@ -196,10 +211,13 @@ struct nfs_server *server; struct nfs_reqlist *cache; server = (struct nfs_server *) task->tk_calldata; + + nfs_lock_flushd(); cache = server->rw_requests; if (cache->task == task) cache->task = NULL; wake_up(&cache->request_wait); + nfs_unlock_flushd(); } diff -u --recursive --new-file linux-2.4.17/fs/nfs/inode.c linux-2.4.17-rpc_tweaks/fs/nfs/inode.c --- linux-2.4.17/fs/nfs/inode.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/inode.c Mon Jan 28 11:56:39 2002 @@ -86,6 +86,9 @@ &nfs_rpcstat, }; +/* Spinlock to protect the NFS inode update */ +static spinlock_t nfs_inode_lock = SPIN_LOCK_UNLOCKED; + static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { @@ -107,17 +110,10 @@ inode->i_rdev = 0; /* We can't support UPDATE_ATIME(), since the server will reset it */ inode->i_flags |= S_NOATIME; - NFS_FILEID(inode) = 0; - NFS_FSID(inode) = 0; - NFS_FLAGS(inode) = 0; INIT_LIST_HEAD(&inode->u.nfs_i.read); INIT_LIST_HEAD(&inode->u.nfs_i.dirty); INIT_LIST_HEAD(&inode->u.nfs_i.commit); INIT_LIST_HEAD(&inode->u.nfs_i.writeback); - inode->u.nfs_i.nread = 0; - inode->u.nfs_i.ndirty = 0; - inode->u.nfs_i.ncommit = 0; - inode->u.nfs_i.npages = 0; NFS_CACHEINV(inode); NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; @@ -258,6 +254,69 @@ } /* + * Set up the NFS superblock private area using probed values + */ +static int +nfs_setup_superblock(struct super_block *sb, struct nfs_fh *rootfh) +{ + struct nfs_server *server = &sb->u.nfs_sb.s_server; + struct nfs_fattr fattr; + struct nfs_fsinfo fsinfo = { &fattr, }; + struct nfs_pathconf pathinfo = { &fattr, }; + int maxlen, res; + + res = server->rpc_ops->fsinfo(server, rootfh, &fsinfo); + if (res < 0) + return res; + + /* Work out a lot of parameters */ + if (!server->rsize) + server->rsize = nfs_block_size(fsinfo.rtpref, NULL); + if (!server->wsize) + server->wsize = nfs_block_size(fsinfo.wtpref, NULL); + + /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ + if (!fsinfo.wtmult) + fsinfo.wtmult = 512; + sb->s_blocksize = nfs_block_bits(fsinfo.wtmult, &sb->s_blocksize_bits); + + if (server->rsize > fsinfo.rtmax) + server->rsize = fsinfo.rtmax; + if (server->wsize > fsinfo.wtmax) + server->wsize = fsinfo.wtmax; + + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->rpages > NFS_READ_MAXIOV) { + server->rpages = NFS_READ_MAXIOV; + server->rsize = server->rpages << PAGE_CACHE_SHIFT; + } + + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->wpages > NFS_WRITE_MAXIOV) { + server->wpages = NFS_WRITE_MAXIOV; + server->wsize = server->wpages << PAGE_CACHE_SHIFT; + } + + server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE) + server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + maxlen = (server->rpc_ops->version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; + if (!server->namelen) { + res = server->rpc_ops->pathconf(server, rootfh, &pathinfo); + if (!res) + server->namelen = pathinfo.name_max; + } + if (!server->namelen || server->namelen > maxlen) + server->namelen = maxlen; + + sb->s_maxbytes = fsinfo.maxfilesize; + return 0; +} + +/* * The way this works is that the mount process passes a structure * in the data argument which contains the server's IP address * and the root file handle obtained from the server's mount @@ -275,8 +334,7 @@ unsigned int authflavor; struct sockaddr_in srvaddr; struct rpc_timeout timeparms; - struct nfs_fsinfo fsinfo; - int tcp, version, maxlen; + int tcp, version; memset(&sb->u.nfs_sb, 0, sizeof(sb->u.nfs_sb)); if (!data) @@ -305,11 +363,11 @@ sb->s_magic = NFS_SUPER_MAGIC; sb->s_op = &nfs_sops; - sb->s_blocksize_bits = 0; - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); server = &sb->u.nfs_sb.s_server; - server->rsize = nfs_block_size(data->rsize, NULL); - server->wsize = nfs_block_size(data->wsize, NULL); + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); server->flags = data->flags & NFS_MOUNT_FLAGMASK; if (data->flags & NFS_MOUNT_NOAC) { @@ -339,6 +397,7 @@ #ifdef CONFIG_NFS_V3 server->rpc_ops = &nfs_v3_clientops; version = 3; + server->caps |= NFS_CAP_READDIRPLUS; if (data->version < 4) { printk(KERN_NOTICE "NFS: NFSv3 not supported by mount program.\n"); goto out_unlock; @@ -416,68 +475,19 @@ sb->s_root->d_op = &nfs_dentry_operations; /* Get some general file system info */ - if (server->rpc_ops->statfs(server, root, &fsinfo) >= 0) { - if (server->namelen == 0) - server->namelen = fsinfo.namelen; - } else { + if (nfs_setup_superblock(sb, root) < 0) { printk(KERN_NOTICE "NFS: cannot retrieve file system info.\n"); goto out_no_root; } - /* Work out a lot of parameters */ - if (data->rsize == 0) - server->rsize = nfs_block_size(fsinfo.rtpref, NULL); - if (data->wsize == 0) - server->wsize = nfs_block_size(fsinfo.wtpref, NULL); - /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ - if (!fsinfo.bsize) - fsinfo.bsize = (fsinfo.rtmult>fsinfo.wtmult) ? fsinfo.rtmult : fsinfo.wtmult; - /* Also make sure we don't go below rsize/wsize since - * RPC calls are expensive */ - if (fsinfo.bsize < server->rsize) - fsinfo.bsize = server->rsize; - if (fsinfo.bsize < server->wsize) - fsinfo.bsize = server->wsize; - - if (data->bsize == 0) - sb->s_blocksize = nfs_block_bits(fsinfo.bsize, &sb->s_blocksize_bits); - if (server->rsize > fsinfo.rtmax) - server->rsize = fsinfo.rtmax; - if (server->wsize > fsinfo.wtmax) - server->wsize = fsinfo.wtmax; - - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->rpages > NFS_READ_MAXIOV) { - server->rpages = NFS_READ_MAXIOV; - server->rsize = server->rpages << PAGE_CACHE_SHIFT; - } - - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->wpages > NFS_WRITE_MAXIOV) { - server->wpages = NFS_WRITE_MAXIOV; - server->wsize = server->wpages << PAGE_CACHE_SHIFT; - } - - server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; - if (server->dtsize > server->rsize) - server->dtsize = server->rsize; - - maxlen = (version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; - - if (server->namelen == 0 || server->namelen > maxlen) - server->namelen = maxlen; - - sb->s_maxbytes = fsinfo.maxfilesize; - /* Fire up the writeback cache */ if (nfs_reqlist_alloc(server) < 0) { printk(KERN_NOTICE "NFS: cannot initialize writeback cache.\n"); goto failure_kill_reqlist; } - /* We're airborne */ + /* We're airborne. Set socket buffersize */ + xprt_setbufsize(xprt, server->wsize + 1024, server->rsize + 1024); /* Check whether to start the lockd process */ if (!(server->flags & NFS_MOUNT_NONLM)) @@ -530,7 +540,8 @@ struct nfs_server *server = &sb->u.nfs_sb.s_server; unsigned char blockbits; unsigned long blockres; - struct nfs_fsinfo res; + struct nfs_fattr attr; + struct nfs_fsstat res = { &attr, }; int error; error = server->rpc_ops->statfs(server, NFS_FH(sb->s_root->d_inode), &res); @@ -538,18 +549,15 @@ if (error < 0) goto out_err; - if (res.bsize == 0) - res.bsize = sb->s_blocksize; - buf->f_bsize = nfs_block_bits(res.bsize, &blockbits); + buf->f_bsize = sb->s_blocksize; + blockbits = sb->s_blocksize_bits; blockres = (1 << blockbits) - 1; buf->f_blocks = (res.tbytes + blockres) >> blockbits; buf->f_bfree = (res.fbytes + blockres) >> blockbits; buf->f_bavail = (res.abytes + blockres) >> blockbits; buf->f_files = res.tfiles; buf->f_ffree = res.afiles; - if (res.namelen == 0 || res.namelen > server->namelen) - res.namelen = server->namelen; - buf->f_namelen = res.namelen; + buf->f_namelen = server->namelen; return 0; out_err: printk("nfs_statfs: statfs error = %d\n", -error); @@ -600,18 +608,30 @@ } /* + * Reset the read time on the local caches + */ +void +nfs_invalidate_caches(struct inode *inode) +{ + spin_lock(&nfs_inode_lock); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); +} + +/* * Invalidate the local caches */ void nfs_zap_caches(struct inode *inode) { - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - invalidate_inode_pages(inode); + spin_lock(&nfs_inode_lock); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); + NFS_ATTRTIMEO_UPDATE(inode) = jiffies; memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - NFS_CACHEINV(inode); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); } /* @@ -627,50 +647,35 @@ nfs_zap_caches(inode); } +/* Don't use READDIRPLUS on directories that we believe are too large */ +#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) + /* * Fill in inode information from the fattr. */ static void nfs_fill_inode(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) { - /* - * Check whether the mode has been set, as we only want to - * do this once. (We don't allow inodes to change types.) + NFS_FILEID(inode) = fattr->fileid; + inode->i_mode = fattr->mode; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. */ - if (inode->i_mode == 0) { - NFS_FILEID(inode) = fattr->fileid; - NFS_FSID(inode) = fattr->fsid; - inode->i_mode = fattr->mode; - /* Why so? Because we want revalidate for devices/FIFOs, and - * that's precisely what we have in nfs_file_inode_operations. - */ - inode->i_op = &nfs_file_inode_operations; - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; - inode->i_data.a_ops = &nfs_file_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &nfs_dir_inode_operations; - inode->i_fop = &nfs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) - inode->i_op = &nfs_symlink_inode_operations; - else - init_special_inode(inode, inode->i_mode, fattr->rdev); - /* - * Preset the size and mtime, as there's no need - * to invalidate the caches. - */ - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_mtime = nfs_time_to_secs(fattr->mtime); - inode->i_atime = nfs_time_to_secs(fattr->atime); - inode->i_ctime = nfs_time_to_secs(fattr->ctime); - NFS_CACHE_CTIME(inode) = fattr->ctime; - NFS_CACHE_MTIME(inode) = fattr->mtime; - NFS_CACHE_ISIZE(inode) = fattr->size; - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); - } - nfs_refresh_inode(inode, fattr); + inode->i_op = &nfs_file_inode_operations; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nfs_dir_inode_operations; + inode->i_fop = &nfs_dir_operations; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) + && fattr->size <= NFS_LIMIT_READDIRPLUS) + NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); } struct nfs_find_desc { @@ -691,34 +696,16 @@ struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; - if (NFS_FSID(inode) != fattr->fsid) - return 0; if (NFS_FILEID(inode) != fattr->fileid) return 0; if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0) return 0; - return 1; -} - -int -nfs_inode_is_stale(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) -{ - /* Empty inodes are not stale */ - if (!inode->i_mode) + if (is_bad_inode(inode)) return 0; - - if ((fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT)) - return 1; - - if (is_bad_inode(inode) || NFS_STALE(inode)) - return 1; - - /* Has the filehandle changed? If so is the old one stale? */ - if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0 && - __nfs_revalidate_inode(NFS_SERVER(inode),inode) == -ESTALE) - return 1; - - return 0; + /* Force an attribute cache update if inode->i_count == 0 */ + if (!atomic_read(&inode->i_count)) + NFS_CACHEINV(inode); + return 1; } /* @@ -763,7 +750,14 @@ if (!(inode = iget4(sb, ino, nfs_find_actor, &desc))) goto out_no_inode; - nfs_fill_inode(inode, fh, fattr); + /* + * Check whether the mode has been set, as we only want to + * do this once. (We don't allow inodes to change types.) + */ + if (inode->i_mode == 0) + nfs_fill_inode(inode, fh, fattr); + + nfs_refresh_inode(inode, fattr); dprintk("NFS: __nfs_fhget(%x/%Ld ct=%d)\n", inode->i_dev, (long long)NFS_FILEID(inode), atomic_read(&inode->i_count)); @@ -786,7 +780,7 @@ /* * Make sure the inode is up-to-date. */ - error = nfs_revalidate(dentry); + error = nfs_revalidate_inode(NFS_SERVER(inode),inode); if (error) { #ifdef NFS_PARANOIA printk("nfs_notify_change: revalidate failed, error=%d\n", error); @@ -797,7 +791,9 @@ if (!S_ISREG(inode->i_mode)) attr->ia_valid &= ~ATTR_SIZE; + filemap_fdatasync(inode->i_mapping); error = nfs_wb_all(inode); + filemap_fdatawait(inode->i_mapping); if (error) goto out; @@ -825,6 +821,8 @@ fattr.pre_ctime = NFS_CACHE_CTIME(inode); fattr.valid |= NFS_ATTR_WCC; } + /* Force an attribute cache update */ + NFS_CACHEINV(inode); error = nfs_refresh_inode(inode, &fattr); out: return error; @@ -854,7 +852,26 @@ nfs_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - return nfs_revalidate_inode(NFS_SERVER(inode), inode); + int status; + lock_kernel(); + status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); + return status; +} + +/* + * Another revalidation function: This one checks inodes for staleness + * when we've bypassed the ordinary dcache revalidation routines. + * e.g. open(".") + */ +int +nfs_check_stale(struct inode *inode) +{ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + if (NFS_STALE(inode)) + return -ESTALE; + return 0; } /* @@ -883,13 +900,11 @@ struct rpc_auth *auth; struct rpc_cred *cred; - lock_kernel(); auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); filp->private_data = cred; if (filp->f_mode & FMODE_WRITE) nfs_set_mmcred(inode, cred); - unlock_kernel(); return 0; } @@ -897,11 +912,9 @@ { struct rpc_cred *cred; - lock_kernel(); cred = nfs_file_cred(filp); if (cred) put_rpccred(cred); - unlock_kernel(); return 0; } @@ -918,7 +931,6 @@ dfprintk(PAGECACHE, "NFS: revalidating (%x/%Ld)\n", inode->i_dev, (long long)NFS_FILEID(inode)); - lock_kernel(); if (!inode || is_bad_inode(inode)) goto out_nowait; if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) @@ -961,10 +973,50 @@ NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; wake_up(&inode->i_wait); out_nowait: - unlock_kernel(); return status; } +/** + * nfs_grow_isize - Extend inode->i_size + * @inode: inode + * @size: new file size + */ +void nfs_grow_isize(struct inode *inode, loff_t size) +{ + spin_lock(&nfs_inode_lock); + if (inode->i_size < size) + inode->i_size = size; + spin_unlock(&nfs_inode_lock); +} + +/* + * nfs_fattr_obsolete - Test if attribute data is newer than cached data + * @inode: inode + * @fattr: attributes to test + * + * Avoid stuffing the attribute cache with obsolete information. + * We always accept updates if the attribute cache timed out, or if + * fattr->ctime is newer than our cached value. + * If fattr->ctime matches the cached value, we still accept the update + * if it increases the file size. + */ +static inline +int nfs_fattr_obsolete(struct inode *inode, struct nfs_fattr *fattr) +{ + s64 cdif; + + if (time_after_eq(jiffies, NFS_READTIME(inode)+NFS_ATTRTIMEO(inode))) + goto out_valid; + if ((cdif = (s64)fattr->ctime - (s64)NFS_CACHE_CTIME(inode)) > 0) + goto out_valid; + /* Ugh... */ + if (cdif == 0 && fattr->size > NFS_CACHE_ISIZE(inode)) + goto out_valid; + return -1; + out_valid: + return 0; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -982,21 +1034,16 @@ { __u64 new_size, new_mtime; loff_t new_isize; + time_t new_atime; int invalid = 0; dfprintk(VFS, "NFS: refresh_inode(%x/%ld ct=%d info=0x%x)\n", inode->i_dev, inode->i_ino, atomic_read(&inode->i_count), fattr->valid); - if (NFS_FSID(inode) != fattr->fsid || - NFS_FILEID(inode) != fattr->fileid) { - printk(KERN_ERR "nfs_refresh_inode: inode number mismatch\n" - "expected (0x%Lx/0x%Lx), got (0x%Lx/0x%Lx)\n", - (long long)NFS_FSID(inode), (long long)NFS_FILEID(inode), - (long long)fattr->fsid, (long long)fattr->fileid); - goto out_err; - } - + /* Throw out obsolete READDIRPLUS attributes */ + if (time_after(NFS_READTIME(inode), fattr->timestamp)) + return 0; /* * Make sure the inode's type hasn't changed. */ @@ -1007,10 +1054,16 @@ new_size = fattr->size; new_isize = nfs_size_to_loff_t(fattr->size); + new_atime = nfs_time_to_secs(fattr->atime); + /* Avoid races */ + spin_lock(&nfs_inode_lock); + if (nfs_fattr_obsolete(inode, fattr)) + goto out_nochange; + /* * Update the read time so we don't revalidate too often. */ - NFS_READTIME(inode) = jiffies; + NFS_READTIME(inode) = fattr->timestamp; /* * Note: NFS_CACHE_ISIZE(inode) reflects the state of the cache. @@ -1056,10 +1109,13 @@ NFS_CACHE_CTIME(inode) = fattr->ctime; inode->i_ctime = nfs_time_to_secs(fattr->ctime); - inode->i_atime = nfs_time_to_secs(fattr->atime); + inode->i_atime = new_atime; - NFS_CACHE_MTIME(inode) = new_mtime; - inode->i_mtime = nfs_time_to_secs(new_mtime); + if (NFS_CACHE_MTIME(inode) != new_mtime) { + NFS_MTIME_UPDATE(inode) = fattr->timestamp; + NFS_CACHE_MTIME(inode) = new_mtime; + inode->i_mtime = nfs_time_to_secs(new_mtime); + } NFS_CACHE_ISIZE(inode) = new_size; inode->i_size = new_isize; @@ -1089,11 +1145,16 @@ NFS_ATTRTIMEO(inode) = NFS_MAXATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; } + spin_unlock(&nfs_inode_lock); if (invalid) nfs_zap_caches(inode); return 0; - + out_nochange: + if (new_atime - inode->i_atime > 0) + inode->i_atime = new_atime; + spin_unlock(&nfs_inode_lock); + return 0; out_changed: /* * Big trouble! The inode has become a different object. @@ -1108,7 +1169,6 @@ * (But we fall through to invalidate the caches.) */ nfs_invalidate_inode(inode); - out_err: return -EIO; } diff -u --recursive --new-file linux-2.4.17/fs/nfs/nfs2xdr.c linux-2.4.17-rpc_tweaks/fs/nfs/nfs2xdr.c --- linux-2.4.17/fs/nfs/nfs2xdr.c Sat Nov 3 02:40:09 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/nfs2xdr.c Sun Jan 13 17:19:31 2002 @@ -132,6 +132,7 @@ fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } + fattr->timestamp = jiffies; return p; } @@ -270,14 +271,12 @@ count = ntohl(*p++); hdrlen = (u8 *) p - (u8 *) iov->iov_base; - recvd = req->rq_rlen - hdrlen; - if (p != iov[req->rq_rnr-1].iov_base) { - /* Unexpected reply header size. Punt. - * XXX: Move iovec contents to align data on page - * boundary and adjust RPC header size guess */ - printk(KERN_WARNING "NFS: Odd RPC header size in read reply: %d\n", hdrlen); - return -errno_NFSERR_IO; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READ header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); } + + recvd = req->rq_rlen - hdrlen; if (count > recvd) { printk(KERN_WARNING "NFS: server cheating in read reply: " "count %d > recvd %d\n", count, recvd); @@ -419,7 +418,7 @@ bufsiz = bufsiz >> 2; p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); + *p++ = htonl(args->cookie & 0xFFFFFFFF); *p++ = htonl(bufsiz); /* see above */ req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); @@ -448,27 +447,23 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs_readdirres *res) { struct iovec *iov = req->rq_rvec; + int hdrlen; int status, nr; u32 *end, *entry, len; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); - if ((void *) p != ((u8 *) iov->iov_base+iov->iov_len)) { - /* Unexpected reply header size. Punt. */ - printk(KERN_WARNING "NFS: Odd RPC header size in readdirres reply\n"); - return -errno_NFSERR_IO; + + hdrlen = (u8 *) p - (u8 *) iov->iov_base; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); } + /* Get start and end address of XDR data */ p = (u32 *) iov[1].iov_base; end = (u32 *) ((u8 *) p + iov[1].iov_len); - - /* Get start and end of dirent buffer */ - if (res->buffer != p) { - printk(KERN_ERR "NFS: Bad result buffer in readdir\n"); - return -errno_NFSERR_IO; - } - for (nr = 0; *p++; nr++) { entry = p - 1; if (p + 2 > end) @@ -506,7 +501,7 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); + entry->cookie = (s64)((off_t)ntohl(*p++)); entry->eof = !p[0] && p[1]; return p; @@ -598,13 +593,21 @@ static int nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_readlinkres *res) { + struct iovec *iov = req->rq_rvec; u32 *strlen; char *string; + int hdrlen; int status; unsigned int len; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); + hdrlen = (u8 *) p - (u8 *) iov->iov_base; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); + } + strlen = (u32*)res->buffer; /* Convert length of symlink */ len = ntohl(*strlen); @@ -631,36 +634,18 @@ * Decode STATFS reply */ static int -nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_statfs *res) { int status; - u32 xfer_size; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); - /* For NFSv2, we more or less have to guess the preferred - * read/write/readdir sizes from the single 'transfer size' - * value. - */ - xfer_size = ntohl(*p++); /* tsize */ - res->rtmax = 8 * 1024; - res->rtpref = xfer_size; - res->rtmult = xfer_size; - res->wtmax = 8 * 1024; - res->wtpref = xfer_size; - res->wtmult = xfer_size; - res->dtpref = PAGE_CACHE_SIZE; - res->maxfilesize = 0x7FFFFFFF; /* just a guess */ + res->tsize = ntohl(*p++); res->bsize = ntohl(*p++); - - res->tbytes = ntohl(*p++) * res->bsize; - res->fbytes = ntohl(*p++) * res->bsize; - res->abytes = ntohl(*p++) * res->bsize; - res->tfiles = 0; - res->ffiles = 0; - res->afiles = 0; - res->namelen = 0; + res->blocks = ntohl(*p++); + res->bfree = ntohl(*p++); + res->bavail = ntohl(*p++); return 0; } diff -u --recursive --new-file linux-2.4.17/fs/nfs/nfs3proc.c linux-2.4.17-rpc_tweaks/fs/nfs/nfs3proc.c --- linux-2.4.17/fs/nfs/nfs3proc.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/nfs3proc.c Sun Jan 13 17:19:02 2002 @@ -111,7 +111,8 @@ status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, fhandle, fattr, 0); dprintk("NFS reply lookup: %d\n", status); - nfs_refresh_inode(dir, &dir_attr); + if (status >= 0) + status = nfs_refresh_inode(dir, &dir_attr); return status; } @@ -493,24 +494,42 @@ return status; } -/* - * This is a combo call of fsstat and fsinfo - */ static int nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; - dprintk("NFS call fsstat\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, info, 0); - if (status < 0) - goto error; + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); + dprintk("NFS reply statfs: %d\n", status); + return status; +} + +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); status = rpc_call(server->client, NFS3PROC_FSINFO, fhandle, info, 0); + dprintk("NFS reply fsinfo: %d\n", status); + return status; +} -error: - dprintk("NFS reply statfs: %d\n", status); +static int +nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call pathconf\n"); + status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); + dprintk("NFS reply pathconf: %d\n", status); return status; } @@ -539,5 +558,7 @@ nfs3_proc_readdir, nfs3_proc_mknod, nfs3_proc_statfs, + nfs3_proc_fsinfo, + nfs3_proc_pathconf, nfs3_decode_dirent, }; diff -u --recursive --new-file linux-2.4.17/fs/nfs/nfs3xdr.c linux-2.4.17-rpc_tweaks/fs/nfs/nfs3xdr.c --- linux-2.4.17/fs/nfs/nfs3xdr.c Sat Nov 3 02:40:09 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/nfs3xdr.c Sun Jan 13 17:19:31 2002 @@ -195,6 +195,7 @@ /* Update the mode bits */ fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); + fattr->timestamp = jiffies; return p; } @@ -523,6 +524,13 @@ return 0; } +/* Hack to sign-extending 32-bit cookies */ +static inline +u64 nfs_transform_cookie64(u64 cookie) +{ + return (cookie & 0x80000000) ? (cookie ^ 0xFFFFFFFF00000000) : cookie; +} + /* * Encode arguments to readdir call */ @@ -533,7 +541,7 @@ int buflen, replen; p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); + p = xdr_encode_hyper(p, nfs_transform_cookie64(args->cookie)); *p++ = args->verf[0]; *p++ = args->verf[1]; if (args->plus) { @@ -644,6 +652,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) { struct nfs_entry old = *entry; + u64 cookie; if (!*p++) { if (!*p) @@ -657,24 +666,23 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &cookie); + entry->cookie = nfs_transform_cookie64(cookie); if (plus) { - p = xdr_decode_post_op_attr(p, &entry->fattr); + entry->fattr->valid = 0; + p = xdr_decode_post_op_attr(p, entry->fattr); /* In fact, a post_op_fh3: */ if (*p++) { - p = xdr_decode_fhandle(p, &entry->fh); + p = xdr_decode_fhandle(p, entry->fh); /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); *entry = old; return ERR_PTR(-EAGAIN); } - } else { - /* If we don't get a file handle, the attrs - * aren't worth a lot. */ - entry->fattr.valid = 0; - } + } else + memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } entry->eof = !p[0] && p[1]; @@ -958,14 +966,13 @@ * Decode FSSTAT reply */ static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -975,8 +982,7 @@ p = xdr_decode_hyper(p, &res->tfiles); p = xdr_decode_hyper(p, &res->ffiles); p = xdr_decode_hyper(p, &res->afiles); - - /* ignore invarsec */ + res->invarsec = ntohl(*p++); return 0; } @@ -986,12 +992,11 @@ static int nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -1003,8 +1008,8 @@ res->wtmult = ntohl(*p++); res->dtpref = ntohl(*p++); p = xdr_decode_hyper(p, &res->maxfilesize); - - /* ignore time_delta and properties */ + p = xdr_decode_time3(p, &res->time_delta); + res->properties = ntohl(*p++); return 0; } @@ -1012,20 +1017,21 @@ * Decode PATHCONF reply */ static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); res->linkmax = ntohl(*p++); - res->namelen = ntohl(*p++); - - /* ignore remaining fields */ + res->name_max = ntohl(*p++); + res->no_trunc = ntohl(*p++) != 0; + res->chown_restricted = ntohl(*p++) != 0; + res->case_insensitive = ntohl(*p++) != 0; + res->case_preserving = ntohl(*p++) != 0; return 0; } diff -u --recursive --new-file linux-2.4.17/fs/nfs/pagelist.c linux-2.4.17-rpc_tweaks/fs/nfs/pagelist.c --- linux-2.4.17/fs/nfs/pagelist.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/pagelist.c Tue Feb 19 15:44:37 2002 @@ -53,7 +53,7 @@ /** * nfs_create_request - Create an NFS read/write request. - * @file: file that owns this request + * @cred: RPC credential to use * @inode: inode to which the request is attached * @page: page to write * @offset: starting offset within the page for the write @@ -66,7 +66,7 @@ * User should ensure it is safe to sleep in this function. */ struct nfs_page * -nfs_create_request(struct file *file, struct inode *inode, +nfs_create_request(struct rpc_cred *cred, struct inode *inode, struct page *page, unsigned int offset, unsigned int count) { @@ -108,34 +108,49 @@ req->wb_offset = offset; req->wb_bytes = count; - /* If we have a struct file, use its cached credentials */ - if (file) { - req->wb_file = file; - get_file(file); - req->wb_cred = nfs_file_cred(file); - } + if (cred) + req->wb_cred = get_rpccred(cred); req->wb_inode = inode; req->wb_count = 1; return req; } +/** + * nfs_clear_request - Free up all resources allocated to the request + * @req: + * + * Release all resources associated with a write request after it + * has completed. + */ +void nfs_clear_request(struct nfs_page *req) +{ + /* Release struct file or cached credential */ + if (req->wb_file) { + fput(req->wb_file); + req->wb_file = NULL; + } + if (req->wb_cred) { + put_rpccred(req->wb_cred); + req->wb_cred = NULL; + } + if (req->wb_page) { + page_cache_release(req->wb_page); + req->wb_page = NULL; + atomic_dec(&NFS_REQUESTLIST(req->wb_inode)->nr_requests); + } +} + /** * nfs_release_request - Release the count on an NFS read/write request * @req: request to release * - * Release all resources associated with a write request after it - * has been committed to stable storage - * * Note: Should never be called with the spinlock held! */ void nfs_release_request(struct nfs_page *req) { - struct inode *inode = req->wb_inode; - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - spin_lock(&nfs_wreq_lock); if (--req->wb_count) { spin_unlock(&nfs_wreq_lock); @@ -143,7 +158,6 @@ } __nfs_del_lru(req); spin_unlock(&nfs_wreq_lock); - atomic_dec(&cache->nr_requests); #ifdef NFS_PARANOIA if (!list_empty(&req->wb_list)) @@ -152,16 +166,12 @@ BUG(); if (NFS_WBACK_BUSY(req)) BUG(); - if (atomic_read(&cache->nr_requests) < 0) + if (atomic_read(&NFS_REQUESTLIST(req->wb_inode)->nr_requests) < 0) BUG(); #endif /* Release struct file or cached credential */ - if (req->wb_file) - fput(req->wb_file); - else if (req->wb_cred) - put_rpccred(req->wb_cred); - page_cache_release(req->wb_page); + nfs_clear_request(req); nfs_page_free(req); } @@ -236,7 +246,7 @@ req = nfs_list_entry(head->next); if (prev) { - if (req->wb_file != prev->wb_file) + if (req->wb_cred != prev->wb_cred) break; if (page_index(req->wb_page) != page_index(prev->wb_page)+1) break; @@ -270,7 +280,7 @@ { struct nfs_server *server = NFS_SERVER(req->wb_inode); struct list_head *pos, *head = req->wb_list_head; - struct file *file = req->wb_file; + struct rpc_cred *cred = req->wb_cred; unsigned long idx = page_index(req->wb_page) + 1; int npages = 0; @@ -291,7 +301,7 @@ break; if (req->wb_offset != 0) break; - if (req->wb_file != file) + if (req->wb_cred != cred) break; } return npages; diff -u --recursive --new-file linux-2.4.17/fs/nfs/proc.c linux-2.4.17-rpc_tweaks/fs/nfs/proc.c --- linux-2.4.17/fs/nfs/proc.c Fri Feb 9 20:29:44 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/proc.c Sun Jan 13 17:17:14 2002 @@ -361,17 +361,62 @@ static int nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; + struct nfs2_statfs fsinfo; - dprintk("NFS call statfs\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFSPROC_STATFS, fhandle, info, 0); + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); dprintk("NFS reply statfs: %d\n", status); + if (status) + goto out; + stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize; + stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize; + stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize; + stat->tfiles = 0; + stat->ffiles = 0; + stat->afiles = 0; + stat->invarsec = 0; + out: return status; } +static int +nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + struct nfs2_statfs fsinfo; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); + dprintk("NFS reply fsinfo: %d\n", status); + if (status) + goto out; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->time_delta = 0; + info->properties = 0x1b; + out: + return status; +} + +static int +nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + return -ENOTSUPP; +} + extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); struct nfs_rpc_ops nfs_v2_clientops = { @@ -397,5 +442,7 @@ nfs_proc_readdir, nfs_proc_mknod, nfs_proc_statfs, + nfs_proc_fsinfo, + nfs_proc_pathconf, nfs_decode_dirent, }; diff -u --recursive --new-file linux-2.4.17/fs/nfs/read.c linux-2.4.17-rpc_tweaks/fs/nfs/read.c --- linux-2.4.17/fs/nfs/read.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/read.c Sun Jan 20 00:40:42 2002 @@ -113,11 +113,9 @@ inode->i_dev, (long long)NFS_FILEID(inode), (long long)offset, rsize, buffer); - lock_kernel(); result = NFS_PROTO(inode)->read(inode, cred, &fattr, flags, offset, rsize, buffer, &eof); nfs_refresh_inode(inode, &fattr); - unlock_kernel(); /* * Even if we had a partial success we can't mark the page @@ -168,7 +166,7 @@ { struct nfs_page *new; - new = nfs_create_request(file, inode, page, 0, PAGE_CACHE_SIZE); + new = nfs_create_request(nfs_file_cred(file), inode, page, 0, PAGE_CACHE_SIZE); if (IS_ERR(new)) return PTR_ERR(new); nfs_mark_request_read(new); @@ -224,8 +222,9 @@ nfs_list_remove_request(req); SetPageError(page); UnlockPage(page); - nfs_unlock_request(req); + nfs_clear_request(req); nfs_release_request(req); + nfs_unlock_request(req); } } @@ -272,9 +271,7 @@ rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -397,7 +394,7 @@ { struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; struct inode *inode = data->inode; - int count = data->res.count; + unsigned int count = data->res.count; dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", task->tk_pid, task->tk_status); @@ -411,9 +408,15 @@ struct page *page = req->wb_page; nfs_list_remove_request(req); - if (task->tk_status >= 0 && count >= 0) { + if (task->tk_status >= 0) { + if (count < PAGE_CACHE_SIZE) { + char *p = kmap(page); + memset(p + count, 0, PAGE_CACHE_SIZE - count); + kunmap(page); + count = 0; + } else + count -= PAGE_CACHE_SIZE; SetPageUptodate(page); - count -= PAGE_CACHE_SIZE; } else SetPageError(page); flush_dcache_page(page); @@ -425,6 +428,7 @@ (long long)NFS_FILEID(req->wb_inode), req->wb_bytes, (long long)(page_offset(page) + req->wb_offset)); + nfs_clear_request(req); nfs_release_request(req); nfs_unlock_request(req); } @@ -442,19 +446,9 @@ int nfs_readpage(struct file *file, struct page *page) { - struct inode *inode; + struct inode *inode = page->mapping->host; int error; - if (!file) { - struct address_space *mapping = page->mapping; - if (!mapping) - BUG(); - inode = mapping->host; - } else - inode = file->f_dentry->d_inode; - if (!inode) - BUG(); - dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page->index); /* diff -u --recursive --new-file linux-2.4.17/fs/nfs/write.c linux-2.4.17-rpc_tweaks/fs/nfs/write.c --- linux-2.4.17/fs/nfs/write.c Fri Dec 21 18:41:55 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfs/write.c Wed Feb 20 09:31:17 2002 @@ -121,23 +121,6 @@ } /* - * This function will be used to simulate weak cache consistency - * under NFSv2 when the NFSv3 attribute patch is included. - * For the moment, we just call nfs_refresh_inode(). - */ -static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) -{ - if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - return nfs_refresh_inode(inode, fattr); -} - -/* * Write a page synchronously. * Offset is the data offset within the page. */ @@ -193,8 +176,7 @@ * If we've extended the file, update the inode * now so we don't invalidate the cache. */ - if (base > inode->i_size) - inode->i_size = base; + nfs_grow_isize(inode, base); } while (count); if (PageError(page)) @@ -213,6 +195,7 @@ unsigned int offset, unsigned int count) { struct nfs_page *req; + loff_t end; int status; req = nfs_update_request(file, inode, page, offset, count); @@ -223,6 +206,8 @@ req->wb_cred = get_rpccred(NFS_I(inode)->mm_cred); nfs_unlock_request(req); nfs_strategy(inode); + end = ((loff_t)page->index<mapping->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; int err; - struct address_space *mapping = page->mapping; - if (!mapping) - BUG(); - inode = mapping->host; - if (!inode) - BUG(); end_index = inode->i_size >> PAGE_CACHE_SHIFT; /* Ensure we've flushed out any previous writes */ @@ -260,7 +239,6 @@ if (page->index >= end_index+1 || !offset) goto out; do_it: - lock_kernel(); if (NFS_SERVER(inode)->wsize >= PAGE_CACHE_SIZE && !IS_SYNC(inode)) { err = nfs_writepage_async(NULL, inode, page, 0, offset); if (err >= 0) @@ -270,7 +248,6 @@ if (err == offset) err = 0; } - unlock_kernel(); out: UnlockPage(page); return err; @@ -357,6 +334,7 @@ iput(inode); } else spin_unlock(&nfs_wreq_lock); + nfs_clear_request(req); nfs_release_request(req); } @@ -683,9 +661,13 @@ } spin_unlock(&nfs_wreq_lock); - new = nfs_create_request(file, inode, page, offset, bytes); + new = nfs_create_request(nfs_file_cred(file), inode, page, offset, bytes); if (IS_ERR(new)) return new; + if (file) { + new->wb_file = file; + get_file(file); + } /* If the region is locked, adjust the timeout */ if (region_locked(inode, new)) new->wb_timeout = jiffies + NFS_WRITEBACK_LOCKDELAY; @@ -758,12 +740,15 @@ if (dirty >= NFS_STRATEGY_PAGES * wpages) nfs_flush_file(inode, NULL, 0, 0, 0); #endif + if (current->need_resched) + schedule(); } int nfs_flush_incompatible(struct file *file, struct page *page) { - struct inode *inode = file->f_dentry->d_inode; + struct rpc_cred *cred = nfs_file_cred(file); + struct inode *inode = page->mapping->host; struct nfs_page *req; int status = 0; /* @@ -776,7 +761,7 @@ */ req = nfs_find_request(inode,page); if (req) { - if (req->wb_file != file || req->wb_page != page) + if (req->wb_file != file || req->wb_cred != cred || req->wb_page != page) status = nfs_wb_page(inode, page); nfs_release_request(req); } @@ -793,8 +778,9 @@ nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = page->mapping->host; struct nfs_page *req; + loff_t end; int status = 0; dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", @@ -826,6 +812,9 @@ goto done; status = 0; + end = ((loff_t)page->index<tk_pid); rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfs3proc.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfs3proc.c --- linux-2.4.17/fs/nfsd/nfs3proc.c Fri Sep 21 06:02:01 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfs3proc.c Sun Jan 13 17:22:01 2002 @@ -152,7 +152,7 @@ dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh)); /* Reserve room for status, post_op_attr, and path length */ - svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy, + svcbuf_reserve(rqstp->rq_resbuf, &path, &dummy, 1 + NFS3_POST_OP_ATTR_WORDS + 1); /* Read the symlink. */ @@ -181,7 +181,7 @@ * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) * + 1 (xdr opaque byte count) = 26 */ - svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail, + svcbuf_reserve(rqstp->rq_resbuf, &buffer, &avail, 1 + NFS3_POST_OP_ATTR_WORDS + 3); resp->count = argp->count; @@ -448,7 +448,7 @@ argp->count, (u32) argp->cookie); /* Reserve buffer space for status, attributes and verifier */ - svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count, + svcbuf_reserve(rqstp->rq_resbuf, &buffer, &count, 1 + NFS3_POST_OP_ATTR_WORDS + 2); /* Make sure we've room for the NULL ptr & eof flag, and shrink to @@ -483,7 +483,7 @@ argp->count, (u32) argp->cookie); /* Reserve buffer space for status, attributes and verifier */ - svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count, + svcbuf_reserve(rqstp->rq_resbuf, &buffer, &count, 1 + NFS3_POST_OP_ATTR_WORDS + 2); /* Make sure we've room for the NULL ptr & eof flag, and shrink to diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfs3xdr.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfs3xdr.c --- linux-2.4.17/fs/nfsd/nfs3xdr.c Thu Oct 4 07:27:48 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfs3xdr.c Sun Jan 13 17:22:01 2002 @@ -268,7 +268,7 @@ static inline int xdr_ressize_check(struct svc_rqst *rqstp, u32 *p) { - struct svc_buf *buf = &rqstp->rq_resbuf; + struct svc_buf *buf = rqstp->rq_resbuf; buf->len = p - buf->base; dprintk("nfsd: ressize_check p %p base %p len %d\n", diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfscache.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfscache.c --- linux-2.4.17/fs/nfsd/nfscache.c Thu Feb 15 19:56:29 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfscache.c Sun Jan 13 17:22:01 2002 @@ -265,7 +265,7 @@ case RC_NOCACHE: return RC_DOIT; case RC_REPLSTAT: - svc_putlong(&rqstp->rq_resbuf, rp->c_replstat); + svc_putlong(rqstp->rq_resbuf, rp->c_replstat); break; case RC_REPLBUFF: if (!nfsd_cache_append(rqstp, &rp->c_replbuf)) @@ -300,7 +300,7 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp) { struct svc_cacherep *rp; - struct svc_buf *resp = &rqstp->rq_resbuf, *cachp; + struct svc_buf *resp = rqstp->rq_resbuf, *cachp; int len; if (!(rp = rqstp->rq_cacherep) || cache_disabled) @@ -347,7 +347,7 @@ static int nfsd_cache_append(struct svc_rqst *rqstp, struct svc_buf *data) { - struct svc_buf *resp = &rqstp->rq_resbuf; + struct svc_buf *resp = rqstp->rq_resbuf; if (resp->len + data->len > resp->buflen) { printk(KERN_WARNING "nfsd: cached reply too large (%d).\n", diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfsproc.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfsproc.c --- linux-2.4.17/fs/nfsd/nfsproc.c Sun Oct 21 19:40:36 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfsproc.c Sun Jan 13 17:22:01 2002 @@ -110,7 +110,7 @@ dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh)); /* Reserve room for status and path length */ - svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy, 2); + svcbuf_reserve(rqstp->rq_resbuf, &path, &dummy, 2); /* Read the symlink. */ resp->len = NFS_MAXPATHLEN; @@ -138,7 +138,7 @@ /* Obtain buffer pointer for payload. 19 is 1 word for * status, 17 words for fattr, and 1 word for the byte count. */ - svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail, 19); + svcbuf_reserve(rqstp->rq_resbuf, &buffer, &avail, 19); if ((avail << 2) < argp->count) { printk(KERN_NOTICE @@ -477,7 +477,7 @@ argp->count, argp->cookie); /* Reserve buffer space for status */ - svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count, 1); + svcbuf_reserve(rqstp->rq_resbuf, &buffer, &count, 1); /* Shrink to the client read size */ if (count > (argp->count >> 2)) diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfssvc.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfssvc.c --- linux-2.4.17/fs/nfsd/nfssvc.c Wed Oct 17 23:16:34 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfssvc.c Sun Jan 13 17:22:01 2002 @@ -94,7 +94,8 @@ if (error < 0) goto failure; -#if 0 /* Don't even pretend that TCP works. It doesn't. */ +#if CONFIG_NFSD_TCP + /* Don't even pretend that TCP works. It doesn't. */ error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); if (error < 0) goto failure; @@ -292,14 +293,14 @@ } if (rqstp->rq_proc != 0) - svc_putlong(&rqstp->rq_resbuf, nfserr); + svc_putlong(rqstp->rq_resbuf, nfserr); /* Encode result. * For NFSv2, additional info is never returned in case of an error. */ if (!(nfserr && rqstp->rq_vers == 2)) { xdr = proc->pc_encode; - if (xdr && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) { + if (xdr && !xdr(rqstp, rqstp->rq_resbuf->buf, rqstp->rq_resp)) { /* Failed to encode result. Release cache entry */ dprintk("nfsd: failed to encode result!\n"); nfsd_cache_update(rqstp, RC_NOCACHE, NULL); diff -u --recursive --new-file linux-2.4.17/fs/nfsd/nfsxdr.c linux-2.4.17-rpc_tweaks/fs/nfsd/nfsxdr.c --- linux-2.4.17/fs/nfsd/nfsxdr.c Wed Oct 17 23:16:34 2001 +++ linux-2.4.17-rpc_tweaks/fs/nfsd/nfsxdr.c Sun Jan 13 17:22:01 2002 @@ -179,7 +179,7 @@ static inline int xdr_ressize_check(struct svc_rqst *rqstp, u32 *p) { - struct svc_buf *buf = &rqstp->rq_resbuf; + struct svc_buf *buf = rqstp->rq_resbuf; buf->len = p - buf->base; dprintk("nfsd: ressize_check p %p base %p len %d\n", diff -u --recursive --new-file linux-2.4.17/include/linux/fs.h linux-2.4.17-rpc_tweaks/include/linux/fs.h --- linux-2.4.17/include/linux/fs.h Fri Dec 21 18:42:03 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/fs.h Sun Jan 13 17:27:24 2002 @@ -390,7 +390,7 @@ int (*flushpage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); + int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); }; struct address_space { @@ -852,6 +852,7 @@ int (*revalidate) (struct dentry *); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct dentry *, struct iattr *); + int (*check_stale) (struct inode *); }; struct seq_file; @@ -1390,7 +1391,7 @@ int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); -extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *); +extern int generic_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int, get_block_t *); extern int waitfor_one_page(struct page*); extern int generic_file_mmap(struct file *, struct vm_area_struct *); diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_flushd.h linux-2.4.17-rpc_tweaks/include/linux/nfs_flushd.h --- linux-2.4.17/include/linux/nfs_flushd.h Sun Jan 13 17:49:06 2002 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_flushd.h Tue Feb 19 15:44:37 2002 @@ -32,7 +32,6 @@ */ extern int nfs_reqlist_alloc(struct nfs_server *); extern void nfs_reqlist_free(struct nfs_server *); -extern int nfs_reqlist_init(struct nfs_server *); extern void nfs_reqlist_exit(struct nfs_server *); extern void nfs_wake_flushd(void); diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_fs.h linux-2.4.17-rpc_tweaks/include/linux/nfs_fs.h --- linux-2.4.17/include/linux/nfs_fs.h Fri Dec 21 18:42:03 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_fs.h Mon Jan 28 12:33:22 2002 @@ -78,14 +78,12 @@ #define NFS_CONGESTED(inode) (RPC_CONGESTED(NFS_CLIENT(inode))) #define NFS_COOKIEVERF(inode) ((inode)->u.nfs_i.cookieverf) #define NFS_READTIME(inode) ((inode)->u.nfs_i.read_cache_jiffies) +#define NFS_MTIME_UPDATE(inode) ((inode)->u.nfs_i.cache_mtime_jiffies) #define NFS_CACHE_CTIME(inode) ((inode)->u.nfs_i.read_cache_ctime) #define NFS_CACHE_MTIME(inode) ((inode)->u.nfs_i.read_cache_mtime) #define NFS_CACHE_ISIZE(inode) ((inode)->u.nfs_i.read_cache_isize) #define NFS_NEXTSCAN(inode) ((inode)->u.nfs_i.nextscan) -#define NFS_CACHEINV(inode) \ -do { \ - NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; \ -} while (0) +#define NFS_CACHEINV(inode) nfs_invalidate_caches(inode) #define NFS_ATTRTIMEO(inode) ((inode)->u.nfs_i.attrtimeo) #define NFS_MINATTRTIMEO(inode) \ (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ @@ -100,10 +98,16 @@ #define NFS_STALE(inode) (NFS_FLAGS(inode) & NFS_INO_STALE) #define NFS_FILEID(inode) ((inode)->u.nfs_i.fileid) -#define NFS_FSID(inode) ((inode)->u.nfs_i.fsid) -/* Inode Flags */ -#define NFS_USE_READDIRPLUS(inode) ((NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS) ? 1 : 0) +static inline int nfs_server_capable(struct inode *inode, int cap) +{ + return NFS_SERVER(inode)->caps & cap; +} + +static inline int NFS_USE_READDIRPLUS(struct inode *inode) +{ + return NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS; +} /* * These are the default flags for swap requests @@ -142,17 +146,20 @@ * linux/fs/nfs/inode.c */ extern struct super_block *nfs_read_super(struct super_block *, void *, int); +extern void nfs_invalidate_caches(struct inode *); extern void nfs_zap_caches(struct inode *); extern int nfs_inode_is_stale(struct inode *, struct nfs_fh *, struct nfs_fattr *); extern struct inode *nfs_fhget(struct dentry *, struct nfs_fh *, struct nfs_fattr *); extern int __nfs_refresh_inode(struct inode *, struct nfs_fattr *); +extern void nfs_grow_isize(struct inode *, loff_t); extern int nfs_revalidate(struct dentry *); extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); +extern int nfs_check_stale(struct inode *); extern int nfs_notify_change(struct dentry *, struct iattr *); /* @@ -165,7 +172,9 @@ static __inline__ struct rpc_cred * nfs_file_cred(struct file *file) { - struct rpc_cred *cred = (struct rpc_cred *)(file->private_data); + struct rpc_cred *cred = NULL; + if (file) + cred = (struct rpc_cred *)file->private_data; #ifdef RPC_DEBUG if (cred && cred->cr_magic != RPCAUTH_CRED_MAGIC) BUG(); @@ -267,6 +276,11 @@ extern int nfs_scan_lru_read_timeout(struct nfs_server *, struct list_head *); /* + * linux/fs/nfs/direct.c + */ +extern int nfs_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -292,6 +306,23 @@ return __nfs_refresh_inode(inode,fattr); } +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { + fattr->pre_size = NFS_CACHE_ISIZE(inode); + fattr->pre_mtime = NFS_CACHE_MTIME(inode); + fattr->pre_ctime = NFS_CACHE_CTIME(inode); + fattr->valid |= NFS_ATTR_WCC; + } + return nfs_refresh_inode(inode, fattr); +} + static inline loff_t nfs_size_to_loff_t(__u64 size) { diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_fs_i.h linux-2.4.17-rpc_tweaks/include/linux/nfs_fs_i.h --- linux-2.4.17/include/linux/nfs_fs_i.h Sat Dec 22 01:59:36 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_fs_i.h Sun Jan 13 17:27:24 2002 @@ -12,7 +12,6 @@ /* * The 64bit 'inode number' */ - __u64 fsid; __u64 fileid; /* @@ -50,6 +49,12 @@ unsigned long attrtimeo_timestamp; /* + * Timestamp that dates the change made to read_cache_mtime. + * This is of use for dentry revalidation + */ + unsigned long cache_mtime_jiffies; + + /* * This is the cookie verifier used for NFSv3 readdir * operations */ @@ -68,11 +73,6 @@ ncommit, npages; - /* Flush daemon info */ - struct inode *hash_next, - *hash_prev; - unsigned long nextscan; - /* Credentials for shared mmap */ struct rpc_cred *mm_cred; }; diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_fs_sb.h linux-2.4.17-rpc_tweaks/include/linux/nfs_fs_sb.h --- linux-2.4.17/include/linux/nfs_fs_sb.h Thu Nov 22 20:46:19 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_fs_sb.h Sun Jan 13 17:27:24 2002 @@ -10,6 +10,7 @@ struct rpc_clnt * client; /* RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ int flags; /* various flags */ + unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ unsigned int wsize; /* write size */ @@ -36,4 +37,8 @@ struct nfs_server s_server; }; +/* Server capabilities */ +#define NFS_CAP_READDIRPLUS 1 + + #endif diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_page.h linux-2.4.17-rpc_tweaks/include/linux/nfs_page.h --- linux-2.4.17/include/linux/nfs_page.h Sat Dec 22 02:21:21 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_page.h Sun Jan 13 17:49:06 2002 @@ -41,9 +41,10 @@ #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) -extern struct nfs_page *nfs_create_request(struct file *, struct inode *, +extern struct nfs_page *nfs_create_request(struct rpc_cred *, struct inode *, struct page *, unsigned int, unsigned int); +extern void nfs_clear_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req); diff -u --recursive --new-file linux-2.4.17/include/linux/nfs_xdr.h linux-2.4.17-rpc_tweaks/include/linux/nfs_xdr.h --- linux-2.4.17/include/linux/nfs_xdr.h Mon Jan 29 21:07:43 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/nfs_xdr.h Sun Jan 13 17:19:31 2002 @@ -30,6 +30,7 @@ __u64 atime; __u64 mtime; __u64 ctime; + unsigned long timestamp; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ @@ -40,6 +41,7 @@ * Info on the file system */ struct nfs_fsinfo { + struct nfs_fattr *fattr; __u32 rtmax; /* max. read transfer size */ __u32 rtpref; /* pref. read transfer size */ __u32 rtmult; /* reads should be multiple of this */ @@ -48,15 +50,37 @@ __u32 wtmult; /* writes should be multiple of this */ __u32 dtpref; /* pref. readdir transfer size */ __u64 maxfilesize; - __u64 bsize; /* block size */ + __u64 time_delta; + __u32 properties; +}; + +struct nfs_fsstat { + struct nfs_fattr *fattr; __u64 tbytes; /* total size in bytes */ __u64 fbytes; /* # of free bytes */ __u64 abytes; /* # of bytes available to user */ __u64 tfiles; /* # of files */ __u64 ffiles; /* # of free files */ __u64 afiles; /* # of files available to user */ + __u32 invarsec; +}; + +struct nfs_pathconf { + struct nfs_fattr *fattr; /* Post-op attributes */ __u32 linkmax;/* max # of hard links */ - __u32 namelen;/* max name length */ + __u32 name_max;/* max name length */ + int no_trunc : 1, + chown_restricted : 1, + case_insensitive : 1, + case_preserving : 1; +}; + +struct nfs2_statfs { + __u32 tsize; /* Server transfer size */ + __u32 bsize; /* Filesystem block size */ + __u32 blocks; /* No. of "bsize" blocks on filesystem */ + __u32 bfree; /* No. of free "bsize" blocks */ + __u32 bavail; /* No. of available "bsize" blocks */ }; /* Arguments to the read call. @@ -112,8 +136,8 @@ const char * name; unsigned int len; int eof; - struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; }; /* @@ -353,7 +377,11 @@ int (*mknod) (struct inode *, struct qstr *, struct iattr *, dev_t, struct nfs_fh *, struct nfs_fattr *); int (*statfs) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsstat *); + int (*fsinfo) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*pathconf) (struct nfs_server *, struct nfs_fh *, + struct nfs_pathconf *); u32 * (*decode_dirent)(u32 *, struct nfs_entry *, int plus); }; diff -u --recursive --new-file linux-2.4.17/include/linux/nfsd/const.h linux-2.4.17-rpc_tweaks/include/linux/nfsd/const.h --- linux-2.4.17/include/linux/nfsd/const.h Sat Apr 1 18:04:27 2000 +++ linux-2.4.17-rpc_tweaks/include/linux/nfsd/const.h Sun Jan 13 17:22:26 2002 @@ -21,7 +21,7 @@ /* * Maximum blocksize supported by daemon currently at 8K */ -#define NFSSVC_MAXBLKSIZE 8192 +#define NFSSVC_MAXBLKSIZE (32*1024) #ifdef __KERNEL__ diff -u --recursive --new-file linux-2.4.17/include/linux/sunrpc/clnt.h linux-2.4.17-rpc_tweaks/include/linux/sunrpc/clnt.h --- linux-2.4.17/include/linux/sunrpc/clnt.h Thu Nov 22 20:47:20 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/sunrpc/clnt.h Mon Jan 28 12:33:22 2002 @@ -111,6 +111,8 @@ void rpc_release_client(struct rpc_clnt *); void rpc_getport(struct rpc_task *, struct rpc_clnt *); int rpc_register(u32, u32, int, unsigned short, int *); +u32 * rpc_call_header(struct rpc_task *task); +u32 * rpc_call_verify(struct rpc_task *task); void rpc_call_setup(struct rpc_task *, struct rpc_message *, int); @@ -136,7 +138,6 @@ xprt_set_timeout(&clnt->cl_timeout, retr, incr); } -extern void rpciod_tcp_dispatcher(void); extern void rpciod_wake_up(void); /* @@ -144,5 +145,10 @@ */ int rpc_getport_external(struct sockaddr_in *, __u32, __u32, int); +/* + * Ping function + */ +void rpc_ping(struct rpc_task *task); + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff -u --recursive --new-file linux-2.4.17/include/linux/sunrpc/svc.h linux-2.4.17-rpc_tweaks/include/linux/sunrpc/svc.h --- linux-2.4.17/include/linux/sunrpc/svc.h Sat Dec 22 02:20:36 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/sunrpc/svc.h Sun Jan 13 17:48:20 2002 @@ -69,11 +69,18 @@ */ #define RPCSVC_MAXIOV ((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1) struct svc_buf { + struct svc_buf * prev; /* svc_sock send queue */ + struct svc_buf * next; u32 * area; /* allocated memory */ u32 * base; /* base of RPC datagram */ - int buflen; /* total length of buffer */ + unsigned int buflen; /* total length of buffer */ u32 * buf; /* read/write pointer */ - int len; /* current end of buffer */ + unsigned int len; /* current end of buffer */ + + unsigned int sent; /* number of bytes sent */ + + /* UDP responses should have peer addresses */ + struct sockaddr_in raddr; /* peer address */ /* iovec for zero-copy NFS READs */ struct iovec iov[RPCSVC_MAXIOV]; @@ -100,7 +107,7 @@ struct sk_buff * rq_skbuff; /* fast recv inet buffer */ struct svc_buf rq_defbuf; /* default buffer */ struct svc_buf rq_argbuf; /* argument buffer */ - struct svc_buf rq_resbuf; /* result buffer */ + struct svc_buf * rq_resbuf; /* result buffer */ u32 rq_xid; /* transmission id */ u32 rq_prog; /* program number */ u32 rq_vers; /* program version */ @@ -179,5 +186,7 @@ int svc_process(struct svc_serv *, struct svc_rqst *); int svc_register(struct svc_serv *, int, unsigned short); void svc_wake_up(struct svc_serv *); +struct svc_buf * svc_resbuf_alloc(struct svc_serv *); +int svc_resbuf_free(struct svc_buf *); #endif /* SUNRPC_SVC_H */ diff -u --recursive --new-file linux-2.4.17/include/linux/sunrpc/svcsock.h linux-2.4.17-rpc_tweaks/include/linux/sunrpc/svcsock.h --- linux-2.4.17/include/linux/sunrpc/svcsock.h Sat Dec 22 02:20:41 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/sunrpc/svcsock.h Sun Jan 13 17:48:25 2002 @@ -11,6 +11,9 @@ #include +#define SK_SENDING 0 +#define SK_WSPACE 1 +#define SK_INSEND 2 /* * RPC server socket. * NOTE: First two items must be prev/next. @@ -32,16 +35,22 @@ unsigned int sk_temp : 1, /* temp socket */ sk_qued : 1, /* on serv->sk_sockets */ sk_dead : 1; /* socket closed */ - int (*sk_recvfrom)(struct svc_rqst *rqstp); - int (*sk_sendto)(struct svc_rqst *rqstp); + int (*sk_recvfrom)(struct svc_rqst *); + int (*sk_sendto)(struct svc_sock *, struct svc_buf *); /* We keep the old state_change and data_ready CB's here */ void (*sk_ostate)(struct sock *); - void (*sk_odata)(struct sock *, int bytes); + void (*sk_odata)(struct sock *, int); + void (*sk_owspace)(struct sock *); + /* send stuff */ + spinlock_t sk_sendlk; + struct svc_buf * sk_sendq; /* send-queue of resbuf's */ + unsigned int sk_sendstate; /* private TCP part */ - int sk_reclen; /* length of record */ - int sk_tcplen; /* current read length */ + unsigned int sk_reclen; /* length of record */ + unsigned int sk_tcplen; /* current read length */ + struct sockaddr_in sk_raddr; /* peer address */ /* Debugging */ struct svc_rqst * sk_rqstp; diff -u --recursive --new-file linux-2.4.17/include/linux/sunrpc/xprt.h linux-2.4.17-rpc_tweaks/include/linux/sunrpc/xprt.h --- linux-2.4.17/include/linux/sunrpc/xprt.h Thu Nov 22 20:47:20 2001 +++ linux-2.4.17-rpc_tweaks/include/linux/sunrpc/xprt.h Mon Jan 28 12:05:22 2002 @@ -39,12 +39,14 @@ * Come Linux 2.3, we'll handle fragments directly. */ #define RPC_MAXCONG 16 -#define RPC_MAXREQS (RPC_MAXCONG + 1) +#define RPC_MAXREQS (RPC_MAXCONG + 2) #define RPC_CWNDSCALE 256 #define RPC_MAXCWND (RPC_MAXCONG * RPC_CWNDSCALE) #define RPC_INITCWND RPC_CWNDSCALE #define RPCXPRT_CONGESTED(xprt) \ ((xprt)->cong >= (xprt)->cwnd) +#define RPCXPRT_SUPERCONGESTED(xprt) \ + ((xprt)->cwnd < 2*RPC_CWNDSCALE) /* Default timeout values */ #define RPC_MAX_UDP_TIMEOUT (60*HZ) @@ -130,11 +132,13 @@ unsigned long cong; /* current congestion */ unsigned long cwnd; /* congestion window */ - unsigned long congtime; /* hold cwnd until then */ + int sndsize; /* length send buffer */ + int rcvsize; /* length receive buffer */ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue pending; /* requests in flight */ struct rpc_wait_queue backlog; /* waiting for slot */ + struct rpc_wait_queue pingwait; /* waiting on ping() */ struct rpc_rqst * free; /* free slots */ struct rpc_rqst slot[RPC_MAXREQS]; unsigned long sockstate; /* Socket state */ @@ -172,6 +176,7 @@ struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *addr, struct rpc_timeout *toparms); +void xprt_setbufsize(struct rpc_xprt *, int, int); int xprt_destroy(struct rpc_xprt *); void xprt_shutdown(struct rpc_xprt *); void xprt_default_timeout(struct rpc_timeout *, int); @@ -179,10 +184,12 @@ unsigned long); int xprt_reserve(struct rpc_task *); +int xprt_ping_reserve(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_timeout *); void xprt_release(struct rpc_task *); +void xprt_ping_release(struct rpc_task *); void xprt_reconnect(struct rpc_task *); int xprt_clear_backlog(struct rpc_xprt *); int xprt_tcp_pending(void); @@ -190,6 +197,8 @@ #define XPRT_WSPACE 0 #define XPRT_CONNECT 1 +#define XPRT_PING 2 +#define XPRT_NORESPOND 3 #define xprt_wspace(xp) (test_bit(XPRT_WSPACE, &(xp)->sockstate)) #define xprt_test_and_set_wspace(xp) (test_and_set_bit(XPRT_WSPACE, &(xp)->sockstate)) @@ -200,6 +209,32 @@ #define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) +static inline int xprt_pinging(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_PING, &xprt->sockstate); +} +static inline int xprt_test_and_set_pinging(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_PING, &xprt->sockstate); +} +static inline void xprt_clear_pinging(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_PING, &xprt->sockstate); +} + +static inline int xprt_norespond(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline int xprt_test_and_set_norespond(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline void xprt_clear_norespond(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_NORESPOND, &xprt->sockstate); +} + static inline void rpciod_tcp_dispatcher(void) { diff -u --recursive --new-file linux-2.4.17/mm/filemap.c linux-2.4.17-rpc_tweaks/mm/filemap.c --- linux-2.4.17/mm/filemap.c Fri Dec 21 18:42:04 2001 +++ linux-2.4.17-rpc_tweaks/mm/filemap.c Sun Jan 13 17:17:58 2002 @@ -1539,7 +1539,7 @@ if (retval) break; - retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + retval = mapping->a_ops->direct_IO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize); if (rw == READ && retval > 0) mark_dirty_kiobuf(iobuf, retval); diff -u --recursive --new-file linux-2.4.17/net/sunrpc/Makefile linux-2.4.17-rpc_tweaks/net/sunrpc/Makefile --- linux-2.4.17/net/sunrpc/Makefile Fri Dec 29 23:07:24 2000 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/Makefile Sun Jan 13 17:19:52 2002 @@ -14,7 +14,7 @@ obj-y := clnt.o xprt.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o \ - pmap_clnt.o xdr.o sunrpc_syms.o + ping.o pmap_clnt.o xdr.o sunrpc_syms.o obj-$(CONFIG_PROC_FS) += stats.o obj-$(CONFIG_SYSCTL) += sysctl.o diff -u --recursive --new-file linux-2.4.17/net/sunrpc/clnt.c linux-2.4.17-rpc_tweaks/net/sunrpc/clnt.c --- linux-2.4.17/net/sunrpc/clnt.c Fri Sep 21 20:24:50 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/clnt.c Mon Jan 28 01:14:54 2002 @@ -57,8 +57,8 @@ static void call_reconnect(struct rpc_task *task); static void child_reconnect(struct rpc_task *); static void child_reconnect_status(struct rpc_task *); -static u32 * call_header(struct rpc_task *task); -static u32 * call_verify(struct rpc_task *task); +static void call_ping(struct rpc_task *task); +static void call_pingresult(struct rpc_task *task); /* @@ -78,10 +78,6 @@ dprintk("RPC: creating %s client for %s (xprt %p)\n", program->name, servname, xprt); -#ifdef RPC_DEBUG - rpc_register_sysctl(); -#endif - if (!xprt) goto out; if (vers >= program->nrvers || !(version = program->version[vers])) @@ -375,7 +371,6 @@ task->tk_status = 0; task->tk_action = call_reserveresult; task->tk_timeout = clnt->cl_timeout.to_resrvval; - clnt->cl_stats->rpccnt++; xprt_reserve(task); } @@ -399,21 +394,20 @@ task->tk_status, task->tk_rqstp); if (task->tk_status >= 0) { + task->tk_client->cl_stats->rpccnt++; task->tk_action = call_allocate; return; } task->tk_status = 0; switch (status) { + case -ETIMEDOUT: + dprintk("RPC: task timed out\n"); case -EAGAIN: case -ENOBUFS: task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; task->tk_action = call_reserve; break; - case -ETIMEDOUT: - dprintk("RPC: task timed out\n"); - task->tk_action = call_timeout; - break; default: if (!task->tk_rqstp) { printk(KERN_INFO "RPC: task has no request, exit EIO\n"); @@ -448,8 +442,7 @@ printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) { - xprt_release(task); - task->tk_action = call_reserve; + task->tk_action = call_allocate; rpc_delay(task, HZ>>4); return; } @@ -491,7 +484,7 @@ /* Encode header and provided arguments */ encode = rpcproc_encode(clnt, task->tk_msg.rpc_proc); - if (!(p = call_header(task))) { + if (!(p = rpc_call_header(task))) { printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); rpc_exit(task, -EIO); } else @@ -618,13 +611,13 @@ task->tk_action = call_reconnect; break; } - /* - * Sleep and dream of an open connection - */ - task->tk_timeout = 5 * HZ; - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + if (RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; + break; + } case -ENOMEM: case -EAGAIN: + case -ENOBUFS: task->tk_action = call_transmit; clnt->cl_stats->rpcretrans++; break; @@ -646,6 +639,7 @@ { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; + int major = 0; if (req) { struct rpc_timeout *to = &req->rq_timeout; @@ -666,17 +660,7 @@ rpc_exit(task, -EIO); return; } - if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { - task->tk_flags |= RPC_CALL_MAJORSEEN; - if (req) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); -#ifdef RPC_DEBUG - else - printk(KERN_NOTICE "%s: task %d can't get a request slot\n", - clnt->cl_protname, task->tk_pid); -#endif - } + major = 1; if (clnt->cl_autobind) clnt->cl_port = 0; @@ -689,6 +673,8 @@ } else if (!xprt_connected(clnt->cl_xprt)) { task->tk_action = call_reconnect; clnt->cl_stats->rpcretrans++; + } else if (major && RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; } else { task->tk_action = call_transmit; clnt->cl_stats->rpcretrans++; @@ -710,12 +696,6 @@ dprintk("RPC: %4d call_decode (status %d)\n", task->tk_pid, task->tk_status); - if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { - printk(KERN_NOTICE "%s: server %s OK\n", - clnt->cl_protname, clnt->cl_server); - task->tk_flags &= ~RPC_CALL_MAJORSEEN; - } - if (task->tk_status < 12) { if (!clnt->cl_softrtry) { task->tk_action = call_transmit; @@ -729,7 +709,7 @@ } /* Verify the RPC header */ - if (!(p = call_verify(task))) + if (!(p = rpc_call_verify(task))) return; /* @@ -788,8 +768,8 @@ /* * Call header serialization */ -static u32 * -call_header(struct rpc_task *task) +u32 * +rpc_call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_xprt *xprt = clnt->cl_xprt; @@ -809,10 +789,63 @@ } /* + * Ping a non-responding server + */ +static void +call_ping(struct rpc_task *task) +{ + task->tk_action = call_pingresult; + rpc_ping(task); +} + +/* + * Interpret the result from ping + */ +static void +call_pingresult(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + int status = task->tk_status; + + task->tk_status = 0; + if (status >= 0) { + task->tk_action = call_transmit; + return; + } + + switch(status) { + case -ECONNREFUSED: + case -ENOTCONN: + if (clnt->cl_autobind || !clnt->cl_port) { + clnt->cl_port = 0; + task->tk_action = call_bind; + break; + } + if (xprt->stream) { + task->tk_action = call_reconnect; + break; + } + case -ENOMEM: + case -ENOBUFS: + rpc_delay(task, HZ >> 4); + case -ETIMEDOUT: + task->tk_action = call_ping; + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task,status); + return; + } +} + +/* * Reply header verification */ -static u32 * -call_verify(struct rpc_task *task) +u32 * +rpc_call_verify(struct rpc_task *task) { u32 *p = task->tk_rqstp->rq_rvec[0].iov_base, n; diff -u --recursive --new-file linux-2.4.17/net/sunrpc/ping.c linux-2.4.17-rpc_tweaks/net/sunrpc/ping.c --- linux-2.4.17/net/sunrpc/ping.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/ping.c Sun Jan 13 17:19:52 2002 @@ -0,0 +1,218 @@ +/* + * linux/net/sunrpc/ping.c + * + * Ping routing. + * + * Copyright (C) 2000, Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define RPC_SLACK_SPACE 512 /* total overkill */ +#define RPC_PING_DELAY (15*HZ) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +static void ping_call_reserve(struct rpc_task *); +static void ping_call_allocate(struct rpc_task *); +static void ping_call_encode(struct rpc_task *); +static void ping_call_transmit(struct rpc_task *); +static void ping_call_receive(struct rpc_task *); +static void ping_call_exit(struct rpc_task *); + + +static void +ping_call_reserve(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_reserve\n", task->tk_pid); + task->tk_status = 0; + task->tk_action = ping_call_allocate; + task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; + xprt_ping_reserve(task); +} + +static void +ping_call_allocate(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + unsigned int bufsiz; + + dprintk("RPC: %4d, ping_call_allocate (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = ping_call_exit; + if (task->tk_status < 0) + return; + + bufsiz = rpcproc_bufsiz(clnt, task->tk_msg.rpc_proc) + RPC_SLACK_SPACE; + if (!(task->tk_buffer = rpc_malloc(task, bufsiz << 1))) { + task->tk_status = -ENOMEM; + return; + } + req->rq_svec[0].iov_base = (void *)task->tk_buffer; + req->rq_svec[0].iov_len = bufsiz; + req->rq_slen = 0; + req->rq_snr = 1; + req->rq_rvec[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + req->rq_rvec[0].iov_len = bufsiz; + req->rq_rlen = bufsiz; + req->rq_rnr = 1; + task->tk_action = ping_call_encode; +} + +static void +ping_call_encode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + u32 *p; + + dprintk("RPC: %4d, ping_call_encode (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status < 0) { + task->tk_action = ping_call_exit; + return; + } + p = rpc_call_header(task); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + task->tk_action = ping_call_transmit; +} + +static void +ping_call_transmit(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_transmit\n", task->tk_pid); + task->tk_action = ping_call_receive; + xprt_transmit(task); +} + +static void +ping_call_receive(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_timeout *to = &req->rq_timeout; + u32 *p; + + dprintk("RPC: %4d, ping_call_receive (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status >= 0) + p = rpc_call_verify(task); + + task->tk_action = ping_call_exit; + + if (task->tk_status >= 0 || task->tk_status == -EACCES) { + task->tk_status = 0; + if (xprt_norespond(xprt)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + xprt_clear_norespond(xprt); + } + return; + } + + switch (task->tk_status) { + case -ENOTCONN: + break; + case -ENOMEM: + case -EAGAIN: + case -ECONNREFUSED: + case -ETIMEDOUT: + if (!xprt_adjust_timeout(to)) { + task->tk_status = 0; + task->tk_action = ping_call_transmit; + break; + } + default: + if (clnt->cl_softrtry) { + task->tk_status = -EIO; + break; + } + if (clnt->cl_chatty) { + if (!xprt_test_and_set_norespond(xprt)) { + printk(KERN_NOTICE + "%s: server %s is not responding\n", + clnt->cl_protname, clnt->cl_server); + } else { + printk(KERN_NOTICE + "%s: server %s still not responding\n", + clnt->cl_protname, clnt->cl_server); + } + } + rpc_delay(task, RPC_PING_DELAY); + } +} + +static void +ping_call_exit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %4d, ping_call_exit (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = NULL; + xprt_ping_release(task); + + /* Sigh. rpc_delay() clears task->tk_status */ + if (task->tk_status == 0 && xprt_norespond(xprt)) + task->tk_status = -ETIMEDOUT; + + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, task->tk_status); +} + +void +rpc_ping(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_task *child; + struct rpc_message msg = {0, NULL, NULL, NULL}; + + dprintk("RPC: %4d, rpc_ping\n", task->tk_pid); + + again: + if (xprt_test_and_set_pinging(xprt)) { + rpc_sleep_on(&xprt->pingwait, task, NULL, 0); + if (!xprt_pinging(xprt)) { + rpc_wake_up_task(task); + goto again; + } + dprintk("RPC: %4d, rpc_ping, waiting on completion\n", + task->tk_pid); + return; + } + + child = rpc_new_child(clnt, task); + if (!child) { + dprintk("RPC: %4d, rpc_ping, failed to create child process\n", + task->tk_pid); + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, -ENOMEM); + task->tk_status = -ENOMEM; + return; + } + rpc_call_setup(child, &msg, 0); + child->tk_action = ping_call_reserve; + + dprintk("RPC: %4d, rpc_ping, running child process %4d\n", + task->tk_pid, child->tk_pid); + rpc_run_child(task, child, NULL); +} diff -u --recursive --new-file linux-2.4.17/net/sunrpc/sched.c linux-2.4.17-rpc_tweaks/net/sunrpc/sched.c --- linux-2.4.17/net/sunrpc/sched.c Thu Oct 11 17:12:52 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/sched.c Sun Jan 20 18:34:44 2002 @@ -104,7 +104,11 @@ static inline void __rpc_disable_timer(struct rpc_task *task) { + struct timer_list *timer = &task->tk_timer; + dprintk("RPC: %4d disabling timer\n", task->tk_pid); + if (timer_pending(timer)) + del_timer(timer); task->tk_timeout_fn = NULL; task->tk_timeout = 0; } @@ -1052,7 +1056,6 @@ int rounds = 0; MOD_INC_USE_COUNT; - lock_kernel(); /* * Let our maker know we're running ... */ @@ -1076,7 +1079,7 @@ } __rpc_schedule(); - if (++rounds >= 64) { /* safeguard */ + if (++rounds >= 64 || current->need_resched) { /* safeguard */ schedule(); rounds = 0; } diff -u --recursive --new-file linux-2.4.17/net/sunrpc/stats.c linux-2.4.17-rpc_tweaks/net/sunrpc/stats.c --- linux-2.4.17/net/sunrpc/stats.c Thu Oct 11 20:17:22 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/stats.c Sun Jan 13 17:22:01 2002 @@ -14,10 +14,11 @@ #define __NO_VERSION__ #include - +#include #include #include #include +#include #include #include @@ -25,6 +26,8 @@ static struct proc_dir_entry *proc_net_rpc = NULL; +kmem_cache_t *rpc_rbcachep = NULL; + /* * Get RPC client stats */ @@ -181,25 +184,53 @@ } } -#ifdef MODULE +static int +svc_rbcache_create(void) +{ + if (!(rpc_rbcachep = kmem_cache_create("rpc_rbcache", + sizeof (struct svc_buf), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL))) + return -ENOMEM; + return 0; +} -int -init_module(void) +static int +svc_rbcache_destroy(void) { + if (rpc_rbcachep && kmem_cache_destroy(rpc_rbcachep)) { + printk(KERN_WARNING "RPC: Unable to destroy rpc_rbcache (%p).\n" + "This module cannot be reloaded.\n", + rpc_rbcachep); + return -EBUSY; + } + return 0; +} + +/* + * Initialize sunrpc + */ +static int __init init_sunrpc(void) +{ + int err = 0; + #ifdef RPC_DEBUG rpc_register_sysctl(); #endif rpc_proc_init(); - return 0; + + err = svc_rbcache_create(); + return err; } -void -cleanup_module(void) +static void __exit exit_sunrpc(void) { #ifdef RPC_DEBUG rpc_unregister_sysctl(); #endif rpc_proc_exit(); + svc_rbcache_destroy(); } -#endif + MODULE_LICENSE("GPL"); +module_init(init_sunrpc) +module_exit(exit_sunrpc) diff -u --recursive --new-file linux-2.4.17/net/sunrpc/sunrpc_syms.c linux-2.4.17-rpc_tweaks/net/sunrpc/sunrpc_syms.c --- linux-2.4.17/net/sunrpc/sunrpc_syms.c Fri Sep 21 06:02:01 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/sunrpc_syms.c Mon Jan 28 11:56:12 2002 @@ -55,6 +55,7 @@ EXPORT_SYMBOL(xprt_create_proto); EXPORT_SYMBOL(xprt_destroy); EXPORT_SYMBOL(xprt_set_timeout); +EXPORT_SYMBOL(xprt_setbufsize); /* Client credential cache */ EXPORT_SYMBOL(rpcauth_register); diff -u --recursive --new-file linux-2.4.17/net/sunrpc/svc.c linux-2.4.17-rpc_tweaks/net/sunrpc/svc.c --- linux-2.4.17/net/sunrpc/svc.c Fri Sep 7 19:48:39 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/svc.c Sun Jan 13 17:22:01 2002 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -23,6 +24,8 @@ #define RPCDBG_FACILITY RPCDBG_SVCDSP #define RPC_PARANOIA 1 +extern kmem_cache_t *rpc_rbcachep; + /* * Create an RPC service */ @@ -31,10 +34,6 @@ { struct svc_serv *serv; -#ifdef RPC_DEBUG - rpc_register_sysctl(); -#endif - if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) return NULL; @@ -112,6 +111,36 @@ bufp->area = 0; } +struct svc_buf * +svc_resbuf_alloc(struct svc_serv *serv) +{ + struct svc_buf *resbufp; + + if (!(resbufp = kmem_cache_alloc(rpc_rbcachep, SLAB_NFS))) + return NULL; + + memset(resbufp, 0, sizeof (struct svc_buf)); + + if (!svc_init_buffer(resbufp, serv->sv_bufsz)) { + kmem_cache_free(rpc_rbcachep, resbufp); + return NULL; + } + + dprintk("rpc: allocated resbuf %p for %s\n", resbufp, serv->sv_name); + return resbufp; +} + +int +svc_resbuf_free(struct svc_buf *bufp) +{ + if (bufp) { + svc_release_buffer(bufp); + kmem_cache_free(rpc_rbcachep, bufp); + dprintk("rpc: freed resbuf %p\n", bufp); + } + return 0; +} + /* * Create a server thread */ @@ -218,7 +247,7 @@ struct svc_version *versp = NULL; /* compiler food */ struct svc_procedure *procp = NULL; struct svc_buf * argp = &rqstp->rq_argbuf; - struct svc_buf * resp = &rqstp->rq_resbuf; + struct svc_buf * resp = rqstp->rq_resbuf; kxdrproc_t xdr; u32 *bufp, *statp; u32 dir, prog, vers, proc, @@ -307,7 +336,7 @@ /* Encode reply */ if (*statp == rpc_success && (xdr = procp->pc_encode) - && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) { + && !xdr(rqstp, rqstp->rq_resbuf->buf, rqstp->rq_resp)) { dprintk("svc: failed to encode reply\n"); /* serv->sv_stats->rpcsystemerr++; */ *statp = rpc_system_err; diff -u --recursive --new-file linux-2.4.17/net/sunrpc/svcauth.c linux-2.4.17-rpc_tweaks/net/sunrpc/svcauth.c --- linux-2.4.17/net/sunrpc/svcauth.c Sat Apr 29 07:50:39 2000 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/svcauth.c Sun Jan 13 17:22:01 2002 @@ -85,7 +85,7 @@ svcauth_null(struct svc_rqst *rqstp, u32 *statp, u32 *authp) { struct svc_buf *argp = &rqstp->rq_argbuf; - struct svc_buf *resp = &rqstp->rq_resbuf; + struct svc_buf *resp = rqstp->rq_resbuf; if ((argp->len -= 3) < 0) { *statp = rpc_garbage_args; @@ -117,7 +117,7 @@ svcauth_unix(struct svc_rqst *rqstp, u32 *statp, u32 *authp) { struct svc_buf *argp = &rqstp->rq_argbuf; - struct svc_buf *resp = &rqstp->rq_resbuf; + struct svc_buf *resp = rqstp->rq_resbuf; struct svc_cred *cred = &rqstp->rq_cred; u32 *bufp = argp->buf, slen, i; int len = argp->len; diff -u --recursive --new-file linux-2.4.17/net/sunrpc/svcauth_des.c linux-2.4.17-rpc_tweaks/net/sunrpc/svcauth_des.c --- linux-2.4.17/net/sunrpc/svcauth_des.c Mon Apr 7 20:35:33 1997 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/svcauth_des.c Sun Jan 13 17:22:01 2002 @@ -57,7 +57,7 @@ svcauth_des(struct svc_rqst *rqstp, u32 *statp, u32 *authp) { struct svc_buf *argp = &rqstp->rq_argbuf; - struct svc_buf *resp = &rqstp->rq_resbuf; + struct svc_buf *resp = rqstp->rq_resbuf; struct svc_cred *cred = &rqstp->rq_cred; struct des_cred *data = NULL; u32 cryptkey[2]; diff -u --recursive --new-file linux-2.4.17/net/sunrpc/svcsock.c linux-2.4.17-rpc_tweaks/net/sunrpc/svcsock.c --- linux-2.4.17/net/sunrpc/svcsock.c Wed Jul 4 20:50:38 2001 +++ linux-2.4.17-rpc_tweaks/net/sunrpc/svcsock.c Sun Jan 13 17:22:26 2002 @@ -52,13 +52,16 @@ #define RPCDBG_FACILITY RPCDBG_SVCSOCK +#define SVC_TCP_DEFAULT_SOCKSIZE (64*1024) +#define SVC_UDP_DEFAULT_SOCKSIZE (128*1024) +#define SVC_MIN_WRITE_SPACE (35000) static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, int *errp, int pmap_reg); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); -static int svc_udp_sendto(struct svc_rqst *); - +static int svc_udp_sendto(struct svc_sock *, struct svc_buf *); +static int svc_empty_sendq(struct svc_sock *); /* * Queue up an idle server thread. Must have serv->sv_lock held. @@ -205,10 +208,28 @@ spin_unlock_bh(&svsk->sk_lock); } +static void +svc_sock_close(struct svc_sock *svsk) +{ + struct svc_buf *resbufp; + + dprintk("svc: releasing dead socket %p\n", svsk); + /* Flush out the send queue first */ + spin_lock(&svsk->sk_sendlk); + while ((resbufp = svsk->sk_sendq) != NULL) { + rpc_remove_list(&svsk->sk_sendq, resbufp); + svc_resbuf_free(resbufp); + } + spin_unlock(&svsk->sk_sendlk); + + sock_release(svsk->sk_sock); + kfree(svsk); +} + /* * Release a socket after use. */ -static inline void +static void svc_sock_release(struct svc_rqst *rqstp) { struct svc_sock *svsk = rqstp->rq_sock; @@ -216,13 +237,13 @@ svc_release_skb(rqstp); rqstp->rq_sock = NULL; + if (svsk->sk_sendq) + svc_empty_sendq(svsk); spin_lock_bh(&serv->sv_lock); if (!--(svsk->sk_inuse) && svsk->sk_dead) { spin_unlock_bh(&serv->sv_lock); - dprintk("svc: releasing dead socket\n"); - sock_release(svsk->sk_sock); - kfree(svsk); + svc_sock_close(svsk); } else spin_unlock_bh(&serv->sv_lock); @@ -252,10 +273,9 @@ * Generic sendto routine */ static int -svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr) +svc_sendto(struct svc_sock *svsk, struct svc_buf *bufp, struct iovec *iov, int nr) { mm_segment_t oldfs; - struct svc_sock *svsk = rqstp->rq_sock; struct socket *sock = svsk->sk_sock; struct msghdr msg; int i, buflen, len; @@ -263,8 +283,14 @@ for (i = buflen = 0; i < nr; i++) buflen += iov[i].iov_len; - msg.msg_name = &rqstp->rq_addr; - msg.msg_namelen = sizeof(rqstp->rq_addr); + if (sock->type == SOCK_STREAM) { + msg.msg_name = &svsk->sk_raddr; + msg.msg_namelen = sizeof (svsk->sk_raddr); + } else { + msg.msg_name = &bufp->raddr; + msg.msg_namelen = sizeof(bufp->raddr); + } + msg.msg_iov = iov; msg.msg_iovlen = nr; msg.msg_control = NULL; @@ -277,7 +303,7 @@ set_fs(oldfs); dprintk("svc: socket %p sendto([%p %Zu... ], %d, %d) = %d\n", - rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, nr, buflen, len); + svsk, iov[0].iov_base, iov[0].iov_len, nr, buflen, len); return len; } @@ -340,6 +366,20 @@ } /* + * Set socket buffer length + */ +static inline void +svc_sock_setbufsize(struct socket *sock, unsigned int size) +{ + mm_segment_t oldfs; + + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, (char *)&size, sizeof(size)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, (char *)&size, sizeof(size)); + set_fs(oldfs); +} + +/* * INET callback when data has been received on the socket. */ static void @@ -360,6 +400,38 @@ wake_up_interruptible(sk->sleep); } +static void +svc_udp_write_space(struct sock *sk) +{ + struct svc_sock *svsk; + struct socket *sock; + + dprintk("svc: socket %p TCP write space (svsk %p)\n", + sk, sk->user_data); + + if (!(sock = sk->socket)) + return; + + if (!(svsk = (struct svc_sock *) sk->user_data)) + goto out; + + if (sock_wspace(sk) < mi