diff -u --recursive --new-file linux-2.4.16/Documentation/Configure.help linux-2.4.16-NFS_ALL/Documentation/Configure.help --- linux-2.4.16/Documentation/Configure.help Thu Nov 22 19:52:44 2001 +++ linux-2.4.16-NFS_ALL/Documentation/Configure.help Fri Feb 1 11:22:34 2002 @@ -14173,6 +14173,30 @@ If unsure, say N. +Allow direct I/O on files in NFS +CONFIG_NFS_DIRECTIO + There are important applications whose performance or correctness + depends on uncached access to file data. Database clusters (multiple + copies of the same instance running on separate hosts) implement their + own cache coherency protocol that subsumes the NFS cache protocols. + Applications that process datasets considerably larger than the client's + memory do not always benefit from a local cache. A streaming video + server, for instance, has no need to cache the contents of a file. + + This option enables applications to perform direct I/O on files in NFS + file systems using the O_DIRECT open() flag. When O_DIRECT is set for + files, their data is not cached in the system's page cache. Direct + read and write operations are aligned to block boundaries. Data is + moved to and from user-level application buffers directly. + + Unless your program is designed to use O_DIRECT properly, you are much + better off allowing the NFS client to manage caching for you. Misusing + O_DIRECT can cause poor server performance or network storms. This + kernel build option defaults OFF to avoid exposing system administrators + unwittingly to a potentially hazardous feature. + + If unsure, say N. + Root file system on NFS CONFIG_ROOT_NFS If you want your Linux box to mount its whole root file system (the diff -u --recursive --new-file linux-2.4.16/fs/Config.in linux-2.4.16-NFS_ALL/fs/Config.in --- linux-2.4.16/fs/Config.in Mon Nov 12 18:34:16 2001 +++ linux-2.4.16-NFS_ALL/fs/Config.in Fri Feb 1 11:22:35 2002 @@ -95,6 +95,7 @@ dep_tristate 'InterMezzo file system support (experimental, replicating fs)' CONFIG_INTERMEZZO_FS $CONFIG_INET $CONFIG_EXPERIMENTAL dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET dep_mbool ' Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS + dep_mbool ' Allow direct I/O on NFS files (EXPERIMENTAL)' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS $CONFIG_EXPERIMENTAL dep_bool ' Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET diff -u --recursive --new-file linux-2.4.16/fs/block_dev.c linux-2.4.16-NFS_ALL/fs/block_dev.c --- linux-2.4.16/fs/block_dev.c Wed Nov 21 23:07:25 2001 +++ linux-2.4.16-NFS_ALL/fs/block_dev.c Fri Feb 1 11:22:35 2002 @@ -113,9 +113,9 @@ return 0; } -static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int blkdev_direct_IO(int rw, struct file * file, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { - return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block); + return generic_direct_IO(rw, file, iobuf, blocknr, blocksize, blkdev_get_block); } static int blkdev_writepage(struct page * page) diff -u --recursive --new-file linux-2.4.16/fs/buffer.c linux-2.4.16-NFS_ALL/fs/buffer.c --- linux-2.4.16/fs/buffer.c Wed Nov 21 23:40:17 2001 +++ linux-2.4.16-NFS_ALL/fs/buffer.c Fri Feb 1 11:22:35 2002 @@ -1998,10 +1998,11 @@ return tmp.b_blocknr; } -int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) +int generic_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) { int i, nr_blocks, retval; unsigned long * blocks = iobuf->blocks; + struct inode * inode = filp->f_dentry->d_inode; nr_blocks = iobuf->length / blocksize; /* build the blocklist */ diff -u --recursive --new-file linux-2.4.16/fs/ext2/inode.c linux-2.4.16-NFS_ALL/fs/ext2/inode.c --- linux-2.4.16/fs/ext2/inode.c Wed Nov 21 23:07:25 2001 +++ linux-2.4.16-NFS_ALL/fs/ext2/inode.c Fri Feb 1 11:22:35 2002 @@ -592,9 +592,9 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } -static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int ext2_direct_IO(int rw, struct file * file, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { - return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); + return generic_direct_IO(rw, file, iobuf, blocknr, blocksize, ext2_get_block); } struct address_space_operations ext2_aops = { readpage: ext2_readpage, diff -u --recursive --new-file linux-2.4.16/fs/lockd/clntproc.c linux-2.4.16-NFS_ALL/fs/lockd/clntproc.c --- linux-2.4.16/fs/lockd/clntproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.16-NFS_ALL/fs/lockd/clntproc.c Fri Feb 1 11:22:18 2002 @@ -569,11 +569,15 @@ printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_rebind: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -650,12 +654,16 @@ } die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_cancel: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff -u --recursive --new-file linux-2.4.16/fs/lockd/svc4proc.c linux-2.4.16-NFS_ALL/fs/lockd/svc4proc.c --- linux-2.4.16/fs/lockd/svc4proc.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.16-NFS_ALL/fs/lockd/svc4proc.c Fri Feb 1 11:22:18 2002 @@ -17,6 +17,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -499,7 +500,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.16/fs/lockd/svclock.c linux-2.4.16-NFS_ALL/fs/lockd/svclock.c --- linux-2.4.16/fs/lockd/svclock.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.16-NFS_ALL/fs/lockd/svclock.c Fri Feb 1 11:22:18 2002 @@ -576,9 +576,10 @@ dprintk("lockd: GRANT_MSG RPC callback\n"); dprintk("callback: looking for cookie %x \n", *(unsigned int *)(call->a_args.cookie.data)); + lock_kernel(); if (!(block = nlmsvc_find_block(&call->a_args.cookie))) { dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data)); - return; + goto out; } /* Technically, we should down the file semaphore here. Since we @@ -599,6 +600,8 @@ block->b_incall = 0; nlm_release_host(call->a_host); + out: + unlock_kernel(); } /* diff -u --recursive --new-file linux-2.4.16/fs/lockd/svcproc.c linux-2.4.16-NFS_ALL/fs/lockd/svcproc.c --- linux-2.4.16/fs/lockd/svcproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.16-NFS_ALL/fs/lockd/svcproc.c Fri Feb 1 11:22:18 2002 @@ -18,6 +18,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -527,7 +528,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.16/fs/namei.c linux-2.4.16-NFS_ALL/fs/namei.c --- linux-2.4.16/fs/namei.c Wed Oct 17 23:46:29 2001 +++ linux-2.4.16-NFS_ALL/fs/namei.c Fri Feb 1 11:22:18 2002 @@ -454,7 +454,7 @@ while (*name=='/') name++; if (!*name) - goto return_base; + goto return_reval; inode = nd->dentry->d_inode; if (current->link_count) @@ -573,7 +573,7 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: - goto return_base; + goto return_reval; } if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { err = nd->dentry->d_op->d_hash(nd->dentry, &this); @@ -624,6 +624,17 @@ nd->last_type = LAST_DOT; else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; +return_reval: + /* + * We bypassed the ordinary revalidation routines, so + * NFS wants to check the cached inode for staleness. + */ + inode = nd->dentry->d_inode; + if (inode && inode->i_op && inode->i_op->check_stale) { + err = inode->i_op->check_stale(inode); + if (err) + break; + } return_base: return 0; out_dput: diff -u --recursive --new-file linux-2.4.16/fs/nfs/Makefile linux-2.4.16-NFS_ALL/fs/nfs/Makefile --- linux-2.4.16/fs/nfs/Makefile Fri Nov 9 23:28:15 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/Makefile Fri Feb 1 11:22:35 2002 @@ -14,6 +14,7 @@ obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +obj-$(CONFIG_NFS_DIRECTIO) += direct.o obj-m := $(O_TARGET) diff -u --recursive --new-file linux-2.4.16/fs/nfs/dir.c linux-2.4.16-NFS_ALL/fs/nfs/dir.c --- linux-2.4.16/fs/nfs/dir.c Tue Jun 12 20:15:08 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/dir.c Fri Feb 1 11:47:32 2002 @@ -34,8 +34,11 @@ #define NFS_PARANOIA 1 /* #define NFS_DEBUG_VERBOSE 1 */ +static loff_t nfs_dir_llseek(struct file *, loff_t, int); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *); +static int nfs_cached_lookup(struct inode *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); static int nfs_create(struct inode *, struct dentry *, int); static int nfs_mkdir(struct inode *, struct dentry *, int); static int nfs_rmdir(struct inode *, struct dentry *); @@ -47,6 +50,7 @@ struct inode *, struct dentry *); struct file_operations nfs_dir_operations = { + llseek: nfs_dir_llseek, read: generic_read_dir, readdir: nfs_readdir, open: nfs_open, @@ -66,8 +70,28 @@ permission: nfs_permission, revalidate: nfs_revalidate, setattr: nfs_notify_change, + check_stale: nfs_check_stale, }; +static loff_t nfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + switch (origin) { + case 1: + if (offset == 0) { + offset = file->f_pos; + break; + } + case 2: + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_reada = 0; + file->f_version = ++event; + } + return (offset <= 0) ? 0 : offset; +} + typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int); typedef struct { struct file *file; @@ -108,13 +132,15 @@ error = NFS_PROTO(inode)->readdir(inode, cred, desc->entry->cookie, buffer, NFS_SERVER(inode)->dtsize, desc->plus); /* We requested READDIRPLUS, but the server doesn't grok it */ - if (desc->plus && error == -ENOTSUPP) { - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; - desc->plus = 0; - goto again; - } - if (error < 0) + if (error < 0) { + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + desc->plus = 0; + goto again; + } goto error; + } SetPageUptodate(page); kunmap(page); /* Ensure consistent page alignment of the data. @@ -195,7 +221,6 @@ dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index); - desc->plus = NFS_USE_READDIRPLUS(inode); page = read_cache_page(&inode->i_data, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page)) { @@ -247,6 +272,24 @@ return res; } +static unsigned int nfs_type2dtype[] = { + DT_UNKNOWN, + DT_REG, + DT_DIR, + DT_BLK, + DT_CHR, + DT_LNK, + DT_SOCK, + DT_UNKNOWN, + DT_FIFO +}; + +static inline +unsigned int nfs_type_to_d_type(enum nfs_ftype type) +{ + return nfs_type2dtype[type]; +} + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -263,11 +306,17 @@ dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target); for(;;) { + unsigned d_type = DT_UNKNOWN; /* Note: entry->prev_cookie contains the cookie for * retrieving the current dirent on the server */ fileid = nfs_fileid_to_ino_t(entry->ino); + + /* Use readdirplus info */ + if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) + d_type = nfs_type_to_d_type(entry->fattr->type); + res = filldir(dirent, entry->name, entry->len, - entry->prev_cookie, fileid, DT_UNKNOWN); + entry->prev_cookie, fileid, d_type); if (res < 0) break; file->f_pos = desc->target = entry->cookie; @@ -334,7 +383,8 @@ /* Reset read descriptor so it searches the page cache from * the start upon the next call to readdir_search_pagecache() */ desc->page_index = 0; - memset(desc->entry, 0, sizeof(*desc->entry)); + desc->entry->cookie = desc->entry->prev_cookie = 0; + desc->entry->eof = 0; out: dfprintk(VFS, "NFS: uncached_readdir() returns %d\n", status); return status; @@ -353,9 +403,11 @@ nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_entry my_entry; + struct nfs_fh fh; + struct nfs_fattr fattr; long res; - res = nfs_revalidate(dentry); + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res < 0) return res; @@ -366,12 +418,16 @@ * itself. */ memset(desc, 0, sizeof(*desc)); - memset(&my_entry, 0, sizeof(my_entry)); - desc->file = filp; desc->target = filp->f_pos; - desc->entry = &my_entry; desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; + my_entry.fh = &fh; + my_entry.fattr = &fattr; + desc->entry = &my_entry; while(!desc->entry->eof) { res = readdir_search_pagecache(desc); @@ -401,6 +457,32 @@ return 0; } +static inline +void nfs_renew_verifier(struct inode *dir, struct dentry *dentry) +{ + u64 mtime = NFS_CACHE_MTIME(dir); + dentry->d_rtime_sec = mtime >> 32; + dentry->d_rtime_nsec = mtime & 0xffffffffUL; +} + +/* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + */ +static inline +int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +{ + u64 mtime; + if (IS_ROOT(dentry)) + return 1; + if (nfs_revalidate_inode(NFS_SERVER(dir), dir)) + return 0; + mtime = NFS_CACHE_MTIME(dir); + return (dentry->d_rtime_sec == (mtime >> 32)) && + (dentry->d_rtime_nsec == (mtime & 0xffffffffUL)); +} + /* * Whenever an NFS operation succeeds, we know that the dentry * is valid, so we update the revalidation timestamp. @@ -408,50 +490,34 @@ static inline void nfs_renew_times(struct dentry * dentry) { dentry->d_time = jiffies; + nfs_renew_verifier(dentry->d_parent->d_inode, dentry); } -static inline int nfs_dentry_force_reval(struct dentry *dentry, int flags) +static inline +int nfs_lookup_verify_inode(struct inode *inode, int flags) { - struct inode *inode = dentry->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(inode); - + struct nfs_server *server = NFS_SERVER(inode); /* - * If it's the last lookup in a series, we use a stricter - * cache consistency check by looking at the parent mtime. - * - * If it's been modified in the last hour, be really strict. - * (This still means that we can avoid doing unnecessary - * work on directories like /usr/share/bin etc which basically - * never change). + * If we're interested in close-to-open cache consistency, + * then we revalidate the inode upon lookup. */ - if (!(flags & LOOKUP_CONTINUE)) { - long diff = CURRENT_TIME - dentry->d_parent->d_inode->i_mtime; - - if (diff < 15*60) - timeout = 0; - } - - return time_after(jiffies,dentry->d_time + timeout); + if (!(server->flags & NFS_MOUNT_NOCTO) && !(flags & LOOKUP_CONTINUE)) + NFS_CACHEINV(inode); + return nfs_revalidate_inode(server, inode); } /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. * - * If mtime is close to present time, we revalidate - * more often. + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. */ -#define NFS_REVALIDATE_NEGATIVE (1 * HZ) -static inline int nfs_neg_need_reval(struct dentry *dentry) +static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(dir); - long diff = CURRENT_TIME - dir->i_mtime; - - if (diff < 5*60 && timeout > NFS_REVALIDATE_NEGATIVE) - timeout = NFS_REVALIDATE_NEGATIVE; - - return time_after(jiffies, dentry->d_time + timeout); + if (!nfs_check_verifier(dir, dentry)) + return 1; + return time_after(jiffies, dentry->d_time + NFS_ATTRTIMEO(dir)); } /* @@ -462,9 +528,8 @@ * NOTE! The hit can be a negative hit too, don't assume * we have an inode! * - * If the dentry is older than the revalidation interval, - * we do a new lookup and verify that the dentry is still - * correct. + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. */ static int nfs_lookup_revalidate(struct dentry * dentry, int flags) { @@ -477,13 +542,9 @@ lock_kernel(); dir = dentry->d_parent->d_inode; inode = dentry->d_inode; - /* - * If we don't have an inode, let's look at the parent - * directory mtime to get a hint about how often we - * should validate things.. - */ + if (!inode) { - if (nfs_neg_need_reval(dentry)) + if (nfs_neg_need_reval(dir, dentry)) goto out_bad; goto out_valid; } @@ -494,48 +555,49 @@ goto out_bad; } - if (!nfs_dentry_force_reval(dentry, flags)) + /* Force a full look up iff the parent directory has changed */ + if (nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid; + } - if (IS_ROOT(dentry)) { - __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) + goto out_bad; + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid_renew; } - /* - * Do a new lookup and check the dentry attributes. - */ + if (NFS_STALE(inode)) + goto out_bad; + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error) goto out_bad; - - /* Inode number matches? */ - if (!(fattr.valid & NFS_ATTR_FATTR) || - NFS_FSID(inode) != fattr.fsid || - NFS_FILEID(inode) != fattr.fileid) + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) goto out_bad; - - /* Ok, remember that we successfully checked it.. */ - nfs_refresh_inode(inode, &fattr); - - if (nfs_inode_is_stale(inode, &fhandle, &fattr)) + if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; out_valid_renew: nfs_renew_times(dentry); -out_valid: + out_valid: unlock_kernel(); return 1; -out_bad: - shrink_dcache_parent(dentry); - /* If we have submounts, don't unhash ! */ - if (have_submounts(dentry)) - goto out_valid; - d_drop(dentry); - /* Purge readdir caches. */ - nfs_zap_caches(dir); - if (inode && S_ISDIR(inode->i_mode)) + out_bad: + NFS_CACHEINV(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ nfs_zap_caches(inode); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); unlock_kernel(); return 0; } @@ -565,9 +627,12 @@ { if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { lock_kernel(); + inode->i_nlink--; nfs_complete_unlink(dentry); unlock_kernel(); } + if (is_bad_inode(inode)) + force_delete(inode); iput(inode); } @@ -594,6 +659,20 @@ error = -ENOMEM; dentry->d_op = &nfs_dentry_operations; + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + error = -EACCES; + inode = nfs_fhget(dentry, &fhandle, &fattr); + if (inode) { + if (!(NFS_SERVER(dir)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + d_add(dentry, inode); + nfs_renew_times(dentry); + error = 0; + } + goto out; + } + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); inode = NULL; if (error == -ENOENT) @@ -604,14 +683,85 @@ if (inode) { no_entry: d_add(dentry, inode); - nfs_renew_times(dentry); error = 0; } + nfs_renew_times(dentry); } out: return ERR_PTR(error); } +static inline +int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) +{ + struct nfs_entry *entry = desc->entry; + int status; + + while((status = dir_decode(desc)) == 0) { + if (entry->len != dentry->d_name.len) + continue; + if (memcmp(entry->name, dentry->d_name.name, entry->len)) + continue; + if (!(entry->fattr->valid & NFS_ATTR_FATTR)) + continue; + break; + } + return status; +} + +/* + * Use the cached Readdirplus results in order to avoid a LOOKUP call + * whenever we believe that the parent directory has not changed. + * + * We assume that any file creation/rename changes the directory mtime. + * As this results in a page cache invalidation whenever it occurs, + * we don't require any other tests for cache coherency. + */ +static +int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + nfs_readdir_descriptor_t desc; + struct nfs_server *server; + struct nfs_entry entry; + struct page *page; + int res; + + if (!NFS_USE_READDIRPLUS(dir)) + return -ENOENT; + server = NFS_SERVER(dir); + if (server->flags & NFS_MOUNT_NOAC) + return -ENOENT; + nfs_revalidate_inode(server, dir); + + entry.fh = fh; + entry.fattr = fattr; + + desc.decode = NFS_PROTO(dir)->decode_dirent; + desc.entry = &entry; + desc.page_index = 0; + desc.plus = 1; + + for(;(page = find_get_page(&dir->i_data, desc.page_index)); desc.page_index++) { + + res = -EIO; + if (Page_Uptodate(page)) { + desc.ptr = kmap(page); + res = find_dirent_name(&desc, page, dentry); + kunmap(page); + } + page_cache_release(page); + + if (res == 0) + goto out_found; + if (res != -EAGAIN) + break; + } + return -ENOENT; + out_found: + return 0; +} + /* * Code common to create, mkdir, and mknod. */ diff -u --recursive --new-file linux-2.4.16/fs/nfs/direct.c linux-2.4.16-NFS_ALL/fs/nfs/direct.c --- linux-2.4.16/fs/nfs/direct.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-NFS_ALL/fs/nfs/direct.c Fri Feb 1 11:22:35 2002 @@ -0,0 +1,378 @@ +/* + * linux/fs/nfs/direct.c + * + * High-performance direct I/O for the NFS client + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. Applications that manage their own data caching, such + * as databases, make very good use of direct I/O on local file systems. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Note that I/O to read in executables (e.g. kernel_read) cannot use + * direct (kiobuf) reads because there is no vma backing the passed-in + * data buffer. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * + * Initial implementation: 12/2001 by Chuck Lever + * + * TODO: + * + * 1. Use concurrent asynchronous network requests rather than + * serialized synchronous network requests for normal (non-sync) + * direct I/O. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define VERF_SIZE (2 * sizeof(__u32)) + +static /* inline */ int +nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg) +{ + int result; + struct inode * inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_readres res = { &fattr, arg->count, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_READ : NFSPROC_READ; +#else + msg.rpc_proc = NFSPROC_READ; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = nfs_file_cred(file); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + return result; +} + +static /* inline */ int +nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg, + struct nfs_writeverf *verf) +{ + int result; + struct inode *inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_writeres res = { &fattr, verf, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_WRITE : NFSPROC_WRITE; +#else + msg.rpc_proc = NFSPROC_WRITE; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = get_rpccred(nfs_file_cred(file)); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + put_rpccred(msg.rpc_cred); + unlock_kernel(); + + if (result > 0) { + if ((arg->stable == NFS_FILE_SYNC) && + (verf->committed != NFS_FILE_SYNC)) { + printk(KERN_ERR __FUNCTION__ + ": server didn't sync stable write request\n"); + return -EIO; + } + + if (result != arg->count) + printk(KERN_INFO __FUNCTION__ + ": short write, count=%u, result=%d\n", + arg->count, result); + } + + return result; +} + +#ifdef CONFIG_NFS_V3 +static /* inline */ int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0, + {{0, 0}, {0,0}, {0,0}, {0,0}, + {0,0}, {0,0}, {0,0}, {0,0}} }; + struct nfs_writeres res = { &fattr, verf, 0 }; + struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL }; + + fattr.valid = 0; + + lock_kernel(); + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + return result; +} +#else +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + return 0; +} +#endif + +/* + * Walk through the iobuf and create an iovec for each "rsize" bytes. + */ +static int +nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset, + size_t count) +{ + int curpage, total; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + struct nfs_readargs args = { NFS_FH(inode), 0, 0, 0 }; + + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > rsize) + request = rsize; + args.count = request; + args.offset = offset; + + starting_offset = iobuf->offset; + first = last = curpage; + while (curpage < iobuf->nr_pages) { + struct page *page = iobuf->maplist[curpage]; + + if (!page) + return -EFAULT; + + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + starting_offset = 0; /* zero after the first page */ + last = curpage; + curpage++; + iovec++; + args.nriov++; + } + + result = nfs_direct_read_rpc(file, &args); + + for (i = first; i < last; i++) { + flush_dcache_page(iobuf->maplist[i]); + kunmap(iobuf->maplist[i]); + } + + if (result < 0) { + if (result == -EISDIR) + total = -EINVAL; + else + total = result; + break; + } + + total += result; + count -= result; + offset += result; + + if (result < args.count) /* NFSv2ism */ + break; + }; + return total; +} + +/* + * Walk through the iobuf and create an iovec for each "wsize" bytes. + * If only one network write is necessary, or if the O_SYNC flag or + * 'sync' mount option are present, or if this is a V2 inode, use + * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. + * + * The mechanics of this function are much the same as nfs_direct_read, + * with the added complexity of committing unstable writes. + */ +static int +nfs_direct_write(struct file *file, struct kiobuf *iobuf, + loff_t offset, size_t count) +{ + int curpage, total; + int need_commit = 0; + loff_t save_offset = offset; + struct inode *inode = file->f_dentry->d_inode; + int wsize = NFS_SERVER(inode)->wsize; + struct nfs_writeverf first_verf, ret_verf; + struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0 }; + +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && + (!IS_SYNC(inode))) + args.stable = NFS_UNSTABLE; +#endif + +retry: + total = 0; + curpage = 0; + while (count) { + int starting_offset, request, result, first, last, i; + struct iovec *iovec = args.iov; + + request = count; + if (count > wsize) + request = wsize; + args.count = request; + args.offset = offset; + + starting_offset = iobuf->offset; + first = last = curpage; + while (curpage < iobuf->nr_pages) { + struct page *page = iobuf->maplist[curpage]; + + if (!page) + return -EFAULT; + + iovec->iov_base = kmap(page) + starting_offset; + iovec->iov_len = (PAGE_SIZE - starting_offset); + if ((starting_offset + request) < PAGE_SIZE) + iovec->iov_len = request; + + request -= iovec->iov_len; + starting_offset = 0; /* zero after the first page */ + last = curpage; + curpage++; + iovec++; + args.nriov++; + } + + result = nfs_direct_write_rpc(file, &args, &ret_verf); + + for (i = first; i < last; i++) + kunmap(iobuf->maplist[i]); + + if (result < 0) { + total = result; + break; + } + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + count -= result; + offset += result; + }; + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + if (nfs_direct_commit_rpc(inode, save_offset, + iobuf->length - count, &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + return total; + +print_retry: + printk(KERN_INFO __FUNCTION__ + ": detected server restart; retrying with FILE_SYNC\n"); + args.stable = NFS_FILE_SYNC; + offset = save_offset; + count = iobuf->length; + goto retry; +} + +/* + * Read or write data, moving the data directly to/from the + * application's buffer without caching in the page cache. + * + * Rules for direct I/O + * + * 1. block size = 512 bytes or more + * 2. file byte offset is block aligned + * 3. byte count is a multiple of block size + * 4. user buffer is not aligned + * 5. user buffer is faulted in and pinned + * + * These are verified before we get here. + */ +int +nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf, + unsigned long blocknr, int blocksize) +{ + int result = -EINVAL; + size_t count = iobuf->length; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + loff_t offset = blocknr << inode->i_blkbits; + + switch (rw) { + case READ: + dfprintk(VFS, + "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_read(file, iobuf, offset, count); + break; + case WRITE: + dfprintk(VFS, + "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_write(file, iobuf, offset, count); + break; + default: + break; + } + + dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + return result; +} diff -u --recursive --new-file linux-2.4.16/fs/nfs/file.c linux-2.4.16-NFS_ALL/fs/nfs/file.c --- linux-2.4.16/fs/nfs/file.c Sun Sep 23 18:48:01 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/file.c Fri Feb 1 11:57:09 2002 @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include #include #include #include @@ -99,7 +100,9 @@ dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) *ppos); + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!result) result = generic_file_read(file, buf, count, ppos); return result; @@ -115,7 +118,9 @@ dfprintk(VFS, "nfs: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); + lock_kernel(); status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!status) status = generic_file_mmap(file, vma); return status; @@ -134,13 +139,11 @@ dfprintk(VFS, "nfs: fsync(%x/%ld)\n", inode->i_dev, inode->i_ino); - lock_kernel(); status = nfs_wb_file(inode, file); if (!status) { status = file->f_error; file->f_error = 0; } - unlock_kernel(); return status; } @@ -160,17 +163,7 @@ static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - long status; - loff_t pos = ((loff_t)page->index<mapping->host; - - lock_kernel(); - status = nfs_updatepage(file, page, offset, to-offset); - unlock_kernel(); - /* most likely it's already done. CHECKME */ - if (pos > inode->i_size) - inode->i_size = pos; - return status; + return nfs_updatepage(file, page, offset, to-offset); } /* @@ -204,7 +197,10 @@ sync_page: nfs_sync_page, writepage: nfs_writepage, prepare_write: nfs_prepare_write, - commit_write: nfs_commit_write + commit_write: nfs_commit_write, +#ifdef CONFIG_NFS_DIRECTIO + direct_IO: nfs_direct_IO, +#endif }; /* @@ -224,7 +220,9 @@ result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (result) goto out; diff -u --recursive --new-file linux-2.4.16/fs/nfs/flushd.c linux-2.4.16-NFS_ALL/fs/nfs/flushd.c --- linux-2.4.16/fs/nfs/flushd.c Fri Nov 9 23:28:15 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/flushd.c Fri Feb 1 11:22:18 2002 @@ -51,6 +51,19 @@ * This is the wait queue all cluster daemons sleep on */ static struct rpc_wait_queue flushd_queue = RPC_INIT_WAITQ("nfs_flushd"); +static spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED; + +static inline void +nfs_lock_flushd(void) +{ + spin_lock(&nfs_flushd_lock); +} + +static inline void +nfs_unlock_flushd(void) +{ + spin_unlock(&nfs_flushd_lock); +} /* * Local function declarations. @@ -67,12 +80,11 @@ dprintk("NFS: writecache_init\n"); - lock_kernel(); - status = -ENOMEM; /* Create the RPC task */ if (!(task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC))) - goto out_unlock; + return -ENOMEM; + nfs_lock_flushd(); cache = server->rw_requests; status = 0; @@ -89,22 +101,21 @@ cache->auth = server->client->cl_auth; task->tk_action = nfs_flushd; task->tk_exit = nfs_flushd_exit; + nfs_unlock_flushd(); rpc_execute(task); - unlock_kernel(); return 0; out_unlock: - if (task) - rpc_release_task(task); - unlock_kernel(); - return status; + nfs_unlock_flushd(); + rpc_release_task(task); + return 0; } void nfs_reqlist_exit(struct nfs_server *server) { struct nfs_reqlist *cache; - lock_kernel(); + nfs_lock_flushd(); cache = server->rw_requests; if (!cache) goto out; @@ -114,11 +125,13 @@ while (cache->task) { rpc_exit(cache->task, 0); rpc_wake_up_task(cache->task); + nfs_unlock_flushd(); interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ); + nfs_lock_flushd(); } out: - unlock_kernel(); + nfs_unlock_flushd(); } int nfs_reqlist_alloc(struct nfs_server *server) @@ -183,11 +196,13 @@ } dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid); + nfs_lock_flushd(); if (task->tk_action) { task->tk_timeout = NFS_FLUSHD_TIMEOUT; cache->runat = jiffies + task->tk_timeout; rpc_sleep_on(&flushd_queue, task, NULL, NULL); } + nfs_unlock_flushd(); } static void @@ -196,10 +211,13 @@ struct nfs_server *server; struct nfs_reqlist *cache; server = (struct nfs_server *) task->tk_calldata; + + nfs_lock_flushd(); cache = server->rw_requests; if (cache->task == task) cache->task = NULL; wake_up(&cache->request_wait); + nfs_unlock_flushd(); } diff -u --recursive --new-file linux-2.4.16/fs/nfs/inode.c linux-2.4.16-NFS_ALL/fs/nfs/inode.c --- linux-2.4.16/fs/nfs/inode.c Fri Nov 9 23:28:15 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/inode.c Fri Feb 1 11:57:52 2002 @@ -83,6 +83,9 @@ &nfs_rpcstat, }; +/* Spinlock to protect the NFS inode update */ +static spinlock_t nfs_inode_lock = SPIN_LOCK_UNLOCKED; + static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { @@ -255,6 +258,69 @@ } /* + * Set up the NFS superblock private area using probed values + */ +static int +nfs_setup_superblock(struct super_block *sb, struct nfs_fh *rootfh) +{ + struct nfs_server *server = &sb->u.nfs_sb.s_server; + struct nfs_fattr fattr; + struct nfs_fsinfo fsinfo = { &fattr, }; + struct nfs_pathconf pathinfo = { &fattr, }; + int maxlen, res; + + res = server->rpc_ops->fsinfo(server, rootfh, &fsinfo); + if (res < 0) + return res; + + /* Work out a lot of parameters */ + if (!server->rsize) + server->rsize = nfs_block_size(fsinfo.rtpref, NULL); + if (!server->wsize) + server->wsize = nfs_block_size(fsinfo.wtpref, NULL); + + /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ + if (!fsinfo.wtmult) + fsinfo.wtmult = 512; + sb->s_blocksize = nfs_block_bits(fsinfo.wtmult, &sb->s_blocksize_bits); + + if (server->rsize > fsinfo.rtmax) + server->rsize = fsinfo.rtmax; + if (server->wsize > fsinfo.wtmax) + server->wsize = fsinfo.wtmax; + + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->rpages > NFS_READ_MAXIOV) { + server->rpages = NFS_READ_MAXIOV; + server->rsize = server->rpages << PAGE_CACHE_SHIFT; + } + + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->wpages > NFS_WRITE_MAXIOV) { + server->wpages = NFS_WRITE_MAXIOV; + server->wsize = server->wpages << PAGE_CACHE_SHIFT; + } + + server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE) + server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + maxlen = (server->rpc_ops->version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; + if (!server->namelen) { + res = server->rpc_ops->pathconf(server, rootfh, &pathinfo); + if (!res) + server->namelen = pathinfo.name_max; + } + if (!server->namelen || server->namelen > maxlen) + server->namelen = maxlen; + + sb->s_maxbytes = fsinfo.maxfilesize; + return 0; +} + +/* * The way this works is that the mount process passes a structure * in the data argument which contains the server's IP address * and the root file handle obtained from the server's mount @@ -272,8 +338,7 @@ unsigned int authflavor; struct sockaddr_in srvaddr; struct rpc_timeout timeparms; - struct nfs_fsinfo fsinfo; - int tcp, version, maxlen; + int tcp, version; memset(&sb->u.nfs_sb, 0, sizeof(sb->u.nfs_sb)); if (!data) @@ -302,11 +367,11 @@ sb->s_magic = NFS_SUPER_MAGIC; sb->s_op = &nfs_sops; - sb->s_blocksize_bits = 0; - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); server = &sb->u.nfs_sb.s_server; - server->rsize = nfs_block_size(data->rsize, NULL); - server->wsize = nfs_block_size(data->wsize, NULL); + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); server->flags = data->flags & NFS_MOUNT_FLAGMASK; if (data->flags & NFS_MOUNT_NOAC) { @@ -336,6 +401,7 @@ #ifdef CONFIG_NFS_V3 server->rpc_ops = &nfs_v3_clientops; version = 3; + server->caps |= NFS_CAP_READDIRPLUS; if (data->version < 4) { printk(KERN_NOTICE "NFS: NFSv3 not supported by mount program.\n"); goto out_unlock; @@ -413,61 +479,11 @@ sb->s_root->d_op = &nfs_dentry_operations; /* Get some general file system info */ - if (server->rpc_ops->statfs(server, root, &fsinfo) >= 0) { - if (server->namelen == 0) - server->namelen = fsinfo.namelen; - } else { + if (nfs_setup_superblock(sb, root) < 0) { printk(KERN_NOTICE "NFS: cannot retrieve file system info.\n"); goto out_no_root; } - /* Work out a lot of parameters */ - if (data->rsize == 0) - server->rsize = nfs_block_size(fsinfo.rtpref, NULL); - if (data->wsize == 0) - server->wsize = nfs_block_size(fsinfo.wtpref, NULL); - /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ - if (!fsinfo.bsize) - fsinfo.bsize = (fsinfo.rtmult>fsinfo.wtmult) ? fsinfo.rtmult : fsinfo.wtmult; - /* Also make sure we don't go below rsize/wsize since - * RPC calls are expensive */ - if (fsinfo.bsize < server->rsize) - fsinfo.bsize = server->rsize; - if (fsinfo.bsize < server->wsize) - fsinfo.bsize = server->wsize; - - if (data->bsize == 0) - sb->s_blocksize = nfs_block_bits(fsinfo.bsize, &sb->s_blocksize_bits); - if (server->rsize > fsinfo.rtmax) - server->rsize = fsinfo.rtmax; - if (server->wsize > fsinfo.wtmax) - server->wsize = fsinfo.wtmax; - - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->rpages > NFS_READ_MAXIOV) { - server->rpages = NFS_READ_MAXIOV; - server->rsize = server->rpages << PAGE_CACHE_SHIFT; - } - - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->wpages > NFS_WRITE_MAXIOV) { - server->wpages = NFS_WRITE_MAXIOV; - server->wsize = server->wpages << PAGE_CACHE_SHIFT; - } - - server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; - if (server->dtsize > server->rsize) - server->dtsize = server->rsize; - - maxlen = (version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; - - if (server->namelen == 0 || server->namelen > maxlen) - server->namelen = maxlen; - - sb->s_maxbytes = fsinfo.maxfilesize; - /* Fire up the writeback cache */ if (nfs_reqlist_alloc(server) < 0) { printk(KERN_NOTICE "NFS: cannot initialize writeback cache.\n"); @@ -527,7 +543,8 @@ struct nfs_server *server = &sb->u.nfs_sb.s_server; unsigned char blockbits; unsigned long blockres; - struct nfs_fsinfo res; + struct nfs_fattr attr; + struct nfs_fsstat res = { &attr, }; int error; error = server->rpc_ops->statfs(server, NFS_FH(sb->s_root->d_inode), &res); @@ -535,18 +552,15 @@ if (error < 0) goto out_err; - if (res.bsize == 0) - res.bsize = sb->s_blocksize; - buf->f_bsize = nfs_block_bits(res.bsize, &blockbits); + buf->f_bsize = sb->s_blocksize; + blockbits = sb->s_blocksize_bits; blockres = (1 << blockbits) - 1; buf->f_blocks = (res.tbytes + blockres) >> blockbits; buf->f_bfree = (res.fbytes + blockres) >> blockbits; buf->f_bavail = (res.abytes + blockres) >> blockbits; buf->f_files = res.tfiles; buf->f_ffree = res.afiles; - if (res.namelen == 0 || res.namelen > server->namelen) - res.namelen = server->namelen; - buf->f_namelen = res.namelen; + buf->f_namelen = server->namelen; return 0; out_err: printk("nfs_statfs: statfs error = %d\n", -error); @@ -555,18 +569,30 @@ } /* + * Reset the read time on the local caches + */ +void +nfs_invalidate_caches(struct inode *inode) +{ + spin_lock(&nfs_inode_lock); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); +} + +/* * Invalidate the local caches */ void nfs_zap_caches(struct inode *inode) { - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - invalidate_inode_pages(inode); + spin_lock(&nfs_inode_lock); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); + NFS_ATTRTIMEO_UPDATE(inode) = jiffies; memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - NFS_CACHEINV(inode); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); } /* @@ -582,50 +608,36 @@ nfs_zap_caches(inode); } +/* Don't use READDIRPLUS on directories that we believe are too large */ +#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) + /* * Fill in inode information from the fattr. */ static void nfs_fill_inode(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) { - /* - * Check whether the mode has been set, as we only want to - * do this once. (We don't allow inodes to change types.) + NFS_FILEID(inode) = fattr->fileid; + NFS_FSID(inode) = fattr->fsid; + inode->i_mode = fattr->mode; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. */ - if (inode->i_mode == 0) { - NFS_FILEID(inode) = fattr->fileid; - NFS_FSID(inode) = fattr->fsid; - inode->i_mode = fattr->mode; - /* Why so? Because we want revalidate for devices/FIFOs, and - * that's precisely what we have in nfs_file_inode_operations. - */ - inode->i_op = &nfs_file_inode_operations; - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; - inode->i_data.a_ops = &nfs_file_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &nfs_dir_inode_operations; - inode->i_fop = &nfs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) - inode->i_op = &nfs_symlink_inode_operations; - else - init_special_inode(inode, inode->i_mode, fattr->rdev); - /* - * Preset the size and mtime, as there's no need - * to invalidate the caches. - */ - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_mtime = nfs_time_to_secs(fattr->mtime); - inode->i_atime = nfs_time_to_secs(fattr->atime); - inode->i_ctime = nfs_time_to_secs(fattr->ctime); - NFS_CACHE_CTIME(inode) = fattr->ctime; - NFS_CACHE_MTIME(inode) = fattr->mtime; - NFS_CACHE_ISIZE(inode) = fattr->size; - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); - } - nfs_refresh_inode(inode, fattr); + inode->i_op = &nfs_file_inode_operations; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nfs_dir_inode_operations; + inode->i_fop = &nfs_dir_operations; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) + && fattr->size <= NFS_LIMIT_READDIRPLUS) + NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); } struct nfs_find_desc { @@ -652,28 +664,12 @@ return 0; if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0) return 0; - return 1; -} - -int -nfs_inode_is_stale(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) -{ - /* Empty inodes are not stale */ - if (!inode->i_mode) + if (is_bad_inode(inode)) return 0; - - if ((fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT)) - return 1; - - if (is_bad_inode(inode) || NFS_STALE(inode)) - return 1; - - /* Has the filehandle changed? If so is the old one stale? */ - if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0 && - __nfs_revalidate_inode(NFS_SERVER(inode),inode) == -ESTALE) - return 1; - - return 0; + /* Force an attribute cache update if inode->i_count == 0 */ + if (!atomic_read(&inode->i_count)) + NFS_CACHEINV(inode); + return 1; } /* @@ -718,7 +714,19 @@ if (!(inode = iget4(sb, ino, nfs_find_actor, &desc))) goto out_no_inode; - nfs_fill_inode(inode, fh, fattr); + /* + * Check whether the mode has been set, as we only want to + * do this once. (We don't allow inodes to change types.) + */ + if (inode->i_mode == 0) { + nfs_fill_inode(inode, fh, fattr); + nfs_refresh_inode(inode, fattr); + + /* We don't trust READDIRPLUS attributes */ + if (fattr->valid & NFS_ATTR_RDPLUS) + NFS_CACHEINV(inode); + } else if (!(fattr->valid & NFS_ATTR_RDPLUS)) + nfs_refresh_inode(inode, fattr); dprintk("NFS: __nfs_fhget(%x/%Ld ct=%d)\n", inode->i_dev, (long long)NFS_FILEID(inode), atomic_read(&inode->i_count)); @@ -741,7 +749,7 @@ /* * Make sure the inode is up-to-date. */ - error = nfs_revalidate(dentry); + error = nfs_revalidate_inode(NFS_SERVER(inode),inode); if (error) { #ifdef NFS_PARANOIA printk("nfs_notify_change: revalidate failed, error=%d\n", error); @@ -752,7 +760,9 @@ if (!S_ISREG(inode->i_mode)) attr->ia_valid &= ~ATTR_SIZE; + filemap_fdatasync(inode->i_mapping); error = nfs_wb_all(inode); + filemap_fdatawait(inode->i_mapping); if (error) goto out; @@ -780,6 +790,8 @@ fattr.pre_ctime = NFS_CACHE_CTIME(inode); fattr.valid |= NFS_ATTR_WCC; } + /* Force an attribute cache update */ + NFS_CACHEINV(inode); error = nfs_refresh_inode(inode, &fattr); out: return error; @@ -809,7 +821,26 @@ nfs_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - return nfs_revalidate_inode(NFS_SERVER(inode), inode); + int status; + lock_kernel(); + status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); + return status; +} + +/* + * Another revalidation function: This one checks inodes for staleness + * when we've bypassed the ordinary dcache revalidation routines. + * e.g. open(".") + */ +int +nfs_check_stale(struct inode *inode) +{ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + if (NFS_STALE(inode)) + return -ESTALE; + return 0; } /* @@ -838,13 +869,11 @@ struct rpc_auth *auth; struct rpc_cred *cred; - lock_kernel(); auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); filp->private_data = cred; if (filp->f_mode & FMODE_WRITE) nfs_set_mmcred(inode, cred); - unlock_kernel(); return 0; } @@ -852,11 +881,9 @@ { struct rpc_cred *cred; - lock_kernel(); cred = nfs_file_cred(filp); if (cred) put_rpccred(cred); - unlock_kernel(); return 0; } @@ -873,7 +900,6 @@ dfprintk(PAGECACHE, "NFS: revalidating (%x/%Ld)\n", inode->i_dev, (long long)NFS_FILEID(inode)); - lock_kernel(); if (!inode || is_bad_inode(inode)) goto out_nowait; if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) @@ -916,10 +942,50 @@ NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; wake_up(&inode->i_wait); out_nowait: - unlock_kernel(); return status; } +/** + * nfs_grow_isize - Extend inode->i_size + * @inode: inode + * @size: new file size + */ +void nfs_grow_isize(struct inode *inode, loff_t size) +{ + spin_lock(&nfs_inode_lock); + if (inode->i_size < size) + inode->i_size = size; + spin_unlock(&nfs_inode_lock); +} + +/* + * nfs_fattr_obsolete - Test if attribute data is newer than cached data + * @inode: inode + * @fattr: attributes to test + * + * Avoid stuffing the attribute cache with obsolete information. + * We always accept updates if the attribute cache timed out, or if + * fattr->ctime is newer than our cached value. + * If fattr->ctime matches the cached value, we still accept the update + * if it increases the file size. + */ +static inline +int nfs_fattr_obsolete(struct inode *inode, struct nfs_fattr *fattr) +{ + s64 cdif; + + if (time_after_eq(jiffies, NFS_READTIME(inode)+NFS_ATTRTIMEO(inode))) + goto out_valid; + if ((cdif = (s64)fattr->ctime - (s64)NFS_CACHE_CTIME(inode)) > 0) + goto out_valid; + /* Ugh... */ + if (cdif == 0 && fattr->size > NFS_CACHE_ISIZE(inode)) + goto out_valid; + return -1; + out_valid: + return 0; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -937,6 +1003,7 @@ { __u64 new_size, new_mtime; loff_t new_isize; + time_t new_atime; int invalid = 0; dfprintk(VFS, "NFS: refresh_inode(%x/%ld ct=%d info=0x%x)\n", @@ -962,6 +1029,12 @@ new_size = fattr->size; new_isize = nfs_size_to_loff_t(fattr->size); + new_atime = nfs_time_to_secs(fattr->atime); + /* Avoid races */ + spin_lock(&nfs_inode_lock); + if (nfs_fattr_obsolete(inode, fattr)) + goto out_nochange; + /* * Update the read time so we don't revalidate too often. */ @@ -1011,7 +1084,7 @@ NFS_CACHE_CTIME(inode) = fattr->ctime; inode->i_ctime = nfs_time_to_secs(fattr->ctime); - inode->i_atime = nfs_time_to_secs(fattr->atime); + inode->i_atime = new_atime; NFS_CACHE_MTIME(inode) = new_mtime; inode->i_mtime = nfs_time_to_secs(new_mtime); @@ -1044,11 +1117,16 @@ NFS_ATTRTIMEO(inode) = NFS_MAXATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; } + spin_unlock(&nfs_inode_lock); if (invalid) nfs_zap_caches(inode); return 0; - + out_nochange: + if (new_atime - inode->i_atime > 0) + inode->i_atime = new_atime; + spin_unlock(&nfs_inode_lock); + return 0; out_changed: /* * Big trouble! The inode has become a different object. diff -u --recursive --new-file linux-2.4.16/fs/nfs/nfs2xdr.c linux-2.4.16-NFS_ALL/fs/nfs/nfs2xdr.c --- linux-2.4.16/fs/nfs/nfs2xdr.c Sat Nov 3 02:40:09 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/nfs2xdr.c Fri Feb 1 12:01:31 2002 @@ -270,14 +270,12 @@ count = ntohl(*p++); hdrlen = (u8 *) p - (u8 *) iov->iov_base; - recvd = req->rq_rlen - hdrlen; - if (p != iov[req->rq_rnr-1].iov_base) { - /* Unexpected reply header size. Punt. - * XXX: Move iovec contents to align data on page - * boundary and adjust RPC header size guess */ - printk(KERN_WARNING "NFS: Odd RPC header size in read reply: %d\n", hdrlen); - return -errno_NFSERR_IO; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READ header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); } + + recvd = req->rq_rlen - hdrlen; if (count > recvd) { printk(KERN_WARNING "NFS: server cheating in read reply: " "count %d > recvd %d\n", count, recvd); @@ -419,7 +417,7 @@ bufsiz = bufsiz >> 2; p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); + *p++ = htonl(args->cookie & 0xFFFFFFFF); *p++ = htonl(bufsiz); /* see above */ req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); @@ -448,27 +446,23 @@ nfs_xdr_readdirres(struct rpc_rqst *req, u32 *p, struct nfs_readdirres *res) { struct iovec *iov = req->rq_rvec; + int hdrlen; int status, nr; u32 *end, *entry, len; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); - if ((void *) p != ((u8 *) iov->iov_base+iov->iov_len)) { - /* Unexpected reply header size. Punt. */ - printk(KERN_WARNING "NFS: Odd RPC header size in readdirres reply\n"); - return -errno_NFSERR_IO; + + hdrlen = (u8 *) p - (u8 *) iov->iov_base; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READDIR header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); } + /* Get start and end address of XDR data */ p = (u32 *) iov[1].iov_base; end = (u32 *) ((u8 *) p + iov[1].iov_len); - - /* Get start and end of dirent buffer */ - if (res->buffer != p) { - printk(KERN_ERR "NFS: Bad result buffer in readdir\n"); - return -errno_NFSERR_IO; - } - for (nr = 0; *p++; nr++) { entry = p - 1; if (p + 2 > end) @@ -506,7 +500,7 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); + entry->cookie = (s64)((off_t)ntohl(*p++)); entry->eof = !p[0] && p[1]; return p; @@ -598,13 +592,21 @@ static int nfs_xdr_readlinkres(struct rpc_rqst *req, u32 *p, struct nfs_readlinkres *res) { + struct iovec *iov = req->rq_rvec; u32 *strlen; char *string; + int hdrlen; int status; unsigned int len; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); + hdrlen = (u8 *) p - (u8 *) iov->iov_base; + if (iov->iov_len > hdrlen) { + dprintk("NFS: READLINK header is short. iovec will be shifted.\n"); + xdr_shift_iovec(iov, req->rq_rnr, iov->iov_len - hdrlen); + } + strlen = (u32*)res->buffer; /* Convert length of symlink */ len = ntohl(*strlen); @@ -631,36 +633,18 @@ * Decode STATFS reply */ static int -nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_statfs *res) { int status; - u32 xfer_size; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); - /* For NFSv2, we more or less have to guess the preferred - * read/write/readdir sizes from the single 'transfer size' - * value. - */ - xfer_size = ntohl(*p++); /* tsize */ - res->rtmax = 8 * 1024; - res->rtpref = xfer_size; - res->rtmult = xfer_size; - res->wtmax = 8 * 1024; - res->wtpref = xfer_size; - res->wtmult = xfer_size; - res->dtpref = PAGE_CACHE_SIZE; - res->maxfilesize = 0x7FFFFFFF; /* just a guess */ + res->tsize = ntohl(*p++); res->bsize = ntohl(*p++); - - res->tbytes = ntohl(*p++) * res->bsize; - res->fbytes = ntohl(*p++) * res->bsize; - res->abytes = ntohl(*p++) * res->bsize; - res->tfiles = 0; - res->ffiles = 0; - res->afiles = 0; - res->namelen = 0; + res->blocks = ntohl(*p++); + res->bfree = ntohl(*p++); + res->bavail = ntohl(*p++); return 0; } diff -u --recursive --new-file linux-2.4.16/fs/nfs/nfs3proc.c linux-2.4.16-NFS_ALL/fs/nfs/nfs3proc.c --- linux-2.4.16/fs/nfs/nfs3proc.c Mon Oct 1 22:45:37 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/nfs3proc.c Fri Feb 1 11:34:22 2002 @@ -17,6 +17,37 @@ #define NFSDBG_FACILITY NFSDBG_PROC +/* A wrapper to handle the EJUKEBOX error message */ +static int +nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + sigset_t oldset; + int res; + rpc_clnt_sigmask(clnt, &oldset); + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EJUKEBOX) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!signalled()); + rpc_clnt_sigunmask(clnt, &oldset); + return res; +} + +static inline int +nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) +{ + struct rpc_message msg = { proc, argp, resp, NULL }; + return nfs3_rpc_wrapper(clnt, &msg, flags); +} + +#define rpc_call(clnt, proc, argp, resp, flags) \ + nfs3_rpc_call_wrapper(clnt, proc, argp, resp, flags) +#define rpc_call_sync(clnt, msg, flags) \ + nfs3_rpc_wrapper(clnt, msg, flags) + /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -80,7 +111,8 @@ status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, fhandle, fattr, 0); dprintk("NFS reply lookup: %d\n", status); - nfs_refresh_inode(dir, &dir_attr); + if (status >= 0) + status = nfs_refresh_inode(dir, &dir_attr); return status; } @@ -462,24 +494,42 @@ return status; } -/* - * This is a combo call of fsstat and fsinfo - */ static int nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; - dprintk("NFS call fsstat\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, info, 0); - if (status < 0) - goto error; + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); + dprintk("NFS reply statfs: %d\n", status); + return status; +} + +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); status = rpc_call(server->client, NFS3PROC_FSINFO, fhandle, info, 0); + dprintk("NFS reply fsinfo: %d\n", status); + return status; +} -error: - dprintk("NFS reply statfs: %d\n", status); +static int +nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call pathconf\n"); + status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); + dprintk("NFS reply pathconf: %d\n", status); return status; } @@ -508,5 +558,7 @@ nfs3_proc_readdir, nfs3_proc_mknod, nfs3_proc_statfs, + nfs3_proc_fsinfo, + nfs3_proc_pathconf, nfs3_decode_dirent, }; diff -u --recursive --new-file linux-2.4.16/fs/nfs/nfs3xdr.c linux-2.4.16-NFS_ALL/fs/nfs/nfs3xdr.c --- linux-2.4.16/fs/nfs/nfs3xdr.c Sat Nov 3 02:40:09 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/nfs3xdr.c Fri Feb 1 11:22:18 2002 @@ -523,6 +523,13 @@ return 0; } +/* Hack to sign-extending 32-bit cookies */ +static inline +u64 nfs_transform_cookie64(u64 cookie) +{ + return (cookie & 0x80000000) ? (cookie ^ 0xFFFFFFFF00000000) : cookie; +} + /* * Encode arguments to readdir call */ @@ -533,7 +540,7 @@ int buflen, replen; p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); + p = xdr_encode_hyper(p, nfs_transform_cookie64(args->cookie)); *p++ = args->verf[0]; *p++ = args->verf[1]; if (args->plus) { @@ -644,6 +651,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) { struct nfs_entry old = *entry; + u64 cookie; if (!*p++) { if (!*p) @@ -657,24 +665,25 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &cookie); + entry->cookie = nfs_transform_cookie64(cookie); if (plus) { - p = xdr_decode_post_op_attr(p, &entry->fattr); + entry->fattr->valid = 0; + p = xdr_decode_post_op_attr(p, entry->fattr); + if (entry->fattr->valid != 0) + entry->fattr->valid |= NFS_ATTR_RDPLUS; /* In fact, a post_op_fh3: */ if (*p++) { - p = xdr_decode_fhandle(p, &entry->fh); + p = xdr_decode_fhandle(p, entry->fh); /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); *entry = old; return ERR_PTR(-EAGAIN); } - } else { - /* If we don't get a file handle, the attrs - * aren't worth a lot. */ - entry->fattr.valid = 0; - } + } else + memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } entry->eof = !p[0] && p[1]; @@ -958,14 +967,13 @@ * Decode FSSTAT reply */ static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -975,8 +983,7 @@ p = xdr_decode_hyper(p, &res->tfiles); p = xdr_decode_hyper(p, &res->ffiles); p = xdr_decode_hyper(p, &res->afiles); - - /* ignore invarsec */ + res->invarsec = ntohl(*p++); return 0; } @@ -986,12 +993,11 @@ static int nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -1003,8 +1009,8 @@ res->wtmult = ntohl(*p++); res->dtpref = ntohl(*p++); p = xdr_decode_hyper(p, &res->maxfilesize); - - /* ignore time_delta and properties */ + p = xdr_decode_time3(p, &res->time_delta); + res->properties = ntohl(*p++); return 0; } @@ -1012,20 +1018,21 @@ * Decode PATHCONF reply */ static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); res->linkmax = ntohl(*p++); - res->namelen = ntohl(*p++); - - /* ignore remaining fields */ + res->name_max = ntohl(*p++); + res->no_trunc = ntohl(*p++) != 0; + res->chown_restricted = ntohl(*p++) != 0; + res->case_insensitive = ntohl(*p++) != 0; + res->case_preserving = ntohl(*p++) != 0; return 0; } diff -u --recursive --new-file linux-2.4.16/fs/nfs/pagelist.c linux-2.4.16-NFS_ALL/fs/nfs/pagelist.c --- linux-2.4.16/fs/nfs/pagelist.c Fri Nov 9 23:28:15 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/pagelist.c Fri Feb 1 11:22:18 2002 @@ -187,7 +187,7 @@ BUG(); } #endif - for (pos = head->prev; pos != head; pos = pos->prev) { + list_for_each_prev(pos, head) { struct nfs_page *p = nfs_list_entry(pos); if (page_index(p->wb_page) < pg_idx) break; diff -u --recursive --new-file linux-2.4.16/fs/nfs/proc.c linux-2.4.16-NFS_ALL/fs/nfs/proc.c --- linux-2.4.16/fs/nfs/proc.c Fri Feb 9 20:29:44 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/proc.c Fri Feb 1 11:22:18 2002 @@ -361,17 +361,62 @@ static int nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; + struct nfs2_statfs fsinfo; - dprintk("NFS call statfs\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFSPROC_STATFS, fhandle, info, 0); + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); dprintk("NFS reply statfs: %d\n", status); + if (status) + goto out; + stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize; + stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize; + stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize; + stat->tfiles = 0; + stat->ffiles = 0; + stat->afiles = 0; + stat->invarsec = 0; + out: return status; } +static int +nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + struct nfs2_statfs fsinfo; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); + dprintk("NFS reply fsinfo: %d\n", status); + if (status) + goto out; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->time_delta = 0; + info->properties = 0x1b; + out: + return status; +} + +static int +nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + return -ENOTSUPP; +} + extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); struct nfs_rpc_ops nfs_v2_clientops = { @@ -397,5 +442,7 @@ nfs_proc_readdir, nfs_proc_mknod, nfs_proc_statfs, + nfs_proc_fsinfo, + nfs_proc_pathconf, nfs_decode_dirent, }; diff -u --recursive --new-file linux-2.4.16/fs/nfs/read.c linux-2.4.16-NFS_ALL/fs/nfs/read.c --- linux-2.4.16/fs/nfs/read.c Fri Nov 9 23:28:15 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/read.c Fri Feb 1 11:52:21 2002 @@ -113,11 +113,9 @@ inode->i_dev, (long long)NFS_FILEID(inode), (long long)offset, rsize, buffer); - lock_kernel(); result = NFS_PROTO(inode)->read(inode, cred, &fattr, flags, offset, rsize, buffer, &eof); nfs_refresh_inode(inode, &fattr); - unlock_kernel(); /* * Even if we had a partial success we can't mark the page @@ -272,9 +270,7 @@ rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -397,20 +393,29 @@ { struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; struct inode *inode = data->inode; - int count = data->res.count; + unsigned int count = data->res.count; dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", task->tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + nfs_refresh_inode(inode, &data->fattr); while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); struct page *page = req->wb_page; nfs_list_remove_request(req); - if (task->tk_status >= 0 && count >= 0) { + if (task->tk_status >= 0) { + if (count < PAGE_CACHE_SIZE) { + char *p = kmap(page); + memset(p + count, 0, PAGE_CACHE_SIZE - count); + kunmap(page); + count = 0; + } else + count -= PAGE_CACHE_SIZE; SetPageUptodate(page); - count -= PAGE_CACHE_SIZE; } else SetPageError(page); flush_dcache_page(page); diff -u --recursive --new-file linux-2.4.16/fs/nfs/unlink.c linux-2.4.16-NFS_ALL/fs/nfs/unlink.c --- linux-2.4.16/fs/nfs/unlink.c Thu Aug 16 18:39:37 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/unlink.c Fri Feb 1 11:34:22 2002 @@ -123,6 +123,8 @@ struct dentry *dir = data->dir; struct inode *dir_i; + if (nfs_async_handle_jukebox(task)) + return; if (!dir) return; dir_i = dir->d_inode; diff -u --recursive --new-file linux-2.4.16/fs/nfs/write.c linux-2.4.16-NFS_ALL/fs/nfs/write.c --- linux-2.4.16/fs/nfs/write.c Tue Nov 20 23:18:50 2001 +++ linux-2.4.16-NFS_ALL/fs/nfs/write.c Fri Feb 1 12:02:12 2002 @@ -121,23 +121,6 @@ } /* - * This function will be used to simulate weak cache consistency - * under NFSv2 when the NFSv3 attribute patch is included. - * For the moment, we just call nfs_refresh_inode(). - */ -static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) -{ - if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - return nfs_refresh_inode(inode, fattr); -} - -/* * Write a page synchronously. * Offset is the data offset within the page. */ @@ -193,8 +176,7 @@ * If we've extended the file, update the inode * now so we don't invalidate the cache. */ - if (base > inode->i_size) - inode->i_size = base; + nfs_grow_isize(inode, base); } while (count); if (PageError(page)) @@ -213,6 +195,7 @@ unsigned int offset, unsigned int count) { struct nfs_page *req; + loff_t end; int status; req = nfs_update_request(file, inode, page, offset, count); @@ -223,6 +206,8 @@ req->wb_cred = get_rpccred(NFS_I(inode)->mm_cred); nfs_unlock_request(req); nfs_strategy(inode); + end = ((loff_t)page->index<index >= end_index+1 || !offset) goto out; do_it: - lock_kernel(); if (NFS_SERVER(inode)->wsize >= PAGE_CACHE_SIZE && !IS_SYNC(inode)) { err = nfs_writepage_async(NULL, inode, page, 0, offset); if (err >= 0) @@ -270,7 +254,6 @@ if (err == offset) err = 0; } - unlock_kernel(); out: UnlockPage(page); return err; @@ -305,18 +288,30 @@ /* * Insert a write request into an inode + * Note: we sort the list in order to be able to optimize nfs_find_request() + * & co. for the 'write append' case. For 2.5 we may want to consider + * some form of hashing so as to perform well on random writes. */ static inline void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { + struct list_head *pos, *head; + unsigned long pg_idx = page_index(req->wb_page); + if (!list_empty(&req->wb_hash)) return; if (!NFS_WBACK_BUSY(req)) printk(KERN_ERR "NFS: unlocked request attempted hashed!\n"); - if (list_empty(&inode->u.nfs_i.writeback)) + head = &inode->u.nfs_i.writeback; + if (list_empty(head)) igrab(inode); + list_for_each_prev(pos, head) { + struct nfs_page *entry = nfs_inode_wb_entry(pos); + if (page_index(entry->wb_page) < pg_idx) + break; + } inode->u.nfs_i.npages++; - list_add(&req->wb_hash, &inode->u.nfs_i.writeback); + list_add(&req->wb_hash, pos); req->wb_count++; } @@ -354,15 +349,18 @@ static inline struct nfs_page * _nfs_find_request(struct inode *inode, struct page *page) { - struct list_head *head, *next; + struct list_head *head, *pos; + unsigned long pg_idx = page_index(page); head = &inode->u.nfs_i.writeback; - next = head->next; - while (next != head) { - struct nfs_page *req = nfs_inode_wb_entry(next); - next = next->next; - if (page_index(req->wb_page) != page_index(page)) + list_for_each_prev(pos, head) { + struct nfs_page *req = nfs_inode_wb_entry(pos); + unsigned long found_idx = page_index(req->wb_page); + + if (pg_idx < found_idx) continue; + if (pg_idx != found_idx) + break; req->wb_count++; return req; } @@ -444,20 +442,20 @@ else idx_end = idx_start + npages - 1; - spin_lock(&nfs_wreq_lock); head = &inode->u.nfs_i.writeback; - p = head->next; - while (p != head) { + restart: + spin_lock(&nfs_wreq_lock); + list_for_each_prev(p, head) { unsigned long pg_idx; struct nfs_page *req = nfs_inode_wb_entry(p); - p = p->next; - if (file && req->wb_file != file) continue; pg_idx = page_index(req->wb_page); - if (pg_idx < idx_start || pg_idx > idx_end) + if (pg_idx < idx_start) + break; + if (pg_idx > idx_end) continue; if (!NFS_WBACK_BUSY(req)) @@ -468,9 +466,8 @@ nfs_release_request(req); if (error < 0) return error; - spin_lock(&nfs_wreq_lock); - p = head->next; res++; + goto restart; } spin_unlock(&nfs_wreq_lock); return res; @@ -781,6 +778,7 @@ struct dentry *dentry = file->f_dentry; struct inode *inode = dentry->d_inode; struct nfs_page *req; + loff_t end; int status = 0; dprintk("NFS: nfs_updatepage(%s/%s %d@%Ld)\n", @@ -812,6 +810,9 @@ goto done; status = 0; + end = ((loff_t)page->index<tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + /* We can't handle that yet but we check for it nevertheless */ if (resp->count < argp->count && task->tk_status >= 0) { static unsigned long complain; @@ -1155,9 +1157,7 @@ dprintk("NFS: %4d initiated commit call\n", task->tk_pid); rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -1184,6 +1184,9 @@ dprintk("NFS: %4d nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + nfs_write_attributes(inode, resp->fattr); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); Binary files linux-2.4.16/include/linux/.nfs_fs.h.swp and linux-2.4.16-NFS_ALL/include/linux/.nfs_fs.h.swp differ diff -u --recursive --new-file linux-2.4.16/include/linux/dcache.h linux-2.4.16-NFS_ALL/include/linux/dcache.h --- linux-2.4.16/include/linux/dcache.h Thu Nov 22 20:46:18 2001 +++ linux-2.4.16-NFS_ALL/include/linux/dcache.h Fri Feb 1 11:22:19 2002 @@ -80,6 +80,8 @@ struct super_block * d_sb; /* The root of the dentry tree */ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ + unsigned long d_rtime_sec; /* used by nfs d_revalidate */ + unsigned long d_rtime_nsec; /* used by nfs d_revalidate */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ }; diff -u --recursive --new-file linux-2.4.16/include/linux/fs.h linux-2.4.16-NFS_ALL/include/linux/fs.h --- linux-2.4.16/include/linux/fs.h Mon Nov 26 14:29:17 2001 +++ linux-2.4.16-NFS_ALL/include/linux/fs.h Fri Feb 1 11:22:41 2002 @@ -390,7 +390,7 @@ int (*flushpage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); + int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); }; struct address_space { @@ -852,6 +852,7 @@ int (*revalidate) (struct dentry *); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct dentry *, struct iattr *); + int (*check_stale) (struct inode *); }; /* @@ -1387,7 +1388,7 @@ int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); -extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *); +extern int generic_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int, get_block_t *); extern int waitfor_one_page(struct page*); extern int generic_file_mmap(struct file *, struct vm_area_struct *); diff -u --recursive --new-file linux-2.4.16/include/linux/list.h linux-2.4.16-NFS_ALL/include/linux/list.h --- linux-2.4.16/include/linux/list.h Thu Nov 22 20:46:19 2001 +++ linux-2.4.16-NFS_ALL/include/linux/list.h Fri Feb 1 11:22:19 2002 @@ -162,6 +162,16 @@ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -u --recursive --new-file linux-2.4.16/include/linux/nfs_fs.h linux-2.4.16-NFS_ALL/include/linux/nfs_fs.h --- linux-2.4.16/include/linux/nfs_fs.h Thu Nov 22 20:47:00 2001 +++ linux-2.4.16-NFS_ALL/include/linux/nfs_fs.h Fri Feb 1 12:03:10 2002 @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -81,10 +82,7 @@ #define NFS_CACHE_MTIME(inode) ((inode)->u.nfs_i.read_cache_mtime) #define NFS_CACHE_ISIZE(inode) ((inode)->u.nfs_i.read_cache_isize) #define NFS_NEXTSCAN(inode) ((inode)->u.nfs_i.nextscan) -#define NFS_CACHEINV(inode) \ -do { \ - NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; \ -} while (0) +#define NFS_CACHEINV(inode) nfs_invalidate_caches(inode) #define NFS_ATTRTIMEO(inode) ((inode)->u.nfs_i.attrtimeo) #define NFS_MINATTRTIMEO(inode) \ (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ @@ -101,8 +99,15 @@ #define NFS_FILEID(inode) ((inode)->u.nfs_i.fileid) #define NFS_FSID(inode) ((inode)->u.nfs_i.fsid) -/* Inode Flags */ -#define NFS_USE_READDIRPLUS(inode) ((NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS) ? 1 : 0) +static inline int nfs_server_capable(struct inode *inode, int cap) +{ + return NFS_SERVER(inode)->caps & cap; +} + +static inline int NFS_USE_READDIRPLUS(struct inode *inode) +{ + return NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS; +} /* * These are the default flags for swap requests @@ -141,17 +146,20 @@ * linux/fs/nfs/inode.c */ extern struct super_block *nfs_read_super(struct super_block *, void *, int); +extern void nfs_invalidate_caches(struct inode *); extern void nfs_zap_caches(struct inode *); extern int nfs_inode_is_stale(struct inode *, struct nfs_fh *, struct nfs_fattr *); extern struct inode *nfs_fhget(struct dentry *, struct nfs_fh *, struct nfs_fattr *); extern int __nfs_refresh_inode(struct inode *, struct nfs_fattr *); +extern void nfs_grow_isize(struct inode *, loff_t); extern int nfs_revalidate(struct dentry *); extern int nfs_permission(struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); +extern int nfs_check_stale(struct inode *); extern int nfs_notify_change(struct dentry *, struct iattr *); /* @@ -266,6 +274,11 @@ extern int nfs_scan_lru_read_timeout(struct nfs_server *, struct list_head *); /* + * linux/fs/nfs/direct.c + */ +extern int nfs_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -291,6 +304,23 @@ return __nfs_refresh_inode(inode,fattr); } +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { + fattr->pre_size = NFS_CACHE_ISIZE(inode); + fattr->pre_mtime = NFS_CACHE_MTIME(inode); + fattr->pre_ctime = NFS_CACHE_CTIME(inode); + fattr->valid |= NFS_ATTR_WCC; + } + return nfs_refresh_inode(inode, fattr); +} + static inline loff_t nfs_size_to_loff_t(__u64 size) { @@ -332,6 +362,29 @@ __retval; \ }) +#ifdef CONFIG_NFS_V3 + +#define NFS_JUKEBOX_RETRY_TIME (5 * HZ) +static inline int +nfs_async_handle_jukebox(struct rpc_task *task) +{ + if (task->tk_status != -EJUKEBOX) + return 0; + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +#else + +static inline int +nfs_async_handle_jukebox(struct rpc_task *task) +{ + return 0; +} +#endif /* CONFIG_NFS_V3 */ + #endif /* __KERNEL__ */ /* diff -u --recursive --new-file linux-2.4.16/include/linux/nfs_fs_sb.h linux-2.4.16-NFS_ALL/include/linux/nfs_fs_sb.h --- linux-2.4.16/include/linux/nfs_fs_sb.h Thu Nov 22 20:46:19 2001 +++ linux-2.4.16-NFS_ALL/include/linux/nfs_fs_sb.h Fri Feb 1 11:22:19 2002 @@ -10,6 +10,7 @@ struct rpc_clnt * client; /* RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ int flags; /* various flags */ + unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ unsigned int wsize; /* write size */ @@ -36,4 +37,8 @@ struct nfs_server s_server; }; +/* Server capabilities */ +#define NFS_CAP_READDIRPLUS 1 + + #endif diff -u --recursive --new-file linux-2.4.16/include/linux/nfs_xdr.h linux-2.4.16-NFS_ALL/include/linux/nfs_xdr.h --- linux-2.4.16/include/linux/nfs_xdr.h Mon Jan 29 21:07:43 2001 +++ linux-2.4.16-NFS_ALL/include/linux/nfs_xdr.h Fri Feb 1 11:22:19 2002 @@ -35,11 +35,13 @@ #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ #define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ #define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ +#define NFS_ATTR_RDPLUS 0x0008 /* Made in readdirplus */ /* * Info on the file system */ struct nfs_fsinfo { + struct nfs_fattr *fattr; __u32 rtmax; /* max. read transfer size */ __u32 rtpref; /* pref. read transfer size */ __u32 rtmult; /* reads should be multiple of this */ @@ -48,15 +50,37 @@ __u32 wtmult; /* writes should be multiple of this */ __u32 dtpref; /* pref. readdir transfer size */ __u64 maxfilesize; - __u64 bsize; /* block size */ + __u64 time_delta; + __u32 properties; +}; + +struct nfs_fsstat { + struct nfs_fattr *fattr; __u64 tbytes; /* total size in bytes */ __u64 fbytes; /* # of free bytes */ __u64 abytes; /* # of bytes available to user */ __u64 tfiles; /* # of files */ __u64 ffiles; /* # of free files */ __u64 afiles; /* # of files available to user */ + __u32 invarsec; +}; + +struct nfs_pathconf { + struct nfs_fattr *fattr; /* Post-op attributes */ __u32 linkmax;/* max # of hard links */ - __u32 namelen;/* max name length */ + __u32 name_max;/* max name length */ + int no_trunc : 1, + chown_restricted : 1, + case_insensitive : 1, + case_preserving : 1; +}; + +struct nfs2_statfs { + __u32 tsize; /* Server transfer size */ + __u32 bsize; /* Filesystem block size */ + __u32 blocks; /* No. of "bsize" blocks on filesystem */ + __u32 bfree; /* No. of free "bsize" blocks */ + __u32 bavail; /* No. of available "bsize" blocks */ }; /* Arguments to the read call. @@ -112,8 +136,8 @@ const char * name; unsigned int len; int eof; - struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; }; /* @@ -353,7 +377,11 @@ int (*mknod) (struct inode *, struct qstr *, struct iattr *, dev_t, struct nfs_fh *, struct nfs_fattr *); int (*statfs) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsstat *); + int (*fsinfo) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*pathconf) (struct nfs_server *, struct nfs_fh *, + struct nfs_pathconf *); u32 * (*decode_dirent)(u32 *, struct nfs_entry *, int plus); }; diff -u --recursive --new-file linux-2.4.16/include/linux/sunrpc/clnt.h linux-2.4.16-NFS_ALL/include/linux/sunrpc/clnt.h --- linux-2.4.16/include/linux/sunrpc/clnt.h Thu Nov 22 20:47:20 2001 +++ linux-2.4.16-NFS_ALL/include/linux/sunrpc/clnt.h Fri Feb 1 11:22:19 2002 @@ -111,6 +111,8 @@ void rpc_release_client(struct rpc_clnt *); void rpc_getport(struct rpc_task *, struct rpc_clnt *); int rpc_register(u32, u32, int, unsigned short, int *); +u32 * rpc_call_header(struct rpc_task *task); +u32 * rpc_call_verify(struct rpc_task *task); void rpc_call_setup(struct rpc_task *, struct rpc_message *, int); @@ -144,5 +146,10 @@ */ int rpc_getport_external(struct sockaddr_in *, __u32, __u32, int); +/* + * Ping function + */ +void rpc_ping(struct rpc_task *task); + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff -u --recursive --new-file linux-2.4.16/include/linux/sunrpc/xprt.h linux-2.4.16-NFS_ALL/include/linux/sunrpc/xprt.h --- linux-2.4.16/include/linux/sunrpc/xprt.h Thu Nov 22 20:47:20 2001 +++ linux-2.4.16-NFS_ALL/include/linux/sunrpc/xprt.h Fri Feb 1 11:22:19 2002 @@ -39,12 +39,14 @@ * Come Linux 2.3, we'll handle fragments directly. */ #define RPC_MAXCONG 16 -#define RPC_MAXREQS (RPC_MAXCONG + 1) +#define RPC_MAXREQS (RPC_MAXCONG + 2) #define RPC_CWNDSCALE 256 #define RPC_MAXCWND (RPC_MAXCONG * RPC_CWNDSCALE) #define RPC_INITCWND RPC_CWNDSCALE #define RPCXPRT_CONGESTED(xprt) \ ((xprt)->cong >= (xprt)->cwnd) +#define RPCXPRT_SUPERCONGESTED(xprt) \ + ((xprt)->cwnd < 2*RPC_CWNDSCALE) /* Default timeout values */ #define RPC_MAX_UDP_TIMEOUT (60*HZ) @@ -135,6 +137,7 @@ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue pending; /* requests in flight */ struct rpc_wait_queue backlog; /* waiting for slot */ + struct rpc_wait_queue pingwait; /* waiting on ping() */ struct rpc_rqst * free; /* free slots */ struct rpc_rqst slot[RPC_MAXREQS]; unsigned long sockstate; /* Socket state */ @@ -179,10 +182,12 @@ unsigned long); int xprt_reserve(struct rpc_task *); +int xprt_ping_reserve(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_timeout *); void xprt_release(struct rpc_task *); +void xprt_ping_release(struct rpc_task *); void xprt_reconnect(struct rpc_task *); int xprt_clear_backlog(struct rpc_xprt *); int xprt_tcp_pending(void); @@ -190,6 +195,8 @@ #define XPRT_WSPACE 0 #define XPRT_CONNECT 1 +#define XPRT_PING 2 +#define XPRT_NORESPOND 3 #define xprt_wspace(xp) (test_bit(XPRT_WSPACE, &(xp)->sockstate)) #define xprt_test_and_set_wspace(xp) (test_and_set_bit(XPRT_WSPACE, &(xp)->sockstate)) @@ -200,6 +207,32 @@ #define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) +static inline int xprt_pinging(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_PING, &xprt->sockstate); +} +static inline int xprt_test_and_set_pinging(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_PING, &xprt->sockstate); +} +static inline void xprt_clear_pinging(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_PING, &xprt->sockstate); +} + +static inline int xprt_norespond(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline int xprt_test_and_set_norespond(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline void xprt_clear_norespond(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_NORESPOND, &xprt->sockstate); +} + static inline void rpciod_tcp_dispatcher(void) { diff -u --recursive --new-file linux-2.4.16/mm/filemap.c linux-2.4.16-NFS_ALL/mm/filemap.c --- linux-2.4.16/mm/filemap.c Mon Nov 26 14:29:17 2001 +++ linux-2.4.16-NFS_ALL/mm/filemap.c Fri Feb 1 11:22:42 2002 @@ -1539,7 +1539,7 @@ if (retval) break; - retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + retval = mapping->a_ops->direct_IO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize); if (rw == READ && retval > 0) mark_dirty_kiobuf(iobuf, retval); diff -u --recursive --new-file linux-2.4.16/net/sunrpc/Makefile linux-2.4.16-NFS_ALL/net/sunrpc/Makefile --- linux-2.4.16/net/sunrpc/Makefile Fri Dec 29 23:07:24 2000 +++ linux-2.4.16-NFS_ALL/net/sunrpc/Makefile Fri Feb 1 11:22:19 2002 @@ -14,7 +14,7 @@ obj-y := clnt.o xprt.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o \ - pmap_clnt.o xdr.o sunrpc_syms.o + ping.o pmap_clnt.o xdr.o sunrpc_syms.o obj-$(CONFIG_PROC_FS) += stats.o obj-$(CONFIG_SYSCTL) += sysctl.o diff -u --recursive --new-file linux-2.4.16/net/sunrpc/clnt.c linux-2.4.16-NFS_ALL/net/sunrpc/clnt.c --- linux-2.4.16/net/sunrpc/clnt.c Fri Sep 21 20:24:50 2001 +++ linux-2.4.16-NFS_ALL/net/sunrpc/clnt.c Fri Feb 1 11:22:19 2002 @@ -57,8 +57,8 @@ static void call_reconnect(struct rpc_task *task); static void child_reconnect(struct rpc_task *); static void child_reconnect_status(struct rpc_task *); -static u32 * call_header(struct rpc_task *task); -static u32 * call_verify(struct rpc_task *task); +static void call_ping(struct rpc_task *task); +static void call_pingresult(struct rpc_task *task); /* @@ -491,7 +491,7 @@ /* Encode header and provided arguments */ encode = rpcproc_encode(clnt, task->tk_msg.rpc_proc); - if (!(p = call_header(task))) { + if (!(p = rpc_call_header(task))) { printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); rpc_exit(task, -EIO); } else @@ -618,11 +618,10 @@ task->tk_action = call_reconnect; break; } - /* - * Sleep and dream of an open connection - */ - task->tk_timeout = 5 * HZ; - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + if (RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; + break; + } case -ENOMEM: case -EAGAIN: task->tk_action = call_transmit; @@ -646,6 +645,7 @@ { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; + int major = 0; if (req) { struct rpc_timeout *to = &req->rq_timeout; @@ -666,17 +666,7 @@ rpc_exit(task, -EIO); return; } - if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { - task->tk_flags |= RPC_CALL_MAJORSEEN; - if (req) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); -#ifdef RPC_DEBUG - else - printk(KERN_NOTICE "%s: task %d can't get a request slot\n", - clnt->cl_protname, task->tk_pid); -#endif - } + major = 1; if (clnt->cl_autobind) clnt->cl_port = 0; @@ -689,6 +679,8 @@ } else if (!xprt_connected(clnt->cl_xprt)) { task->tk_action = call_reconnect; clnt->cl_stats->rpcretrans++; + } else if (major && RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; } else { task->tk_action = call_transmit; clnt->cl_stats->rpcretrans++; @@ -710,12 +702,6 @@ dprintk("RPC: %4d call_decode (status %d)\n", task->tk_pid, task->tk_status); - if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { - printk(KERN_NOTICE "%s: server %s OK\n", - clnt->cl_protname, clnt->cl_server); - task->tk_flags &= ~RPC_CALL_MAJORSEEN; - } - if (task->tk_status < 12) { if (!clnt->cl_softrtry) { task->tk_action = call_transmit; @@ -729,7 +715,7 @@ } /* Verify the RPC header */ - if (!(p = call_verify(task))) + if (!(p = rpc_call_verify(task))) return; /* @@ -788,8 +774,8 @@ /* * Call header serialization */ -static u32 * -call_header(struct rpc_task *task) +u32 * +rpc_call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_xprt *xprt = clnt->cl_xprt; @@ -809,10 +795,63 @@ } /* + * Ping a non-responding server + */ +static void +call_ping(struct rpc_task *task) +{ + task->tk_action = call_pingresult; + rpc_ping(task); +} + +/* + * Interpret the result from ping + */ +static void +call_pingresult(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + int status = task->tk_status; + + task->tk_status = 0; + if (status >= 0) { + task->tk_action = call_transmit; + return; + } + + switch(status) { + case -ECONNREFUSED: + case -ENOTCONN: + if (clnt->cl_autobind || !clnt->cl_port) { + clnt->cl_port = 0; + task->tk_action = call_bind; + break; + } + if (xprt->stream) { + task->tk_action = call_reconnect; + break; + } + case -ENOMEM: + case -ENOBUFS: + rpc_delay(task, HZ >> 4); + case -ETIMEDOUT: + task->tk_action = call_ping; + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task,status); + return; + } +} + +/* * Reply header verification */ -static u32 * -call_verify(struct rpc_task *task) +u32 * +rpc_call_verify(struct rpc_task *task) { u32 *p = task->tk_rqstp->rq_rvec[0].iov_base, n; diff -u --recursive --new-file linux-2.4.16/net/sunrpc/ping.c linux-2.4.16-NFS_ALL/net/sunrpc/ping.c --- linux-2.4.16/net/sunrpc/ping.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.16-NFS_ALL/net/sunrpc/ping.c Fri Feb 1 11:22:19 2002 @@ -0,0 +1,218 @@ +/* + * linux/net/sunrpc/ping.c + * + * Ping routing. + * + * Copyright (C) 2000, Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define RPC_SLACK_SPACE 512 /* total overkill */ +#define RPC_PING_DELAY (15*HZ) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +static void ping_call_reserve(struct rpc_task *); +static void ping_call_allocate(struct rpc_task *); +static void ping_call_encode(struct rpc_task *); +static void ping_call_transmit(struct rpc_task *); +static void ping_call_receive(struct rpc_task *); +static void ping_call_exit(struct rpc_task *); + + +static void +ping_call_reserve(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_reserve\n", task->tk_pid); + task->tk_status = 0; + task->tk_action = ping_call_allocate; + task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; + xprt_ping_reserve(task); +} + +static void +ping_call_allocate(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + unsigned int bufsiz; + + dprintk("RPC: %4d, ping_call_allocate (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = ping_call_exit; + if (task->tk_status < 0) + return; + + bufsiz = rpcproc_bufsiz(clnt, task->tk_msg.rpc_proc) + RPC_SLACK_SPACE; + if (!(task->tk_buffer = rpc_malloc(task, bufsiz << 1))) { + task->tk_status = -ENOMEM; + return; + } + req->rq_svec[0].iov_base = (void *)task->tk_buffer; + req->rq_svec[0].iov_len = bufsiz; + req->rq_slen = 0; + req->rq_snr = 1; + req->rq_rvec[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + req->rq_rvec[0].iov_len = bufsiz; + req->rq_rlen = bufsiz; + req->rq_rnr = 1; + task->tk_action = ping_call_encode; +} + +static void +ping_call_encode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + u32 *p; + + dprintk("RPC: %4d, ping_call_encode (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status < 0) { + task->tk_action = ping_call_exit; + return; + } + p = rpc_call_header(task); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + task->tk_action = ping_call_transmit; +} + +static void +ping_call_transmit(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_transmit\n", task->tk_pid); + task->tk_action = ping_call_receive; + xprt_transmit(task); +} + +static void +ping_call_receive(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_timeout *to = &req->rq_timeout; + u32 *p; + + dprintk("RPC: %4d, ping_call_receive (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status >= 0) + p = rpc_call_verify(task); + + task->tk_action = ping_call_exit; + + if (task->tk_status >= 0 || task->tk_status == -EACCES) { + task->tk_status = 0; + if (xprt_norespond(xprt)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + xprt_clear_norespond(xprt); + } + return; + } + + switch (task->tk_status) { + case -ENOTCONN: + break; + case -ENOMEM: + case -EAGAIN: + case -ECONNREFUSED: + case -ETIMEDOUT: + if (!xprt_adjust_timeout(to)) { + task->tk_status = 0; + task->tk_action = ping_call_transmit; + break; + } + default: + if (clnt->cl_softrtry) { + task->tk_status = -EIO; + break; + } + if (clnt->cl_chatty) { + if (!xprt_test_and_set_norespond(xprt)) { + printk(KERN_NOTICE + "%s: server %s is not responding\n", + clnt->cl_protname, clnt->cl_server); + } else { + printk(KERN_NOTICE + "%s: server %s still not responding\n", + clnt->cl_protname, clnt->cl_server); + } + } + rpc_delay(task, RPC_PING_DELAY); + } +} + +static void +ping_call_exit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %4d, ping_call_exit (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = NULL; + xprt_ping_release(task); + + /* Sigh. rpc_delay() clears task->tk_status */ + if (task->tk_status == 0 && xprt_norespond(xprt)) + task->tk_status = -ETIMEDOUT; + + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, task->tk_status); +} + +void +rpc_ping(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_task *child; + struct rpc_message msg = {0, NULL, NULL, NULL}; + + dprintk("RPC: %4d, rpc_ping\n", task->tk_pid); + + again: + if (xprt_test_and_set_pinging(xprt)) { + rpc_sleep_on(&xprt->pingwait, task, NULL, 0); + if (!xprt_pinging(xprt)) { + rpc_wake_up_task(task); + goto again; + } + dprintk("RPC: %4d, rpc_ping, waiting on completion\n", + task->tk_pid); + return; + } + + child = rpc_new_child(clnt, task); + if (!child) { + dprintk("RPC: %4d, rpc_ping, failed to create child process\n", + task->tk_pid); + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, -ENOMEM); + task->tk_status = -ENOMEM; + return; + } + rpc_call_setup(child, &msg, 0); + child->tk_action = ping_call_reserve; + + dprintk("RPC: %4d, rpc_ping, running child process %4d\n", + task->tk_pid, child->tk_pid); + rpc_run_child(task, child, NULL); +} diff -u --recursive --new-file linux-2.4.16/net/sunrpc/sched.c linux-2.4.16-NFS_ALL/net/sunrpc/sched.c --- linux-2.4.16/net/sunrpc/sched.c Thu Oct 11 17:12:52 2001 +++ linux-2.4.16-NFS_ALL/net/sunrpc/sched.c Fri Feb 1 11:22:19 2002 @@ -1052,7 +1052,6 @@ int rounds = 0; MOD_INC_USE_COUNT; - lock_kernel(); /* * Let our maker know we're running ... */ diff -u --recursive --new-file linux-2.4.16/net/sunrpc/xprt.c linux-2.4.16-NFS_ALL/net/sunrpc/xprt.c --- linux-2.4.16/net/sunrpc/xprt.c Mon Oct 8 21:36:07 2001 +++ linux-2.4.16-NFS_ALL/net/sunrpc/xprt.c Fri Feb 1 11:22:19 2002 @@ -85,7 +85,7 @@ */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void do_xprt_transmit(struct rpc_task *); -static void xprt_reserve_status(struct rpc_task *task); +static void xprt_alloc_slot(struct rpc_xprt *, struct rpc_task *); static void xprt_disconnect(struct rpc_xprt *); static void xprt_reconn_status(struct rpc_task *task); static struct socket *xprt_create_socket(int, struct rpc_timeout *); @@ -1247,15 +1247,8 @@ rpc_sleep_on(&xprt->sending, task, NULL, NULL); } spin_unlock_bh(&xprt->sock_lock); - return; case -EAGAIN: - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); return; - case -ECONNREFUSED: - case -ENOTCONN: - if (!xprt->stream) - return; default: if (xprt->stream) xprt_disconnect(xprt); @@ -1306,9 +1299,11 @@ dprintk("RPC: %4d xprt_reserve cong = %ld cwnd = %ld\n", task->tk_pid, xprt->cong, xprt->cwnd); spin_lock_bh(&xprt->xprt_lock); - xprt_reserve_status(task); + if (!RPCXPRT_CONGESTED(xprt)) + xprt_alloc_slot(xprt, task); if (task->tk_rqstp) { task->tk_timeout = 0; + xprt->cong += RPC_CWNDSCALE; } else if (!task->tk_timeout) { task->tk_status = -ENOBUFS; } else { @@ -1323,35 +1318,48 @@ } /* - * Reservation callback + * Reserve a ping RPC call slot. */ -static void -xprt_reserve_status(struct rpc_task *task) +int +xprt_ping_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req; - if (xprt->shutdown) { - task->tk_status = -EIO; - } else if (task->tk_status < 0) { - /* NOP */ - } else if (task->tk_rqstp) { - /* We've already been given a request slot: NOP */ - } else { - if (RPCXPRT_CONGESTED(xprt) || !(req = xprt->free)) - goto out_nofree; - /* OK: There's room for us. Grab a free slot and bump - * congestion value */ - xprt->free = req->rq_next; - req->rq_next = NULL; - xprt->cong += RPC_CWNDSCALE; - task->tk_rqstp = req; - xprt_request_init(task, xprt); + /* We already have an initialized request. */ + if (task->tk_rqstp) + return 0; - if (xprt->free) - xprt_clear_backlog(xprt); - } + dprintk("RPC: %4d xprt_ping_reserve cong = %ld cwnd = %ld\n", + task->tk_pid, xprt->cong, xprt->cwnd); + spin_lock_bh(&xprt->xprt_lock); + xprt_alloc_slot(xprt, task); + if (!task->tk_rqstp) + task->tk_status = -ENOBUFS; + spin_unlock_bh(&xprt->xprt_lock); + dprintk("RPC: %4d xprt_ping_reserve returns %d\n", + task->tk_pid, task->tk_status); + return task->tk_status; +} +/* + * Reserve a slot + */ +static void +xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req; + + if (!(req = xprt->free)) + goto out_nofree; + /* OK: There's room for us. Grab a free slot and bump + * congestion value */ + xprt->free = req->rq_next; + req->rq_next = NULL; + task->tk_rqstp = req; + xprt_request_init(task, xprt); + + if (xprt->free) + xprt_clear_backlog(xprt); return; out_nofree: @@ -1383,8 +1391,8 @@ /* * Release an RPC call slot */ -void -xprt_release(struct rpc_task *task) +static void +__xprt_release(struct rpc_task *task, int congvalue) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; @@ -1405,13 +1413,26 @@ req->rq_next = xprt->free; xprt->free = req; - /* Decrease congestion value. */ - xprt->cong -= RPC_CWNDSCALE; - - xprt_clear_backlog(xprt); + if (congvalue) { + /* Decrease congestion value. */ + xprt->cong -= congvalue; + xprt_clear_backlog(xprt); + } spin_unlock_bh(&xprt->xprt_lock); } +void +xprt_release(struct rpc_task *task) +{ + __xprt_release(task, RPC_CWNDSCALE); +} + +void +xprt_ping_release(struct rpc_task *task) +{ + __xprt_release(task, 0); +} + /* * Set default timeout parameters */ @@ -1481,6 +1502,7 @@ xprt->pending = RPC_INIT_WAITQ("xprt_pending"); xprt->sending = RPC_INIT_WAITQ("xprt_sending"); xprt->backlog = RPC_INIT_WAITQ("xprt_backlog"); + xprt->pingwait= RPC_INIT_WAITQ("xprt_pingwait"); /* initialize free list */ for (i = 0, req = xprt->slot; i < RPC_MAXREQS-1; i++, req++) @@ -1616,6 +1638,7 @@ rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->pending); rpc_wake_up(&xprt->backlog); + rpc_wake_up(&xprt->pingwait); if (waitqueue_active(&xprt->cong_wait)) wake_up(&xprt->cong_wait); }