diff -u --recursive --new-file linux-2.4.14-ext3/fs/lockd/clntproc.c linux-2.4.14-jukebox/fs/lockd/clntproc.c --- linux-2.4.14-ext3/fs/lockd/clntproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.14-jukebox/fs/lockd/clntproc.c Tue Nov 6 13:07:24 2001 @@ -569,11 +569,15 @@ printk(KERN_WARNING "lockd: unexpected unlock status: %d\n", status); die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_rebind: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); retry_unlock: rpc_restart_call(task); } @@ -650,12 +654,16 @@ } die: + lock_kernel(); nlm_release_host(req->a_host); + unlock_kernel(); kfree(req); return; retry_cancel: + lock_kernel(); nlm_rebind_host(req->a_host); + unlock_kernel(); rpc_restart_call(task); rpc_delay(task, 30 * HZ); } diff -u --recursive --new-file linux-2.4.14-ext3/fs/lockd/svc4proc.c linux-2.4.14-jukebox/fs/lockd/svc4proc.c --- linux-2.4.14-ext3/fs/lockd/svc4proc.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.14-jukebox/fs/lockd/svc4proc.c Tue Nov 6 13:07:24 2001 @@ -17,6 +17,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -499,7 +500,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.14-ext3/fs/lockd/svclock.c linux-2.4.14-jukebox/fs/lockd/svclock.c --- linux-2.4.14-ext3/fs/lockd/svclock.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.14-jukebox/fs/lockd/svclock.c Tue Nov 6 13:07:24 2001 @@ -576,9 +576,10 @@ dprintk("lockd: GRANT_MSG RPC callback\n"); dprintk("callback: looking for cookie %x \n", *(unsigned int *)(call->a_args.cookie.data)); + lock_kernel(); if (!(block = nlmsvc_find_block(&call->a_args.cookie))) { dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data)); - return; + goto out; } /* Technically, we should down the file semaphore here. Since we @@ -599,6 +600,8 @@ block->b_incall = 0; nlm_release_host(call->a_host); + out: + unlock_kernel(); } /* diff -u --recursive --new-file linux-2.4.14-ext3/fs/lockd/svcproc.c linux-2.4.14-jukebox/fs/lockd/svcproc.c --- linux-2.4.14-ext3/fs/lockd/svcproc.c Thu Oct 11 16:52:18 2001 +++ linux-2.4.14-jukebox/fs/lockd/svcproc.c Tue Nov 6 13:07:24 2001 @@ -18,6 +18,7 @@ #include #include #include +#include #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -527,7 +528,9 @@ dprintk("lockd: %4d callback failed (errno = %d)\n", task->tk_pid, -task->tk_status); } + lock_kernel(); nlm_release_host(call->a_host); + unlock_kernel(); kfree(call); } diff -u --recursive --new-file linux-2.4.14-ext3/fs/namei.c linux-2.4.14-jukebox/fs/namei.c --- linux-2.4.14-ext3/fs/namei.c Wed Oct 17 23:46:29 2001 +++ linux-2.4.14-jukebox/fs/namei.c Tue Nov 6 13:03:45 2001 @@ -454,7 +454,7 @@ while (*name=='/') name++; if (!*name) - goto return_base; + goto return_reval; inode = nd->dentry->d_inode; if (current->link_count) @@ -573,7 +573,7 @@ inode = nd->dentry->d_inode; /* fallthrough */ case 1: - goto return_base; + goto return_reval; } if (nd->dentry->d_op && nd->dentry->d_op->d_hash) { err = nd->dentry->d_op->d_hash(nd->dentry, &this); @@ -624,6 +624,17 @@ nd->last_type = LAST_DOT; else if (this.len == 2 && this.name[1] == '.') nd->last_type = LAST_DOTDOT; +return_reval: + /* + * We bypassed the ordinary revalidation routines, so + * NFS wants to check the cached inode for staleness. + */ + inode = nd->dentry->d_inode; + if (inode && inode->i_op && inode->i_op->check_stale) { + err = inode->i_op->check_stale(inode); + if (err) + break; + } return_base: return 0; out_dput: diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/Makefile linux-2.4.14-jukebox/fs/nfs/Makefile --- linux-2.4.14-ext3/fs/nfs/Makefile Fri Dec 29 23:07:23 2000 +++ linux-2.4.14-jukebox/fs/nfs/Makefile Tue Nov 6 13:05:49 2001 @@ -9,8 +9,8 @@ O_TARGET := nfs.o -obj-y := inode.o file.o read.o write.o dir.o symlink.o proc.o \ - nfs2xdr.o flushd.o unlink.o +obj-y := dir.o file.o flushd.o inode.o nfs2xdr.o pagelist.o proc.o \ + read.o symlink.o unlink.o write.o obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/dir.c linux-2.4.14-jukebox/fs/nfs/dir.c --- linux-2.4.14-ext3/fs/nfs/dir.c Tue Jun 12 20:15:08 2001 +++ linux-2.4.14-jukebox/fs/nfs/dir.c Tue Nov 6 13:04:28 2001 @@ -34,8 +34,11 @@ #define NFS_PARANOIA 1 /* #define NFS_DEBUG_VERBOSE 1 */ +static loff_t nfs_dir_llseek(struct file *, loff_t, int); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *); +static int nfs_cached_lookup(struct inode *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); static int nfs_create(struct inode *, struct dentry *, int); static int nfs_mkdir(struct inode *, struct dentry *, int); static int nfs_rmdir(struct inode *, struct dentry *); @@ -47,6 +50,7 @@ struct inode *, struct dentry *); struct file_operations nfs_dir_operations = { + llseek: nfs_dir_llseek, read: generic_read_dir, readdir: nfs_readdir, open: nfs_open, @@ -66,8 +70,28 @@ permission: nfs_permission, revalidate: nfs_revalidate, setattr: nfs_notify_change, + check_stale: nfs_check_stale, }; +static loff_t nfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + switch (origin) { + case 1: + if (offset == 0) { + offset = file->f_pos; + break; + } + case 2: + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_reada = 0; + file->f_version = ++event; + } + return (offset <= 0) ? 0 : offset; +} + typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int); typedef struct { struct file *file; @@ -108,13 +132,15 @@ error = NFS_PROTO(inode)->readdir(inode, cred, desc->entry->cookie, buffer, NFS_SERVER(inode)->dtsize, desc->plus); /* We requested READDIRPLUS, but the server doesn't grok it */ - if (desc->plus && error == -ENOTSUPP) { - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; - desc->plus = 0; - goto again; - } - if (error < 0) + if (error < 0) { + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + desc->plus = 0; + goto again; + } goto error; + } SetPageUptodate(page); kunmap(page); /* Ensure consistent page alignment of the data. @@ -195,7 +221,6 @@ dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index); - desc->plus = NFS_USE_READDIRPLUS(inode); page = read_cache_page(&inode->i_data, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page)) { @@ -247,6 +272,24 @@ return res; } +static unsigned int nfs_type2dtype[] = { + DT_UNKNOWN, + DT_REG, + DT_DIR, + DT_BLK, + DT_CHR, + DT_LNK, + DT_SOCK, + DT_UNKNOWN, + DT_FIFO +}; + +static inline +unsigned int nfs_type_to_d_type(enum nfs_ftype type) +{ + return nfs_type2dtype[type]; +} + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -263,11 +306,17 @@ dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target); for(;;) { + unsigned d_type = DT_UNKNOWN; /* Note: entry->prev_cookie contains the cookie for * retrieving the current dirent on the server */ fileid = nfs_fileid_to_ino_t(entry->ino); + + /* Use readdirplus info */ + if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) + d_type = nfs_type_to_d_type(entry->fattr->type); + res = filldir(dirent, entry->name, entry->len, - entry->prev_cookie, fileid, DT_UNKNOWN); + entry->prev_cookie, fileid, d_type); if (res < 0) break; file->f_pos = desc->target = entry->cookie; @@ -334,7 +383,8 @@ /* Reset read descriptor so it searches the page cache from * the start upon the next call to readdir_search_pagecache() */ desc->page_index = 0; - memset(desc->entry, 0, sizeof(*desc->entry)); + desc->entry->cookie = desc->entry->prev_cookie = 0; + desc->entry->eof = 0; out: dfprintk(VFS, "NFS: uncached_readdir() returns %d\n", status); return status; @@ -353,9 +403,11 @@ nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_entry my_entry; + struct nfs_fh fh; + struct nfs_fattr fattr; long res; - res = nfs_revalidate(dentry); + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res < 0) return res; @@ -366,12 +418,16 @@ * itself. */ memset(desc, 0, sizeof(*desc)); - memset(&my_entry, 0, sizeof(my_entry)); - desc->file = filp; desc->target = filp->f_pos; - desc->entry = &my_entry; desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; + my_entry.fh = &fh; + my_entry.fattr = &fattr; + desc->entry = &my_entry; while(!desc->entry->eof) { res = readdir_search_pagecache(desc); @@ -401,6 +457,32 @@ return 0; } +static inline +void nfs_renew_verifier(struct inode *dir, struct dentry *dentry) +{ + u64 mtime = NFS_CACHE_MTIME(dir); + dentry->d_rtime_sec = mtime >> 32; + dentry->d_rtime_nsec = mtime & 0xffffffffUL; +} + +/* + * A check for whether or not the parent directory has changed. + * In the case it has, we assume that the dentries are untrustworthy + * and may need to be looked up again. + */ +static inline +int nfs_check_verifier(struct inode *dir, struct dentry *dentry) +{ + u64 mtime; + if (IS_ROOT(dentry)) + return 1; + if (nfs_revalidate_inode(NFS_SERVER(dir), dir)) + return 0; + mtime = NFS_CACHE_MTIME(dir); + return (dentry->d_rtime_sec == (mtime >> 32)) && + (dentry->d_rtime_nsec == (mtime & 0xffffffffUL)); +} + /* * Whenever an NFS operation succeeds, we know that the dentry * is valid, so we update the revalidation timestamp. @@ -408,50 +490,34 @@ static inline void nfs_renew_times(struct dentry * dentry) { dentry->d_time = jiffies; + nfs_renew_verifier(dentry->d_parent->d_inode, dentry); } -static inline int nfs_dentry_force_reval(struct dentry *dentry, int flags) +static inline +int nfs_lookup_verify_inode(struct inode *inode, int flags) { - struct inode *inode = dentry->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(inode); - + struct nfs_server *server = NFS_SERVER(inode); /* - * If it's the last lookup in a series, we use a stricter - * cache consistency check by looking at the parent mtime. - * - * If it's been modified in the last hour, be really strict. - * (This still means that we can avoid doing unnecessary - * work on directories like /usr/share/bin etc which basically - * never change). + * If we're interested in close-to-open cache consistency, + * then we revalidate the inode upon lookup. */ - if (!(flags & LOOKUP_CONTINUE)) { - long diff = CURRENT_TIME - dentry->d_parent->d_inode->i_mtime; - - if (diff < 15*60) - timeout = 0; - } - - return time_after(jiffies,dentry->d_time + timeout); + if (!(server->flags & NFS_MOUNT_NOCTO) && !(flags & LOOKUP_CONTINUE)) + NFS_CACHEINV(inode); + return nfs_revalidate_inode(server, inode); } /* * We judge how long we want to trust negative * dentries by looking at the parent inode mtime. * - * If mtime is close to present time, we revalidate - * more often. + * If parent mtime has changed, we revalidate, else we wait for a + * period corresponding to the parent's attribute cache timeout value. */ -#define NFS_REVALIDATE_NEGATIVE (1 * HZ) -static inline int nfs_neg_need_reval(struct dentry *dentry) +static inline int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - unsigned long timeout = NFS_ATTRTIMEO(dir); - long diff = CURRENT_TIME - dir->i_mtime; - - if (diff < 5*60 && timeout > NFS_REVALIDATE_NEGATIVE) - timeout = NFS_REVALIDATE_NEGATIVE; - - return time_after(jiffies, dentry->d_time + timeout); + if (!nfs_check_verifier(dir, dentry)) + return 1; + return time_after(jiffies, dentry->d_time + NFS_ATTRTIMEO(dir)); } /* @@ -462,9 +528,8 @@ * NOTE! The hit can be a negative hit too, don't assume * we have an inode! * - * If the dentry is older than the revalidation interval, - * we do a new lookup and verify that the dentry is still - * correct. + * If the parent directory is seen to have changed, we throw out the + * cached dentry and do a new lookup. */ static int nfs_lookup_revalidate(struct dentry * dentry, int flags) { @@ -477,13 +542,9 @@ lock_kernel(); dir = dentry->d_parent->d_inode; inode = dentry->d_inode; - /* - * If we don't have an inode, let's look at the parent - * directory mtime to get a hint about how often we - * should validate things.. - */ + if (!inode) { - if (nfs_neg_need_reval(dentry)) + if (nfs_neg_need_reval(dir, dentry)) goto out_bad; goto out_valid; } @@ -494,48 +555,49 @@ goto out_bad; } - if (!nfs_dentry_force_reval(dentry, flags)) + /* Force a full look up iff the parent directory has changed */ + if (nfs_check_verifier(dir, dentry)) { + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid; + } - if (IS_ROOT(dentry)) { - __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) + goto out_bad; + if (nfs_lookup_verify_inode(inode, flags)) + goto out_bad; goto out_valid_renew; } - /* - * Do a new lookup and check the dentry attributes. - */ + if (NFS_STALE(inode)) + goto out_bad; + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); if (error) goto out_bad; - - /* Inode number matches? */ - if (!(fattr.valid & NFS_ATTR_FATTR) || - NFS_FSID(inode) != fattr.fsid || - NFS_FILEID(inode) != fattr.fileid) + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) goto out_bad; - - /* Ok, remember that we successfully checked it.. */ - nfs_refresh_inode(inode, &fattr); - - if (nfs_inode_is_stale(inode, &fhandle, &fattr)) + if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; out_valid_renew: nfs_renew_times(dentry); -out_valid: + out_valid: unlock_kernel(); return 1; -out_bad: - shrink_dcache_parent(dentry); - /* If we have submounts, don't unhash ! */ - if (have_submounts(dentry)) - goto out_valid; - d_drop(dentry); - /* Purge readdir caches. */ - nfs_zap_caches(dir); - if (inode && S_ISDIR(inode->i_mode)) + out_bad: + NFS_CACHEINV(dir); + if (inode && S_ISDIR(inode->i_mode)) { + /* Purge readdir caches. */ nfs_zap_caches(inode); + /* If we have submounts, don't unhash ! */ + if (have_submounts(dentry)) + goto out_valid; + shrink_dcache_parent(dentry); + } + d_drop(dentry); unlock_kernel(); return 0; } @@ -594,6 +656,20 @@ error = -ENOMEM; dentry->d_op = &nfs_dentry_operations; + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + error = -EACCES; + inode = nfs_fhget(dentry, &fhandle, &fattr); + if (inode) { + if (!(NFS_SERVER(dir)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + d_add(dentry, inode); + nfs_renew_times(dentry); + error = 0; + } + goto out; + } + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); inode = NULL; if (error == -ENOENT) @@ -604,14 +680,85 @@ if (inode) { no_entry: d_add(dentry, inode); - nfs_renew_times(dentry); error = 0; } + nfs_renew_times(dentry); } out: return ERR_PTR(error); } +static inline +int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) +{ + struct nfs_entry *entry = desc->entry; + int status; + + while((status = dir_decode(desc)) == 0) { + if (entry->len != dentry->d_name.len) + continue; + if (memcmp(entry->name, dentry->d_name.name, entry->len)) + continue; + if (!(entry->fattr->valid & NFS_ATTR_FATTR)) + continue; + break; + } + return status; +} + +/* + * Use the cached Readdirplus results in order to avoid a LOOKUP call + * whenever we believe that the parent directory has not changed. + * + * We assume that any file creation/rename changes the directory mtime. + * As this results in a page cache invalidation whenever it occurs, + * we don't require any other tests for cache coherency. + */ +static +int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + nfs_readdir_descriptor_t desc; + struct nfs_server *server; + struct nfs_entry entry; + struct page *page; + int res; + + if (!NFS_USE_READDIRPLUS(dir)) + return -ENOENT; + server = NFS_SERVER(dir); + if (server->flags & NFS_MOUNT_NOAC) + return -ENOENT; + nfs_revalidate_inode(server, dir); + + entry.fh = fh; + entry.fattr = fattr; + + desc.decode = NFS_PROTO(dir)->decode_dirent; + desc.entry = &entry; + desc.page_index = 0; + desc.plus = 1; + + for(;(page = find_get_page(&dir->i_data, desc.page_index)); desc.page_index++) { + + res = -EIO; + if (Page_Uptodate(page)) { + desc.ptr = kmap(page); + res = find_dirent_name(&desc, page, dentry); + kunmap(page); + } + page_cache_release(page); + + if (res == 0) + goto out_found; + if (res != -EAGAIN) + break; + } + return -ENOENT; + out_found: + return 0; +} + /* * Code common to create, mkdir, and mknod. */ diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/file.c linux-2.4.14-jukebox/fs/nfs/file.c --- linux-2.4.14-ext3/fs/nfs/file.c Sun Sep 23 18:48:01 2001 +++ linux-2.4.14-jukebox/fs/nfs/file.c Tue Nov 6 13:07:24 2001 @@ -99,7 +99,9 @@ dentry->d_parent->d_name.name, dentry->d_name.name, (unsigned long) count, (unsigned long) *ppos); + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!result) result = generic_file_read(file, buf, count, ppos); return result; @@ -115,7 +117,9 @@ dfprintk(VFS, "nfs: mmap(%s/%s)\n", dentry->d_parent->d_name.name, dentry->d_name.name); + lock_kernel(); status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (!status) status = generic_file_mmap(file, vma); return status; @@ -134,13 +138,11 @@ dfprintk(VFS, "nfs: fsync(%x/%ld)\n", inode->i_dev, inode->i_ino); - lock_kernel(); status = nfs_wb_file(inode, file); if (!status) { status = file->f_error; file->f_error = 0; } - unlock_kernel(); return status; } @@ -164,9 +166,7 @@ loff_t pos = ((loff_t)page->index<mapping->host; - lock_kernel(); status = nfs_updatepage(file, page, offset, to-offset); - unlock_kernel(); /* most likely it's already done. CHECKME */ if (pos > inode->i_size) inode->i_size = pos; @@ -224,7 +224,9 @@ result = -EBUSY; if (IS_SWAPFILE(inode)) goto out_swapfile; + lock_kernel(); result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); if (result) goto out; diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/flushd.c linux-2.4.14-jukebox/fs/nfs/flushd.c --- linux-2.4.14-ext3/fs/nfs/flushd.c Wed Jun 27 23:02:29 2001 +++ linux-2.4.14-jukebox/fs/nfs/flushd.c Tue Nov 6 13:07:24 2001 @@ -38,9 +38,9 @@ #include #include +#include #include #include -#include /* * Various constants @@ -51,6 +51,19 @@ * This is the wait queue all cluster daemons sleep on */ static struct rpc_wait_queue flushd_queue = RPC_INIT_WAITQ("nfs_flushd"); +static spinlock_t nfs_flushd_lock = SPIN_LOCK_UNLOCKED; + +static inline void +nfs_lock_flushd(void) +{ + spin_lock(&nfs_flushd_lock); +} + +static inline void +nfs_unlock_flushd(void) +{ + spin_unlock(&nfs_flushd_lock); +} /* * Local function declarations. @@ -67,12 +80,11 @@ dprintk("NFS: writecache_init\n"); - lock_kernel(); - status = -ENOMEM; /* Create the RPC task */ if (!(task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC))) - goto out_unlock; + return -ENOMEM; + nfs_lock_flushd(); cache = server->rw_requests; status = 0; @@ -89,39 +101,37 @@ cache->auth = server->client->cl_auth; task->tk_action = nfs_flushd; task->tk_exit = nfs_flushd_exit; + nfs_unlock_flushd(); rpc_execute(task); - unlock_kernel(); return 0; out_unlock: - if (task) - rpc_release_task(task); - unlock_kernel(); - return status; + nfs_unlock_flushd(); + rpc_release_task(task); + return 0; } void nfs_reqlist_exit(struct nfs_server *server) { struct nfs_reqlist *cache; - lock_kernel(); + nfs_lock_flushd(); cache = server->rw_requests; if (!cache) goto out; dprintk("NFS: reqlist_exit (ptr %p rpc %p)\n", cache, cache->task); - while (cache->task || cache->inodes) { - if (!cache->task) { - nfs_reqlist_init(server); - } else { - cache->task->tk_status = -ENOMEM; - rpc_wake_up_task(cache->task); - } + while (cache->task) { + rpc_exit(cache->task, 0); + rpc_wake_up_task(cache->task); + nfs_unlock_flushd(); + interruptible_sleep_on_timeout(&cache->request_wait, 1 * HZ); + nfs_lock_flushd(); } out: - unlock_kernel(); + nfs_unlock_flushd(); } int nfs_reqlist_alloc(struct nfs_server *server) @@ -150,133 +160,49 @@ } } -void nfs_wake_flushd() -{ - rpc_wake_up_status(&flushd_queue, -ENOMEM); -} - -static void inode_append_flushd(struct inode *inode) -{ - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - struct inode **q; - - if (NFS_FLAGS(inode) & NFS_INO_FLUSH) - goto out; - inode->u.nfs_i.hash_next = NULL; - - q = &cache->inodes; - while (*q) - q = &(*q)->u.nfs_i.hash_next; - *q = inode; - - /* Note: we increase the inode i_count in order to prevent - * it from disappearing when on the flush list - */ - NFS_FLAGS(inode) |= NFS_INO_FLUSH; - atomic_inc(&inode->i_count); -out:; -} - -/* Protect me using the BKL */ -void inode_remove_flushd(struct inode *inode) -{ - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - struct inode **q; - - if (!(NFS_FLAGS(inode) & NFS_INO_FLUSH)) - return; - - q = &cache->inodes; - while (*q && *q != inode) - q = &(*q)->u.nfs_i.hash_next; - if (*q) { - *q = inode->u.nfs_i.hash_next; - NFS_FLAGS(inode) &= ~NFS_INO_FLUSH; - iput(inode); - } -} - -void inode_schedule_scan(struct inode *inode, unsigned long time) -{ - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - struct rpc_task *task; - unsigned long mintimeout; - - lock_kernel(); - if (time_after(NFS_NEXTSCAN(inode), time)) - NFS_NEXTSCAN(inode) = time; - mintimeout = jiffies + 1 * HZ; - if (time_before(mintimeout, NFS_NEXTSCAN(inode))) - mintimeout = NFS_NEXTSCAN(inode); - inode_append_flushd(inode); - - task = cache->task; - if (!task) { - nfs_reqlist_init(NFS_SERVER(inode)); - } else { - if (time_after(cache->runat, mintimeout)) - rpc_wake_up_task(task); - } - unlock_kernel(); -} - - +#define NFS_FLUSHD_TIMEOUT (30*HZ) static void nfs_flushd(struct rpc_task *task) { struct nfs_server *server; struct nfs_reqlist *cache; - struct inode *inode, *next; - unsigned long delay = jiffies + NFS_WRITEBACK_LOCKDELAY; - int flush = (task->tk_status == -ENOMEM); + LIST_HEAD(head); dprintk("NFS: %4d flushd starting\n", task->tk_pid); server = (struct nfs_server *) task->tk_calldata; cache = server->rw_requests; - next = cache->inodes; - cache->inodes = NULL; - - while ((inode = next) != NULL) { - next = next->u.nfs_i.hash_next; - inode->u.nfs_i.hash_next = NULL; - NFS_FLAGS(inode) &= ~NFS_INO_FLUSH; - - if (flush) { - nfs_pagein_inode(inode, 0, 0); - nfs_sync_file(inode, NULL, 0, 0, FLUSH_AGING); - } else if (time_after(jiffies, NFS_NEXTSCAN(inode))) { - NFS_NEXTSCAN(inode) = jiffies + NFS_WRITEBACK_LOCKDELAY; - nfs_pagein_timeout(inode); - nfs_flush_timeout(inode, FLUSH_AGING); -#ifdef CONFIG_NFS_V3 - nfs_commit_timeout(inode, FLUSH_AGING); -#endif + for(;;) { + spin_lock(&nfs_wreq_lock); + if (nfs_scan_lru_dirty_timeout(server, &head)) { + spin_unlock(&nfs_wreq_lock); + nfs_flush_list(&head, server->wpages, FLUSH_AGING); + continue; } - - if (nfs_have_writebacks(inode) || nfs_have_read(inode)) { - inode_append_flushd(inode); - if (time_after(delay, NFS_NEXTSCAN(inode))) - delay = NFS_NEXTSCAN(inode); + if (nfs_scan_lru_read_timeout(server, &head)) { + spin_unlock(&nfs_wreq_lock); + nfs_pagein_list(&head, server->rpages); + continue; + } +#ifdef CONFIG_NFS_V3 + if (nfs_scan_lru_commit_timeout(server, &head)) { + spin_unlock(&nfs_wreq_lock); + nfs_commit_list(&head, FLUSH_AGING); + continue; } - iput(inode); +#endif + spin_unlock(&nfs_wreq_lock); + break; } dprintk("NFS: %4d flushd back to sleep\n", task->tk_pid); - if (time_after(jiffies + 1 * HZ, delay)) - delay = 1 * HZ; - else - delay = delay - jiffies; - task->tk_status = 0; - task->tk_action = nfs_flushd; - task->tk_timeout = delay; - cache->runat = jiffies + task->tk_timeout; - - if (!atomic_read(&cache->nr_requests) && !cache->inodes) { - cache->task = NULL; - task->tk_action = NULL; - } else + nfs_lock_flushd(); + if (task->tk_action) { + task->tk_timeout = NFS_FLUSHD_TIMEOUT; + cache->runat = jiffies + task->tk_timeout; rpc_sleep_on(&flushd_queue, task, NULL, NULL); + } + nfs_unlock_flushd(); } static void @@ -285,10 +211,13 @@ struct nfs_server *server; struct nfs_reqlist *cache; server = (struct nfs_server *) task->tk_calldata; + + nfs_lock_flushd(); cache = server->rw_requests; if (cache->task == task) cache->task = NULL; wake_up(&cache->request_wait); + nfs_unlock_flushd(); } diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/inode.c linux-2.4.14-jukebox/fs/nfs/inode.c --- linux-2.4.14-ext3/fs/nfs/inode.c Mon Oct 1 22:45:47 2001 +++ linux-2.4.14-jukebox/fs/nfs/inode.c Tue Nov 6 13:07:24 2001 @@ -83,6 +83,9 @@ &nfs_rpcstat, }; +/* Spinlock to protect the NFS inode update */ +static spinlock_t nfs_inode_lock = SPIN_LOCK_UNLOCKED; + static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { @@ -324,6 +327,10 @@ if (!server->hostname) goto out_unlock; strcpy(server->hostname, data->hostname); + INIT_LIST_HEAD(&server->lru_read); + INIT_LIST_HEAD(&server->lru_dirty); + INIT_LIST_HEAD(&server->lru_commit); + INIT_LIST_HEAD(&server->lru_busy); nfsv3_try_again: /* Check NFS protocol revision and initialize RPC op vector @@ -332,6 +339,7 @@ #ifdef CONFIG_NFS_V3 server->rpc_ops = &nfs_v3_clientops; version = 3; + server->caps |= NFS_CAP_READDIRPLUS; if (data->version < 4) { printk(KERN_NOTICE "NFS: NFSv3 not supported by mount program.\n"); goto out_unlock; @@ -551,18 +559,30 @@ } /* + * Reset the read time on the local caches + */ +void +nfs_invalidate_caches(struct inode *inode) +{ + spin_lock(&nfs_inode_lock); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); +} + +/* * Invalidate the local caches */ void nfs_zap_caches(struct inode *inode) { - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - invalidate_inode_pages(inode); + spin_lock(&nfs_inode_lock); + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); + NFS_ATTRTIMEO_UPDATE(inode) = jiffies; memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode))); - NFS_CACHEINV(inode); + NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; + spin_unlock(&nfs_inode_lock); } /* @@ -578,50 +598,49 @@ nfs_zap_caches(inode); } +/* Don't use READDIRPLUS on directories that we believe are too large */ +#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) + /* * Fill in inode information from the fattr. */ static void nfs_fill_inode(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) { - /* - * Check whether the mode has been set, as we only want to - * do this once. (We don't allow inodes to change types.) + NFS_FILEID(inode) = fattr->fileid; + NFS_FSID(inode) = fattr->fsid; + inode->i_mode = fattr->mode; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. */ - if (inode->i_mode == 0) { - NFS_FILEID(inode) = fattr->fileid; - NFS_FSID(inode) = fattr->fsid; - inode->i_mode = fattr->mode; - /* Why so? Because we want revalidate for devices/FIFOs, and - * that's precisely what we have in nfs_file_inode_operations. - */ - inode->i_op = &nfs_file_inode_operations; - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; - inode->i_data.a_ops = &nfs_file_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &nfs_dir_inode_operations; - inode->i_fop = &nfs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) - inode->i_op = &nfs_symlink_inode_operations; - else - init_special_inode(inode, inode->i_mode, fattr->rdev); - /* - * Preset the size and mtime, as there's no need - * to invalidate the caches. - */ - inode->i_size = nfs_size_to_loff_t(fattr->size); - inode->i_mtime = nfs_time_to_secs(fattr->mtime); - inode->i_atime = nfs_time_to_secs(fattr->atime); - inode->i_ctime = nfs_time_to_secs(fattr->ctime); - NFS_CACHE_CTIME(inode) = fattr->ctime; - NFS_CACHE_MTIME(inode) = fattr->mtime; - NFS_CACHE_ISIZE(inode) = fattr->size; - NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); - NFS_ATTRTIMEO_UPDATE(inode) = jiffies; - memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); - } - nfs_refresh_inode(inode, fattr); + inode->i_op = &nfs_file_inode_operations; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nfs_dir_inode_operations; + inode->i_fop = &nfs_dir_operations; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) + && fattr->size <= NFS_LIMIT_READDIRPLUS) + NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + /* + * Preset the size and mtime, as there's no need + * to invalidate the caches. + */ + inode->i_size = nfs_size_to_loff_t(fattr->size); + inode->i_mtime = nfs_time_to_secs(fattr->mtime); + inode->i_atime = nfs_time_to_secs(fattr->atime); + inode->i_ctime = nfs_time_to_secs(fattr->ctime); + NFS_CACHE_CTIME(inode) = fattr->ctime; + NFS_CACHE_MTIME(inode) = fattr->mtime; + NFS_CACHE_ISIZE(inode) = fattr->size; + NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode); + NFS_ATTRTIMEO_UPDATE(inode) = jiffies; + memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); } struct nfs_find_desc { @@ -651,27 +670,6 @@ return 1; } -int -nfs_inode_is_stale(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) -{ - /* Empty inodes are not stale */ - if (!inode->i_mode) - return 0; - - if ((fattr->mode & S_IFMT) != (inode->i_mode & S_IFMT)) - return 1; - - if (is_bad_inode(inode) || NFS_STALE(inode)) - return 1; - - /* Has the filehandle changed? If so is the old one stale? */ - if (memcmp(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)) != 0 && - __nfs_revalidate_inode(NFS_SERVER(inode),inode) == -ESTALE) - return 1; - - return 0; -} - /* * This is our own version of iget that looks up inodes by file handle * instead of inode number. We use this technique instead of using @@ -714,7 +712,19 @@ if (!(inode = iget4(sb, ino, nfs_find_actor, &desc))) goto out_no_inode; - nfs_fill_inode(inode, fh, fattr); + /* + * Check whether the mode has been set, as we only want to + * do this once. (We don't allow inodes to change types.) + */ + if (inode->i_mode == 0) { + nfs_fill_inode(inode, fh, fattr); + nfs_refresh_inode(inode, fattr); + + /* We don't trust READDIRPLUS attributes */ + if (fattr->valid & NFS_ATTR_RDPLUS) + NFS_CACHEINV(inode); + } else if (!(fattr->valid & NFS_ATTR_RDPLUS)) + nfs_refresh_inode(inode, fattr); dprintk("NFS: __nfs_fhget(%x/%Ld ct=%d)\n", inode->i_dev, (long long)NFS_FILEID(inode), atomic_read(&inode->i_count)); @@ -737,7 +747,7 @@ /* * Make sure the inode is up-to-date. */ - error = nfs_revalidate(dentry); + error = nfs_revalidate_inode(NFS_SERVER(inode),inode); if (error) { #ifdef NFS_PARANOIA printk("nfs_notify_change: revalidate failed, error=%d\n", error); @@ -805,7 +815,26 @@ nfs_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - return nfs_revalidate_inode(NFS_SERVER(inode), inode); + int status; + lock_kernel(); + status = nfs_revalidate_inode(NFS_SERVER(inode), inode); + unlock_kernel(); + return status; +} + +/* + * Another revalidation function: This one checks inodes for staleness + * when we've bypassed the ordinary dcache revalidation routines. + * e.g. open(".") + */ +int +nfs_check_stale(struct inode *inode) +{ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)) + NFS_CACHEINV(inode); + if (NFS_STALE(inode)) + return -ESTALE; + return 0; } /* @@ -834,13 +863,11 @@ struct rpc_auth *auth; struct rpc_cred *cred; - lock_kernel(); auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); filp->private_data = cred; if (filp->f_mode & FMODE_WRITE) nfs_set_mmcred(inode, cred); - unlock_kernel(); return 0; } @@ -848,11 +875,9 @@ { struct rpc_cred *cred; - lock_kernel(); cred = nfs_file_cred(filp); if (cred) put_rpccred(cred); - unlock_kernel(); return 0; } @@ -869,7 +894,6 @@ dfprintk(PAGECACHE, "NFS: revalidating (%x/%Ld)\n", inode->i_dev, (long long)NFS_FILEID(inode)); - lock_kernel(); if (!inode || is_bad_inode(inode)) goto out_nowait; if (NFS_STALE(inode) && inode != inode->i_sb->s_root->d_inode) @@ -912,7 +936,6 @@ NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING; wake_up(&inode->i_wait); out_nowait: - unlock_kernel(); return status; } @@ -958,6 +981,8 @@ new_size = fattr->size; new_isize = nfs_size_to_loff_t(fattr->size); + spin_lock(&nfs_inode_lock); + /* * Update the read time so we don't revalidate too often. */ @@ -1040,6 +1065,7 @@ NFS_ATTRTIMEO(inode) = NFS_MAXATTRTIMEO(inode); NFS_ATTRTIMEO_UPDATE(inode) = jiffies; } + spin_unlock(&nfs_inode_lock); if (invalid) nfs_zap_caches(inode); @@ -1072,6 +1098,8 @@ extern void nfs_destroy_nfspagecache(void); extern int nfs_init_readpagecache(void); extern int nfs_destroy_readpagecache(void); +extern int nfs_init_writepagecache(void); +extern int nfs_destroy_writepagecache(void); /* * Initialize NFS @@ -1088,6 +1116,10 @@ if (err) return err; + err = nfs_init_writepagecache(); + if (err) + return err; + #ifdef CONFIG_PROC_FS rpc_proc_register(&nfs_rpcstat); #endif @@ -1096,6 +1128,7 @@ static void __exit exit_nfs_fs(void) { + nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_nfspagecache(); #ifdef CONFIG_PROC_FS @@ -1107,6 +1140,7 @@ EXPORT_NO_SYMBOLS; /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch "); +MODULE_LICENSE("GPL"); module_init(init_nfs_fs) module_exit(exit_nfs_fs) diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/nfs2xdr.c linux-2.4.14-jukebox/fs/nfs/nfs2xdr.c --- linux-2.4.14-ext3/fs/nfs/nfs2xdr.c Tue Nov 6 12:59:12 2001 +++ linux-2.4.14-jukebox/fs/nfs/nfs2xdr.c Tue Nov 6 13:03:10 2001 @@ -419,7 +419,7 @@ bufsiz = bufsiz >> 2; p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); + *p++ = htonl(args->cookie & 0xFFFFFFFF); *p++ = htonl(bufsiz); /* see above */ req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); @@ -506,7 +506,7 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); + entry->cookie = (s64)((off_t)ntohl(*p++)); entry->eof = !p[0] && p[1]; return p; diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/nfs3proc.c linux-2.4.14-jukebox/fs/nfs/nfs3proc.c --- linux-2.4.14-ext3/fs/nfs/nfs3proc.c Mon Oct 1 22:45:37 2001 +++ linux-2.4.14-jukebox/fs/nfs/nfs3proc.c Wed Nov 7 00:20:06 2001 @@ -17,6 +17,37 @@ #define NFSDBG_FACILITY NFSDBG_PROC +/* A wrapper to handle the EJUKEBOX error message */ +static int +nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + sigset_t oldset; + int res; + rpc_clnt_sigmask(clnt, &oldset); + do { + res = rpc_call_sync(clnt, msg, flags); + if (res != -EJUKEBOX) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); + res = -ERESTARTSYS; + } while (!signalled()); + rpc_clnt_sigunmask(clnt, &oldset); + return res; +} + +static inline int +nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags) +{ + struct rpc_message msg = { proc, argp, resp, NULL }; + return nfs3_rpc_wrapper(clnt, &msg, flags); +} + +#define rpc_call(clnt, proc, argp, resp, flags) \ + nfs3_rpc_call_wrapper(clnt, proc, argp, resp, flags) +#define rpc_call_sync(clnt, msg, flags) \ + nfs3_rpc_wrapper(clnt, msg, flags) + /* * Bare-bones access to getattr: this is for nfs_read_super. */ @@ -80,7 +111,8 @@ status = rpc_call(NFS_CLIENT(dir), NFS3PROC_GETATTR, fhandle, fattr, 0); dprintk("NFS reply lookup: %d\n", status); - nfs_refresh_inode(dir, &dir_attr); + if (status >= 0) + status = nfs_refresh_inode(dir, &dir_attr); return status; } @@ -477,6 +509,9 @@ if (status < 0) goto error; status = rpc_call(server->client, NFS3PROC_FSINFO, fhandle, info, 0); + if (status < 0) + goto error; + status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); error: dprintk("NFS reply statfs: %d\n", status); diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/nfs3xdr.c linux-2.4.14-jukebox/fs/nfs/nfs3xdr.c --- linux-2.4.14-ext3/fs/nfs/nfs3xdr.c Tue Nov 6 12:59:12 2001 +++ linux-2.4.14-jukebox/fs/nfs/nfs3xdr.c Tue Nov 6 13:04:28 2001 @@ -523,6 +523,13 @@ return 0; } +/* Hack to sign-extending 32-bit cookies */ +static inline +u64 nfs_transform_cookie64(u64 cookie) +{ + return (cookie & 0x80000000) ? (cookie ^ 0xFFFFFFFF00000000) : cookie; +} + /* * Encode arguments to readdir call */ @@ -533,7 +540,7 @@ int buflen, replen; p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); + p = xdr_encode_hyper(p, nfs_transform_cookie64(args->cookie)); *p++ = args->verf[0]; *p++ = args->verf[1]; if (args->plus) { @@ -644,6 +651,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) { struct nfs_entry old = *entry; + u64 cookie; if (!*p++) { if (!*p) @@ -657,24 +665,25 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &cookie); + entry->cookie = nfs_transform_cookie64(cookie); if (plus) { - p = xdr_decode_post_op_attr(p, &entry->fattr); + entry->fattr->valid = 0; + p = xdr_decode_post_op_attr(p, entry->fattr); + if (entry->fattr->valid != 0) + entry->fattr->valid |= NFS_ATTR_RDPLUS; /* In fact, a post_op_fh3: */ if (*p++) { - p = xdr_decode_fhandle(p, &entry->fh); + p = xdr_decode_fhandle(p, entry->fh); /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); *entry = old; return ERR_PTR(-EAGAIN); } - } else { - /* If we don't get a file handle, the attrs - * aren't worth a lot. */ - entry->fattr.valid = 0; - } + } else + memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } entry->eof = !p[0] && p[1]; diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/pagelist.c linux-2.4.14-jukebox/fs/nfs/pagelist.c --- linux-2.4.14-ext3/fs/nfs/pagelist.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.14-jukebox/fs/nfs/pagelist.c Tue Nov 6 13:07:01 2001 @@ -0,0 +1,498 @@ +/* + * linux/fs/nfs/pagelist.c + * + * A set of helper functions for managing NFS read and write requests. + * The main purpose of these routines is to provide support for the + * coalescing of several requests into a single RPC call. + * + * Copyright 2000, 2001 (c) Trond Myklebust + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NFS_PARANOIA 1 + +/* + * Spinlock + */ +spinlock_t nfs_wreq_lock = SPIN_LOCK_UNLOCKED; + +static kmem_cache_t *nfs_page_cachep; + +static inline struct nfs_page * +nfs_page_alloc(void) +{ + struct nfs_page *p; + p = kmem_cache_alloc(nfs_page_cachep, SLAB_NOFS); + if (p) { + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->wb_hash); + INIT_LIST_HEAD(&p->wb_list); + INIT_LIST_HEAD(&p->wb_lru); + init_waitqueue_head(&p->wb_wait); + } + return p; +} + +static inline void +nfs_page_free(struct nfs_page *p) +{ + kmem_cache_free(nfs_page_cachep, p); +} + +static int nfs_try_to_free_pages(struct nfs_server *); + +/** + * nfs_create_request - Create an NFS read/write request. + * @file: file that owns this request + * @inode: inode to which the request is attached + * @page: page to write + * @offset: starting offset within the page for the write + * @count: number of bytes to read/write + * + * The page must be locked by the caller. This makes sure we never + * create two different requests for the same page, and avoids + * a possible deadlock when we reach the hard limit on the number + * of dirty pages. + * User should ensure it is safe to sleep in this function. + */ +struct nfs_page * +nfs_create_request(struct file *file, struct inode *inode, + struct page *page, + unsigned int offset, unsigned int count) +{ + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + struct nfs_page *req; + + /* Deal with hard limits. */ + for (;;) { + /* Prevent races by incrementing *before* we test */ + atomic_inc(&cache->nr_requests); + + /* If we haven't reached the local hard limit yet, + * try to allocate the request struct */ + if (atomic_read(&cache->nr_requests) <= MAX_REQUEST_HARD) { + req = nfs_page_alloc(); + if (req != NULL) + break; + } + + atomic_dec(&cache->nr_requests); + + /* Try to free up at least one request in order to stay + * below the hard limit + */ + if (nfs_try_to_free_pages(server)) + continue; + if (signalled() && (server->flags & NFS_MOUNT_INTR)) + return ERR_PTR(-ERESTARTSYS); + current->policy = SCHED_YIELD; + schedule(); + } + + /* Initialize the request struct. Initially, we assume a + * long write-back delay. This will be adjusted in + * update_nfs_request below if the region is not locked. */ + req->wb_page = page; + page_cache_get(page); + req->wb_offset = offset; + req->wb_bytes = count; + + /* If we have a struct file, use its cached credentials */ + if (file) { + req->wb_file = file; + get_file(file); + req->wb_cred = nfs_file_cred(file); + } + req->wb_inode = inode; + req->wb_count = 1; + + return req; +} + + +/** + * nfs_release_request - Release the count on an NFS read/write request + * @req: request to release + * + * Release all resources associated with a write request after it + * has been committed to stable storage + * + * Note: Should never be called with the spinlock held! + */ +void +nfs_release_request(struct nfs_page *req) +{ + struct inode *inode = req->wb_inode; + struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); + + spin_lock(&nfs_wreq_lock); + if (--req->wb_count) { + spin_unlock(&nfs_wreq_lock); + return; + } + __nfs_del_lru(req); + spin_unlock(&nfs_wreq_lock); + atomic_dec(&cache->nr_requests); + +#ifdef NFS_PARANOIA + if (!list_empty(&req->wb_list)) + BUG(); + if (!list_empty(&req->wb_hash)) + BUG(); + if (NFS_WBACK_BUSY(req)) + BUG(); + if (atomic_read(&cache->nr_requests) < 0) + BUG(); +#endif + + /* Release struct file or cached credential */ + if (req->wb_file) + fput(req->wb_file); + else if (req->wb_cred) + put_rpccred(req->wb_cred); + page_cache_release(req->wb_page); + nfs_page_free(req); +} + +/** + * nfs_list_add_request - Insert a request into a sorted list + * @req: request + * @head: head of list into which to insert the request. + * + * Note that the wb_list is sorted by page index in order to facilitate + * coalescing of requests. + * We use an insertion sort that is optimized for the case of appended + * writes. + */ +void +nfs_list_add_request(struct nfs_page *req, struct list_head *head) +{ + struct list_head *pos; + unsigned long pg_idx = page_index(req->wb_page); + +#ifdef NFS_PARANOIA + if (!list_empty(&req->wb_list)) { + printk(KERN_ERR "NFS: Add to list failed!\n"); + BUG(); + } +#endif + list_for_each_prev(pos, head) { + struct nfs_page *p = nfs_list_entry(pos); + if (page_index(p->wb_page) < pg_idx) + break; + } + list_add(&req->wb_list, pos); + req->wb_list_head = head; +} + +/** + * nfs_wait_on_request - Wait for a request to complete. + * @req: request to wait upon. + * + * Interruptible by signals only if mounted with intr flag. + * The user is responsible for holding a count on the request. + */ +int +nfs_wait_on_request(struct nfs_page *req) +{ + struct inode *inode = req->wb_inode; + struct rpc_clnt *clnt = NFS_CLIENT(inode); + + if (!NFS_WBACK_BUSY(req)) + return 0; + return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req)); +} + +/** + * nfs_coalesce_requests - Split coalesced requests out from a list. + * @head: source list + * @dst: destination list + * @nmax: maximum number of requests to coalesce + * + * Moves a maximum of 'nmax' elements from one list to another. + * The elements are checked to ensure that they form a contiguous set + * of pages, and that they originated from the same file. + */ +int +nfs_coalesce_requests(struct list_head *head, struct list_head *dst, + unsigned int nmax) +{ + struct nfs_page *req = NULL; + unsigned int npages = 0; + + while (!list_empty(head)) { + struct nfs_page *prev = req; + + req = nfs_list_entry(head->next); + if (prev) { + if (req->wb_file != prev->wb_file) + break; + if (page_index(req->wb_page) != page_index(prev->wb_page)+1) + break; + + if (req->wb_offset != 0) + break; + } + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + npages++; + if (req->wb_offset + req->wb_bytes != PAGE_CACHE_SIZE) + break; + if (npages >= nmax) + break; + } + return npages; +} + +/* + * nfs_scan_forward - Coalesce more requests + * @req: First request to add + * @dst: destination list + * @nmax: maximum number of requests to coalesce + * + * Tries to coalesce more requests by traversing the request's wb_list. + * Moves the resulting list into dst. Requests are guaranteed to be + * contiguous, and to originate from the same file. + */ +static int +nfs_scan_forward(struct nfs_page *req, struct list_head *dst, int nmax) +{ + struct nfs_server *server = NFS_SERVER(req->wb_inode); + struct list_head *pos, *head = req->wb_list_head; + struct file *file = req->wb_file; + unsigned long idx = page_index(req->wb_page) + 1; + int npages = 0; + + for (pos = req->wb_list.next; nfs_lock_request(req); pos = pos->next) { + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + __nfs_del_lru(req); + __nfs_add_lru(&server->lru_busy, req); + npages++; + if (npages == nmax) + break; + if (pos == head) + break; + if (req->wb_offset + req->wb_bytes != PAGE_CACHE_SIZE) + break; + req = nfs_list_entry(pos); + if (page_index(req->wb_page) != idx++) + break; + if (req->wb_offset != 0) + break; + if (req->wb_file != file) + break; + } + return npages; +} + +/** + * nfs_scan_lru - Scan one of the least recently used list + * @head: One of the NFS superblock lru lists + * @dst: Destination list + * @nmax: maximum number of requests to coalesce + * + * Scans one of the NFS superblock lru lists for upto nmax requests + * and returns them on a list. The requests are all guaranteed to be + * contiguous, originating from the same inode and the same file. + */ +int +nfs_scan_lru(struct list_head *head, struct list_head *dst, int nmax) +{ + struct list_head *pos; + struct nfs_page *req; + int npages = 0; + + list_for_each(pos, head) { + req = nfs_lru_entry(pos); + npages = nfs_scan_forward(req, dst, nmax); + if (npages) + break; + } + return npages; +} + +/** + * nfs_scan_lru_timeout - Scan one of the superblock lru lists for timed out requests + * @head: One of the NFS superblock lru lists + * @dst: Destination list + * @nmax: maximum number of requests to coalesce + * + * Scans one of the NFS superblock lru lists for upto nmax requests + * and returns them on a list. The requests are all guaranteed to be + * contiguous, originating from the same inode and the same file. + * The first request on the destination list will be timed out, the + * others are not guaranteed to be so. + */ +int +nfs_scan_lru_timeout(struct list_head *head, struct list_head *dst, int nmax) +{ + struct list_head *pos; + struct nfs_page *req; + int npages = 0; + + list_for_each(pos, head) { + req = nfs_lru_entry(pos); + if (time_after(req->wb_timeout, jiffies)) + break; + npages = nfs_scan_forward(req, dst, nmax); + if (npages) + break; + } + return npages; +} + +/** + * nfs_scan_list - Scan a list for matching requests + * @head: One of the NFS inode request lists + * @dst: Destination list + * @file: if set, ensure we match requests from this file + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves elements from one of the inode request lists. + * If the number of requests is set to 0, the entire address_space + * starting at index idx_start, is scanned. + * The requests are *not* checked to ensure that they form a contiguous set. + * You must be holding the nfs_wreq_lock when calling this function + */ +int +nfs_scan_list(struct list_head *head, struct list_head *dst, + struct file *file, + unsigned long idx_start, unsigned int npages) +{ + struct list_head *pos, *tmp; + struct nfs_page *req; + unsigned long idx_end; + int res; + + res = 0; + if (npages == 0) + idx_end = ~0; + else + idx_end = idx_start + npages - 1; + + list_for_each_safe(pos, tmp, head) { + unsigned long pg_idx; + + req = nfs_list_entry(pos); + + if (file && req->wb_file != file) + continue; + + pg_idx = page_index(req->wb_page); + if (pg_idx < idx_start) + continue; + if (pg_idx > idx_end) + break; + + if (!nfs_lock_request(req)) + continue; + nfs_list_remove_request(req); + nfs_list_add_request(req, dst); + __nfs_del_lru(req); + __nfs_add_lru(&NFS_SERVER(req->wb_inode)->lru_busy, req); + res++; + } + return res; +} + +/* + * nfs_try_to_free_pages - Free up NFS read/write requests + * @server: The NFS superblock + * + * This function attempts to flush out NFS reads and writes in order + * to keep the hard limit on the total number of pending requests + * on a given NFS partition. + * Note: we first try to commit unstable writes, then flush out pending + * reads, then finally the dirty pages. + * The assumption is that this reflects the ordering from the fastest + * to the slowest method for reclaiming requests. + */ +static int +nfs_try_to_free_pages(struct nfs_server *server) +{ + LIST_HEAD(head); + struct nfs_page *req = NULL; + int nreq; + + for (;;) { + if (req) { + int status = nfs_wait_on_request(req); + nfs_release_request(req); + if (status) + break; + req = NULL; + } + nreq = atomic_read(&server->rw_requests->nr_requests); + if (nreq < MAX_REQUEST_HARD) + return 1; + spin_lock(&nfs_wreq_lock); + /* Are there any busy RPC calls that might free up requests? */ + if (!list_empty(&server->lru_busy)) { + req = nfs_lru_entry(server->lru_busy.next); + req->wb_count++; + __nfs_del_lru(req); + spin_unlock(&nfs_wreq_lock); + continue; + } + +#ifdef CONFIG_NFS_V3 + /* Let's try to free up some completed NFSv3 unstable writes */ + nfs_scan_lru_commit(server, &head); + if (!list_empty(&head)) { + spin_unlock(&nfs_wreq_lock); + nfs_commit_list(&head, 0); + continue; + } +#endif + /* OK, so we try to free up some pending readaheads */ + nfs_scan_lru_read(server, &head); + if (!list_empty(&head)) { + spin_unlock(&nfs_wreq_lock); + nfs_pagein_list(&head, server->rpages); + continue; + } + /* Last resort: we try to flush out single requests */ + nfs_scan_lru_dirty(server, &head); + if (!list_empty(&head)) { + spin_unlock(&nfs_wreq_lock); + nfs_flush_list(&head, server->wpages, FLUSH_STABLE); + continue; + } + spin_unlock(&nfs_wreq_lock); + break; + } + /* We failed to free up requests */ + return 0; +} + +int nfs_init_nfspagecache(void) +{ + nfs_page_cachep = kmem_cache_create("nfs_page", + sizeof(struct nfs_page), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (nfs_page_cachep == NULL) + return -ENOMEM; + + return 0; +} + +void nfs_destroy_nfspagecache(void) +{ + if (kmem_cache_destroy(nfs_page_cachep)) + printk(KERN_INFO "nfs_page: not all structures were freed\n"); +} + diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/read.c linux-2.4.14-jukebox/fs/nfs/read.c --- linux-2.4.14-ext3/fs/nfs/read.c Thu Oct 11 17:12:52 2001 +++ linux-2.4.14-jukebox/fs/nfs/read.c Wed Nov 7 00:12:44 2001 @@ -113,11 +113,9 @@ inode->i_dev, (long long)NFS_FILEID(inode), (long long)offset, rsize, buffer); - lock_kernel(); result = NFS_PROTO(inode)->read(inode, cred, &fattr, flags, offset, rsize, buffer, &eof); nfs_refresh_inode(inode, &fattr); - unlock_kernel(); /* * Even if we had a partial success we can't mark the page @@ -148,34 +146,6 @@ return result; } -static inline struct nfs_page * -_nfs_find_read(struct inode *inode, struct page *page) -{ - struct list_head *head, *next; - - head = &inode->u.nfs_i.read; - next = head->next; - while (next != head) { - struct nfs_page *req = nfs_list_entry(next); - next = next->next; - if (page_index(req->wb_page) != page_index(page)) - continue; - req->wb_count++; - return req; - } - return NULL; -} - -static struct nfs_page * -nfs_find_read(struct inode *inode, struct page *page) -{ - struct nfs_page *req; - spin_lock(&nfs_wreq_lock); - req = _nfs_find_read(inode, page); - spin_unlock(&nfs_wreq_lock); - return req; -} - /* * Add a request to the inode's asynchronous read list. */ @@ -185,61 +155,26 @@ struct inode *inode = req->wb_inode; spin_lock(&nfs_wreq_lock); - if (list_empty(&req->wb_list)) { - nfs_list_add_request(req, &inode->u.nfs_i.read); - inode->u.nfs_i.nread++; - } + nfs_list_add_request(req, &inode->u.nfs_i.read); + inode->u.nfs_i.nread++; + __nfs_add_lru(&NFS_SERVER(inode)->lru_read, req); spin_unlock(&nfs_wreq_lock); - /* - * NB: the call to inode_schedule_scan() must lie outside the - * spinlock since it can run flushd(). - */ - inode_schedule_scan(inode, req->wb_timeout); } static int nfs_readpage_async(struct file *file, struct inode *inode, struct page *page) { - struct nfs_page *req, *new = NULL; - int result; - - for (;;) { - result = 0; - if (Page_Uptodate(page)) - break; - - req = nfs_find_read(inode, page); - if (req) { - if (page != req->wb_page) { - nfs_release_request(req); - nfs_pagein_inode(inode, page_index(page), 0); - continue; - } - nfs_release_request(req); - break; - } - - if (new) { - nfs_lock_request(new); - new->wb_timeout = jiffies + NFS_READ_DELAY; - nfs_mark_request_read(new); - nfs_unlock_request(new); - new = NULL; - break; - } + struct nfs_page *new; - result = -ENOMEM; - new = nfs_create_request(file, inode, page, 0, PAGE_CACHE_SIZE); - if (!new) - break; - } + new = nfs_create_request(file, inode, page, 0, PAGE_CACHE_SIZE); + if (IS_ERR(new)) + return PTR_ERR(new); + nfs_mark_request_read(new); if (inode->u.nfs_i.nread >= NFS_SERVER(inode)->rpages || page_index(page) == (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) nfs_pagein_inode(inode, 0, 0); - if (new) - nfs_release_request(new); - return result; + return 0; } /* @@ -335,9 +270,7 @@ rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -345,14 +278,13 @@ return -ENOMEM; } -static int -nfs_pagein_list(struct inode *inode, struct list_head *head) +int +nfs_pagein_list(struct list_head *head, int rpages) { LIST_HEAD(one_request); struct nfs_page *req; int error = 0; - unsigned int pages = 0, - rpages = NFS_SERVER(inode)->rpages; + unsigned int pages = 0; while (!list_empty(head)) { pages += nfs_coalesce_requests(head, &one_request, rpages); @@ -368,29 +300,70 @@ return error; } -static int -nfs_scan_read_timeout(struct inode *inode, struct list_head *dst) +/** + * nfs_scan_lru_read_timeout - Scan LRU list for timed out read requests + * @server: NFS superblock data + * @dst: destination list + * + * Moves a maximum of 'rpages' timed out requests from the NFS read LRU list. + * The elements are checked to ensure that they form a contiguous set + * of pages, and that they originated from the same file. + */ +int +nfs_scan_lru_read_timeout(struct nfs_server *server, struct list_head *dst) { - int pages; - spin_lock(&nfs_wreq_lock); - pages = nfs_scan_list_timeout(&inode->u.nfs_i.read, dst, inode); - inode->u.nfs_i.nread -= pages; - if ((inode->u.nfs_i.nread == 0) != list_empty(&inode->u.nfs_i.read)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.nread.\n"); - spin_unlock(&nfs_wreq_lock); - return pages; + struct inode *inode; + int npages; + + npages = nfs_scan_lru_timeout(&server->lru_read, dst, server->rpages); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + inode->u.nfs_i.nread -= npages; + } + return npages; +} + +/** + * nfs_scan_lru_read - Scan LRU list for read requests + * @server: NFS superblock data + * @dst: destination list + * + * Moves a maximum of 'rpages' requests from the NFS read LRU list. + * The elements are checked to ensure that they form a contiguous set + * of pages, and that they originated from the same file. + */ +int +nfs_scan_lru_read(struct nfs_server *server, struct list_head *dst) +{ + struct inode *inode; + int npages; + + npages = nfs_scan_lru(&server->lru_read, dst, server->rpages); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + inode->u.nfs_i.nread -= npages; + } + return npages; } +/* + * nfs_scan_read - Scan an inode for read requests + * @inode: NFS inode to scan + * @dst: destination list + * @idx_start: lower bound of page->index to scan + * @npages: idx_start + npages sets the upper bound to scan + * + * Moves requests from the inode's read list. + * The requests are *not* checked to ensure that they form a contiguous set. + */ static int nfs_scan_read(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages) { int res; - spin_lock(&nfs_wreq_lock); res = nfs_scan_list(&inode->u.nfs_i.read, dst, NULL, idx_start, npages); inode->u.nfs_i.nread -= res; if ((inode->u.nfs_i.nread == 0) != list_empty(&inode->u.nfs_i.read)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.nread.\n"); - spin_unlock(&nfs_wreq_lock); return res; } @@ -401,28 +374,16 @@ int res, error = 0; + spin_lock(&nfs_wreq_lock); res = nfs_scan_read(inode, &head, idx_start, npages); + spin_unlock(&nfs_wreq_lock); if (res) - error = nfs_pagein_list(inode, &head); + error = nfs_pagein_list(&head, NFS_SERVER(inode)->rpages); if (error < 0) return error; return res; } -int nfs_pagein_timeout(struct inode *inode) -{ - LIST_HEAD(head); - int pages, - error = 0; - - pages = nfs_scan_read_timeout(inode, &head); - if (pages) - error = nfs_pagein_list(inode, &head); - if (error < 0) - return error; - return pages; -} - /* * This is the callback from RPC telling us whether a reply was * received or some error occurred (timeout or socket shutdown). @@ -437,6 +398,9 @@ dprintk("NFS: %4d nfs_readpage_result, (status %d)\n", task->tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + nfs_refresh_inode(inode, &data->fattr); while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); @@ -457,8 +421,8 @@ (long long)NFS_FILEID(req->wb_inode), req->wb_bytes, (long long)(page_offset(page) + req->wb_offset)); - nfs_unlock_request(req); nfs_release_request(req); + nfs_unlock_request(req); } } @@ -500,11 +464,10 @@ if (error) goto out_error; - error = -1; - if (!PageError(page) && NFS_SERVER(inode)->rsize >= PAGE_CACHE_SIZE) + if (!PageError(page) && NFS_SERVER(inode)->rsize >= PAGE_CACHE_SIZE) { error = nfs_readpage_async(file, inode, page); - if (error >= 0) goto out; + } error = nfs_readpage_sync(file, inode, page); if (error < 0 && IS_SWAPFILE(inode)) diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/unlink.c linux-2.4.14-jukebox/fs/nfs/unlink.c --- linux-2.4.14-ext3/fs/nfs/unlink.c Thu Aug 16 18:39:37 2001 +++ linux-2.4.14-jukebox/fs/nfs/unlink.c Wed Nov 7 00:12:44 2001 @@ -123,6 +123,8 @@ struct dentry *dir = data->dir; struct inode *dir_i; + if (nfs_async_handle_jukebox(task)) + return; if (!dir) return; dir_i = dir->d_inode; diff -u --recursive --new-file linux-2.4.14-ext3/fs/nfs/write.c linux-2.4.14-jukebox/fs/nfs/write.c --- linux-2.4.14-ext3/fs/nfs/write.c Thu Oct 11 17:12:52 2001 +++ linux-2.4.14-jukebox/fs/nfs/write.c Wed Nov 7 00:12:44 2001 @@ -61,16 +61,9 @@ #include #include -#define NFS_PARANOIA 1 #define NFSDBG_FACILITY NFSDBG_PAGECACHE /* - * Spinlock - */ -spinlock_t nfs_wreq_lock = SPIN_LOCK_UNLOCKED; -static atomic_t nfs_nr_requests = ATOMIC_INIT(0); - -/* * Local structures * * This is the struct where the WRITE/COMMIT arguments go. @@ -103,27 +96,8 @@ # define IS_SWAPFILE(inode) (0) #endif -static kmem_cache_t *nfs_page_cachep; static kmem_cache_t *nfs_wdata_cachep; -static __inline__ struct nfs_page *nfs_page_alloc(void) -{ - struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, SLAB_NOFS); - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->wb_hash); - INIT_LIST_HEAD(&p->wb_list); - init_waitqueue_head(&p->wb_wait); - } - return p; -} - -static __inline__ void nfs_page_free(struct nfs_page *p) -{ - kmem_cache_free(nfs_page_cachep, p); -} - static __inline__ struct nfs_write_data *nfs_writedata_alloc(void) { struct nfs_write_data *p; @@ -151,16 +125,18 @@ * under NFSv2 when the NFSv3 attribute patch is included. * For the moment, we just call nfs_refresh_inode(). */ -static __inline__ int +static inline int nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) { + int status; if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { fattr->pre_size = NFS_CACHE_ISIZE(inode); fattr->pre_mtime = NFS_CACHE_MTIME(inode); fattr->pre_ctime = NFS_CACHE_CTIME(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_refresh_inode(inode, fattr); + status = nfs_refresh_inode(inode, fattr); + return status; } /* @@ -248,7 +224,6 @@ if (!req->wb_cred) req->wb_cred = get_rpccred(NFS_I(inode)->mm_cred); nfs_unlock_request(req); - nfs_release_request(req); nfs_strategy(inode); out: return status; @@ -287,7 +262,6 @@ if (page->index >= end_index+1 || !offset) goto out; do_it: - lock_kernel(); if (NFS_SERVER(inode)->wsize >= PAGE_CACHE_SIZE && !IS_SYNC(inode)) { err = nfs_writepage_async(NULL, inode, page, 0, offset); if (err >= 0) @@ -297,7 +271,6 @@ if (err == offset) err = 0; } - unlock_kernel(); out: UnlockPage(page); return err; @@ -332,18 +305,30 @@ /* * Insert a write request into an inode + * Note: we sort the list in order to be able to optimize nfs_find_request() + * & co. for the 'write append' case. For 2.5 we may want to consider + * some form of hashing so as to perform well on random writes. */ static inline void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { + struct list_head *pos, *head; + unsigned long pg_idx = page_index(req->wb_page); + if (!list_empty(&req->wb_hash)) return; if (!NFS_WBACK_BUSY(req)) printk(KERN_ERR "NFS: unlocked request attempted hashed!\n"); - if (list_empty(&inode->u.nfs_i.writeback)) + head = &inode->u.nfs_i.writeback; + if (list_empty(head)) atomic_inc(&inode->i_count); + list_for_each_prev(pos, head) { + struct nfs_page *entry = nfs_inode_wb_entry(pos); + if (page_index(entry->wb_page) < pg_idx) + break; + } inode->u.nfs_i.npages++; - list_add(&req->wb_hash, &inode->u.nfs_i.writeback); + list_add(&req->wb_hash, pos); req->wb_count++; } @@ -367,11 +352,11 @@ inode->u.nfs_i.npages--; if ((inode->u.nfs_i.npages == 0) != list_empty(&inode->u.nfs_i.writeback)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.npages.\n"); - if (list_empty(&inode->u.nfs_i.writeback)) + if (list_empty(&inode->u.nfs_i.writeback)) { + spin_unlock(&nfs_wreq_lock); iput(inode); - if (!nfs_have_writebacks(inode) && !nfs_have_read(inode)) - inode_remove_flushd(inode); - spin_unlock(&nfs_wreq_lock); + } else + spin_unlock(&nfs_wreq_lock); nfs_release_request(req); } @@ -381,15 +366,18 @@ static inline struct nfs_page * _nfs_find_request(struct inode *inode, struct page *page) { - struct list_head *head, *next; + struct list_head *head, *pos; + unsigned long pg_idx = page_index(page); head = &inode->u.nfs_i.writeback; - next = head->next; - while (next != head) { - struct nfs_page *req = nfs_inode_wb_entry(next); - next = next->next; - if (page_index(req->wb_page) != page_index(page)) + list_for_each_prev(pos, head) { + struct nfs_page *req = nfs_inode_wb_entry(pos); + unsigned long found_idx = page_index(req->wb_page); + + if (pg_idx < found_idx) continue; + if (pg_idx != found_idx) + break; req->wb_count++; return req; } @@ -408,44 +396,6 @@ } /* - * Insert a write request into a sorted list - */ -void nfs_list_add_request(struct nfs_page *req, struct list_head *head) -{ - struct list_head *prev; - - if (!list_empty(&req->wb_list)) { - printk(KERN_ERR "NFS: Add to list failed!\n"); - return; - } - if (!NFS_WBACK_BUSY(req)) - printk(KERN_ERR "NFS: unlocked request attempted added to list!\n"); - prev = head->prev; - while (prev != head) { - struct nfs_page *p = nfs_list_entry(prev); - if (page_index(p->wb_page) < page_index(req->wb_page)) - break; - prev = prev->prev; - } - list_add(&req->wb_list, prev); - req->wb_list_head = head; -} - -/* - * Insert a write request into an inode - */ -void nfs_list_remove_request(struct nfs_page *req) -{ - if (list_empty(&req->wb_list)) - return; - if (!NFS_WBACK_BUSY(req)) - printk(KERN_ERR "NFS: unlocked request attempted removed from list!\n"); - list_del(&req->wb_list); - INIT_LIST_HEAD(&req->wb_list); - req->wb_list_head = NULL; -} - -/* * Add a request to the inode's dirty list. */ static inline void @@ -454,16 +404,11 @@ struct inode *inode = req->wb_inode; spin_lock(&nfs_wreq_lock); - if (list_empty(&req->wb_list)) { - nfs_list_add_request(req, &inode->u.nfs_i.dirty); - inode->u.nfs_i.ndirty++; - } + nfs_list_add_request(req, &inode->u.nfs_i.dirty); + inode->u.nfs_i.ndirty++; + __nfs_del_lru(req); + __nfs_add_lru(&NFS_SERVER(inode)->lru_dirty, req); spin_unlock(&nfs_wreq_lock); - /* - * NB: the call to inode_schedule_scan() must lie outside the - * spinlock since it can run flushd(). - */ - inode_schedule_scan(inode, req->wb_timeout); mark_inode_dirty(inode); } @@ -487,165 +432,16 @@ struct inode *inode = req->wb_inode; spin_lock(&nfs_wreq_lock); - if (list_empty(&req->wb_list)) { - nfs_list_add_request(req, &inode->u.nfs_i.commit); - inode->u.nfs_i.ncommit++; - } + nfs_list_add_request(req, &inode->u.nfs_i.commit); + inode->u.nfs_i.ncommit++; + __nfs_del_lru(req); + __nfs_add_lru(&NFS_SERVER(inode)->lru_commit, req); spin_unlock(&nfs_wreq_lock); - /* - * NB: the call to inode_schedule_scan() must lie outside the - * spinlock since it can run flushd(). - */ - inode_schedule_scan(inode, req->wb_timeout); mark_inode_dirty(inode); } #endif /* - * Create a write request. - * Page must be locked by the caller. This makes sure we never create - * two different requests for the same page, and avoids possible deadlock - * when we reach the hard limit on the number of dirty pages. - * It should be safe to sleep here. - */ -struct nfs_page *nfs_create_request(struct file *file, struct inode *inode, - struct page *page, - unsigned int offset, unsigned int count) -{ - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - struct nfs_page *req = NULL; - long timeout; - - /* Deal with hard/soft limits. - */ - do { - /* If we're over the global soft limit, wake up all requests */ - if (atomic_read(&nfs_nr_requests) >= MAX_REQUEST_SOFT) { - dprintk("NFS: hit soft limit (%d requests)\n", - atomic_read(&nfs_nr_requests)); - if (!cache->task) - nfs_reqlist_init(NFS_SERVER(inode)); - nfs_wake_flushd(); - } - - /* If we haven't reached the local hard limit yet, - * try to allocate the request struct */ - if (atomic_read(&cache->nr_requests) < MAX_REQUEST_HARD) { - req = nfs_page_alloc(); - if (req != NULL) - break; - } - - /* We're over the hard limit. Wait for better times */ - dprintk("NFS: create_request sleeping (total %d pid %d)\n", - atomic_read(&cache->nr_requests), current->pid); - - timeout = 1 * HZ; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_INTR) { - interruptible_sleep_on_timeout(&cache->request_wait, - timeout); - if (signalled()) - break; - } else - sleep_on_timeout(&cache->request_wait, timeout); - - dprintk("NFS: create_request waking up (tot %d pid %d)\n", - atomic_read(&cache->nr_requests), current->pid); - } while (!req); - if (!req) - return NULL; - - /* Initialize the request struct. Initially, we assume a - * long write-back delay. This will be adjusted in - * update_nfs_request below if the region is not locked. */ - req->wb_page = page; - page_cache_get(page); - req->wb_offset = offset; - req->wb_bytes = count; - req->wb_file = file; - - /* If we have a struct file, use its cached credentials */ - if (file) { - get_file(file); - req->wb_cred = nfs_file_cred(file); - } - req->wb_inode = inode; - req->wb_count = 1; - - /* register request's existence */ - atomic_inc(&cache->nr_requests); - atomic_inc(&nfs_nr_requests); - return req; -} - - -/* - * Release all resources associated with a write request after it - * has been committed to stable storage - * - * Note: Should always be called with the spinlock held! - */ -void -nfs_release_request(struct nfs_page *req) -{ - struct inode *inode = req->wb_inode; - struct nfs_reqlist *cache = NFS_REQUESTLIST(inode); - struct page *page = req->wb_page; - - spin_lock(&nfs_wreq_lock); - if (--req->wb_count) { - spin_unlock(&nfs_wreq_lock); - return; - } - spin_unlock(&nfs_wreq_lock); - - if (!list_empty(&req->wb_list)) { - printk(KERN_ERR "NFS: Request released while still on a list!\n"); - nfs_list_remove_request(req); - } - if (!list_empty(&req->wb_hash)) { - printk(KERN_ERR "NFS: Request released while still hashed!\n"); - nfs_inode_remove_request(req); - } - if (NFS_WBACK_BUSY(req)) - printk(KERN_ERR "NFS: Request released while still locked!\n"); - - /* Release struct file or cached credential */ - if (req->wb_file) - fput(req->wb_file); - else if (req->wb_cred) - put_rpccred(req->wb_cred); - page_cache_release(page); - nfs_page_free(req); - /* wake up anyone waiting to allocate a request */ - atomic_dec(&cache->nr_requests); - atomic_dec(&nfs_nr_requests); - wake_up(&cache->request_wait); -#ifdef NFS_PARANOIA - if (atomic_read(&cache->nr_requests) < 0) - BUG(); - if (atomic_read(&nfs_nr_requests) < 0) - BUG(); -#endif -} - -/* - * Wait for a request to complete. - * - * Interruptible by signals only if mounted with intr flag. - */ -static int -nfs_wait_on_request(struct nfs_page *req) -{ - struct inode *inode = req->wb_inode; - struct rpc_clnt *clnt = NFS_CLIENT(inode); - - if (!NFS_WBACK_BUSY(req)) - return 0; - return nfs_wait_event(clnt, req->wb_wait, !NFS_WBACK_BUSY(req)); -} - -/* * Wait for a request to complete. * * Interruptible by signals only if mounted with intr flag. @@ -663,20 +459,20 @@ else idx_end = idx_start + npages - 1; - spin_lock(&nfs_wreq_lock); head = &inode->u.nfs_i.writeback; - p = head->next; - while (p != head) { + restart: + spin_lock(&nfs_wreq_lock); + list_for_each_prev(p, head) { unsigned long pg_idx; struct nfs_page *req = nfs_inode_wb_entry(p); - p = p->next; - if (file && req->wb_file != file) continue; pg_idx = page_index(req->wb_page); - if (pg_idx < idx_start || pg_idx > idx_end) + if (pg_idx < idx_start) + break; + if (pg_idx > idx_end) continue; if (!NFS_WBACK_BUSY(req)) @@ -687,163 +483,159 @@ nfs_release_request(req); if (error < 0) return error; - spin_lock(&nfs_wreq_lock); - p = head->next; res++; + goto restart; } spin_unlock(&nfs_wreq_lock); return res; } -/* - * Scan cluster for dirty pages and send as many of them to the - * server as possible. +/** + * nfs_scan_lru_dirty_timeout - Scan LRU list for timed out dirty requests + * @server: NFS superblock data + * @dst: destination list + * + * Moves a maximum of 'wpages' requests from the NFS dirty page LRU list. + * The elements are checked to ensure that they form a contiguous set + * of pages, and that they originated from the same file. */ -int nfs_scan_list_timeout(struct list_head *head, struct list_head *dst, struct inode *inode) -{ - struct list_head *p; - struct nfs_page *req; - int pages = 0; - - p = head->next; - while (p != head) { - req = nfs_list_entry(p); - p = p->next; - if (time_after(req->wb_timeout, jiffies)) { - if (time_after(NFS_NEXTSCAN(inode), req->wb_timeout)) - NFS_NEXTSCAN(inode) = req->wb_timeout; - continue; - } - if (!nfs_lock_request(req)) - continue; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - pages++; - } - return pages; -} - -static int -nfs_scan_dirty_timeout(struct inode *inode, struct list_head *dst) +int +nfs_scan_lru_dirty_timeout(struct nfs_server *server, struct list_head *dst) { - int pages; - spin_lock(&nfs_wreq_lock); - pages = nfs_scan_list_timeout(&inode->u.nfs_i.dirty, dst, inode); - inode->u.nfs_i.ndirty -= pages; - if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); - spin_unlock(&nfs_wreq_lock); - return pages; -} + struct inode *inode; + int npages; -#ifdef CONFIG_NFS_V3 -static int -nfs_scan_commit_timeout(struct inode *inode, struct list_head *dst) -{ - int pages; - spin_lock(&nfs_wreq_lock); - pages = nfs_scan_list_timeout(&inode->u.nfs_i.commit, dst, inode); - inode->u.nfs_i.ncommit -= pages; - if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit)) - printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); - spin_unlock(&nfs_wreq_lock); - return pages; + npages = nfs_scan_lru_timeout(&server->lru_dirty, dst, server->wpages); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + inode->u.nfs_i.ndirty -= npages; + } + return npages; } -#endif -int nfs_scan_list(struct list_head *src, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) +/** + * nfs_scan_lru_dirty - Scan LRU list for dirty requests + * @server: NFS superblock data + * @dst: destination list + * + * Moves a maximum of 'wpages' requests from the NFS dirty page LRU list. + * The elements are checked to ensure that they form a contiguous set + * of pages, and that they originated from the same file. + */ +int +nfs_scan_lru_dirty(struct nfs_server *server, struct list_head *dst) { - struct list_head *p; - struct nfs_page *req; - unsigned long idx_end; - int res; - - res = 0; - if (npages == 0) - idx_end = ~0; - else - idx_end = idx_start + npages - 1; - p = src->next; - while (p != src) { - unsigned long pg_idx; - - req = nfs_list_entry(p); - p = p->next; - - if (file && req->wb_file != file) - continue; - - pg_idx = page_index(req->wb_page); - if (pg_idx < idx_start || pg_idx > idx_end) - continue; + struct inode *inode; + int npages; - if (!nfs_lock_request(req)) - continue; - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - res++; + npages = nfs_scan_lru(&server->lru_dirty, dst, server->wpages); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + inode->u.nfs_i.ndirty -= npages; } - return res; + return npages; } +/* + * nfs_scan_dirty - Scan an inode for dirty requests + * @inode: NFS inode to scan + * @dst: destination list + * @file: if set, ensure we match requests from this file + * @idx_start: lower bound of page->index to scan. + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves requests from the inode's dirty page list. + * The requests are *not* checked to ensure that they form a contiguous set. + */ static int nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) { int res; - spin_lock(&nfs_wreq_lock); res = nfs_scan_list(&inode->u.nfs_i.dirty, dst, file, idx_start, npages); inode->u.nfs_i.ndirty -= res; if ((inode->u.nfs_i.ndirty == 0) != list_empty(&inode->u.nfs_i.dirty)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); - spin_unlock(&nfs_wreq_lock); return res; } #ifdef CONFIG_NFS_V3 +/** + * nfs_scan_lru_commit_timeout - Scan LRU list for timed out commit requests + * @server: NFS superblock data + * @dst: destination list + * + * Finds the first a timed out request in the NFS commit LRU list and moves it + * to the list dst. If such an element is found, we move all other commit + * requests that apply to the same inode. + * The assumption is that doing everything in a single commit-to-disk is + * the cheaper alternative. + */ +int +nfs_scan_lru_commit_timeout(struct nfs_server *server, struct list_head *dst) +{ + struct inode *inode; + int npages; + + npages = nfs_scan_lru_timeout(&server->lru_commit, dst, 1); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + npages += nfs_scan_list(&inode->u.nfs_i.commit, dst, NULL, 0, 0); + inode->u.nfs_i.ncommit -= npages; + } + return npages; +} + + +/** + * nfs_scan_lru_commit_timeout - Scan LRU list for timed out commit requests + * @server: NFS superblock data + * @dst: destination list + * + * Finds the first request in the NFS commit LRU list and moves it + * to the list dst. If such an element is found, we move all other commit + * requests that apply to the same inode. + * The assumption is that doing everything in a single commit-to-disk is + * the cheaper alternative. + */ +int +nfs_scan_lru_commit(struct nfs_server *server, struct list_head *dst) +{ + struct inode *inode; + int npages; + + npages = nfs_scan_lru(&server->lru_commit, dst, 1); + if (npages) { + inode = nfs_list_entry(dst->next)->wb_inode; + npages += nfs_scan_list(&inode->u.nfs_i.commit, dst, NULL, 0, 0); + inode->u.nfs_i.ncommit -= npages; + } + return npages; +} + +/* + * nfs_scan_commit - Scan an inode for commit requests + * @inode: NFS inode to scan + * @dst: destination list + * @file: if set, ensure we collect requests from this file only. + * @idx_start: lower bound of page->index to scan. + * @npages: idx_start + npages sets the upper bound to scan. + * + * Moves requests from the inode's 'commit' request list. + * The requests are *not* checked to ensure that they form a contiguous set. + */ static int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, unsigned long idx_start, unsigned int npages) { int res; - spin_lock(&nfs_wreq_lock); res = nfs_scan_list(&inode->u.nfs_i.commit, dst, file, idx_start, npages); inode->u.nfs_i.ncommit -= res; if ((inode->u.nfs_i.ncommit == 0) != list_empty(&inode->u.nfs_i.commit)) printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); - spin_unlock(&nfs_wreq_lock); return res; } #endif -int nfs_coalesce_requests(struct list_head *src, struct list_head *dst, unsigned int maxpages) -{ - struct nfs_page *req = NULL; - unsigned int pages = 0; - - while (!list_empty(src)) { - struct nfs_page *prev = req; - - req = nfs_list_entry(src->next); - if (prev) { - if (req->wb_file != prev->wb_file) - break; - if (page_index(req->wb_page) != page_index(prev->wb_page)+1) - break; - - if (req->wb_offset != 0) - break; - } - nfs_list_remove_request(req); - nfs_list_add_request(req, dst); - pages++; - if (req->wb_offset + req->wb_bytes != PAGE_CACHE_SIZE) - break; - if (pages >= maxpages) - break; - } - return pages; -} - /* * Try to update any existing write request, or create one if there is none. * In order to match, the request's credentials must match those of @@ -867,7 +659,7 @@ spin_lock(&nfs_wreq_lock); req = _nfs_find_request(inode, page); if (req) { - if (!nfs_lock_request(req)) { + if (!nfs_lock_request_dontget(req)) { int error; spin_unlock(&nfs_wreq_lock); error = nfs_wait_on_request(req); @@ -882,24 +674,18 @@ break; } - req = new; - if (req) { - nfs_lock_request(req); - nfs_inode_add_request(inode, req); + if (new) { + nfs_lock_request_dontget(new); + nfs_inode_add_request(inode, new); spin_unlock(&nfs_wreq_lock); - nfs_mark_request_dirty(req); - break; + nfs_mark_request_dirty(new); + return new; } spin_unlock(&nfs_wreq_lock); - /* - * If we're over the soft limit, flush out old requests - */ - if (inode->u.nfs_i.npages >= MAX_REQUEST_SOFT) - nfs_wb_file(inode, file); new = nfs_create_request(file, inode, page, offset, bytes); - if (!new) - return ERR_PTR(-ENOMEM); + if (IS_ERR(new)) + return new; /* If the region is locked, adjust the timeout */ if (region_locked(inode, new)) new->wb_timeout = jiffies + NFS_WRITEBACK_LOCKDELAY; @@ -919,7 +705,6 @@ || !nfs_dirty_request(req) || offset > rqend || end < req->wb_offset) { nfs_unlock_request(req); - nfs_release_request(req); return ERR_PTR(-EBUSY); } @@ -967,23 +752,12 @@ if (NFS_PROTO(inode)->version == 2) { if (dirty >= NFS_STRATEGY_PAGES * wpages) nfs_flush_file(inode, NULL, 0, 0, 0); - } else { - if (dirty >= wpages) - nfs_flush_file(inode, NULL, 0, 0, 0); - if (inode->u.nfs_i.ncommit > NFS_STRATEGY_PAGES * wpages && - atomic_read(&nfs_nr_requests) > MAX_REQUEST_SOFT) - nfs_commit_file(inode, NULL, 0, 0, 0); - } + } else if (dirty >= wpages) + nfs_flush_file(inode, NULL, 0, 0, 0); #else if (dirty >= NFS_STRATEGY_PAGES * wpages) nfs_flush_file(inode, NULL, 0, 0, 0); #endif - /* - * If we're running out of free requests, flush out everything - * in order to reduce memory useage... - */ - if (inode->u.nfs_i.npages > MAX_REQUEST_SOFT) - nfs_wb_all(inode); } int @@ -1052,16 +826,16 @@ goto done; status = 0; - nfs_unlock_request(req); /* If we wrote past the end of the page. * Call the strategy routine so it can send out a bunch * of requests. */ if (req->wb_offset == 0 && req->wb_bytes == PAGE_CACHE_SIZE) { SetPageUptodate(page); + nfs_unlock_request(req); nfs_strategy(inode); - } - nfs_release_request(req); + } else + nfs_unlock_request(req); done: dprintk("NFS: nfs_updatepage returns %d (isize %Ld)\n", status, (long long)inode->i_size); @@ -1123,6 +897,7 @@ struct rpc_task *task; struct rpc_message msg; int flags, + nfsvers = NFS_PROTO(inode)->version, async = !(how & FLUSH_SYNC), stable = (how & FLUSH_STABLE); sigset_t oldset; @@ -1138,7 +913,9 @@ /* Set up the argument struct */ nfs_write_rpcsetup(head, data); - if (stable) { + if (nfsvers < 3) + data->args.stable = NFS_FILE_SYNC; + else if (stable) { if (!inode->u.nfs_i.ncommit) data->args.stable = NFS_FILE_SYNC; else @@ -1153,7 +930,7 @@ task->tk_release = nfs_writedata_release; #ifdef CONFIG_NFS_V3 - msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? NFS3PROC_WRITE : NFSPROC_WRITE; + msg.rpc_proc = (nfsvers == 3) ? NFS3PROC_WRITE : NFSPROC_WRITE; #else msg.rpc_proc = NFSPROC_WRITE; #endif @@ -1169,9 +946,7 @@ rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -1184,14 +959,13 @@ return -ENOMEM; } -static int -nfs_flush_list(struct inode *inode, struct list_head *head, int how) +int +nfs_flush_list(struct list_head *head, int wpages, int how) { LIST_HEAD(one_request); struct nfs_page *req; int error = 0; - unsigned int pages = 0, - wpages = NFS_SERVER(inode)->wpages; + unsigned int pages = 0; while (!list_empty(head)) { pages += nfs_coalesce_requests(head, &one_request, wpages); @@ -1229,6 +1003,9 @@ dprintk("NFS: %4d nfs_writeback_done (status %d)\n", task->tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + /* We can't handle that yet but we check for it nevertheless */ if (resp->count < argp->count && task->tk_status >= 0) { static unsigned long complain; @@ -1294,7 +1071,7 @@ } #ifdef CONFIG_NFS_V3 - if (resp->verf->committed != NFS_UNSTABLE) { + if (argp->stable != NFS_UNSTABLE || resp->verf->committed == NFS_FILE_SYNC) { nfs_inode_remove_request(req); dprintk(" OK\n"); goto next; @@ -1355,7 +1132,7 @@ /* * Commit dirty pages */ -static int +int nfs_commit_list(struct list_head *head, int how) { struct rpc_message msg; @@ -1393,9 +1170,7 @@ dprintk("NFS: %4d initiated commit call\n", task->tk_pid); rpc_clnt_sigmask(clnt, &oldset); rpc_call_setup(task, &msg, 0); - lock_kernel(); rpc_execute(task); - unlock_kernel(); rpc_clnt_sigunmask(clnt, &oldset); return 0; out_bad: @@ -1422,6 +1197,9 @@ dprintk("NFS: %4d nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); + if (nfs_async_handle_jukebox(task)) + return; + nfs_write_attributes(inode, resp->fattr); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1464,28 +1242,16 @@ int res, error = 0; + spin_lock(&nfs_wreq_lock); res = nfs_scan_dirty(inode, &head, file, idx_start, npages); + spin_unlock(&nfs_wreq_lock); if (res) - error = nfs_flush_list(inode, &head, how); + error = nfs_flush_list(&head, NFS_SERVER(inode)->wpages, how); if (error < 0) return error; return res; } -int nfs_flush_timeout(struct inode *inode, int how) -{ - LIST_HEAD(head); - int pages, - error = 0; - - pages = nfs_scan_dirty_timeout(inode, &head); - if (pages) - error = nfs_flush_list(inode, &head, how); - if (error < 0) - return error; - return pages; -} - #ifdef CONFIG_NFS_V3 int nfs_commit_file(struct inode *inode, struct file *file, unsigned long idx_start, unsigned int npages, int how) @@ -1494,29 +1260,15 @@ int res, error = 0; + spin_lock(&nfs_wreq_lock); res = nfs_scan_commit(inode, &head, file, idx_start, npages); + spin_unlock(&nfs_wreq_lock); if (res) error = nfs_commit_list(&head, how); if (error < 0) return error; return res; } - -int nfs_commit_timeout(struct inode *inode, int how) -{ - LIST_HEAD(head); - int pages, - error = 0; - - pages = nfs_scan_commit_timeout(inode, &head); - if (pages) { - pages += nfs_scan_commit(inode, &head, NULL, 0, 0); - error = nfs_commit_list(&head, how); - } - if (error < 0) - return error; - return pages; -} #endif int nfs_sync_file(struct inode *inode, struct file *file, unsigned long idx_start, @@ -1545,15 +1297,8 @@ return error; } -int nfs_init_nfspagecache(void) +int nfs_init_writepagecache(void) { - nfs_page_cachep = kmem_cache_create("nfs_page", - sizeof(struct nfs_page), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (nfs_page_cachep == NULL) - return -ENOMEM; - nfs_wdata_cachep = kmem_cache_create("nfs_write_data", sizeof(struct nfs_write_data), 0, SLAB_HWCACHE_ALIGN, @@ -1564,10 +1309,8 @@ return 0; } -void nfs_destroy_nfspagecache(void) +void nfs_destroy_writepagecache(void) { - if (kmem_cache_destroy(nfs_page_cachep)) - printk(KERN_INFO "nfs_page: not all structures were freed\n"); if (kmem_cache_destroy(nfs_wdata_cachep)) printk(KERN_INFO "nfs_write_data: not all structures were freed\n"); } diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/dcache.h linux-2.4.14-jukebox/include/linux/dcache.h --- linux-2.4.14-ext3/include/linux/dcache.h Wed Oct 24 06:59:05 2001 +++ linux-2.4.14-jukebox/include/linux/dcache.h Tue Nov 6 13:20:33 2001 @@ -80,6 +80,8 @@ struct super_block * d_sb; /* The root of the dentry tree */ unsigned long d_vfs_flags; void * d_fsdata; /* fs-specific data */ + unsigned long d_rtime_sec; /* used by nfs d_revalidate */ + unsigned long d_rtime_nsec; /* used by nfs d_revalidate */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ }; diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/fs.h linux-2.4.14-jukebox/include/linux/fs.h --- linux-2.4.14-ext3/include/linux/fs.h Tue Nov 6 13:00:28 2001 +++ linux-2.4.14-jukebox/include/linux/fs.h Tue Nov 6 13:20:33 2001 @@ -851,6 +851,7 @@ int (*revalidate) (struct dentry *); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct dentry *, struct iattr *); + int (*check_stale) (struct inode *); }; /* diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/list.h linux-2.4.14-jukebox/include/linux/list.h --- linux-2.4.14-ext3/include/linux/list.h Wed Oct 24 06:59:06 2001 +++ linux-2.4.14-jukebox/include/linux/list.h Tue Nov 6 13:20:33 2001 @@ -162,6 +162,16 @@ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) +/** + * list_for_each_prev - iterate over a list in reverse order + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \ + pos = pos->prev, prefetch(pos->prev)) + + #endif /* __KERNEL__ || _LVM_H_INCLUDE */ #endif diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/nfs_flushd.h linux-2.4.14-jukebox/include/linux/nfs_flushd.h --- linux-2.4.14-ext3/include/linux/nfs_flushd.h Wed Oct 24 07:00:40 2001 +++ linux-2.4.14-jukebox/include/linux/nfs_flushd.h Tue Nov 6 13:44:26 2001 @@ -9,11 +9,9 @@ /* * Counters of total number and pending number of requests. - * When the total number of requests exceeds the soft limit, we start - * flushing out requests. If it exceeds the hard limit, we stall until - * it drops again. + * When the total number of requests exceeds the hard limit, we stall + * until it drops again. */ -#define MAX_REQUEST_SOFT 192 #define MAX_REQUEST_HARD 256 /* @@ -36,8 +34,6 @@ extern void nfs_reqlist_free(struct nfs_server *); extern int nfs_reqlist_init(struct nfs_server *); extern void nfs_reqlist_exit(struct nfs_server *); -extern void inode_schedule_scan(struct inode *, unsigned long); -extern void inode_remove_flushd(struct inode *); extern void nfs_wake_flushd(void); /* diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/nfs_fs.h linux-2.4.14-jukebox/include/linux/nfs_fs.h --- linux-2.4.14-ext3/include/linux/nfs_fs.h Wed Oct 24 07:00:07 2001 +++ linux-2.4.14-jukebox/include/linux/nfs_fs.h Wed Nov 7 00:18:20 2001 @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -81,10 +82,7 @@ #define NFS_CACHE_MTIME(inode) ((inode)->u.nfs_i.read_cache_mtime) #define NFS_CACHE_ISIZE(inode) ((inode)->u.nfs_i.read_cache_isize) #define NFS_NEXTSCAN(inode) ((inode)->u.nfs_i.nextscan) -#define NFS_CACHEINV(inode) \ -do { \ - NFS_READTIME(inode) = jiffies - NFS_MAXATTRTIMEO(inode) - 1; \ -} while (0) +#define NFS_CACHEINV(inode) nfs_invalidate_caches(inode) #define NFS_ATTRTIMEO(inode) ((inode)->u.nfs_i.attrtimeo) #define NFS_MINATTRTIMEO(inode) \ (S_ISDIR(inode->i_mode)? NFS_SERVER(inode)->acdirmin \ @@ -101,8 +99,15 @@ #define NFS_FILEID(inode) ((inode)->u.nfs_i.fileid) #define NFS_FSID(inode) ((inode)->u.nfs_i.fsid) -/* Inode Flags */ -#define NFS_USE_READDIRPLUS(inode) ((NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS) ? 1 : 0) +static inline int nfs_server_capable(struct inode *inode, int cap) +{ + return NFS_SERVER(inode)->caps & cap; +} + +static inline int NFS_USE_READDIRPLUS(struct inode *inode) +{ + return NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS; +} /* * These are the default flags for swap requests @@ -141,6 +146,7 @@ * linux/fs/nfs/inode.c */ extern struct super_block *nfs_read_super(struct super_block *, void *, int); +extern void nfs_invalidate_caches(struct inode *); extern void nfs_zap_caches(struct inode *); extern int nfs_inode_is_stale(struct inode *, struct nfs_fh *, struct nfs_fattr *); @@ -152,6 +158,7 @@ extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); +extern int nfs_check_stale(struct inode *); extern int nfs_notify_change(struct dentry *, struct iattr *); /* @@ -207,10 +214,14 @@ */ extern int nfs_sync_file(struct inode *, struct file *, unsigned long, unsigned int, int); extern int nfs_flush_file(struct inode *, struct file *, unsigned long, unsigned int, int); -extern int nfs_flush_timeout(struct inode *, int); +extern int nfs_flush_list(struct list_head *, int, int); +extern int nfs_scan_lru_dirty(struct nfs_server *, struct list_head *); +extern int nfs_scan_lru_dirty_timeout(struct nfs_server *, struct list_head *); #ifdef CONFIG_NFS_V3 extern int nfs_commit_file(struct inode *, struct file *, unsigned long, unsigned int, int); -extern int nfs_commit_timeout(struct inode *, int); +extern int nfs_commit_list(struct list_head *, int); +extern int nfs_scan_lru_commit(struct nfs_server *, struct list_head *); +extern int nfs_scan_lru_commit_timeout(struct nfs_server *, struct list_head *); #endif static inline int @@ -257,7 +268,9 @@ */ extern int nfs_readpage(struct file *, struct page *); extern int nfs_pagein_inode(struct inode *, unsigned long, unsigned int); -extern int nfs_pagein_timeout(struct inode *); +extern int nfs_pagein_list(struct list_head *, int); +extern int nfs_scan_lru_read(struct nfs_server *, struct list_head *); +extern int nfs_scan_lru_read_timeout(struct nfs_server *, struct list_head *); /* * linux/fs/mount_clnt.c @@ -326,6 +339,29 @@ __retval; \ }) +#ifdef CONFIG_NFS_V3 + +#define NFS_JUKEBOX_RETRY_TIME (5 * HZ) +static inline int +nfs_async_handle_jukebox(struct rpc_task *task) +{ + if (task->tk_status != -EJUKEBOX) + return 0; + task->tk_status = 0; + rpc_restart_call(task); + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); + return 1; +} + +#else + +static inline int +nfs_async_handle_jukebox(struct rpc_task *task) +{ + return 0; +} +#endif /* CONFIG_NFS_V3 */ + #endif /* __KERNEL__ */ /* diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/nfs_fs_sb.h linux-2.4.14-jukebox/include/linux/nfs_fs_sb.h --- linux-2.4.14-ext3/include/linux/nfs_fs_sb.h Wed Apr 26 02:28:56 2000 +++ linux-2.4.14-jukebox/include/linux/nfs_fs_sb.h Tue Nov 6 13:20:33 2001 @@ -1,6 +1,8 @@ #ifndef _NFS_FS_SB #define _NFS_FS_SB +#include + /* * NFS client parameters stored in the superblock. */ @@ -8,6 +10,7 @@ struct rpc_clnt * client; /* RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ int flags; /* various flags */ + unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ unsigned int wsize; /* write size */ @@ -21,6 +24,10 @@ unsigned int namelen; char * hostname; /* remote hostname */ struct nfs_reqlist * rw_requests; /* async read/write requests */ + struct list_head lru_read, + lru_dirty, + lru_commit, + lru_busy; }; /* @@ -30,4 +37,8 @@ struct nfs_server s_server; }; +/* Server capabilities */ +#define NFS_CAP_READDIRPLUS 1 + + #endif diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/nfs_page.h linux-2.4.14-jukebox/include/linux/nfs_page.h --- linux-2.4.14-ext3/include/linux/nfs_page.h Wed Oct 24 07:00:45 2001 +++ linux-2.4.14-jukebox/include/linux/nfs_page.h Tue Nov 6 13:44:26 2001 @@ -23,6 +23,7 @@ struct nfs_page { struct list_head wb_hash, /* Inode */ + wb_lru, /* superblock lru list */ wb_list, /* Defines state of page: */ *wb_list_head; /* read/write/commit */ struct file *wb_file; @@ -40,33 +41,39 @@ #define NFS_WBACK_BUSY(req) (test_bit(PG_BUSY,&(req)->wb_flags)) -extern struct nfs_page *nfs_create_request(struct file *file, - struct inode *inode, - struct page *page, - unsigned int offset, - unsigned int count); +extern struct nfs_page *nfs_create_request(struct file *, struct inode *, + struct page *, + unsigned int, unsigned int); extern void nfs_release_request(struct nfs_page *req); -extern void nfs_list_add_request(struct nfs_page *req, - struct list_head *head); -extern void nfs_list_remove_request(struct nfs_page *req); - -extern int nfs_scan_list_timeout(struct list_head *head, - struct list_head *dst, - struct inode *inode); -extern int nfs_scan_list(struct list_head *src, struct list_head *dst, - struct file *file, unsigned long idx_start, - unsigned int npages); -extern int nfs_coalesce_requests(struct list_head *src, struct list_head *dst, - unsigned int maxpages); +extern void nfs_list_add_request(struct nfs_page *, struct list_head *); + +extern int nfs_scan_lru(struct list_head *, struct list_head *, int); +extern int nfs_scan_lru_timeout(struct list_head *, struct list_head *, int); +extern int nfs_scan_list(struct list_head *, struct list_head *, + struct file *, unsigned long, unsigned int); +extern int nfs_coalesce_requests(struct list_head *, struct list_head *, + unsigned int); +extern int nfs_wait_on_request(struct nfs_page *); extern spinlock_t nfs_wreq_lock; /* + * Lock the page of an asynchronous request without incrementing the wb_count + */ +static inline int +nfs_lock_request_dontget(struct nfs_page *req) +{ + if (test_and_set_bit(PG_BUSY, &req->wb_flags)) + return 0; + return 1; +} + +/* * Lock the page of an asynchronous request */ -static __inline__ int +static inline int nfs_lock_request(struct nfs_page *req) { if (test_and_set_bit(PG_BUSY, &req->wb_flags)) @@ -75,7 +82,7 @@ return 1; } -static __inline__ void +static inline void nfs_unlock_request(struct nfs_page *req) { if (!NFS_WBACK_BUSY(req)) { @@ -86,20 +93,57 @@ clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_clear_bit(); if (waitqueue_active(&req->wb_wait)) - wake_up(&req->wb_wait); + wake_up_all(&req->wb_wait); nfs_release_request(req); } -static __inline__ struct nfs_page * +/** + * nfs_list_remove_request - Remove a request from its wb_list + * @req: request + */ +static inline void +nfs_list_remove_request(struct nfs_page *req) +{ + if (list_empty(&req->wb_list)) + return; + if (!NFS_WBACK_BUSY(req)) { + printk(KERN_ERR "NFS: unlocked request attempted removed from list!\n"); + BUG(); + } + list_del_init(&req->wb_list); + req->wb_list_head = NULL; +} + +static inline struct nfs_page * nfs_list_entry(struct list_head *head) { return list_entry(head, struct nfs_page, wb_list); } -static __inline__ struct nfs_page * +static inline struct nfs_page * nfs_inode_wb_entry(struct list_head *head) { return list_entry(head, struct nfs_page, wb_hash); } +static inline void +__nfs_add_lru(struct list_head *head, struct nfs_page *req) +{ + list_add_tail(&req->wb_lru, head); +} + +static inline void +__nfs_del_lru(struct nfs_page *req) +{ + if (list_empty(&req->wb_lru)) + return; + list_del_init(&req->wb_lru); +} + +static inline struct nfs_page * +nfs_lru_entry(struct list_head *head) +{ + return list_entry(head, struct nfs_page, wb_lru); +} + #endif /* _LINUX_NFS_PAGE_H */ diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/nfs_xdr.h linux-2.4.14-jukebox/include/linux/nfs_xdr.h --- linux-2.4.14-ext3/include/linux/nfs_xdr.h Mon Jan 29 21:07:43 2001 +++ linux-2.4.14-jukebox/include/linux/nfs_xdr.h Tue Nov 6 13:04:28 2001 @@ -35,6 +35,7 @@ #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ #define NFS_ATTR_FATTR 0x0002 /* post-op attributes */ #define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */ +#define NFS_ATTR_RDPLUS 0x0008 /* Made in readdirplus */ /* * Info on the file system @@ -112,8 +113,8 @@ const char * name; unsigned int len; int eof; - struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; }; /* diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/sunrpc/clnt.h linux-2.4.14-jukebox/include/linux/sunrpc/clnt.h --- linux-2.4.14-ext3/include/linux/sunrpc/clnt.h Wed Oct 24 06:59:54 2001 +++ linux-2.4.14-jukebox/include/linux/sunrpc/clnt.h Tue Nov 6 13:43:43 2001 @@ -111,6 +111,8 @@ void rpc_release_client(struct rpc_clnt *); void rpc_getport(struct rpc_task *, struct rpc_clnt *); int rpc_register(u32, u32, int, unsigned short, int *); +u32 * rpc_call_header(struct rpc_task *task); +u32 * rpc_call_verify(struct rpc_task *task); void rpc_call_setup(struct rpc_task *, struct rpc_message *, int); @@ -144,5 +146,10 @@ */ int rpc_getport_external(struct sockaddr_in *, __u32, __u32, int); +/* + * Ping function + */ +void rpc_ping(struct rpc_task *task); + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff -u --recursive --new-file linux-2.4.14-ext3/include/linux/sunrpc/xprt.h linux-2.4.14-jukebox/include/linux/sunrpc/xprt.h --- linux-2.4.14-ext3/include/linux/sunrpc/xprt.h Wed Oct 24 06:59:54 2001 +++ linux-2.4.14-jukebox/include/linux/sunrpc/xprt.h Tue Nov 6 13:43:43 2001 @@ -39,12 +39,14 @@ * Come Linux 2.3, we'll handle fragments directly. */ #define RPC_MAXCONG 16 -#define RPC_MAXREQS (RPC_MAXCONG + 1) +#define RPC_MAXREQS (RPC_MAXCONG + 2) #define RPC_CWNDSCALE 256 #define RPC_MAXCWND (RPC_MAXCONG * RPC_CWNDSCALE) #define RPC_INITCWND RPC_CWNDSCALE #define RPCXPRT_CONGESTED(xprt) \ ((xprt)->cong >= (xprt)->cwnd) +#define RPCXPRT_SUPERCONGESTED(xprt) \ + ((xprt)->cwnd < 2*RPC_CWNDSCALE) /* Default timeout values */ #define RPC_MAX_UDP_TIMEOUT (60*HZ) @@ -135,6 +137,7 @@ struct rpc_wait_queue sending; /* requests waiting to send */ struct rpc_wait_queue pending; /* requests in flight */ struct rpc_wait_queue backlog; /* waiting for slot */ + struct rpc_wait_queue pingwait; /* waiting on ping() */ struct rpc_rqst * free; /* free slots */ struct rpc_rqst slot[RPC_MAXREQS]; unsigned long sockstate; /* Socket state */ @@ -179,10 +182,12 @@ unsigned long); int xprt_reserve(struct rpc_task *); +int xprt_ping_reserve(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_timeout *); void xprt_release(struct rpc_task *); +void xprt_ping_release(struct rpc_task *); void xprt_reconnect(struct rpc_task *); int xprt_clear_backlog(struct rpc_xprt *); int xprt_tcp_pending(void); @@ -190,6 +195,8 @@ #define XPRT_WSPACE 0 #define XPRT_CONNECT 1 +#define XPRT_PING 2 +#define XPRT_NORESPOND 3 #define xprt_wspace(xp) (test_bit(XPRT_WSPACE, &(xp)->sockstate)) #define xprt_test_and_set_wspace(xp) (test_and_set_bit(XPRT_WSPACE, &(xp)->sockstate)) @@ -200,6 +207,32 @@ #define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) +static inline int xprt_pinging(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_PING, &xprt->sockstate); +} +static inline int xprt_test_and_set_pinging(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_PING, &xprt->sockstate); +} +static inline void xprt_clear_pinging(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_PING, &xprt->sockstate); +} + +static inline int xprt_norespond(struct rpc_xprt *xprt) +{ + return test_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline int xprt_test_and_set_norespond(struct rpc_xprt *xprt) +{ + return test_and_set_bit(XPRT_NORESPOND, &xprt->sockstate); +} +static inline void xprt_clear_norespond(struct rpc_xprt *xprt) +{ + clear_bit(XPRT_NORESPOND, &xprt->sockstate); +} + static inline void rpciod_tcp_dispatcher(void) { diff -u --recursive --new-file linux-2.4.14-ext3/net/sunrpc/Makefile linux-2.4.14-jukebox/net/sunrpc/Makefile --- linux-2.4.14-ext3/net/sunrpc/Makefile Fri Dec 29 23:07:24 2000 +++ linux-2.4.14-jukebox/net/sunrpc/Makefile Tue Nov 6 13:04:54 2001 @@ -14,7 +14,7 @@ obj-y := clnt.o xprt.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o \ - pmap_clnt.o xdr.o sunrpc_syms.o + ping.o pmap_clnt.o xdr.o sunrpc_syms.o obj-$(CONFIG_PROC_FS) += stats.o obj-$(CONFIG_SYSCTL) += sysctl.o diff -u --recursive --new-file linux-2.4.14-ext3/net/sunrpc/clnt.c linux-2.4.14-jukebox/net/sunrpc/clnt.c --- linux-2.4.14-ext3/net/sunrpc/clnt.c Fri Sep 21 20:24:50 2001 +++ linux-2.4.14-jukebox/net/sunrpc/clnt.c Tue Nov 6 13:04:54 2001 @@ -57,8 +57,8 @@ static void call_reconnect(struct rpc_task *task); static void child_reconnect(struct rpc_task *); static void child_reconnect_status(struct rpc_task *); -static u32 * call_header(struct rpc_task *task); -static u32 * call_verify(struct rpc_task *task); +static void call_ping(struct rpc_task *task); +static void call_pingresult(struct rpc_task *task); /* @@ -491,7 +491,7 @@ /* Encode header and provided arguments */ encode = rpcproc_encode(clnt, task->tk_msg.rpc_proc); - if (!(p = call_header(task))) { + if (!(p = rpc_call_header(task))) { printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); rpc_exit(task, -EIO); } else @@ -618,11 +618,10 @@ task->tk_action = call_reconnect; break; } - /* - * Sleep and dream of an open connection - */ - task->tk_timeout = 5 * HZ; - rpc_sleep_on(&xprt->sending, task, NULL, NULL); + if (RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; + break; + } case -ENOMEM: case -EAGAIN: task->tk_action = call_transmit; @@ -646,6 +645,7 @@ { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; + int major = 0; if (req) { struct rpc_timeout *to = &req->rq_timeout; @@ -666,17 +666,7 @@ rpc_exit(task, -EIO); return; } - if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { - task->tk_flags |= RPC_CALL_MAJORSEEN; - if (req) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); -#ifdef RPC_DEBUG - else - printk(KERN_NOTICE "%s: task %d can't get a request slot\n", - clnt->cl_protname, task->tk_pid); -#endif - } + major = 1; if (clnt->cl_autobind) clnt->cl_port = 0; @@ -689,6 +679,8 @@ } else if (!xprt_connected(clnt->cl_xprt)) { task->tk_action = call_reconnect; clnt->cl_stats->rpcretrans++; + } else if (major && RPCXPRT_SUPERCONGESTED(clnt->cl_xprt)) { + task->tk_action = call_ping; } else { task->tk_action = call_transmit; clnt->cl_stats->rpcretrans++; @@ -710,12 +702,6 @@ dprintk("RPC: %4d call_decode (status %d)\n", task->tk_pid, task->tk_status); - if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { - printk(KERN_NOTICE "%s: server %s OK\n", - clnt->cl_protname, clnt->cl_server); - task->tk_flags &= ~RPC_CALL_MAJORSEEN; - } - if (task->tk_status < 12) { if (!clnt->cl_softrtry) { task->tk_action = call_transmit; @@ -729,7 +715,7 @@ } /* Verify the RPC header */ - if (!(p = call_verify(task))) + if (!(p = rpc_call_verify(task))) return; /* @@ -788,8 +774,8 @@ /* * Call header serialization */ -static u32 * -call_header(struct rpc_task *task) +u32 * +rpc_call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_xprt *xprt = clnt->cl_xprt; @@ -809,10 +795,63 @@ } /* + * Ping a non-responding server + */ +static void +call_ping(struct rpc_task *task) +{ + task->tk_action = call_pingresult; + rpc_ping(task); +} + +/* + * Interpret the result from ping + */ +static void +call_pingresult(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + int status = task->tk_status; + + task->tk_status = 0; + if (status >= 0) { + task->tk_action = call_transmit; + return; + } + + switch(status) { + case -ECONNREFUSED: + case -ENOTCONN: + if (clnt->cl_autobind || !clnt->cl_port) { + clnt->cl_port = 0; + task->tk_action = call_bind; + break; + } + if (xprt->stream) { + task->tk_action = call_reconnect; + break; + } + case -ENOMEM: + case -ENOBUFS: + rpc_delay(task, HZ >> 4); + case -ETIMEDOUT: + task->tk_action = call_ping; + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task,status); + return; + } +} + +/* * Reply header verification */ -static u32 * -call_verify(struct rpc_task *task) +u32 * +rpc_call_verify(struct rpc_task *task) { u32 *p = task->tk_rqstp->rq_rvec[0].iov_base, n; diff -u --recursive --new-file linux-2.4.14-ext3/net/sunrpc/ping.c linux-2.4.14-jukebox/net/sunrpc/ping.c --- linux-2.4.14-ext3/net/sunrpc/ping.c Thu Jan 1 01:00:00 1970 +++ linux-2.4.14-jukebox/net/sunrpc/ping.c Tue Nov 6 13:04:54 2001 @@ -0,0 +1,218 @@ +/* + * linux/net/sunrpc/ping.c + * + * Ping routing. + * + * Copyright (C) 2000, Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define RPC_SLACK_SPACE 512 /* total overkill */ +#define RPC_PING_DELAY (15*HZ) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +static void ping_call_reserve(struct rpc_task *); +static void ping_call_allocate(struct rpc_task *); +static void ping_call_encode(struct rpc_task *); +static void ping_call_transmit(struct rpc_task *); +static void ping_call_receive(struct rpc_task *); +static void ping_call_exit(struct rpc_task *); + + +static void +ping_call_reserve(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_reserve\n", task->tk_pid); + task->tk_status = 0; + task->tk_action = ping_call_allocate; + task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; + xprt_ping_reserve(task); +} + +static void +ping_call_allocate(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + unsigned int bufsiz; + + dprintk("RPC: %4d, ping_call_allocate (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = ping_call_exit; + if (task->tk_status < 0) + return; + + bufsiz = rpcproc_bufsiz(clnt, task->tk_msg.rpc_proc) + RPC_SLACK_SPACE; + if (!(task->tk_buffer = rpc_malloc(task, bufsiz << 1))) { + task->tk_status = -ENOMEM; + return; + } + req->rq_svec[0].iov_base = (void *)task->tk_buffer; + req->rq_svec[0].iov_len = bufsiz; + req->rq_slen = 0; + req->rq_snr = 1; + req->rq_rvec[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + req->rq_rvec[0].iov_len = bufsiz; + req->rq_rlen = bufsiz; + req->rq_rnr = 1; + task->tk_action = ping_call_encode; +} + +static void +ping_call_encode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + u32 *p; + + dprintk("RPC: %4d, ping_call_encode (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status < 0) { + task->tk_action = ping_call_exit; + return; + } + p = rpc_call_header(task); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + task->tk_action = ping_call_transmit; +} + +static void +ping_call_transmit(struct rpc_task *task) +{ + dprintk("RPC: %4d, ping_call_transmit\n", task->tk_pid); + task->tk_action = ping_call_receive; + xprt_transmit(task); +} + +static void +ping_call_receive(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_timeout *to = &req->rq_timeout; + u32 *p; + + dprintk("RPC: %4d, ping_call_receive (status %d)\n", + task->tk_pid, task->tk_status); + + if (task->tk_status >= 0) + p = rpc_call_verify(task); + + task->tk_action = ping_call_exit; + + if (task->tk_status >= 0 || task->tk_status == -EACCES) { + task->tk_status = 0; + if (xprt_norespond(xprt)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + xprt_clear_norespond(xprt); + } + return; + } + + switch (task->tk_status) { + case -ENOTCONN: + break; + case -ENOMEM: + case -EAGAIN: + case -ECONNREFUSED: + case -ETIMEDOUT: + if (!xprt_adjust_timeout(to)) { + task->tk_status = 0; + task->tk_action = ping_call_transmit; + break; + } + default: + if (clnt->cl_softrtry) { + task->tk_status = -EIO; + break; + } + if (clnt->cl_chatty) { + if (!xprt_test_and_set_norespond(xprt)) { + printk(KERN_NOTICE + "%s: server %s is not responding\n", + clnt->cl_protname, clnt->cl_server); + } else { + printk(KERN_NOTICE + "%s: server %s still not responding\n", + clnt->cl_protname, clnt->cl_server); + } + } + rpc_delay(task, RPC_PING_DELAY); + } +} + +static void +ping_call_exit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %4d, ping_call_exit (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = NULL; + xprt_ping_release(task); + + /* Sigh. rpc_delay() clears task->tk_status */ + if (task->tk_status == 0 && xprt_norespond(xprt)) + task->tk_status = -ETIMEDOUT; + + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, task->tk_status); +} + +void +rpc_ping(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_task *child; + struct rpc_message msg = {0, NULL, NULL, NULL}; + + dprintk("RPC: %4d, rpc_ping\n", task->tk_pid); + + again: + if (xprt_test_and_set_pinging(xprt)) { + rpc_sleep_on(&xprt->pingwait, task, NULL, 0); + if (!xprt_pinging(xprt)) { + rpc_wake_up_task(task); + goto again; + } + dprintk("RPC: %4d, rpc_ping, waiting on completion\n", + task->tk_pid); + return; + } + + child = rpc_new_child(clnt, task); + if (!child) { + dprintk("RPC: %4d, rpc_ping, failed to create child process\n", + task->tk_pid); + xprt_clear_pinging(xprt); + rpc_wake_up_status(&xprt->pingwait, -ENOMEM); + task->tk_status = -ENOMEM; + return; + } + rpc_call_setup(child, &msg, 0); + child->tk_action = ping_call_reserve; + + dprintk("RPC: %4d, rpc_ping, running child process %4d\n", + task->tk_pid, child->tk_pid); + rpc_run_child(task, child, NULL); +} diff -u --recursive --new-file linux-2.4.14-ext3/net/sunrpc/sched.c linux-2.4.14-jukebox/net/sunrpc/sched.c --- linux-2.4.14-ext3/net/sunrpc/sched.c Thu Oct 11 17:12:52 2001 +++ linux-2.4.14-jukebox/net/sunrpc/sched.c Tue Nov 6 13:07:24 2001 @@ -1052,7 +1052,6 @@ int rounds = 0; MOD_INC_USE_COUNT; - lock_kernel(); /* * Let our maker know we're running ... */ diff -u --recursive --new-file linux-2.4.14-ext3/net/sunrpc/xprt.c linux-2.4.14-jukebox/net/sunrpc/xprt.c --- linux-2.4.14-ext3/net/sunrpc/xprt.c Mon Oct 8 21:36:07 2001 +++ linux-2.4.14-jukebox/net/sunrpc/xprt.c Tue Nov 6 13:04:54 2001 @@ -85,7 +85,7 @@ */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void do_xprt_transmit(struct rpc_task *); -static void xprt_reserve_status(struct rpc_task *task); +static void xprt_alloc_slot(struct rpc_xprt *, struct rpc_task *); static void xprt_disconnect(struct rpc_xprt *); static void xprt_reconn_status(struct rpc_task *task); static struct socket *xprt_create_socket(int, struct rpc_timeout *); @@ -1247,15 +1247,8 @@ rpc_sleep_on(&xprt->sending, task, NULL, NULL); } spin_unlock_bh(&xprt->sock_lock); - return; case -EAGAIN: - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); return; - case -ECONNREFUSED: - case -ENOTCONN: - if (!xprt->stream) - return; default: if (xprt->stream) xprt_disconnect(xprt); @@ -1306,9 +1299,11 @@ dprintk("RPC: %4d xprt_reserve cong = %ld cwnd = %ld\n", task->tk_pid, xprt->cong, xprt->cwnd); spin_lock_bh(&xprt->xprt_lock); - xprt_reserve_status(task); + if (!RPCXPRT_CONGESTED(xprt)) + xprt_alloc_slot(xprt, task); if (task->tk_rqstp) { task->tk_timeout = 0; + xprt->cong += RPC_CWNDSCALE; } else if (!task->tk_timeout) { task->tk_status = -ENOBUFS; } else { @@ -1323,35 +1318,48 @@ } /* - * Reservation callback + * Reserve a ping RPC call slot. */ -static void -xprt_reserve_status(struct rpc_task *task) +int +xprt_ping_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req; - if (xprt->shutdown) { - task->tk_status = -EIO; - } else if (task->tk_status < 0) { - /* NOP */ - } else if (task->tk_rqstp) { - /* We've already been given a request slot: NOP */ - } else { - if (RPCXPRT_CONGESTED(xprt) || !(req = xprt->free)) - goto out_nofree; - /* OK: There's room for us. Grab a free slot and bump - * congestion value */ - xprt->free = req->rq_next; - req->rq_next = NULL; - xprt->cong += RPC_CWNDSCALE; - task->tk_rqstp = req; - xprt_request_init(task, xprt); + /* We already have an initialized request. */ + if (task->tk_rqstp) + return 0; - if (xprt->free) - xprt_clear_backlog(xprt); - } + dprintk("RPC: %4d xprt_ping_reserve cong = %ld cwnd = %ld\n", + task->tk_pid, xprt->cong, xprt->cwnd); + spin_lock_bh(&xprt->xprt_lock); + xprt_alloc_slot(xprt, task); + if (!task->tk_rqstp) + task->tk_status = -ENOBUFS; + spin_unlock_bh(&xprt->xprt_lock); + dprintk("RPC: %4d xprt_ping_reserve returns %d\n", + task->tk_pid, task->tk_status); + return task->tk_status; +} +/* + * Reserve a slot + */ +static void +xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req; + + if (!(req = xprt->free)) + goto out_nofree; + /* OK: There's room for us. Grab a free slot and bump + * congestion value */ + xprt->free = req->rq_next; + req->rq_next = NULL; + task->tk_rqstp = req; + xprt_request_init(task, xprt); + + if (xprt->free) + xprt_clear_backlog(xprt); return; out_nofree: @@ -1383,8 +1391,8 @@ /* * Release an RPC call slot */ -void -xprt_release(struct rpc_task *task) +static void +__xprt_release(struct rpc_task *task, int congvalue) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; @@ -1405,13 +1413,26 @@ req->rq_next = xprt->free; xprt->free = req; - /* Decrease congestion value. */ - xprt->cong -= RPC_CWNDSCALE; - - xprt_clear_backlog(xprt); + if (congvalue) { + /* Decrease congestion value. */ + xprt->cong -= congvalue; + xprt_clear_backlog(xprt); + } spin_unlock_bh(&xprt->xprt_lock); } +void +xprt_release(struct rpc_task *task) +{ + __xprt_release(task, RPC_CWNDSCALE); +} + +void +xprt_ping_release(struct rpc_task *task) +{ + __xprt_release(task, 0); +} + /* * Set default timeout parameters */ @@ -1481,6 +1502,7 @@ xprt->pending = RPC_INIT_WAITQ("xprt_pending"); xprt->sending = RPC_INIT_WAITQ("xprt_sending"); xprt->backlog = RPC_INIT_WAITQ("xprt_backlog"); + xprt->pingwait= RPC_INIT_WAITQ("xprt_pingwait"); /* initialize free list */ for (i = 0, req = xprt->slot; i < RPC_MAXREQS-1; i++, req++) @@ -1616,6 +1638,7 @@ rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->pending); rpc_wake_up(&xprt->backlog); + rpc_wake_up(&xprt->pingwait); if (waitqueue_active(&xprt->cong_wait)) wake_up(&xprt->cong_wait); }