diff -u --recursive --new-file linux-2.4.22-pre2/Documentation/Configure.help linux-2.4.22-22-soft2/Documentation/Configure.help --- linux-2.4.22-pre2/Documentation/Configure.help 2003-06-26 23:18:55.000000000 +0200 +++ linux-2.4.22-22-soft2/Documentation/Configure.help 2003-06-27 00:50:12.000000000 +0200 @@ -15925,6 +15925,30 @@ If unsure, say N. +Allow direct I/O on files in NFS +CONFIG_NFS_DIRECTIO + There are important applications whose performance or correctness + depends on uncached access to file data. Database clusters (multiple + copies of the same instance running on separate hosts) implement their + own cache coherency protocol that subsumes the NFS cache protocols. + Applications that process datasets considerably larger than the client's + memory do not always benefit from a local cache. A streaming video + server, for instance, has no need to cache the contents of a file. + + This option enables applications to perform direct I/O on files in NFS + file systems using the O_DIRECT open() flag. When O_DIRECT is set for + files, their data is not cached in the system's page cache. Direct + read and write operations are aligned to block boundaries. Data is + moved to and from user-level application buffers directly. + + Unless your program is designed to use O_DIRECT properly, you are much + better off allowing the NFS client to manage caching for you. Misusing + O_DIRECT can cause poor server performance or network storms. This + kernel build option defaults OFF to avoid exposing system administrators + unwittingly to a potentially hazardous feature. + + If unsure, say N. + Root file system on NFS CONFIG_ROOT_NFS If you want your Linux box to mount its whole root file system (the diff -u --recursive --new-file linux-2.4.22-pre2/fs/block_dev.c linux-2.4.22-22-soft2/fs/block_dev.c --- linux-2.4.22-pre2/fs/block_dev.c 2003-05-28 01:36:30.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/block_dev.c 2003-06-27 00:50:12.000000000 +0200 @@ -131,8 +131,9 @@ return 0; } -static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int blkdev_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { + struct inode * inode = filp->f_dentry->d_inode->i_mapping->host; return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block); } diff -u --recursive --new-file linux-2.4.22-pre2/fs/Config.in linux-2.4.22-22-soft2/fs/Config.in --- linux-2.4.22-pre2/fs/Config.in 2002-08-14 15:18:24.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/Config.in 2003-06-27 00:50:12.000000000 +0200 @@ -102,6 +102,7 @@ dep_tristate 'InterMezzo file system support (replicating fs) (EXPERIMENTAL)' CONFIG_INTERMEZZO_FS $CONFIG_INET $CONFIG_EXPERIMENTAL dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET dep_mbool ' Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS + dep_mbool ' Allow direct I/O on NFS files (EXPERIMENTAL)' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS $CONFIG_EXPERIMENTAL dep_bool ' Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET diff -u --recursive --new-file linux-2.4.22-pre2/fs/ext2/inode.c linux-2.4.22-22-soft2/fs/ext2/inode.c --- linux-2.4.22-pre2/fs/ext2/inode.c 2003-03-13 01:37:19.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/ext2/inode.c 2003-06-27 00:50:12.000000000 +0200 @@ -592,8 +592,9 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } -static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int ext2_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { + struct inode * inode = filp->f_dentry->d_inode->i_mapping->host; return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); } struct address_space_operations ext2_aops = { diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/clntproc.c linux-2.4.22-22-soft2/fs/lockd/clntproc.c --- linux-2.4.22-pre2/fs/lockd/clntproc.c 2002-02-05 08:52:37.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/lockd/clntproc.c 2003-06-27 00:46:25.000000000 +0200 @@ -460,7 +460,7 @@ } if (status < 0) return status; - } while (resp->status == NLM_LCK_BLOCKED); + } while (resp->status == NLM_LCK_BLOCKED && req->a_args.block); if (resp->status == NLM_LCK_GRANTED) { fl->fl_u.nfs_fl.state = host->h_state; diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/host.c linux-2.4.22-22-soft2/fs/lockd/host.c --- linux-2.4.22-pre2/fs/lockd/host.c 2002-02-05 08:49:27.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/lockd/host.c 2003-06-27 00:48:37.000000000 +0200 @@ -187,15 +187,7 @@ host->h_nextrebind - jiffies); } } else { - uid_t saved_fsuid = current->fsuid; - kernel_cap_t saved_cap = current->cap_effective; - - /* Create RPC socket as root user so we get a priv port */ - current->fsuid = 0; - cap_raise (current->cap_effective, CAP_NET_BIND_SERVICE); xprt = xprt_create_proto(host->h_proto, &host->h_addr, NULL); - current->fsuid = saved_fsuid; - current->cap_effective = saved_cap; if (xprt == NULL) goto forgetit; @@ -209,6 +201,7 @@ } clnt->cl_autobind = 1; /* turn on pmap queries */ xprt->nocong = 1; /* No congestion control for NLM */ + xprt->resvport = 1; /* NLM requires a reserved port */ host->h_rpcclnt = clnt; } diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/mon.c linux-2.4.22-22-soft2/fs/lockd/mon.c --- linux-2.4.22-pre2/fs/lockd/mon.c 2002-02-05 08:49:27.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/lockd/mon.c 2003-06-27 00:48:37.000000000 +0200 @@ -122,6 +122,7 @@ clnt->cl_softrtry = 1; clnt->cl_chatty = 1; clnt->cl_oneshot = 1; + xprt->resvport = 1; /* NSM requires a reserved port */ out: return clnt; diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/svc4proc.c linux-2.4.22-22-soft2/fs/lockd/svc4proc.c --- linux-2.4.22-pre2/fs/lockd/svc4proc.c 2002-08-11 13:55:07.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/lockd/svc4proc.c 2003-06-27 00:46:38.000000000 +0200 @@ -462,6 +462,24 @@ } /* + * client sent a GRANTED_RES, let's remove the associated block + */ +static int +nlm4svc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + return rpc_success; +} + + + +/* * This is the generic lockd callback for async RPC calls */ static u32 @@ -524,7 +542,6 @@ #define nlm4svc_proc_lock_res nlm4svc_proc_null #define nlm4svc_proc_cancel_res nlm4svc_proc_null #define nlm4svc_proc_unlock_res nlm4svc_proc_null -#define nlm4svc_proc_granted_res nlm4svc_proc_null struct nlm_void { int dummy; }; @@ -559,7 +576,7 @@ PROC(lock_res, lockres, norep, res, void, 1), PROC(cancel_res, cancelres, norep, res, void, 1), PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, grantedres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), /* statd callback */ PROC(sm_notify, reboot, void, reboot, void, 1), PROC(none, void, void, void, void, 0), diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/svclock.c linux-2.4.22-22-soft2/fs/lockd/svclock.c --- linux-2.4.22-pre2/fs/lockd/svclock.c 2002-02-05 08:52:37.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/lockd/svclock.c 2003-06-27 00:46:55.000000000 +0200 @@ -64,7 +64,7 @@ if (when != NLM_NEVER) { if ((when += jiffies) == NLM_NEVER) when ++; - while ((b = *bp) && time_before_eq(b->b_when,when)) + while ((b = *bp) && time_before_eq(b->b_when,when) && b->b_when != NLM_NEVER) bp = &b->b_next; } else while ((b = *bp)) @@ -143,14 +143,15 @@ * Find a block with a given NLM cookie. */ static inline struct nlm_block * -nlmsvc_find_block(struct nlm_cookie *cookie) +nlmsvc_find_block(struct nlm_cookie *cookie, struct sockaddr_in *sin) { struct nlm_block *block; for (block = nlm_blocked; block; block = block->b_next) { dprintk("cookie: head of blocked queue %p, block %p\n", nlm_blocked, block); - if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie)) + if (nlm_cookie_match(&block->b_call.a_args.cookie,cookie) + && nlm_cmp_addr(sin, &block->b_host->h_addr)) break; } @@ -572,12 +573,16 @@ struct nlm_rqst *call = (struct nlm_rqst *) task->tk_calldata; struct nlm_block *block; unsigned long timeout; + struct sockaddr_in *peer_addr = RPC_PEERADDR(task->tk_client); dprintk("lockd: GRANT_MSG RPC callback\n"); - dprintk("callback: looking for cookie %x \n", - *(unsigned int *)(call->a_args.cookie.data)); - if (!(block = nlmsvc_find_block(&call->a_args.cookie))) { - dprintk("lockd: no block for cookie %x\n", *(u32 *)(call->a_args.cookie.data)); + dprintk("callback: looking for cookie %x, host (%08x)\n", + *(unsigned int *)(call->a_args.cookie.data), + ntohl(peer_addr->sin_addr.s_addr)); + if (!(block = nlmsvc_find_block(&call->a_args.cookie, peer_addr))) { + dprintk("lockd: no block for cookie %x, host (%08x)\n", + *(u32 *)(call->a_args.cookie.data), + ntohl(peer_addr->sin_addr.s_addr)); return; } @@ -606,18 +611,21 @@ * block. */ void -nlmsvc_grant_reply(struct nlm_cookie *cookie, u32 status) +nlmsvc_grant_reply(struct svc_rqst *rqstp, struct nlm_cookie *cookie, u32 status) { struct nlm_block *block; struct nlm_file *file; - if (!(block = nlmsvc_find_block(cookie))) + dprintk("grant_reply: looking for cookie %x, host (%08x), s=%d \n", + *(unsigned int *)(cookie->data), + ntohl(rqstp->rq_addr.sin_addr.s_addr), status); + if (!(block = nlmsvc_find_block(cookie, &rqstp->rq_addr))) return; file = block->b_file; file->f_count++; down(&file->f_sema); - if ((block = nlmsvc_find_block(cookie)) != NULL) { + if ((block = nlmsvc_find_block(cookie,&rqstp->rq_addr)) != NULL) { if (status == NLM_LCK_DENIED_GRACE_PERIOD) { /* Try again in a couple of seconds */ nlmsvc_insert_block(block, 10 * HZ); diff -u --recursive --new-file linux-2.4.22-pre2/fs/lockd/svcproc.c linux-2.4.22-22-soft2/fs/lockd/svcproc.c --- linux-2.4.22-pre2/fs/lockd/svcproc.c 2002-08-11 13:55:07.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/lockd/svcproc.c 2003-06-27 00:46:38.000000000 +0200 @@ -490,6 +490,22 @@ } /* + * client sent a GRANTED_RES, let's remove the associated block + */ +static int +nlmsvc_proc_granted_res(struct svc_rqst *rqstp, struct nlm_res *argp, + void *resp) +{ + if (!nlmsvc_ops) + return rpc_success; + + dprintk("lockd: GRANTED_RES called\n"); + + nlmsvc_grant_reply(rqstp, &argp->cookie, argp->status); + return rpc_success; +} + +/* * This is the generic lockd callback for async RPC calls */ static u32 @@ -552,7 +568,6 @@ #define nlmsvc_proc_lock_res nlmsvc_proc_null #define nlmsvc_proc_cancel_res nlmsvc_proc_null #define nlmsvc_proc_unlock_res nlmsvc_proc_null -#define nlmsvc_proc_granted_res nlmsvc_proc_null struct nlm_void { int dummy; }; @@ -589,7 +604,7 @@ PROC(lock_res, lockres, norep, res, void, 1), PROC(cancel_res, cancelres, norep, res, void, 1), PROC(unlock_res, unlockres, norep, res, void, 1), - PROC(granted_res, grantedres, norep, res, void, 1), + PROC(granted_res, res, norep, res, void, 1), /* statd callback */ PROC(sm_notify, reboot, void, reboot, void, 1), PROC(none, void, void, void, void, 1), diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/dir.c linux-2.4.22-22-soft2/fs/nfs/dir.c --- linux-2.4.22-pre2/fs/nfs/dir.c 2002-10-15 06:59:27.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/dir.c 2003-06-30 16:52:52.000000000 +0200 @@ -34,8 +34,11 @@ #define NFS_PARANOIA 1 /* #define NFS_DEBUG_VERBOSE 1 */ +static loff_t nfs_dir_llseek(struct file *, loff_t, int); static int nfs_readdir(struct file *, void *, filldir_t); static struct dentry *nfs_lookup(struct inode *, struct dentry *); +static int nfs_cached_lookup(struct inode *, struct dentry *, + struct nfs_fh *, struct nfs_fattr *); static int nfs_create(struct inode *, struct dentry *, int); static int nfs_mkdir(struct inode *, struct dentry *, int); static int nfs_rmdir(struct inode *, struct dentry *); @@ -48,6 +51,7 @@ static int nfs_fsync_dir(struct file *, struct dentry *, int); struct file_operations nfs_dir_operations = { + llseek: nfs_dir_llseek, read: generic_read_dir, readdir: nfs_readdir, open: nfs_open, @@ -70,6 +74,25 @@ setattr: nfs_notify_change, }; +static loff_t nfs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + switch (origin) { + case 1: + if (offset == 0) { + offset = file->f_pos; + break; + } + case 2: + return -EINVAL; + } + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_reada = 0; + file->f_version = ++event; + } + return (offset <= 0) ? 0 : offset; +} + typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int); typedef struct { struct file *file; @@ -109,13 +132,15 @@ error = NFS_PROTO(inode)->readdir(inode, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); /* We requested READDIRPLUS, but the server doesn't grok it */ - if (desc->plus && error == -ENOTSUPP) { - NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; - desc->plus = 0; - goto again; - } - if (error < 0) + if (error < 0) { + if (error == -ENOTSUPP && desc->plus) { + NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS; + NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS; + desc->plus = 0; + goto again; + } goto error; + } SetPageUptodate(page); /* Ensure consistent page alignment of the data. * Note: assumes we have exclusive access to this mapping either @@ -194,8 +219,7 @@ dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index); - desc->plus = NFS_USE_READDIRPLUS(inode); - page = read_cache_page(&inode->i_data, desc->page_index, + page = read_cache_page(inode->i_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); if (IS_ERR(page)) { status = PTR_ERR(page); @@ -246,6 +270,24 @@ return res; } +static unsigned int nfs_type2dtype[] = { + DT_UNKNOWN, + DT_REG, + DT_DIR, + DT_BLK, + DT_CHR, + DT_LNK, + DT_SOCK, + DT_UNKNOWN, + DT_FIFO +}; + +static inline +unsigned int nfs_type_to_d_type(enum nfs_ftype type) +{ + return nfs_type2dtype[type]; +} + /* * Once we've found the start of the dirent within a page: fill 'er up... */ @@ -262,11 +304,17 @@ dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target); for(;;) { + unsigned d_type = DT_UNKNOWN; /* Note: entry->prev_cookie contains the cookie for * retrieving the current dirent on the server */ fileid = nfs_fileid_to_ino_t(entry->ino); + + /* Use readdirplus info */ + if (desc->plus && (entry->fattr->valid & NFS_ATTR_FATTR)) + d_type = nfs_type_to_d_type(entry->fattr->type); + res = filldir(dirent, entry->name, entry->len, - entry->prev_cookie, fileid, DT_UNKNOWN); + entry->prev_cookie, fileid, d_type); if (res < 0) break; file->f_pos = desc->target = entry->cookie; @@ -333,7 +381,8 @@ /* Reset read descriptor so it searches the page cache from * the start upon the next call to readdir_search_pagecache() */ desc->page_index = 0; - memset(desc->entry, 0, sizeof(*desc->entry)); + desc->entry->cookie = desc->entry->prev_cookie = 0; + desc->entry->eof = 0; out: dfprintk(VFS, "NFS: uncached_readdir() returns %d\n", status); return status; @@ -352,9 +401,11 @@ nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_entry my_entry; + struct nfs_fh fh; + struct nfs_fattr fattr; long res; - res = nfs_revalidate(dentry); + res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res < 0) return res; @@ -365,12 +416,16 @@ * itself. */ memset(desc, 0, sizeof(*desc)); - memset(&my_entry, 0, sizeof(my_entry)); - desc->file = filp; desc->target = filp->f_pos; - desc->entry = &my_entry; desc->decode = NFS_PROTO(inode)->decode_dirent; + desc->plus = NFS_USE_READDIRPLUS(inode); + + my_entry.cookie = my_entry.prev_cookie = 0; + my_entry.eof = 0; + my_entry.fh = &fh; + my_entry.fattr = &fattr; + desc->entry = &my_entry; while(!desc->entry->eof) { res = readdir_search_pagecache(desc); @@ -434,16 +489,9 @@ } static inline -int nfs_lookup_verify_inode(struct inode *inode, int flags) +int nfs_lookup_verify_inode(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - /* - * If we're interested in close-to-open cache consistency, - * then we revalidate the inode upon lookup. - */ - if (!(server->flags & NFS_MOUNT_NOCTO) && !(flags & LOOKUP_CONTINUE)) - NFS_CACHEINV(inode); - return nfs_revalidate_inode(server, inode); + return nfs_revalidate_inode(NFS_SERVER(inode), inode); } /* @@ -497,11 +545,20 @@ /* Force a full look up iff the parent directory has changed */ if (nfs_check_verifier(dir, dentry)) { - if (nfs_lookup_verify_inode(inode, flags)) - goto out_bad; + if (nfs_lookup_verify_inode(inode)) + goto out_zap_parent; goto out_valid; } + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + if (memcmp(NFS_FH(inode), &fhandle, sizeof(struct nfs_fh))!= 0) + goto out_bad; + if (nfs_lookup_verify_inode(inode)) + goto out_zap_parent; + goto out_valid_renew; + } + if (NFS_STALE(inode)) goto out_bad; @@ -513,10 +570,13 @@ if ((error = nfs_refresh_inode(inode, &fattr)) != 0) goto out_bad; + out_valid_renew: nfs_renew_times(dentry); out_valid: unlock_kernel(); return 1; +out_zap_parent: + nfs_zap_caches(dir); out_bad: NFS_CACHEINV(dir); if (inode && S_ISDIR(inode->i_mode)) { @@ -588,6 +648,18 @@ error = -ENOMEM; dentry->d_op = &nfs_dentry_operations; + error = nfs_cached_lookup(dir, dentry, &fhandle, &fattr); + if (!error) { + error = -EACCES; + inode = nfs_fhget(dentry, &fhandle, &fattr); + if (inode) { + d_add(dentry, inode); + nfs_renew_times(dentry); + error = 0; + } + goto out; + } + error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, &fhandle, &fattr); inode = NULL; if (error == -ENOENT) @@ -606,6 +678,79 @@ return ERR_PTR(error); } +static inline +int find_dirent_name(nfs_readdir_descriptor_t *desc, struct page *page, struct dentry *dentry) +{ + struct nfs_entry *entry = desc->entry; + int status; + + while((status = dir_decode(desc)) == 0) { + if (entry->len != dentry->d_name.len) + continue; + if (memcmp(entry->name, dentry->d_name.name, entry->len)) + continue; + if (!(entry->fattr->valid & NFS_ATTR_FATTR)) + continue; + break; + } + return status; +} + +/* + * Use the cached Readdirplus results in order to avoid a LOOKUP call + * whenever we believe that the parent directory has not changed. + * + * We assume that any file creation/rename changes the directory mtime. + * As this results in a page cache invalidation whenever it occurs, + * we don't require any other tests for cache coherency. + */ +static +int nfs_cached_lookup(struct inode *dir, struct dentry *dentry, + struct nfs_fh *fh, struct nfs_fattr *fattr) +{ + nfs_readdir_descriptor_t desc; + struct nfs_server *server; + struct nfs_entry entry; + struct page *page; + unsigned long timestamp = NFS_MTIME_UPDATE(dir); + int res; + + if (!NFS_USE_READDIRPLUS(dir)) + return -ENOENT; + server = NFS_SERVER(dir); + if (server->flags & NFS_MOUNT_NOAC) + return -ENOENT; + nfs_revalidate_inode(server, dir); + + entry.fh = fh; + entry.fattr = fattr; + + desc.decode = NFS_PROTO(dir)->decode_dirent; + desc.entry = &entry; + desc.page_index = 0; + desc.plus = 1; + + for(;(page = find_get_page(dir->i_mapping, desc.page_index)); desc.page_index++) { + + res = -EIO; + if (Page_Uptodate(page)) { + desc.ptr = kmap(page); + res = find_dirent_name(&desc, page, dentry); + kunmap(page); + } + page_cache_release(page); + + if (res == 0) + goto out_found; + if (res != -EAGAIN) + break; + } + return -ENOENT; + out_found: + fattr->timestamp = timestamp; + return 0; +} + /* * Code common to create, mkdir, and mknod. */ @@ -994,7 +1139,7 @@ struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct dentry *dentry = NULL, *rehash = NULL; - int error = -EBUSY; + int error; /* * To prevent any new references to the target during the rename, @@ -1020,6 +1165,12 @@ */ if (!new_inode) goto go_ahead; + /* If target is a hard link to the source, then noop */ + error = 0; + if (NFS_FILEID(new_inode) == NFS_FILEID(old_inode)) + goto out; + + error = -EBUSY; if (S_ISDIR(new_inode->i_mode)) goto out; else if (atomic_read(&new_dentry->d_count) > 1) { @@ -1082,34 +1233,62 @@ int nfs_permission(struct inode *inode, int mask) { - int error = vfs_permission(inode, mask); - - if (!NFS_PROTO(inode)->access) - goto out; - - if (error == -EROFS) - goto out; - - /* - * Trust UNIX mode bits except: - * - * 1) When override capabilities may have been invoked - * 2) When root squashing may be involved - * 3) When ACLs may overturn a negative answer */ - if (!capable(CAP_DAC_OVERRIDE) && !capable(CAP_DAC_READ_SEARCH) - && (current->fsuid != 0) && (current->fsgid != 0) - && error != -EACCES) - goto out; + struct nfs_access_cache *cache = &NFS_I(inode)->cache_access; + struct rpc_cred *cred; + int mode = inode->i_mode; + int error; - error = NFS_PROTO(inode)->access(inode, mask, 0); + if (mask & MAY_WRITE) { + /* + * + * Nobody gets write access to a read-only fs. + * + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; - if (error == -EACCES && NFS_CLIENT(inode)->cl_droppriv && - current->uid != 0 && current->gid != 0 && - (current->fsuid != current->uid || current->fsgid != current->gid)) - error = NFS_PROTO(inode)->access(inode, mask, 1); + /* + * + * Nobody gets write access to an immutable file. + * + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } - out: - return error; + if (!NFS_PROTO(inode)->access) + goto out_notsup; + cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + if (cache->cred == cred + && time_before(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))) { + if (!cache->err) { + /* Is the mask a subset of an accepted mask? */ + if ((cache->mask & mask) == mask) + goto out_cached; + } else { + /* ...or is it a superset of a rejected mask? */ + if ((cache->mask & mask) == cache->mask) + goto out_cached; + } + } + error = NFS_PROTO(inode)->access(inode, cred, mask); + if (!error || error == -EACCES) { + cache->jiffies = jiffies; + if (cache->cred) + put_rpccred(cache->cred); + cache->cred = cred; + cache->mask = mask; + cache->err = error; + return error; + } + put_rpccred(cred); +out_notsup: + nfs_revalidate_inode(NFS_SERVER(inode), inode); + return vfs_permission(inode, mask); +out_cached: + put_rpccred(cred); + return cache->err; } /* diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/direct.c linux-2.4.22-22-soft2/fs/nfs/direct.c --- linux-2.4.22-pre2/fs/nfs/direct.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/nfs/direct.c 2003-06-27 00:50:12.000000000 +0200 @@ -0,0 +1,382 @@ +/* + * linux/fs/nfs/direct.c + * + * High-performance direct I/O for the NFS client + * + * When an application requests uncached I/O, all read and write requests + * are made directly to the server; data stored or fetched via these + * requests is not cached in the Linux page cache. The client does not + * correct unaligned requests from applications. All requested bytes are + * held on permanent storage before a direct write system call returns to + * an application. Applications that manage their own data caching, such + * as databases, make very good use of direct I/O on local file systems. + * + * Solaris implements an uncached I/O facility called directio() that + * is used for backups and sequential I/O to very large files. Solaris + * also supports uncaching whole NFS partitions with "-o forcedirectio," + * an undocumented mount option. + * + * Note that I/O to read in executables (e.g. kernel_read) cannot use + * direct (kiobuf) reads because there is no vma backing the passed-in + * data buffer. + * + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * + * Initial implementation: 12/2001 by Chuck Lever + * + * TODO: + * + * 1. Use concurrent asynchronous network requests rather than + * serialized synchronous network requests for normal (non-sync) + * direct I/O. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define VERF_SIZE (2 * sizeof(__u32)) + +static inline int +nfs_direct_read_rpc(struct file *file, struct nfs_readargs *arg) +{ + int result; + struct inode * inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_readres res = { &fattr, arg->count, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_READ : NFSPROC_READ; +#else + msg.rpc_proc = NFSPROC_READ; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = nfs_file_cred(file); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_refresh_inode(inode, &fattr); + unlock_kernel(); + + return result; +} + +static inline int +nfs_direct_write_rpc(struct file *file, struct nfs_writeargs *arg, + struct nfs_writeverf *verf) +{ + int result; + struct inode *inode = file->f_dentry->d_inode; + struct nfs_fattr fattr; + struct rpc_message msg; + struct nfs_writeres res = { &fattr, verf, 0 }; + +#ifdef CONFIG_NFS_V3 + msg.rpc_proc = (NFS_PROTO(inode)->version == 3) ? + NFS3PROC_WRITE : NFSPROC_WRITE; +#else + msg.rpc_proc = NFSPROC_WRITE; +#endif + msg.rpc_argp = arg; + msg.rpc_resp = &res; + + lock_kernel(); + msg.rpc_cred = get_rpccred(nfs_file_cred(file)); + fattr.valid = 0; + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + put_rpccred(msg.rpc_cred); + unlock_kernel(); + +#ifdef CONFIG_NFS_V3 + if (NFS_PROTO(inode)->version == 3) { + if (result > 0) { + if ((arg->stable == NFS_FILE_SYNC) && + (verf->committed != NFS_FILE_SYNC)) { + printk(KERN_ERR + "%s: server didn't sync stable write request\n", + __FUNCTION__); + return -EIO; + } + + if (result != arg->count) { + printk(KERN_INFO + "%s: short write, count=%u, result=%d\n", + __FUNCTION__, arg->count, result); + } + } + return result; + } else { +#endif + verf->committed = NFS_FILE_SYNC; /* NFSv2 always syncs data */ + if (result == 0) + return arg->count; + return result; +#ifdef CONFIG_NFS_V3 + } +#endif +} + +#ifdef CONFIG_NFS_V3 +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + int result; + struct nfs_fattr fattr; + struct nfs_writeargs arg = { NFS_FH(inode), offset, count, 0, 0, + NULL }; + struct nfs_writeres res = { &fattr, verf, 0 }; + struct rpc_message msg = { NFS3PROC_COMMIT, &arg, &res, NULL }; + + fattr.valid = 0; + + lock_kernel(); + result = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + nfs_write_attributes(inode, &fattr); + unlock_kernel(); + + return result; +} +#else +static inline int +nfs_direct_commit_rpc(struct inode *inode, loff_t offset, size_t count, + struct nfs_writeverf *verf) +{ + return 0; +} +#endif + +/* + * Walk through the iobuf and create an iovec for each "rsize" bytes. + */ +static int +nfs_direct_read(struct file *file, struct kiobuf *iobuf, loff_t offset, + size_t count) +{ + int curpage, total; + int result = 0; + struct inode *inode = file->f_dentry->d_inode; + int rsize = NFS_SERVER(inode)->rsize; + struct page *pages[NFS_READ_MAXIOV]; + struct nfs_readargs args = { NFS_FH(inode), offset, 0, iobuf->offset, + pages }; + + total = 0; + curpage = 0; + while (count) { + int len, request; + struct page **dest = pages; + + request = count; + if (count > rsize) + request = rsize; + args.count = request; + args.offset = offset; + args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; + len = PAGE_SIZE - args.pgbase; + + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) { + result = -EFAULT; + goto out_err; + } + + *dest++ = page; + /* zero after the first iov */ + if (request < len) + break; + request -= len; + len = PAGE_SIZE; + curpage++; + } while (request != 0); + + result = nfs_direct_read_rpc(file, &args); + + if (result < 0) + break; + + total += result; + if (result < args.count) /* NFSv2ism */ + break; + count -= result; + offset += result; + }; +out_err: + if (!total) + return result; + return total; +} + +/* + * Walk through the iobuf and create an iovec for each "wsize" bytes. + * If only one network write is necessary, or if the O_SYNC flag or + * 'sync' mount option are present, or if this is a V2 inode, use + * FILE_SYNC. Otherwise, use UNSTABLE and finish with a COMMIT. + * + * The mechanics of this function are much the same as nfs_direct_read, + * with the added complexity of committing unstable writes. + */ +static int +nfs_direct_write(struct file *file, struct kiobuf *iobuf, + loff_t offset, size_t count) +{ + int curpage, total; + int need_commit = 0; + int result = 0; + loff_t save_offset = offset; + struct inode *inode = file->f_dentry->d_inode; + int wsize = NFS_SERVER(inode)->wsize; + struct nfs_writeverf first_verf, ret_verf; + struct page *pages[NFS_WRITE_MAXIOV]; + struct nfs_writeargs args = { NFS_FH(inode), 0, 0, NFS_FILE_SYNC, 0, + pages }; + +#ifdef CONFIG_NFS_V3 + if ((NFS_PROTO(inode)->version == 3) && (count > wsize) && + (!IS_SYNC(inode))) + args.stable = NFS_UNSTABLE; +#endif + +retry: + total = 0; + curpage = 0; + while (count) { + int len, request; + struct page **dest = pages; + + request = count; + if (count > wsize) + request = wsize; + args.count = request; + args.offset = offset; + args.pgbase = (iobuf->offset + total) & ~PAGE_MASK; + len = PAGE_SIZE - args.pgbase; + + do { + struct page *page = iobuf->maplist[curpage]; + + if (curpage >= iobuf->nr_pages || !page) { + result = -EFAULT; + goto out_err; + } + + *dest++ = page; + /* zero after the first iov */ + if (request < len) + break; + request -= len; + len = PAGE_SIZE; + curpage++; + } while (request != 0); + + result = nfs_direct_write_rpc(file, &args, &ret_verf); + + if (result < 0) + break; + + if (!total) + memcpy(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE); + if (ret_verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + total += result; + count -= result; + offset += result; + }; + +out_err: + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + if (nfs_direct_commit_rpc(inode, save_offset, + iobuf->length - count, &ret_verf)) + goto print_retry; + if (memcmp(&first_verf.verifier, &ret_verf.verifier, + VERF_SIZE)) + goto print_retry; + } + + if (!total) + return result; + return total; + +print_retry: + printk(KERN_INFO "%s: detected server restart; retrying with FILE_SYNC\n", + __FUNCTION__); + args.stable = NFS_FILE_SYNC; + offset = save_offset; + count = iobuf->length; + goto retry; +} + +/* + * Read or write data, moving the data directly to/from the + * application's buffer without caching in the page cache. + * + * Rules for direct I/O + * + * 1. block size = 512 bytes or more + * 2. file byte offset is block aligned + * 3. byte count is a multiple of block size + * 4. user buffer is not aligned + * 5. user buffer is faulted in and pinned + * + * These are verified before we get here. + */ +int +nfs_direct_IO(int rw, struct file *file, struct kiobuf *iobuf, + unsigned long blocknr, int blocksize) +{ + int result = -EINVAL; + size_t count = iobuf->length; + struct dentry *dentry = file->f_dentry; + struct inode *inode = dentry->d_inode; + loff_t offset = blocknr << inode->i_blkbits; + + switch (rw) { + case READ: + dfprintk(VFS, + "NFS: direct_IO(READ) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_read(file, iobuf, offset, count); + break; + case WRITE: + dfprintk(VFS, + "NFS: direct_IO(WRITE) (%s/%s) off/cnt(%Lu/%d)\n", + dentry->d_parent->d_name.name, + dentry->d_name.name, offset, count); + + result = nfs_direct_write(file, iobuf, offset, count); + break; + default: + break; + } + + dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + return result; +} diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/file.c linux-2.4.22-22-soft2/fs/nfs/file.c --- linux-2.4.22-pre2/fs/nfs/file.c 2002-12-12 11:23:09.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/nfs/file.c 2003-06-27 00:50:12.000000000 +0200 @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include #include #include #include @@ -200,6 +201,9 @@ sync_page: nfs_sync_page, writepage: nfs_writepage, prepare_write: nfs_prepare_write, +#ifdef CONFIG_NFS_DIRECTIO + direct_IO: nfs_direct_IO, +#endif commit_write: nfs_commit_write }; diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/inode.c linux-2.4.22-22-soft2/fs/nfs/inode.c --- linux-2.4.22-pre2/fs/nfs/inode.c 2002-08-15 03:05:32.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/inode.c 2003-06-27 00:50:51.000000000 +0200 @@ -146,10 +146,14 @@ static void nfs_clear_inode(struct inode *inode) { - struct rpc_cred *cred = NFS_I(inode)->mm_cred; + struct nfs_inode_info *nfsi = NFS_I(inode); + struct rpc_cred *cred = nfsi->mm_cred; if (cred) put_rpccred(cred); + cred = nfsi->cache_access.cred; + if (cred) + put_rpccred(cred); } void @@ -251,6 +255,72 @@ } /* + * Set up the NFS superblock private area using probed values + */ +static int +nfs_setup_superblock(struct super_block *sb, struct nfs_fh *rootfh) +{ + struct nfs_server *server = &sb->u.nfs_sb.s_server; + struct nfs_fattr fattr; + struct nfs_fsinfo fsinfo = { &fattr, }; + struct nfs_pathconf pathinfo = { &fattr, }; + int maxlen, res; + + res = server->rpc_ops->fsinfo(server, rootfh, &fsinfo); + if (res < 0) + return res; + + /* Work out a lot of parameters */ + if (!server->rsize) + server->rsize = nfs_block_size(fsinfo.rtpref, NULL); + if (!server->wsize) + server->wsize = nfs_block_size(fsinfo.wtpref, NULL); + + /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ + if (!fsinfo.wtmult) + fsinfo.wtmult = 512; + sb->s_blocksize = nfs_block_bits(fsinfo.wtmult, &sb->s_blocksize_bits); + + if (server->rsize > fsinfo.rtmax) + server->rsize = fsinfo.rtmax; + if (server->wsize > fsinfo.wtmax) + server->wsize = fsinfo.wtmax; + + server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->rpages > NFS_READ_MAXIOV) { + server->rpages = NFS_READ_MAXIOV; + server->rsize = server->rpages << PAGE_CACHE_SHIFT; + } + + server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + if (server->wpages > NFS_WRITE_MAXIOV) { + server->wpages = NFS_WRITE_MAXIOV; + server->wsize = server->wpages << PAGE_CACHE_SHIFT; + } + + server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); + if (server->dtsize > PAGE_CACHE_SIZE) + server->dtsize = PAGE_CACHE_SIZE; + if (server->dtsize > server->rsize) + server->dtsize = server->rsize; + + maxlen = (server->rpc_ops->version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; + if (!server->namelen) { + res = server->rpc_ops->pathconf(server, rootfh, &pathinfo); + if (!res) + server->namelen = pathinfo.name_max; + } + if (!server->namelen || server->namelen > maxlen) + server->namelen = maxlen; + + sb->s_maxbytes = fsinfo.maxfilesize; + if (sb->s_maxbytes > MAX_LFS_FILESIZE) + sb->s_maxbytes = MAX_LFS_FILESIZE; + + return 0; +} + +/* * The way this works is that the mount process passes a structure * in the data argument which contains the server's IP address * and the root file handle obtained from the server's mount @@ -268,8 +338,7 @@ unsigned int authflavor; struct sockaddr_in srvaddr; struct rpc_timeout timeparms; - struct nfs_fsinfo fsinfo; - int tcp, version, maxlen; + int tcp, version; memset(&sb->u.nfs_sb, 0, sizeof(sb->u.nfs_sb)); if (!data) @@ -298,11 +367,11 @@ sb->s_magic = NFS_SUPER_MAGIC; sb->s_op = &nfs_sops; - sb->s_blocksize_bits = 0; - sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits); server = &sb->u.nfs_sb.s_server; - server->rsize = nfs_block_size(data->rsize, NULL); - server->wsize = nfs_block_size(data->wsize, NULL); + if (data->rsize) + server->rsize = nfs_block_size(data->rsize, NULL); + if (data->wsize) + server->wsize = nfs_block_size(data->wsize, NULL); server->flags = data->flags & NFS_MOUNT_FLAGMASK; if (data->flags & NFS_MOUNT_NOAC) { @@ -326,12 +395,14 @@ INIT_LIST_HEAD(&server->lru_busy); nfsv3_try_again: + server->caps = 0; /* Check NFS protocol revision and initialize RPC op vector * and file handle pool. */ if (data->flags & NFS_MOUNT_VER3) { #ifdef CONFIG_NFS_V3 server->rpc_ops = &nfs_v3_clientops; version = 3; + server->caps |= NFS_CAP_READDIRPLUS; if (data->version < 4) { printk(KERN_NOTICE "NFS: NFSv3 not supported by mount program.\n"); goto out_unlock; @@ -409,63 +480,11 @@ sb->s_root->d_op = &nfs_dentry_operations; /* Get some general file system info */ - if (server->rpc_ops->statfs(server, root, &fsinfo) >= 0) { - if (server->namelen == 0) - server->namelen = fsinfo.namelen; - } else { + if (nfs_setup_superblock(sb, root) < 0) { printk(KERN_NOTICE "NFS: cannot retrieve file system info.\n"); goto out_no_root; } - /* Work out a lot of parameters */ - if (data->rsize == 0) - server->rsize = nfs_block_size(fsinfo.rtpref, NULL); - if (data->wsize == 0) - server->wsize = nfs_block_size(fsinfo.wtpref, NULL); - /* NFSv3: we don't have bsize, but rather rtmult and wtmult... */ - if (!fsinfo.bsize) - fsinfo.bsize = (fsinfo.rtmult>fsinfo.wtmult) ? fsinfo.rtmult : fsinfo.wtmult; - /* Also make sure we don't go below rsize/wsize since - * RPC calls are expensive */ - if (fsinfo.bsize < server->rsize) - fsinfo.bsize = server->rsize; - if (fsinfo.bsize < server->wsize) - fsinfo.bsize = server->wsize; - - if (data->bsize == 0) - sb->s_blocksize = nfs_block_bits(fsinfo.bsize, &sb->s_blocksize_bits); - if (server->rsize > fsinfo.rtmax) - server->rsize = fsinfo.rtmax; - if (server->wsize > fsinfo.wtmax) - server->wsize = fsinfo.wtmax; - - server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->rpages > NFS_READ_MAXIOV) { - server->rpages = NFS_READ_MAXIOV; - server->rsize = server->rpages << PAGE_CACHE_SHIFT; - } - - server->wpages = (server->wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (server->wpages > NFS_WRITE_MAXIOV) { - server->wpages = NFS_WRITE_MAXIOV; - server->wsize = server->wpages << PAGE_CACHE_SHIFT; - } - - server->dtsize = nfs_block_size(fsinfo.dtpref, NULL); - if (server->dtsize > PAGE_CACHE_SIZE) - server->dtsize = PAGE_CACHE_SIZE; - if (server->dtsize > server->rsize) - server->dtsize = server->rsize; - - maxlen = (version == 2) ? NFS2_MAXNAMLEN : NFS3_MAXNAMLEN; - - if (server->namelen == 0 || server->namelen > maxlen) - server->namelen = maxlen; - - sb->s_maxbytes = fsinfo.maxfilesize; - if (sb->s_maxbytes > MAX_LFS_FILESIZE) - sb->s_maxbytes = MAX_LFS_FILESIZE; - /* Fire up the writeback cache */ if (nfs_reqlist_alloc(server) < 0) { printk(KERN_NOTICE "NFS: cannot initialize writeback cache.\n"); @@ -526,7 +545,8 @@ struct nfs_server *server = &sb->u.nfs_sb.s_server; unsigned char blockbits; unsigned long blockres; - struct nfs_fsinfo res; + struct nfs_fattr attr; + struct nfs_fsstat res = { &attr, }; int error; error = server->rpc_ops->statfs(server, NFS_FH(sb->s_root->d_inode), &res); @@ -534,18 +554,15 @@ if (error < 0) goto out_err; - if (res.bsize == 0) - res.bsize = sb->s_blocksize; - buf->f_bsize = nfs_block_bits(res.bsize, &blockbits); + buf->f_bsize = sb->s_blocksize; + blockbits = sb->s_blocksize_bits; blockres = (1 << blockbits) - 1; buf->f_blocks = (res.tbytes + blockres) >> blockbits; buf->f_bfree = (res.fbytes + blockres) >> blockbits; buf->f_bavail = (res.abytes + blockres) >> blockbits; buf->f_files = res.tfiles; buf->f_ffree = res.afiles; - if (res.namelen == 0 || res.namelen > server->namelen) - res.namelen = server->namelen; - buf->f_namelen = res.namelen; + buf->f_namelen = server->namelen; return 0; out_err: printk("nfs_statfs: statfs error = %d\n", -error); @@ -623,36 +640,35 @@ nfs_zap_caches(inode); } +/* Don't use READDIRPLUS on directories that we believe are too large */ +#define NFS_LIMIT_READDIRPLUS (8*PAGE_SIZE) + /* * Fill in inode information from the fattr. */ static void nfs_fill_inode(struct inode *inode, struct nfs_fh *fh, struct nfs_fattr *fattr) { - /* - * Check whether the mode has been set, as we only want to - * do this once. (We don't allow inodes to change types.) + NFS_FILEID(inode) = fattr->fileid; + inode->i_mode = fattr->mode; + /* Why so? Because we want revalidate for devices/FIFOs, and + * that's precisely what we have in nfs_file_inode_operations. */ - if (inode->i_mode == 0) { - NFS_FILEID(inode) = fattr->fileid; - inode->i_mode = fattr->mode; - /* Why so? Because we want revalidate for devices/FIFOs, and - * that's precisely what we have in nfs_file_inode_operations. - */ - inode->i_op = &nfs_file_inode_operations; - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &nfs_file_operations; - inode->i_data.a_ops = &nfs_file_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &nfs_dir_inode_operations; - inode->i_fop = &nfs_dir_operations; - } else if (S_ISLNK(inode->i_mode)) - inode->i_op = &nfs_symlink_inode_operations; - else - init_special_inode(inode, inode->i_mode, fattr->rdev); - memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); - } - nfs_refresh_inode(inode, fattr); + inode->i_op = &nfs_file_inode_operations; + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &nfs_file_operations; + inode->i_data.a_ops = &nfs_file_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &nfs_dir_inode_operations; + inode->i_fop = &nfs_dir_operations; + if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS) + && fattr->size <= NFS_LIMIT_READDIRPLUS) + NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS; + } else if (S_ISLNK(inode->i_mode)) + inode->i_op = &nfs_symlink_inode_operations; + else + init_special_inode(inode, inode->i_mode, fattr->rdev); + memcpy(&inode->u.nfs_i.fh, fh, sizeof(inode->u.nfs_i.fh)); } struct nfs_find_desc { @@ -727,7 +743,14 @@ if (!(inode = iget4(sb, ino, nfs_find_actor, &desc))) goto out_no_inode; - nfs_fill_inode(inode, fh, fattr); + /* + * Check whether the mode has been set, as we only want to + * do this once. (We don't allow inodes to change types.) + */ + if (inode->i_mode == 0) + nfs_fill_inode(inode, fh, fattr); + + nfs_refresh_inode(inode, fattr); dprintk("NFS: __nfs_fhget(%x/%Ld ct=%d)\n", inode->i_dev, (long long)NFS_FILEID(inode), atomic_read(&inode->i_count)); @@ -850,15 +873,23 @@ { struct rpc_auth *auth; struct rpc_cred *cred; + int err = 0; lock_kernel(); + /* Ensure that we revalidate the data cache */ + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NOCTO)) { + err = __nfs_revalidate_inode(NFS_SERVER(inode),inode); + if (err) + goto out; + } auth = NFS_CLIENT(inode)->cl_auth; cred = rpcauth_lookupcred(auth, 0); filp->private_data = cred; if (filp->f_mode & FMODE_WRITE) nfs_set_mmcred(inode, cred); +out: unlock_kernel(); - return 0; + return err; } int nfs_release(struct inode *inode, struct file *filp) @@ -993,6 +1024,9 @@ goto out_err; } + /* Throw out obsolete READDIRPLUS attributes */ + if (time_before(fattr->timestamp, NFS_READTIME(inode))) + return 0; /* * Make sure the inode's type hasn't changed. */ @@ -1011,7 +1045,7 @@ /* * Update the read time so we don't revalidate too often. */ - NFS_READTIME(inode) = jiffies; + NFS_READTIME(inode) = fattr->timestamp; /* * Note: NFS_CACHE_ISIZE(inode) reflects the state of the cache. @@ -1060,7 +1094,8 @@ inode->i_atime = new_atime; if (NFS_CACHE_MTIME(inode) != new_mtime) { - NFS_MTIME_UPDATE(inode) = jiffies; + if (invalid) + NFS_MTIME_UPDATE(inode) = fattr->timestamp; NFS_CACHE_MTIME(inode) = new_mtime; inode->i_mtime = nfs_time_to_secs(new_mtime); } @@ -1068,6 +1103,16 @@ NFS_CACHE_ISIZE(inode) = new_size; inode->i_size = new_isize; + if (inode->i_mode != fattr->mode || + inode->i_uid != fattr->uid || + inode->i_gid != fattr->gid) { + struct rpc_cred **cred = &NFS_I(inode)->cache_access.cred; + if (*cred) { + put_rpccred(*cred); + *cred = NULL; + } + } + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/Makefile linux-2.4.22-22-soft2/fs/nfs/Makefile --- linux-2.4.22-pre2/fs/nfs/Makefile 2002-02-05 08:55:11.000000000 +0100 +++ linux-2.4.22-22-soft2/fs/nfs/Makefile 2003-06-27 00:50:12.000000000 +0200 @@ -14,6 +14,7 @@ obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +obj-$(CONFIG_NFS_DIRECTIO) += direct.o obj-m := $(O_TARGET) diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/nfs2xdr.c linux-2.4.22-22-soft2/fs/nfs/nfs2xdr.c --- linux-2.4.22-pre2/fs/nfs/nfs2xdr.c 2002-10-06 01:55:26.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/nfs2xdr.c 2003-06-27 00:50:51.000000000 +0200 @@ -118,6 +118,7 @@ fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO; fattr->rdev = 0; } + fattr->timestamp = jiffies; return p; } @@ -369,7 +370,7 @@ count = count >> 2; p = xdr_encode_fhandle(p, args->fh); - *p++ = htonl(args->cookie); + *p++ = htonl(args->cookie & 0xFFFFFFFF); *p++ = htonl(count); /* see above */ req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); @@ -466,7 +467,7 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - entry->cookie = ntohl(*p++); + entry->cookie = (s64)((off_t)ntohl(*p++)); entry->eof = !p[0] && p[1]; return p; @@ -595,36 +596,18 @@ * Decode STATFS reply */ static int -nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs_xdr_statfsres(struct rpc_rqst *req, u32 *p, struct nfs2_statfs *res) { int status; - u32 xfer_size; if ((status = ntohl(*p++))) return -nfs_stat_to_errno(status); - /* For NFSv2, we more or less have to guess the preferred - * read/write/readdir sizes from the single 'transfer size' - * value. - */ - xfer_size = ntohl(*p++); /* tsize */ - res->rtmax = 8 * 1024; - res->rtpref = xfer_size; - res->rtmult = xfer_size; - res->wtmax = 8 * 1024; - res->wtpref = xfer_size; - res->wtmult = xfer_size; - res->dtpref = PAGE_CACHE_SIZE; - res->maxfilesize = 0x7FFFFFFF; /* just a guess */ + res->tsize = ntohl(*p++); res->bsize = ntohl(*p++); - - res->tbytes = ntohl(*p++) * res->bsize; - res->fbytes = ntohl(*p++) * res->bsize; - res->abytes = ntohl(*p++) * res->bsize; - res->tfiles = 0; - res->ffiles = 0; - res->afiles = 0; - res->namelen = 0; + res->blocks = ntohl(*p++); + res->bfree = ntohl(*p++); + res->bavail = ntohl(*p++); return 0; } diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/nfs3proc.c linux-2.4.22-22-soft2/fs/nfs/nfs3proc.c --- linux-2.4.22-pre2/fs/nfs/nfs3proc.c 2002-08-14 14:59:37.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/nfs3proc.c 2003-06-27 00:49:54.000000000 +0200 @@ -117,12 +117,13 @@ } static int -nfs3_proc_access(struct inode *inode, int mode, int ruid) +nfs3_proc_access(struct inode *inode, struct rpc_cred *cred, int mode) { struct nfs_fattr fattr; struct nfs3_accessargs arg = { NFS_FH(inode), 0 }; struct nfs3_accessres res = { &fattr, 0 }; - int status, flags; + struct rpc_message msg = { NFS3PROC_ACCESS, &arg, &res, cred }; + int status; dprintk("NFS call access\n"); fattr.valid = 0; @@ -140,8 +141,7 @@ if (mode & MAY_EXEC) arg.access |= NFS3_ACCESS_EXECUTE; } - flags = (ruid) ? RPC_CALL_REALUID : 0; - status = rpc_call(NFS_CLIENT(inode), NFS3PROC_ACCESS, &arg, &res, flags); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_refresh_inode(inode, &fattr); dprintk("NFS reply access\n"); @@ -298,16 +298,21 @@ static int nfs3_proc_unlink_setup(struct rpc_message *msg, struct dentry *dir, struct qstr *name) { + struct { + struct nfs3_diropargs arg; + struct nfs_fattr res; + } *data; struct nfs3_diropargs *arg; struct nfs_fattr *res; - arg = (struct nfs3_diropargs *)kmalloc(sizeof(*arg)+sizeof(*res), GFP_KERNEL); - if (!arg) + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) return -ENOMEM; - res = (struct nfs_fattr*)(arg + 1); + arg = &data->arg; arg->fh = NFS_FH(dir->d_inode); arg->name = name->name; arg->len = name->len; + res = &data->res; res->valid = 0; msg->rpc_proc = NFS3PROC_REMOVE; msg->rpc_argp = arg; @@ -483,24 +488,42 @@ return status; } -/* - * This is a combo call of fsstat and fsinfo - */ static int nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; - dprintk("NFS call fsstat\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, info, 0); - if (status < 0) - goto error; + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFS3PROC_FSSTAT, fhandle, stat, 0); + dprintk("NFS reply statfs: %d\n", status); + return status; +} + +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); status = rpc_call(server->client, NFS3PROC_FSINFO, fhandle, info, 0); + dprintk("NFS reply fsinfo: %d\n", status); + return status; +} -error: - dprintk("NFS reply statfs: %d\n", status); +static int +nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + int status; + + info->fattr->valid = 0; + dprintk("NFS call pathconf\n"); + status = rpc_call(server->client, NFS3PROC_PATHCONF, fhandle, info, 0); + dprintk("NFS reply pathconf: %d\n", status); return status; } @@ -529,5 +552,7 @@ nfs3_proc_readdir, nfs3_proc_mknod, nfs3_proc_statfs, + nfs3_proc_fsinfo, + nfs3_proc_pathconf, nfs3_decode_dirent, }; diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/nfs3xdr.c linux-2.4.22-22-soft2/fs/nfs/nfs3xdr.c --- linux-2.4.22-pre2/fs/nfs/nfs3xdr.c 2002-10-06 01:03:40.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/nfs3xdr.c 2003-06-27 00:50:51.000000000 +0200 @@ -181,6 +181,7 @@ /* Update the mode bits */ fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3); + fattr->timestamp = jiffies; return p; } @@ -465,6 +466,13 @@ return 0; } +/* Hack to sign-extending 32-bit cookies */ +static inline +u64 nfs_transform_cookie64(u64 cookie) +{ + return (cookie & 0x80000000) ? (cookie ^ 0xFFFFFFFF00000000) : cookie; +} + /* * Encode arguments to readdir call */ @@ -476,7 +484,7 @@ u32 count = args->count; p = xdr_encode_fhandle(p, args->fh); - p = xdr_encode_hyper(p, args->cookie); + p = xdr_encode_hyper(p, nfs_transform_cookie64(args->cookie)); *p++ = args->verf[0]; *p++ = args->verf[1]; if (args->plus) { @@ -600,6 +608,7 @@ nfs3_decode_dirent(u32 *p, struct nfs_entry *entry, int plus) { struct nfs_entry old = *entry; + u64 cookie; if (!*p++) { if (!*p) @@ -613,24 +622,23 @@ entry->name = (const char *) p; p += XDR_QUADLEN(entry->len); entry->prev_cookie = entry->cookie; - p = xdr_decode_hyper(p, &entry->cookie); + p = xdr_decode_hyper(p, &cookie); + entry->cookie = nfs_transform_cookie64(cookie); if (plus) { - p = xdr_decode_post_op_attr(p, &entry->fattr); + entry->fattr->valid = 0; + p = xdr_decode_post_op_attr(p, entry->fattr); /* In fact, a post_op_fh3: */ if (*p++) { - p = xdr_decode_fhandle(p, &entry->fh); + p = xdr_decode_fhandle(p, entry->fh); /* Ugh -- server reply was truncated */ if (p == NULL) { dprintk("NFS: FH truncated\n"); *entry = old; return ERR_PTR(-EAGAIN); } - } else { - /* If we don't get a file handle, the attrs - * aren't worth a lot. */ - entry->fattr.valid = 0; - } + } else + memset((u8*)(entry->fh), 0, sizeof(*entry->fh)); } entry->eof = !p[0] && p[1]; @@ -913,14 +921,13 @@ * Decode FSSTAT reply */ static int -nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_fsstatres(struct rpc_rqst *req, u32 *p, struct nfs_fsstat *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -930,8 +937,7 @@ p = xdr_decode_hyper(p, &res->tfiles); p = xdr_decode_hyper(p, &res->ffiles); p = xdr_decode_hyper(p, &res->afiles); - - /* ignore invarsec */ + res->invarsec = ntohl(*p++); return 0; } @@ -941,12 +947,11 @@ static int nfs3_xdr_fsinfores(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); @@ -958,8 +963,8 @@ res->wtmult = ntohl(*p++); res->dtpref = ntohl(*p++); p = xdr_decode_hyper(p, &res->maxfilesize); - - /* ignore time_delta and properties */ + p = xdr_decode_time3(p, &res->time_delta); + res->properties = ntohl(*p++); return 0; } @@ -967,20 +972,21 @@ * Decode PATHCONF reply */ static int -nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_fsinfo *res) +nfs3_xdr_pathconfres(struct rpc_rqst *req, u32 *p, struct nfs_pathconf *res) { - struct nfs_fattr dummy; int status; status = ntohl(*p++); - p = xdr_decode_post_op_attr(p, &dummy); + p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) return -nfs_stat_to_errno(status); res->linkmax = ntohl(*p++); - res->namelen = ntohl(*p++); - - /* ignore remaining fields */ + res->name_max = ntohl(*p++); + res->no_trunc = ntohl(*p++) != 0; + res->chown_restricted = ntohl(*p++) != 0; + res->case_insensitive = ntohl(*p++) != 0; + res->case_preserving = ntohl(*p++) != 0; return 0; } diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/proc.c linux-2.4.22-22-soft2/fs/nfs/proc.c --- linux-2.4.22-pre2/fs/nfs/proc.c 2002-08-14 14:59:37.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/proc.c 2003-06-27 00:49:54.000000000 +0200 @@ -351,17 +351,62 @@ static int nfs_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, - struct nfs_fsinfo *info) + struct nfs_fsstat *stat) { int status; + struct nfs2_statfs fsinfo; - dprintk("NFS call statfs\n"); - memset((char *)info, 0, sizeof(*info)); - status = rpc_call(server->client, NFSPROC_STATFS, fhandle, info, 0); + stat->fattr->valid = 0; + dprintk("NFS call statfs\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); dprintk("NFS reply statfs: %d\n", status); + if (status) + goto out; + stat->tbytes = (u64)fsinfo.blocks * fsinfo.bsize; + stat->fbytes = (u64)fsinfo.bfree * fsinfo.bsize; + stat->abytes = (u64)fsinfo.bavail * fsinfo.bsize; + stat->tfiles = 0; + stat->ffiles = 0; + stat->afiles = 0; + stat->invarsec = 0; + out: return status; } +static int +nfs_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + struct nfs2_statfs fsinfo; + + info->fattr->valid = 0; + dprintk("NFS call fsinfo\n"); + status = rpc_call(server->client, NFSPROC_STATFS, fhandle, &fsinfo, 0); + dprintk("NFS reply fsinfo: %d\n", status); + if (status) + goto out; + info->rtmax = NFS_MAXDATA; + info->rtpref = fsinfo.tsize; + info->rtmult = fsinfo.bsize; + info->wtmax = NFS_MAXDATA; + info->wtpref = fsinfo.tsize; + info->wtmult = fsinfo.bsize; + info->dtpref = fsinfo.tsize; + info->maxfilesize = 0x7FFFFFFF; + info->time_delta = 0; + info->properties = 0x1b; + out: + return status; +} + +static int +nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_pathconf *info) +{ + return -ENOTSUPP; +} + extern u32 * nfs_decode_dirent(u32 *, struct nfs_entry *, int); struct nfs_rpc_ops nfs_v2_clientops = { @@ -387,5 +432,7 @@ nfs_proc_readdir, nfs_proc_mknod, nfs_proc_statfs, + nfs_proc_fsinfo, + nfs_proc_pathconf, nfs_decode_dirent, }; diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/unlink.c linux-2.4.22-22-soft2/fs/nfs/unlink.c --- linux-2.4.22-pre2/fs/nfs/unlink.c 2002-08-11 13:34:02.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/unlink.c 2003-06-27 00:48:56.000000000 +0200 @@ -12,6 +12,7 @@ #include #include #include +#include struct nfs_unlinkdata { @@ -21,6 +22,9 @@ struct rpc_task task; struct rpc_cred *cred; unsigned int count; + + wait_queue_head_t waitq; + int completed; }; static struct nfs_unlinkdata *nfs_deletes; @@ -133,6 +137,8 @@ put_rpccred(data->cred); data->cred = NULL; dput(dir); + data->completed = 1; + wake_up(&data->waitq); } /** @@ -175,6 +181,8 @@ nfs_deletes = data; data->count = 1; + init_waitqueue_head(&data->waitq); + task = &data->task; rpc_init_task(task, clnt, nfs_async_unlink_done , RPC_TASK_ASYNC); task->tk_calldata = data; @@ -212,7 +220,10 @@ data->count++; nfs_copy_dname(dentry, data); dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; - if (data->task.tk_rpcwait == &nfs_delete_queue) + if (data->task.tk_rpcwait == &nfs_delete_queue) { + struct rpc_clnt *clnt = data->task.tk_client; rpc_wake_up_task(&data->task); + nfs_wait_event(clnt, data->waitq, data->completed == 1); + } nfs_put_unlinkdata(data); } diff -u --recursive --new-file linux-2.4.22-pre2/fs/nfs/write.c linux-2.4.22-22-soft2/fs/nfs/write.c --- linux-2.4.22-pre2/fs/nfs/write.c 2002-08-14 14:58:39.000000000 +0200 +++ linux-2.4.22-22-soft2/fs/nfs/write.c 2003-06-27 00:50:12.000000000 +0200 @@ -123,23 +123,6 @@ } /* - * This function will be used to simulate weak cache consistency - * under NFSv2 when the NFSv3 attribute patch is included. - * For the moment, we just call nfs_refresh_inode(). - */ -static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) -{ - if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - return nfs_refresh_inode(inode, fattr); -} - -/* * Write a page synchronously. * Offset is the data offset within the page. */ @@ -812,8 +795,15 @@ * If wsize is smaller than page size, update and write * page synchronously. */ - if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE || IS_SYNC(inode)) - return nfs_writepage_sync(file, inode, page, offset, count); + if (NFS_SERVER(inode)->wsize < PAGE_CACHE_SIZE || IS_SYNC(inode)) { + status = nfs_writepage_sync(file, inode, page, offset, count); + if (status > 0) { + if (offset == 0 && status == PAGE_CACHE_SIZE) + SetPageUptodate(page); + return 0; + } + return status; + } /* * Try to find an NFS request corresponding to this page diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/fs.h linux-2.4.22-22-soft2/include/linux/fs.h --- linux-2.4.22-pre2/include/linux/fs.h 2003-06-20 02:00:00.000000000 +0200 +++ linux-2.4.22-22-soft2/include/linux/fs.h 2003-06-27 00:57:38.000000000 +0200 @@ -395,7 +395,7 @@ int (*flushpage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); + int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); void (*removepage)(struct page *); /* called when page gets removed from the inode */ }; diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/lockd/lockd.h linux-2.4.22-22-soft2/include/linux/lockd/lockd.h --- linux-2.4.22-pre2/include/linux/lockd/lockd.h 2002-02-05 08:49:27.000000000 +0100 +++ linux-2.4.22-22-soft2/include/linux/lockd/lockd.h 2003-06-27 01:38:57.000000000 +0200 @@ -164,6 +164,7 @@ unsigned long nlmsvc_retry_blocked(void); int nlmsvc_traverse_blocks(struct nlm_host *, struct nlm_file *, int action); +void nlmsvc_grant_reply(struct svc_rqst *, struct nlm_cookie *, u32); /* * File handling for the server personality diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/nfs_fs.h linux-2.4.22-22-soft2/include/linux/nfs_fs.h --- linux-2.4.22-pre2/include/linux/nfs_fs.h 2002-12-12 11:23:09.000000000 +0100 +++ linux-2.4.22-22-soft2/include/linux/nfs_fs.h 2003-06-27 00:58:15.000000000 +0200 @@ -102,8 +102,15 @@ #define NFS_FILEID(inode) ((inode)->u.nfs_i.fileid) -/* Inode Flags */ -#define NFS_USE_READDIRPLUS(inode) ((NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS) ? 1 : 0) +static inline int nfs_server_capable(struct inode *inode, int cap) +{ + return NFS_SERVER(inode)->caps & cap; +} + +static inline int NFS_USE_READDIRPLUS(struct inode *inode) +{ + return NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS; +} /* * These are the default flags for swap requests @@ -274,6 +281,11 @@ #define NFS_TestClearPageSync(page) test_and_clear_bit(PG_fs_1, &(page)->flags) /* + * linux/fs/nfs/direct.c + */ +extern int nfs_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -302,6 +314,23 @@ return __nfs_refresh_inode(inode,fattr); } +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { + fattr->pre_size = NFS_CACHE_ISIZE(inode); + fattr->pre_mtime = NFS_CACHE_MTIME(inode); + fattr->pre_ctime = NFS_CACHE_CTIME(inode); + fattr->valid |= NFS_ATTR_WCC; + } + return nfs_refresh_inode(inode, fattr); +} + static inline loff_t nfs_size_to_loff_t(__u64 size) { diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/nfs_fs_i.h linux-2.4.22-22-soft2/include/linux/nfs_fs_i.h --- linux-2.4.22-pre2/include/linux/nfs_fs_i.h 2002-03-12 16:35:02.000000000 +0100 +++ linux-2.4.22-22-soft2/include/linux/nfs_fs_i.h 2003-06-27 00:57:38.000000000 +0200 @@ -6,6 +6,16 @@ #include /* + * NFSv3 Access mode cache + */ +struct nfs_access_cache { + unsigned long jiffies; + struct rpc_cred * cred; + int mask; + int err; +}; + +/* * nfs fs inode data in memory */ struct nfs_inode_info { @@ -54,6 +64,8 @@ */ unsigned long cache_mtime_jiffies; + struct nfs_access_cache cache_access; + /* * This is the cookie verifier used for NFSv3 readdir * operations diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/nfs_fs_sb.h linux-2.4.22-22-soft2/include/linux/nfs_fs_sb.h --- linux-2.4.22-pre2/include/linux/nfs_fs_sb.h 2002-02-05 08:55:11.000000000 +0100 +++ linux-2.4.22-22-soft2/include/linux/nfs_fs_sb.h 2003-06-27 00:57:38.000000000 +0200 @@ -10,6 +10,7 @@ struct rpc_clnt * client; /* RPC client handle */ struct nfs_rpc_ops * rpc_ops; /* NFS protocol vector */ int flags; /* various flags */ + unsigned int caps; /* server capabilities */ unsigned int rsize; /* read size */ unsigned int rpages; /* read size (in pages) */ unsigned int wsize; /* write size */ @@ -36,4 +37,8 @@ struct nfs_server s_server; }; +/* Server capabilities */ +#define NFS_CAP_READDIRPLUS 1 + + #endif diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/nfs_xdr.h linux-2.4.22-22-soft2/include/linux/nfs_xdr.h --- linux-2.4.22-pre2/include/linux/nfs_xdr.h 2002-08-14 14:59:37.000000000 +0200 +++ linux-2.4.22-22-soft2/include/linux/nfs_xdr.h 2003-06-27 00:50:51.000000000 +0200 @@ -27,6 +27,7 @@ __u64 atime; __u64 mtime; __u64 ctime; + unsigned long timestamp; }; #define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */ @@ -37,6 +38,7 @@ * Info on the file system */ struct nfs_fsinfo { + struct nfs_fattr *fattr; __u32 rtmax; /* max. read transfer size */ __u32 rtpref; /* pref. read transfer size */ __u32 rtmult; /* reads should be multiple of this */ @@ -45,21 +47,43 @@ __u32 wtmult; /* writes should be multiple of this */ __u32 dtpref; /* pref. readdir transfer size */ __u64 maxfilesize; - __u64 bsize; /* block size */ + __u64 time_delta; + __u32 properties; +}; + +struct nfs_fsstat { + struct nfs_fattr *fattr; __u64 tbytes; /* total size in bytes */ __u64 fbytes; /* # of free bytes */ __u64 abytes; /* # of bytes available to user */ __u64 tfiles; /* # of files */ __u64 ffiles; /* # of free files */ __u64 afiles; /* # of files available to user */ + __u32 invarsec; +}; + +struct nfs_pathconf { + struct nfs_fattr *fattr; /* Post-op attributes */ __u32 linkmax;/* max # of hard links */ - __u32 namelen;/* max name length */ + __u32 name_max;/* max name length */ + int no_trunc : 1, + chown_restricted : 1, + case_insensitive : 1, + case_preserving : 1; +}; + +struct nfs2_statfs { + __u32 tsize; /* Server transfer size */ + __u32 bsize; /* Filesystem block size */ + __u32 blocks; /* No. of "bsize" blocks on filesystem */ + __u32 bfree; /* No. of free "bsize" blocks */ + __u32 bavail; /* No. of available "bsize" blocks */ }; /* Arguments to the read call. * Note that NFS_READ_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h */ -#define NFS_READ_MAXIOV 8 +#define NFS_READ_MAXIOV (9) struct nfs_readargs { struct nfs_fh * fh; @@ -78,7 +102,7 @@ /* Arguments to the write call. * Note that NFS_WRITE_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h */ -#define NFS_WRITE_MAXIOV 8 +#define NFS_WRITE_MAXIOV (9) struct nfs_writeargs { struct nfs_fh * fh; __u64 offset; @@ -109,8 +133,8 @@ const char * name; unsigned int len; int eof; - struct nfs_fh fh; - struct nfs_fattr fattr; + struct nfs_fh *fh; + struct nfs_fattr *fattr; }; /* @@ -300,7 +324,7 @@ struct iattr *); int (*lookup) (struct inode *, struct qstr *, struct nfs_fh *, struct nfs_fattr *); - int (*access) (struct inode *, int , int); + int (*access) (struct inode *, struct rpc_cred *, int); int (*readlink)(struct inode *, struct page *); int (*read) (struct inode *, struct rpc_cred *, struct nfs_fattr *, @@ -332,7 +356,11 @@ int (*mknod) (struct inode *, struct qstr *, struct iattr *, dev_t, struct nfs_fh *, struct nfs_fattr *); int (*statfs) (struct nfs_server *, struct nfs_fh *, + struct nfs_fsstat *); + int (*fsinfo) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); + int (*pathconf) (struct nfs_server *, struct nfs_fh *, + struct nfs_pathconf *); u32 * (*decode_dirent)(u32 *, struct nfs_entry *, int plus); }; diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/sunrpc/clnt.h linux-2.4.22-22-soft2/include/linux/sunrpc/clnt.h --- linux-2.4.22-pre2/include/linux/sunrpc/clnt.h 2002-08-15 03:05:32.000000000 +0200 +++ linux-2.4.22-22-soft2/include/linux/sunrpc/clnt.h 2003-06-27 00:58:15.000000000 +0200 @@ -49,8 +49,8 @@ cl_droppriv : 1,/* enable NFS suid hack */ cl_oneshot : 1,/* dispose after use */ cl_dead : 1;/* abandoned */ - unsigned int cl_flags; /* misc client flags */ - unsigned long cl_hardmax; /* max hard timeout */ + unsigned long cl_flags; /* misc client flags */ + unsigned long cl_timeo; /* last timeout message */ struct rpc_rtt cl_rtt; /* RTO estimator data */ @@ -66,6 +66,8 @@ #define cl_port cl_pmap.pm_port #define cl_prot cl_pmap.pm_prot +#define RPC_CLNT_NORESPONSE 1 + /* * General RPC program info */ diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/sunrpc/sched.h linux-2.4.22-22-soft2/include/linux/sunrpc/sched.h --- linux-2.4.22-pre2/include/linux/sunrpc/sched.h 2002-08-14 15:27:16.000000000 +0200 +++ linux-2.4.22-22-soft2/include/linux/sunrpc/sched.h 2003-06-27 00:58:15.000000000 +0200 @@ -108,10 +108,10 @@ #define RPC_TASK_SETUID 0x0004 /* is setuid process */ #define RPC_TASK_CHILD 0x0008 /* is child of other task */ #define RPC_CALL_REALUID 0x0010 /* try using real uid */ -#define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ -#define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */ -#define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ -#define RPC_TASK_KILLED 0x0100 /* task was killed */ +#define RPC_TASK_ROOTCREDS 0x0020 /* force root creds */ +#define RPC_TASK_KILLED 0x0040 /* task was killed */ +#define RPC_TASK_SOFT 0x0080 /* soft time out */ +#define RPC_TASK_RTT 0x0100 /* use round trip timer */ #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SETUID(t) ((t)->tk_flags & RPC_TASK_SETUID) @@ -121,6 +121,8 @@ #define RPC_ASSASSINATED(t) ((t)->tk_flags & RPC_TASK_KILLED) #define RPC_IS_ACTIVATED(t) ((t)->tk_active) #define RPC_DO_CALLBACK(t) ((t)->tk_callback != NULL) +#define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) +#define RPC_USE_RTT(t) ((t)->tk_flags & RPC_TASK_RTT) #define RPC_TASK_SLEEPING 0 #define RPC_TASK_RUNNING 1 diff -u --recursive --new-file linux-2.4.22-pre2/include/linux/sunrpc/xprt.h linux-2.4.22-22-soft2/include/linux/sunrpc/xprt.h --- linux-2.4.22-pre2/include/linux/sunrpc/xprt.h 2002-08-15 03:05:32.000000000 +0200 +++ linux-2.4.22-22-soft2/include/linux/sunrpc/xprt.h 2003-06-27 00:58:15.000000000 +0200 @@ -44,6 +44,19 @@ #define RPC_MAX_UDP_TIMEOUT (60*HZ) #define RPC_MAX_TCP_TIMEOUT (600*HZ) +/* + * * Wait duration for an RPC TCP connection to be established. Solaris + * * NFS over TCP uses 60 seconds, for example, which is in line with how + * * long a server takes to reboot. + * */ +#define RPC_CONNECT_TIMEOUT (60*HZ) + +/* + * * Delay an arbitrary number of seconds before attempting to reconnect + * * after an error. + * */ +#define RPC_REESTABLISH_TIMEOUT (15*HZ) + /* RPC call and reply header size as number of 32bit words (verifier * size computed separately) */ @@ -57,8 +70,7 @@ unsigned long to_current, /* current timeout */ to_initval, /* initial timeout */ to_maxval, /* max timeout */ - to_increment, /* if !exponential */ - to_resrvval; /* reserve timeout */ + to_increment; /* if !exponential */ short to_retries; /* max # of retries */ unsigned char to_exponential; }; @@ -134,6 +146,7 @@ unsigned long sockstate; /* Socket state */ unsigned char shutdown : 1, /* being shut down */ nocong : 1, /* no congestion control */ + resvport : 1, /* use a reserved port */ stream : 1; /* TCP */ /* @@ -173,18 +186,18 @@ void xprt_set_timeout(struct rpc_timeout *, unsigned int, unsigned long); -int xprt_reserve(struct rpc_task *); +void xprt_reserve(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_timeout *); void xprt_release(struct rpc_task *); -void xprt_reconnect(struct rpc_task *); +void xprt_connect(struct rpc_task *); int xprt_clear_backlog(struct rpc_xprt *); void xprt_sock_setbufsize(struct rpc_xprt *); #define XPRT_CONNECT 0 -#define xprt_connected(xp) (!(xp)->stream || test_bit(XPRT_CONNECT, &(xp)->sockstate)) +#define xprt_connected(xp) (test_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_set_connected(xp) (set_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_test_and_set_connected(xp) (test_and_set_bit(XPRT_CONNECT, &(xp)->sockstate)) #define xprt_clear_connected(xp) (clear_bit(XPRT_CONNECT, &(xp)->sockstate)) diff -u --recursive --new-file linux-2.4.22-pre2/mm/filemap.c linux-2.4.22-22-soft2/mm/filemap.c --- linux-2.4.22-pre2/mm/filemap.c 2003-06-03 13:23:57.000000000 +0200 +++ linux-2.4.22-22-soft2/mm/filemap.c 2003-06-27 00:50:13.000000000 +0200 @@ -1607,7 +1607,7 @@ if (retval) break; - retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + retval = mapping->a_ops->direct_IO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize); if (rw == READ && retval > 0) mark_dirty_kiobuf(iobuf, retval); diff -u --recursive --new-file linux-2.4.22-pre2/net/sunrpc/clnt.c linux-2.4.22-22-soft2/net/sunrpc/clnt.c --- linux-2.4.22-pre2/net/sunrpc/clnt.c 2002-08-21 20:48:11.000000000 +0200 +++ linux-2.4.22-22-soft2/net/sunrpc/clnt.c 2003-06-27 00:51:27.000000000 +0200 @@ -55,9 +55,8 @@ static void call_refresh(struct rpc_task *task); static void call_refreshresult(struct rpc_task *task); static void call_timeout(struct rpc_task *task); -static void call_reconnect(struct rpc_task *task); -static void child_reconnect(struct rpc_task *); -static void child_reconnect_status(struct rpc_task *); +static void call_connect(struct rpc_task *task); +static void call_connect_status(struct rpc_task *); static u32 * call_header(struct rpc_task *task); static u32 * call_verify(struct rpc_task *task); @@ -394,8 +393,6 @@ static void call_reserve(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; - dprintk("RPC: %4d call_reserve\n", task->tk_pid); if (!rpcauth_uptodatecred(task)) { @@ -405,7 +402,6 @@ task->tk_status = 0; task->tk_action = call_reserveresult; - task->tk_timeout = clnt->cl_timeout.to_resrvval; xprt_reserve(task); } @@ -419,38 +415,46 @@ dprintk("RPC: %4d call_reserveresult (status %d)\n", task->tk_pid, task->tk_status); + /* * After a call to xprt_reserve(), we must have either * a request slot or else an error status. */ - if ((task->tk_status >= 0 && !task->tk_rqstp) || - (task->tk_status < 0 && task->tk_rqstp)) - printk(KERN_ERR "call_reserveresult: status=%d, request=%p??\n", - task->tk_status, task->tk_rqstp); + task->tk_status = 0; + if (status >= 0) { + if (task->tk_rqstp) { + task->tk_action = call_allocate; + return; + } - if (task->tk_status >= 0) { - task->tk_action = call_allocate; + printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", + __FUNCTION__, status); + rpc_exit(task, -EIO); return; } - task->tk_status = 0; + /* + * Even though there was an error, we may have acquired + * a request slot somehow. Make sure not to leak it. + */ + if (task->tk_rqstp) { + printk(KERN_ERR "%s: status=%d, request allocated anyway\n", + __FUNCTION__, status); + xprt_release(task); + } + switch (status) { - case -EAGAIN: - case -ENOBUFS: - task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; + case -EAGAIN: /* woken up; retry */ task->tk_action = call_reserve; - break; - case -ETIMEDOUT: - dprintk("RPC: task timed out\n"); - task->tk_action = call_timeout; + return; + case -EIO: /* probably a shutdown */ break; default: - if (!task->tk_rqstp) { - printk(KERN_INFO "RPC: task has no request, exit EIO\n"); - rpc_exit(task, -EIO); - } else - rpc_exit(task, status); + printk(KERN_ERR "%s: unrecognized error %d, exiting\n", + __FUNCTION__, status); + break; } + rpc_exit(task, status); } /* @@ -545,53 +549,69 @@ struct rpc_clnt *clnt = task->tk_client; struct rpc_xprt *xprt = clnt->cl_xprt; - task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_reconnect; + dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; if (!clnt->cl_port) { - task->tk_action = call_reconnect; + task->tk_action = call_connect; task->tk_timeout = clnt->cl_timeout.to_maxval; rpc_getport(task, clnt); } } /* - * 4a. Reconnect to the RPC server (TCP case) + * 4a. Establish socket + * Connect to the RPC server (TCP case) */ static void -call_reconnect(struct rpc_task *task) +call_connect(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_task *child; - dprintk("RPC: %4d call_reconnect status %d\n", + dprintk("RPC: %4d call_connect status %d\n", task->tk_pid, task->tk_status); - task->tk_action = call_transmit; - if (task->tk_status < 0 || !clnt->cl_xprt->stream) + if (xprt_connected(clnt->cl_xprt)) { + task->tk_action = call_transmit; return; - - /* Run as a child to ensure it runs as an rpciod task */ - child = rpc_new_child(clnt, task); - if (child) { - child->tk_action = child_reconnect; - rpc_run_child(task, child, NULL); } + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); } -static void child_reconnect(struct rpc_task *task) +/* + * 4b. Sort out reconnection result + */ +static void call_connect_status(struct rpc_task *task) { - task->tk_client->cl_stats->netreconn++; + struct rpc_clnt *clnt = task->tk_client; + int status = task->tk_status; + task->tk_status = 0; - task->tk_action = child_reconnect_status; - xprt_reconnect(task); -} + if (status >= 0) { + clnt->cl_stats->netreconn++; + task->tk_action = call_transmit; + return; + } -static void child_reconnect_status(struct rpc_task *task) -{ - if (task->tk_status == -EAGAIN) - task->tk_action = child_reconnect; - else - task->tk_action = NULL; + /* Something failed: we may have to rebind */ + if (clnt->cl_autobind) + clnt->cl_port = 0; + switch (status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + case -ETIMEDOUT: + case -EAGAIN: + task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + break; + default: + rpc_exit(task, status); + } } /* @@ -651,10 +671,8 @@ task->tk_action = call_bind; break; } - if (xprt->stream) { - task->tk_action = call_reconnect; - break; - } + task->tk_action = call_connect; + break; /* * Sleep and dream of an open connection */ @@ -681,54 +699,46 @@ call_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; - - if (req) { - struct rpc_timeout *to = &req->rq_timeout; + struct rpc_timeout *to = &task->tk_rqstp->rq_timeout; - if (xprt_adjust_timeout(to)) { - dprintk("RPC: %4d call_timeout (minor timeo)\n", - task->tk_pid); - goto minor_timeout; + if (xprt_adjust_timeout(to)) { + dprintk("RPC: %4d call_timeout (minor)\n", task->tk_pid); + goto retry; + } + to->to_retries = clnt->cl_timeout.to_retries; + + dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); + if (RPC_IS_SOFT(task)) { + if (clnt->cl_chatty) { + if (!test_and_set_bit(RPC_CLNT_NORESPONSE, &clnt->cl_flags) + || time_after(jiffies, clnt->cl_timeo + 20*HZ)) { + printk(KERN_NOTICE "%s: server %s is not responding, timed out\n", + clnt->cl_protname, clnt->cl_server); + clnt->cl_timeo = jiffies; + } } - to->to_retries = clnt->cl_timeout.to_retries; - } - - dprintk("RPC: %4d call_timeout (major timeo)\n", task->tk_pid); - if (clnt->cl_softrtry) { - if (clnt->cl_chatty && !task->tk_exit) - printk(KERN_NOTICE "%s: server %s not responding, timed out\n", - clnt->cl_protname, clnt->cl_server); rpc_exit(task, -EIO); return; } - if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN) && rpc_ntimeo(&clnt->cl_rtt) > 7) { - task->tk_flags |= RPC_CALL_MAJORSEEN; - if (req) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); -#ifdef RPC_DEBUG - else - printk(KERN_NOTICE "%s: task %d can't get a request slot\n", - clnt->cl_protname, task->tk_pid); -#endif - } + + if (clnt->cl_chatty) { + if (!test_and_set_bit(RPC_CLNT_NORESPONSE, &clnt->cl_flags)) { + clnt->cl_flags |= RPC_CLNT_NORESPONSE; + printk(KERN_NOTICE "%s: server %s is not responding\n", + clnt->cl_protname, clnt->cl_server); + clnt->cl_timeo = jiffies; + } else if (time_after(jiffies, clnt->cl_timeo + 20*HZ)) { + printk(KERN_NOTICE "%s: server %s is not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); + clnt->cl_timeo = jiffies; + } + } else if (clnt->cl_autobind) clnt->cl_port = 0; -minor_timeout: - if (!req) - task->tk_action = call_reserve; - else if (!clnt->cl_port) { - task->tk_action = call_bind; - clnt->cl_stats->rpcretrans++; - } else if (!xprt_connected(clnt->cl_xprt)) { - task->tk_action = call_reconnect; - clnt->cl_stats->rpcretrans++; - } else { - task->tk_action = call_transmit; - clnt->cl_stats->rpcretrans++; - } +retry: + clnt->cl_stats->rpcretrans++; + task->tk_action = call_bind; task->tk_status = 0; } @@ -746,14 +756,13 @@ dprintk("RPC: %4d call_decode (status %d)\n", task->tk_pid, task->tk_status); - if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { + if (clnt->cl_chatty && test_and_clear_bit(RPC_CLNT_NORESPONSE, &clnt->cl_flags)) { printk(KERN_NOTICE "%s: server %s OK\n", clnt->cl_protname, clnt->cl_server); - task->tk_flags &= ~RPC_CALL_MAJORSEEN; } if (task->tk_status < 12) { - if (!clnt->cl_softrtry) { + if (!RPC_IS_SOFT(task)) { task->tk_action = call_transmit; clnt->cl_stats->rpcretrans++; } else { diff -u --recursive --new-file linux-2.4.22-pre2/net/sunrpc/sched.c linux-2.4.22-22-soft2/net/sunrpc/sched.c --- linux-2.4.22-pre2/net/sunrpc/sched.c 2003-03-19 22:55:06.000000000 +0100 +++ linux-2.4.22-22-soft2/net/sunrpc/sched.c 2003-06-27 00:51:27.000000000 +0200 @@ -761,8 +761,13 @@ list_add(&task->tk_task, &all_tasks); spin_unlock(&rpc_sched_lock); - if (clnt) + if (clnt) { atomic_inc(&clnt->cl_users); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + if (clnt->cl_prot == IPPROTO_UDP) + task->tk_flags |= RPC_TASK_RTT; + } #ifdef RPC_DEBUG task->tk_magic = 0xf00baa; @@ -799,7 +804,6 @@ task->tk_release = rpc_default_free_task; dprintk("RPC: %4d allocated task\n", task->tk_pid); - task->tk_flags |= RPC_TASK_DYNAMIC; out: return task; diff -u --recursive --new-file linux-2.4.22-pre2/net/sunrpc/xprt.c linux-2.4.22-22-soft2/net/sunrpc/xprt.c --- linux-2.4.22-pre2/net/sunrpc/xprt.c 2003-02-20 21:39:51.000000000 +0100 +++ linux-2.4.22-22-soft2/net/sunrpc/xprt.c 2003-06-27 00:51:27.000000000 +0200 @@ -83,10 +83,10 @@ */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void do_xprt_transmit(struct rpc_task *); -static void xprt_reserve_status(struct rpc_task *task); +static inline void do_xprt_reserve(struct rpc_task *); static void xprt_disconnect(struct rpc_xprt *); -static void xprt_reconn_status(struct rpc_task *task); -static struct socket *xprt_create_socket(int, struct rpc_timeout *); +static void xprt_connect_status(struct rpc_task *task); +static struct socket *xprt_create_socket(int, struct rpc_timeout *, int); static int xprt_bind_socket(struct rpc_xprt *, struct socket *); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); @@ -133,14 +133,17 @@ /* * Serialize write access to sockets, in order to prevent different * requests from interfering with each other. - * Also prevents TCP socket reconnections from colliding with writes. + * Also prevents TCP socket connections from colliding with writes. */ static int __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (!xprt->snd_task) { - if (xprt->nocong || __xprt_get_cong(xprt, task)) + if (xprt->nocong || __xprt_get_cong(xprt, task)) { xprt->snd_task = task; + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + } } if (xprt->snd_task != task) { dprintk("RPC: %4d TCP write queue full\n", task->tk_pid); @@ -179,8 +182,11 @@ if (!task) return; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) + if (xprt->nocong || __xprt_get_cong(xprt, task)) { xprt->snd_task = task; + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + } } /* @@ -266,6 +272,7 @@ */ case -EAGAIN: break; + case -ECONNRESET: case -ENOTCONN: case -EPIPE: /* connection broken */ @@ -383,6 +390,7 @@ if (!sk) return; + write_lock_bh(&sk->callback_lock); xprt->inet = NULL; xprt->sock = NULL; @@ -390,17 +398,12 @@ sk->data_ready = xprt->old_data_ready; sk->state_change = xprt->old_state_change; sk->write_space = xprt->old_write_space; + write_unlock_bh(&sk->callback_lock); xprt_disconnect(xprt); sk->no_check = 0; sock_release(sock); - /* - * TCP doesn't require the rpciod now - other things may - * but rpciod handles that not us. - */ - if(xprt->stream) - rpciod_down(); } /* @@ -410,31 +413,29 @@ xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); + spin_lock_bh(&xprt->sock_lock); xprt_clear_connected(xprt); rpc_wake_up_status(&xprt->pending, -ENOTCONN); + spin_unlock_bh(&xprt->sock_lock); } /* * Reconnect a broken TCP connection. * - * Note: This cannot collide with the TCP reads, as both run from rpciod */ void -xprt_reconnect(struct rpc_task *task) +xprt_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct socket *sock = xprt->sock; struct sock *inet; int status; - dprintk("RPC: %4d xprt_reconnect %p connected %d\n", + dprintk("RPC: %4d xprt_connect %p connected %d\n", task->tk_pid, xprt, xprt_connected(xprt)); if (xprt->shutdown) return; - if (!xprt->stream) - return; - if (!xprt->addr.sin_port) { task->tk_status = -EIO; return; @@ -445,76 +446,112 @@ if (xprt_connected(xprt)) goto out_write; - if (sock && sock->state != SS_UNCONNECTED) - xprt_close(xprt); - status = -ENOTCONN; - if (!(inet = xprt->inet)) { - /* Create an unconnected socket */ - if (!(sock = xprt_create_socket(xprt->prot, &xprt->timeout))) - goto defer; - xprt_bind_socket(xprt, sock); - inet = sock->sk; + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + + xprt_close(xprt); + /* Create an unconnected socket */ + sock = xprt_create_socket(xprt->prot, &xprt->timeout, xprt->resvport); + if (!sock) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + task->tk_status = -EIO; + goto out_write; } + xprt_bind_socket(xprt, sock); + + if (!xprt->stream) + goto out_write; + + inet = sock->sk; /* Now connect it asynchronously. */ dprintk("RPC: %4d connecting new socket\n", task->tk_pid); status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %4d connect status %d connected %d\n", + task->tk_pid, status, xprt_connected(xprt)); - if (status < 0) { - switch (status) { - case -EALREADY: - case -EINPROGRESS: - status = 0; - break; - case -EISCONN: - case -EPIPE: - status = 0; - xprt_close(xprt); - goto defer; - default: - printk("RPC: TCP connect error %d!\n", -status); - xprt_close(xprt); - goto defer; - } + if (status >= 0) + return; + switch (status) { + case -EALREADY: + case -EINPROGRESS: /* Protect against TCP socket state changes */ lock_sock(inet); - dprintk("RPC: %4d connect status %d connected %d\n", - task->tk_pid, status, xprt_connected(xprt)); - if (inet->state != TCP_ESTABLISHED) { - task->tk_timeout = xprt->timeout.to_maxval; - /* if the socket is already closing, delay 5 secs */ + dprintk("RPC: %4d waiting for connection\n", + task->tk_pid); + task->tk_timeout = RPC_CONNECT_TIMEOUT; + /* if the socket is already closing, delay briefly */ if ((1<state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) - task->tk_timeout = 5*HZ; - rpc_sleep_on(&xprt->pending, task, xprt_reconn_status, NULL); - release_sock(inet); - return; + task->tk_timeout = RPC_REESTABLISH_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, + NULL); } release_sock(inet); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + if (!RPC_IS_SOFT(task)) { + rpc_delay(task, RPC_REESTABLISH_TIMEOUT); + task->tk_status = -ENOTCONN; + break; + } + default: + /* Report myriad other possible returns. If this file + * system is soft mounted, just error out, like Solaris. */ + if (RPC_IS_SOFT(task)) { + printk(KERN_WARNING + "RPC: error %d connecting to server %s, exiting\n", + -status, task->tk_client->cl_server); + task->tk_status = -EIO; + goto out_write; + } + printk(KERN_WARNING "RPC: error %d connecting to server %s\n", + -status, task->tk_client->cl_server); + /* This will prevent anybody else from connecting */ + rpc_delay(task, RPC_REESTABLISH_TIMEOUT); + task->tk_status = status; + break; } -defer: - if (status < 0) { - rpc_delay(task, 5*HZ); - task->tk_status = -ENOTCONN; - } + return; out_write: xprt_release_write(xprt, task); } /* - * Reconnect timeout. We just mark the transport as not being in the - * process of reconnecting, and leave the rest to the upper layers. + * We arrive here when awoken from waiting on connection establishment. */ static void -xprt_reconn_status(struct rpc_task *task) +xprt_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - dprintk("RPC: %4d xprt_reconn_timeout %d\n", - task->tk_pid, task->tk_status); + if (task->tk_status >= 0) { + dprintk("RPC: %4d xprt_connect_status: connection established\n", + task->tk_pid); + return; + } + + /* if soft mounted, cause this RPC to fail */ + if (RPC_IS_SOFT(task)) + task->tk_status = -EIO; + switch (task->tk_status) { + case -ENOTCONN: + rpc_delay(task, RPC_REESTABLISH_TIMEOUT); + return; + case -ETIMEDOUT: + dprintk("RPC: %4d xprt_connect_status: timed out\n", + task->tk_pid); + break; + default: + printk(KERN_ERR "RPC: error %d connecting to server %s\n", + -task->tk_status, task->tk_client->cl_server); + } xprt_release_write(xprt, task); } @@ -657,8 +694,9 @@ struct sk_buff *skb; int err, repsize, copied; + read_lock(&sk->callback_lock); dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { + if (sk->dead || !(xprt = xprt_from_sock(sk))) { printk("RPC: udp_data_ready request not found!\n"); goto out; } @@ -707,6 +745,7 @@ out: if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); + read_unlock(&sk->callback_lock); } /* @@ -895,7 +934,7 @@ } /* Skip over any trailing bytes on short reads */ tcp_read_discard(xprt, &desc); - } while (desc.count && xprt_connected(xprt)); + } while (desc.count); dprintk("RPC: tcp_data_recv done\n"); return len - desc.count; } @@ -905,18 +944,21 @@ struct rpc_xprt *xprt; read_descriptor_t rd_desc; + read_lock(&sk->callback_lock); dprintk("RPC: tcp_data_ready...\n"); if (!(xprt = xprt_from_sock(sk))) { printk("RPC: tcp_data_ready socket info not found!\n"); - return; + goto out; } if (xprt->shutdown) - return; + goto out; /* We use rd_desc to pass struct xprt to tcp_data_recv */ rd_desc.buf = (char *)xprt; rd_desc.count = 65536; tcp_read_sock(sk, &rd_desc, tcp_data_recv); +out: + read_unlock(&sk->callback_lock); } static void @@ -924,6 +966,7 @@ { struct rpc_xprt *xprt; + read_lock(&sk->callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; dprintk("RPC: tcp_state_change client %p...\n", xprt); @@ -942,10 +985,10 @@ xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - spin_lock(&xprt->sock_lock); + spin_lock_bh(&xprt->sock_lock); if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) rpc_wake_up_task(xprt->snd_task); - spin_unlock(&xprt->sock_lock); + spin_unlock_bh(&xprt->sock_lock); break; case TCP_SYN_SENT: case TCP_SYN_RECV: @@ -957,6 +1000,7 @@ out: if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible_all(sk->sleep); + read_unlock(&sk->callback_lock); } /* @@ -971,24 +1015,25 @@ struct rpc_xprt *xprt; struct socket *sock; + read_lock(&sk->callback_lock); if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->socket)) - return; + goto out; if (xprt->shutdown) - return; + goto out; /* Wait until we have enough socket memory */ if (xprt->stream) { /* from net/ipv4/tcp.c:tcp_write_space */ if (tcp_wspace(sk) < tcp_min_write_space(sk)) - return; + goto out; } else { /* from net/core/sock.c:sock_def_write_space */ if (!sock_writeable(sk)) - return; + goto out; } if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - return; + goto out; spin_lock_bh(&xprt->sock_lock); if (xprt->snd_task && xprt->snd_task->tk_rpcwait == &xprt->pending) @@ -996,21 +1041,8 @@ spin_unlock_bh(&xprt->sock_lock); if (sk->sleep && waitqueue_active(sk->sleep)) wake_up_interruptible(sk->sleep); -} - -/* - * Exponential backoff for UDP retries - */ -static inline int -xprt_expbackoff(struct rpc_task *task, struct rpc_rqst *req) -{ - int backoff; - - req->rq_ntimeo++; - backoff = min(rpc_ntimeo(&task->tk_client->cl_rtt), XPRT_MAX_BACKOFF); - if (req->rq_ntimeo < (1 << backoff)) - return 1; - return 0; +out: + read_unlock(&sk->callback_lock); } /* @@ -1026,14 +1058,7 @@ if (req->rq_received) goto out; - if (!xprt->nocong) { - if (xprt_expbackoff(task, req)) { - rpc_add_timer(task, xprt_timer); - goto out_unlock; - } - rpc_inc_timeo(&task->tk_client->cl_rtt); - xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); - } + xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); req->rq_nresend++; dprintk("RPC: %4d xprt_timer (%s request)\n", @@ -1043,7 +1068,6 @@ out: task->tk_timeout = 0; rpc_wake_up_task(task); -out_unlock: spin_unlock(&xprt->sock_lock); } @@ -1063,9 +1087,6 @@ if (xprt->shutdown) task->tk_status = -EIO; - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - if (task->tk_status < 0) return; @@ -1081,10 +1102,14 @@ } spin_lock_bh(&xprt->sock_lock); - if (!__xprt_lock_write(xprt, task)) { - spin_unlock_bh(&xprt->sock_lock); - return; + if (!__xprt_lock_write(xprt, task)) + goto out_notrans; + + if (!xprt_connected(xprt)) { + task->tk_status = -ENOTCONN; + goto out_notrans; } + if (list_empty(&req->rq_list)) { list_add_tail(&req->rq_list, &xprt->recv); req->rq_received = 0; @@ -1092,6 +1117,9 @@ spin_unlock_bh(&xprt->sock_lock); do_xprt_transmit(task); + return; +out_notrans: + spin_unlock_bh(&xprt->sock_lock); } static void @@ -1135,11 +1163,12 @@ break; } - /* Note: at this point, task->tk_sleeping has not yet been set, - * hence there is no danger of the waking up task being put on - * schedq, and being picked up by a parallel run of rpciod(). + /* If we're doing a resend and have received a reply already, + * then exit early. + * Note, though, that we can't do this if we've already started + * resending down a TCP stream. */ - if (req->rq_received) + if (req->rq_received && !req->rq_bytes_sent) goto out_release; task->tk_status = status; @@ -1149,7 +1178,10 @@ if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { /* Protect against races with xprt_write_space */ spin_lock_bh(&xprt->sock_lock); - if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { task->tk_timeout = req->rq_timeout.to_current; rpc_sleep_on(&xprt->pending, task, NULL, NULL); } @@ -1160,30 +1192,34 @@ rpc_delay(task, HZ>>4); return; case -ECONNREFUSED: + task->tk_timeout = RPC_REESTABLISH_TIMEOUT; + rpc_sleep_on(&xprt->sending, task, NULL, NULL); case -ENOTCONN: - if (!xprt->stream) - return; + return; default: if (xprt->stream) xprt_disconnect(xprt); - req->rq_bytes_sent = 0; } out_release: xprt_release_write(xprt, task); return; out_receive: dprintk("RPC: %4d xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->sock_lock); /* Set the task's receive timeout value */ - if (!xprt->nocong) { + if (RPC_USE_RTT(task)) { task->tk_timeout = rpc_calc_rto(&clnt->cl_rtt, rpcproc_timer(clnt, task->tk_msg.rpc_proc)); - req->rq_ntimeo = 0; + task->tk_timeout <<= clnt->cl_timeout.to_retries + - req->rq_timeout.to_retries; if (task->tk_timeout > req->rq_timeout.to_maxval) task->tk_timeout = req->rq_timeout.to_maxval; } else task->tk_timeout = req->rq_timeout.to_current; - spin_lock_bh(&xprt->sock_lock); - if (!req->rq_received) + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); __xprt_release_write(xprt, task); spin_unlock_bh(&xprt->sock_lock); @@ -1192,61 +1228,39 @@ /* * Reserve an RPC call slot. */ -int +void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - /* We already have an initialized request. */ - if (task->tk_rqstp) - return 0; - - spin_lock(&xprt->xprt_lock); - xprt_reserve_status(task); - if (task->tk_rqstp) { - task->tk_timeout = 0; - } else if (!task->tk_timeout) { - task->tk_status = -ENOBUFS; - } else { - dprintk("RPC: xprt_reserve waiting on backlog\n"); - task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->backlog, task, NULL, NULL); + task->tk_status = -EIO; + if (!xprt->shutdown) { + spin_lock(&xprt->xprt_lock); + do_xprt_reserve(task); + spin_unlock(&xprt->xprt_lock); } - spin_unlock(&xprt->xprt_lock); - dprintk("RPC: %4d xprt_reserve returns %d\n", - task->tk_pid, task->tk_status); - return task->tk_status; } -/* - * Reservation callback - */ -static void -xprt_reserve_status(struct rpc_task *task) +static inline void +do_xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req; - if (xprt->shutdown) { - task->tk_status = -EIO; - } else if (task->tk_status < 0) { - /* NOP */ - } else if (task->tk_rqstp) { - /* We've already been given a request slot: NOP */ - } else { - if (!(req = xprt->free)) - goto out_nofree; - /* OK: There's room for us. Grab a free slot */ - xprt->free = req->rq_next; - req->rq_next = NULL; + task->tk_status = 0; + if (task->tk_rqstp) + return; + if (xprt->free) { + struct rpc_rqst *req = xprt->free; + xprt->free = req->rq_next; + req->rq_next = NULL; task->tk_rqstp = req; xprt_request_init(task, xprt); + return; } - - return; - -out_nofree: + dprintk("RPC: waiting for request slot\n"); task->tk_status = -EAGAIN; + task->tk_timeout = 0; + rpc_sleep_on(&xprt->backlog, task, NULL, NULL); } /* @@ -1339,7 +1353,6 @@ to->to_initval = to->to_increment = incr; to->to_maxval = incr * retr; - to->to_resrvval = incr * retr; to->to_retries = retr; to->to_exponential = 0; } @@ -1348,8 +1361,7 @@ * Initialize an RPC client */ static struct rpc_xprt * -xprt_setup(struct socket *sock, int proto, - struct sockaddr_in *ap, struct rpc_timeout *to) +xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { struct rpc_xprt *xprt; struct rpc_rqst *req; @@ -1380,7 +1392,6 @@ if (to) { xprt->timeout = *to; xprt->timeout.to_current = to->to_initval; - xprt->timeout.to_resrvval = to->to_maxval << 1; } else xprt_default_timeout(&xprt->timeout, xprt->prot); @@ -1395,9 +1406,11 @@ req->rq_next = NULL; xprt->free = xprt->slot; + /* Check whether we want to use a reserved port */ + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + dprintk("RPC: created transport %p\n", xprt); - xprt_bind_socket(xprt, sock); return xprt; } @@ -1409,6 +1422,12 @@ { struct sockaddr_in myaddr; int err, port; + kernel_cap_t saved_cap = current->cap_effective; + + /* Override capabilities. + * They were checked in xprt_create_proto i.e. at mount time + */ + cap_raise (current->cap_effective, CAP_NET_BIND_SERVICE); memset(&myaddr, 0, sizeof(myaddr)); myaddr.sin_family = AF_INET; @@ -1418,6 +1437,7 @@ err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr)); } while (err == -EADDRINUSE && --port > 0); + current->cap_effective = saved_cap; if (err < 0) printk("RPC: Can't bind to reserved port (%d).\n", -err); @@ -1433,6 +1453,7 @@ if (xprt->inet) return -EBUSY; + write_lock_bh(&sk->callback_lock); sk->user_data = xprt; xprt->old_data_ready = sk->data_ready; xprt->old_state_change = sk->state_change; @@ -1453,11 +1474,7 @@ /* Reset to new socket */ xprt->sock = sock; xprt->inet = sk; - /* - * TCP requires the rpc I/O daemon is present - */ - if(xprt->stream) - rpciod_up(); + write_unlock_bh(&sk->callback_lock); return 0; } @@ -1487,7 +1504,7 @@ * Create a client socket given the protocol and peer address. */ static struct socket * -xprt_create_socket(int proto, struct rpc_timeout *to) +xprt_create_socket(int proto, struct rpc_timeout *to, int resvport) { struct socket *sock; int type, err; @@ -1502,8 +1519,8 @@ goto failed; } - /* If the caller has the capability, bind to a reserved port */ - if (capable(CAP_NET_BIND_SERVICE) && xprt_bindresvport(sock) < 0) + /* bind to a reserved port */ + if (resvport && xprt_bindresvport(sock) < 0) goto failed; return sock; @@ -1519,18 +1536,19 @@ struct rpc_xprt * xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) { - struct socket *sock; struct rpc_xprt *xprt; - dprintk("RPC: xprt_create_proto called\n"); - - if (!(sock = xprt_create_socket(proto, to))) - return NULL; - - if (!(xprt = xprt_setup(sock, proto, sap, to))) - sock_release(sock); + xprt = xprt_setup(proto, sap, to); + if (!xprt) + goto out_bad; + dprintk("RPC: xprt_create_proto created xprt %p\n", xprt); return xprt; +out_bad: + dprintk("RPC: xprt_create_proto failed\n"); + if (xprt) + kfree(xprt); + return NULL; } /*