X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Ffilemap.c;h=f4d0cded0e10aa21b02707fcaf99c4cbcafa4f06;hb=cf03613e9662c28372b8c83538fb402df37c53f5;hp=67a03a0a9aee0f2d39db9f92a3429355a449fb2b;hpb=2f718ffc16c43a435d12919c75dbfad518abd056;p=linux-2.6 diff --git a/mm/filemap.c b/mm/filemap.c index 67a03a0a9a..f4d0cded0e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -25,8 +25,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -63,6 +65,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock + * ->zone.lock * * ->i_mutex * ->i_mmap_lock (truncate->unmap_mapping_range) @@ -121,6 +124,18 @@ void __remove_from_page_cache(struct page *page) mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + + /* + * Some filesystems seem to re-dirty the page even after + * the VM has canceled the dirty bit (eg ext3 journaling). + * + * Fix it up by doing a final dirty accounting check after + * having removed the page entirely. + */ + if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { + dec_zone_page_state(page, NR_FILE_DIRTY); + dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + } } void remove_from_page_cache(struct page *page) @@ -839,7 +854,7 @@ static void shrink_readahead_size_eio(struct file *filp, /** * do_generic_mapping_read - generic file read routine * @mapping: address_space to be read - * @_ra: file's readahead state + * @ra: file's readahead state * @filp: the file to read * @ppos: current file position * @desc: read_descriptor @@ -1297,7 +1312,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) - goto outside_data_content; + return VM_FAULT_SIGBUS; /* If we don't want any read-ahead, don't bother */ if (VM_RandomReadHint(vma)) @@ -1374,7 +1389,7 @@ retry_find: if (unlikely(vmf->pgoff >= size)) { unlock_page(page); page_cache_release(page); - goto outside_data_content; + return VM_FAULT_SIGBUS; } /* @@ -1385,15 +1400,6 @@ retry_find: vmf->page = page; return ret | VM_FAULT_LOCKED; -outside_data_content: - /* - * An external ptracer can access pages that normally aren't - * accessible.. - */ - if (vma->vm_mm == current->mm) - return VM_FAULT_SIGBUS; - - /* Fall through to the non-read-ahead case */ no_cached_page: /* * We're only likely to ever get here if MADV_RANDOM is in @@ -1626,12 +1632,18 @@ int __remove_suid(struct dentry *dentry, int kill) int remove_suid(struct dentry *dentry) { - int kill = should_remove_suid(dentry); + int killsuid = should_remove_suid(dentry); + int killpriv = security_inode_need_killpriv(dentry); + int error = 0; - if (unlikely(kill)) - return __remove_suid(dentry, kill); + if (killpriv < 0) + return killpriv; + if (killpriv) + error = security_inode_killpriv(dentry); + if (!error && killsuid) + error = __remove_suid(dentry, killsuid); - return 0; + return error; } EXPORT_SYMBOL(remove_suid); @@ -1684,6 +1696,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, return copied; } +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); /* * This has the same sideeffects and return value as @@ -1710,6 +1723,7 @@ size_t iov_iter_copy_from_user(struct page *page, kunmap(page); return copied; } +EXPORT_SYMBOL(iov_iter_copy_from_user); static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) { @@ -1741,13 +1755,24 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) __iov_iter_advance_iov(i, bytes); i->count -= bytes; } +EXPORT_SYMBOL(iov_iter_advance); -int iov_iter_fault_in_readable(struct iov_iter *i) +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) { - size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count); char __user *buf = i->iov->iov_base + i->iov_offset; - return fault_in_pages_readable(buf, seglen); + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); } +EXPORT_SYMBOL(iov_iter_fault_in_readable); /* * Return the count of just the current iov_iter segment. @@ -1760,6 +1785,7 @@ size_t iov_iter_single_seg_count(struct iov_iter *i) else return min(i->count, iov->iov_len - i->iov_offset); } +EXPORT_SYMBOL(iov_iter_single_seg_count); /* * Performs necessary checks before doing a write @@ -1843,6 +1869,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_end) { + mark_page_accessed(page); + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -1886,8 +1997,7 @@ EXPORT_SYMBOL(generic_file_direct_write); * Find or create a page at the given pagecache position. Return the locked * page. This function is specifically for buffered writes. */ -static struct page *__grab_cache_page(struct address_space *mapping, - pgoff_t index) +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { int status; struct page *page; @@ -1908,20 +2018,16 @@ repeat: } return page; } +EXPORT_SYMBOL(__grab_cache_page); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) { - struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { struct page *src_page; @@ -1934,7 +2040,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, - iov_iter_count(&i)); + iov_iter_count(i)); /* * a non-NULL src_page indicates that we're doing the @@ -1952,7 +2058,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * to check that the address is actually valid, when atomic * usercopies are used, below. */ - if (unlikely(iov_iter_fault_in_readable(&i))) { + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; break; } @@ -1968,7 +2074,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * cannot take a pagefault with the destination page locked. * So pin the source page to copy it. */ - if (!PageUptodate(page)) { + if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { unlock_page(page); src_page = alloc_page(GFP_KERNEL); @@ -1983,7 +2089,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * same reason as we can't take a page fault with a * page locked (as explained below). */ - copied = iov_iter_copy_from_user(src_page, &i, + copied = iov_iter_copy_from_user(src_page, i, offset, bytes); if (unlikely(copied == 0)) { status = -EFAULT; @@ -2008,7 +2114,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, page_cache_release(src_page); continue; } - } status = a_ops->prepare_write(file, page, offset, offset+bytes); @@ -2030,7 +2135,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, * really matter. */ pagefault_disable(); - copied = iov_iter_copy_from_user_atomic(page, &i, + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); } else { @@ -2045,7 +2150,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, flush_dcache_page(page); status = a_ops->commit_write(file, page, offset, offset+bytes); - if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE)) + if (unlikely(status < 0)) goto fs_write_aop_error; if (unlikely(status > 0)) /* filesystem did partial write */ copied = min_t(size_t, copied, status); @@ -2056,17 +2161,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (src_page) page_cache_release(src_page); - iov_iter_advance(&i, copied); - written += copied; + iov_iter_advance(i, copied); pos += copied; + written += copied; balance_dirty_pages_ratelimited(mapping); cond_resched(); continue; fs_write_aop_error: - if (status != AOP_TRUNCATED_PAGE) - unlock_page(page); + unlock_page(page); page_cache_release(page); if (src_page) page_cache_release(src_page); @@ -2078,17 +2182,125 @@ fs_write_aop_error: */ if (pos + bytes > inode->i_size) vmtruncate(inode, inode->i_size); - if (status == AOP_TRUNCATED_PAGE) - continue; - else - break; - } while (iov_iter_count(&i)); - *ppos = pos; + break; + } while (iov_iter_count(i)); + + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + * Copies from kernel address space cannot fail (NFSD is a big user). */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, @@ -2302,21 +2514,17 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, } retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - if (retval) - goto out; /* * Finally, try again to invalidate clean pages which might have been - * faulted in by get_user_pages() if the source of the write was an - * mmap()ed region of the file we're writing. That's a pretty crazy - * thing to do, so we don't support it 100%. If this invalidation - * fails and we have -EIOCBQUEUED we ignore the failure. + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... */ if (rw == WRITE && mapping->nrpages) { - int err = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (err && retval >= 0) - retval = err; + invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); } out: return retval;