X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Ffilemap.c;h=f4d0cded0e10aa21b02707fcaf99c4cbcafa4f06;hb=cf03613e9662c28372b8c83538fb402df37c53f5;hp=67a03a0a9aee0f2d39db9f92a3429355a449fb2b;hpb=2f718ffc16c43a435d12919c75dbfad518abd056;p=linux-2.6

diff --git a/mm/filemap.c b/mm/filemap.c
index 67a03a0a9a..f4d0cded0e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -25,8 +25,10 @@
 #include <linux/uio.h>
 #include <linux/hash.h>
 #include <linux/writeback.h>
+#include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
@@ -63,6 +65,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_lock		(exclusive_swap_page, others)
  *        ->mapping->tree_lock
+ *          ->zone.lock
  *
  *  ->i_mutex
  *    ->i_mmap_lock		(truncate->unmap_mapping_range)
@@ -121,6 +124,18 @@ void __remove_from_page_cache(struct page *page)
 	mapping->nrpages--;
 	__dec_zone_page_state(page, NR_FILE_PAGES);
 	BUG_ON(page_mapped(page));
+
+	/*
+	 * Some filesystems seem to re-dirty the page even after
+	 * the VM has canceled the dirty bit (eg ext3 journaling).
+	 *
+	 * Fix it up by doing a final dirty accounting check after
+	 * having removed the page entirely.
+	 */
+	if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
+		dec_zone_page_state(page, NR_FILE_DIRTY);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+	}
 }
 
 void remove_from_page_cache(struct page *page)
@@ -839,7 +854,7 @@ static void shrink_readahead_size_eio(struct file *filp,
 /**
  * do_generic_mapping_read - generic file read routine
  * @mapping:	address_space to be read
- * @_ra:	file's readahead state
+ * @ra:		file's readahead state
  * @filp:	the file to read
  * @ppos:	current file position
  * @desc:	read_descriptor
@@ -1297,7 +1312,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (vmf->pgoff >= size)
-		goto outside_data_content;
+		return VM_FAULT_SIGBUS;
 
 	/* If we don't want any read-ahead, don't bother */
 	if (VM_RandomReadHint(vma))
@@ -1374,7 +1389,7 @@ retry_find:
 	if (unlikely(vmf->pgoff >= size)) {
 		unlock_page(page);
 		page_cache_release(page);
-		goto outside_data_content;
+		return VM_FAULT_SIGBUS;
 	}
 
 	/*
@@ -1385,15 +1400,6 @@ retry_find:
 	vmf->page = page;
 	return ret | VM_FAULT_LOCKED;
 
-outside_data_content:
-	/*
-	 * An external ptracer can access pages that normally aren't
-	 * accessible..
-	 */
-	if (vma->vm_mm == current->mm)
-		return VM_FAULT_SIGBUS;
-
-	/* Fall through to the non-read-ahead case */
 no_cached_page:
 	/*
 	 * We're only likely to ever get here if MADV_RANDOM is in
@@ -1626,12 +1632,18 @@ int __remove_suid(struct dentry *dentry, int kill)
 
 int remove_suid(struct dentry *dentry)
 {
-	int kill = should_remove_suid(dentry);
+	int killsuid = should_remove_suid(dentry);
+	int killpriv = security_inode_need_killpriv(dentry);
+	int error = 0;
 
-	if (unlikely(kill))
-		return __remove_suid(dentry, kill);
+	if (killpriv < 0)
+		return killpriv;
+	if (killpriv)
+		error = security_inode_killpriv(dentry);
+	if (!error && killsuid)
+		error = __remove_suid(dentry, killsuid);
 
-	return 0;
+	return error;
 }
 EXPORT_SYMBOL(remove_suid);
 
@@ -1684,6 +1696,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
 
 	return copied;
 }
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
 
 /*
  * This has the same sideeffects and return value as
@@ -1710,6 +1723,7 @@ size_t iov_iter_copy_from_user(struct page *page,
 	kunmap(page);
 	return copied;
 }
+EXPORT_SYMBOL(iov_iter_copy_from_user);
 
 static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
 {
@@ -1741,13 +1755,24 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
 	__iov_iter_advance_iov(i, bytes);
 	i->count -= bytes;
 }
+EXPORT_SYMBOL(iov_iter_advance);
 
-int iov_iter_fault_in_readable(struct iov_iter *i)
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
-	size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
 	char __user *buf = i->iov->iov_base + i->iov_offset;
-	return fault_in_pages_readable(buf, seglen);
+	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+	return fault_in_pages_readable(buf, bytes);
 }
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
 
 /*
  * Return the count of just the current iov_iter segment.
@@ -1760,6 +1785,7 @@ size_t iov_iter_single_seg_count(struct iov_iter *i)
 	else
 		return min(i->count, iov->iov_len - i->iov_offset);
 }
+EXPORT_SYMBOL(iov_iter_single_seg_count);
 
 /*
  * Performs necessary checks before doing a write
@@ -1843,6 +1869,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 }
 EXPORT_SYMBOL(generic_write_checks);
 
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+
+	if (aops->write_begin) {
+		return aops->write_begin(file, mapping, pos, len, flags,
+							pagep, fsdata);
+	} else {
+		int ret;
+		pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		struct inode *inode = mapping->host;
+		struct page *page;
+again:
+		page = __grab_cache_page(mapping, index);
+		*pagep = page;
+		if (!page)
+			return -ENOMEM;
+
+		if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
+			/*
+			 * There is no way to resolve a short write situation
+			 * for a !Uptodate page (except by double copying in
+			 * the caller done by generic_perform_write_2copy).
+			 *
+			 * Instead, we have to bring it uptodate here.
+			 */
+			ret = aops->readpage(file, page);
+			page_cache_release(page);
+			if (ret) {
+				if (ret == AOP_TRUNCATED_PAGE)
+					goto again;
+				return ret;
+			}
+			goto again;
+		}
+
+		ret = aops->prepare_write(file, page, offset, offset+len);
+		if (ret) {
+			unlock_page(page);
+			page_cache_release(page);
+			if (pos + len > inode->i_size)
+				vmtruncate(inode, inode->i_size);
+		}
+		return ret;
+	}
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	const struct address_space_operations *aops = mapping->a_ops;
+	int ret;
+
+	if (aops->write_end) {
+		mark_page_accessed(page);
+		ret = aops->write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	} else {
+		unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+		struct inode *inode = mapping->host;
+
+		flush_dcache_page(page);
+		ret = aops->commit_write(file, page, offset, offset+len);
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+
+		if (ret < 0) {
+			if (pos + len > inode->i_size)
+				vmtruncate(inode, inode->i_size);
+		} else if (ret > 0)
+			ret = min_t(size_t, copied, ret);
+		else
+			ret = copied;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(pagecache_write_end);
+
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1886,8 +1997,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
  * Find or create a page at the given pagecache position. Return the locked
  * page. This function is specifically for buffered writes.
  */
-static struct page *__grab_cache_page(struct address_space *mapping,
-							pgoff_t index)
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 {
 	int status;
 	struct page *page;
@@ -1908,20 +2018,16 @@ repeat:
 	}
 	return page;
 }
+EXPORT_SYMBOL(__grab_cache_page);
 
-ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+static ssize_t generic_perform_write_2copy(struct file *file,
+				struct iov_iter *i, loff_t pos)
 {
-	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
-	struct inode 	*inode = mapping->host;
-	long		status = 0;
-	struct iov_iter i;
-
-	iov_iter_init(&i, iov, nr_segs, count, written);
+	struct inode *inode = mapping->host;
+	long status = 0;
+	ssize_t written = 0;
 
 	do {
 		struct page *src_page;
@@ -1934,7 +2040,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		offset = (pos & (PAGE_CACHE_SIZE - 1));
 		index = pos >> PAGE_CACHE_SHIFT;
 		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
-						iov_iter_count(&i));
+						iov_iter_count(i));
 
 		/*
 		 * a non-NULL src_page indicates that we're doing the
@@ -1952,7 +2058,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 * to check that the address is actually valid, when atomic
 		 * usercopies are used, below.
 		 */
-		if (unlikely(iov_iter_fault_in_readable(&i))) {
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
 			status = -EFAULT;
 			break;
 		}
@@ -1968,7 +2074,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		 * cannot take a pagefault with the destination page locked.
 		 * So pin the source page to copy it.
 		 */
-		if (!PageUptodate(page)) {
+		if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
 			unlock_page(page);
 
 			src_page = alloc_page(GFP_KERNEL);
@@ -1983,7 +2089,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * same reason as we can't take a page fault with a
 			 * page locked (as explained below).
 			 */
-			copied = iov_iter_copy_from_user(src_page, &i,
+			copied = iov_iter_copy_from_user(src_page, i,
 								offset, bytes);
 			if (unlikely(copied == 0)) {
 				status = -EFAULT;
@@ -2008,7 +2114,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 				page_cache_release(src_page);
 				continue;
 			}
-
 		}
 
 		status = a_ops->prepare_write(file, page, offset, offset+bytes);
@@ -2030,7 +2135,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 			 * really matter.
 			 */
 			pagefault_disable();
-			copied = iov_iter_copy_from_user_atomic(page, &i,
+			copied = iov_iter_copy_from_user_atomic(page, i,
 								offset, bytes);
 			pagefault_enable();
 		} else {
@@ -2045,7 +2150,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		flush_dcache_page(page);
 
 		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE))
+		if (unlikely(status < 0))
 			goto fs_write_aop_error;
 		if (unlikely(status > 0)) /* filesystem did partial write */
 			copied = min_t(size_t, copied, status);
@@ -2056,17 +2161,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		if (src_page)
 			page_cache_release(src_page);
 
-		iov_iter_advance(&i, copied);
-		written += copied;
+		iov_iter_advance(i, copied);
 		pos += copied;
+		written += copied;
 
 		balance_dirty_pages_ratelimited(mapping);
 		cond_resched();
 		continue;
 
 fs_write_aop_error:
-		if (status != AOP_TRUNCATED_PAGE)
-			unlock_page(page);
+		unlock_page(page);
 		page_cache_release(page);
 		if (src_page)
 			page_cache_release(src_page);
@@ -2078,17 +2182,125 @@ fs_write_aop_error:
 		 */
 		if (pos + bytes > inode->i_size)
 			vmtruncate(inode, inode->i_size);
-		if (status == AOP_TRUNCATED_PAGE)
-			continue;
-		else
-			break;
-	} while (iov_iter_count(&i));
-	*ppos = pos;
+		break;
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+				struct iov_iter *i, loff_t pos)
+{
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	long status = 0;
+	ssize_t written = 0;
+	unsigned int flags = 0;
 
 	/*
-	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+	 * Copies from kernel address space cannot fail (NFSD is a big user).
 	 */
+	if (segment_eq(get_fs(), KERNEL_DS))
+		flags |= AOP_FLAG_UNINTERRUPTIBLE;
+
+	do {
+		struct page *page;
+		pgoff_t index;		/* Pagecache index for current page */
+		unsigned long offset;	/* Offset into pagecache page */
+		unsigned long bytes;	/* Bytes to write to page */
+		size_t copied;		/* Bytes copied from user */
+		void *fsdata;
+
+		offset = (pos & (PAGE_CACHE_SIZE - 1));
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_count(i));
+
+again:
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+			status = -EFAULT;
+			break;
+		}
+
+		status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+						&page, &fsdata);
+		if (unlikely(status))
+			break;
+
+		pagefault_disable();
+		copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+		pagefault_enable();
+		flush_dcache_page(page);
+
+		status = a_ops->write_end(file, mapping, pos, bytes, copied,
+						page, fsdata);
+		if (unlikely(status < 0))
+			break;
+		copied = status;
+
+		cond_resched();
+
+		if (unlikely(copied == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+						iov_iter_single_seg_count(i));
+			goto again;
+		}
+		iov_iter_advance(i, copied);
+		pos += copied;
+		written += copied;
+
+		balance_dirty_pages_ratelimited(mapping);
+
+	} while (iov_iter_count(i));
+
+	return written ? written : status;
+}
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode *inode = mapping->host;
+	ssize_t status;
+	struct iov_iter i;
+
+	iov_iter_init(&i, iov, nr_segs, count, written);
+	if (a_ops->write_begin)
+		status = generic_perform_write(file, &i, pos);
+	else
+		status = generic_perform_write_2copy(file, &i, pos);
+
 	if (likely(status >= 0)) {
+		written += status;
+		*ppos = pos + status;
+
+		/*
+		 * For now, when the user asks for O_SYNC, we'll actually give
+		 * O_DSYNC
+		 */
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
 				status = generic_osync_inode(inode, mapping,
@@ -2302,21 +2514,17 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	}
 
 	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-	if (retval)
-		goto out;
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
-	 * faulted in by get_user_pages() if the source of the write was an
-	 * mmap()ed region of the file we're writing.  That's a pretty crazy
-	 * thing to do, so we don't support it 100%.  If this invalidation
-	 * fails and we have -EIOCBQUEUED we ignore the failure.
+	 * cached by non-direct readahead, or faulted in by get_user_pages()
+	 * if the source of the write was an mmap'ed region of the file
+	 * we're writing.  Either one is a pretty crazy thing to do,
+	 * so we don't support it 100%.  If this invalidation
+	 * fails, tough, the write still worked...
 	 */
 	if (rw == WRITE && mapping->nrpages) {
-		int err = invalidate_inode_pages2_range(mapping,
-					      offset >> PAGE_CACHE_SHIFT, end);
-		if (err && retval >= 0)
-			retval = err;
+		invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
 	}
 out:
 	return retval;