#include "inode.h"
#include "ioctl.h"
#include "journal.h"
+#include "locks.h"
#include "mmap.h"
#include "suballoc.h"
#include "super.h"
return sync_mapping_buffers(inode->i_mapping);
}
+static int ocfs2_init_file_private(struct inode *inode, struct file *file)
+{
+ struct ocfs2_file_private *fp;
+
+ fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL);
+ if (!fp)
+ return -ENOMEM;
+
+ fp->fp_file = file;
+ mutex_init(&fp->fp_mutex);
+ ocfs2_file_lock_res_init(&fp->fp_flock, fp);
+ file->private_data = fp;
+
+ return 0;
+}
+
+static void ocfs2_free_file_private(struct inode *inode, struct file *file)
+{
+ struct ocfs2_file_private *fp = file->private_data;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (fp) {
+ ocfs2_simple_drop_lockres(osb, &fp->fp_flock);
+ ocfs2_lock_res_free(&fp->fp_flock);
+ kfree(fp);
+ file->private_data = NULL;
+ }
+}
+
static int ocfs2_file_open(struct inode *inode, struct file *file)
{
int status;
oi->ip_open_count++;
spin_unlock(&oi->ip_lock);
- status = 0;
+
+ status = ocfs2_init_file_private(inode, file);
+ if (status) {
+ /*
+ * We want to set open count back if we're failing the
+ * open.
+ */
+ spin_lock(&oi->ip_lock);
+ oi->ip_open_count--;
+ spin_unlock(&oi->ip_lock);
+ }
+
leave:
mlog_exit(status);
return status;
oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
spin_unlock(&oi->ip_lock);
+ ocfs2_free_file_private(inode, file);
+
mlog_exit(0);
return 0;
}
+static int ocfs2_dir_open(struct inode *inode, struct file *file)
+{
+ return ocfs2_init_file_private(inode, file);
+}
+
+static int ocfs2_dir_release(struct inode *inode, struct file *file)
+{
+ ocfs2_free_file_private(inode, file);
+ return 0;
+}
+
static int ocfs2_sync_file(struct file *file,
struct dentry *dentry,
int datasync)
down_write(&OCFS2_I(inode)->ip_alloc_sem);
- /* This forces other nodes to sync and drop their pages. Do
- * this even if we have a truncate without allocation change -
- * ocfs2 cluster sizes can be much greater than page size, so
- * we have to truncate them anyway. */
- status = ocfs2_data_lock(inode, 1);
- if (status < 0) {
- up_write(&OCFS2_I(inode)->ip_alloc_sem);
-
- mlog_errno(status);
- goto bail;
- }
-
+ /*
+ * The inode lock forced other nodes to sync and drop their
+ * pages, which (correctly) happens even if we have a truncate
+ * without allocation change - ocfs2 cluster sizes can be much
+ * greater than page size, so we have to truncate them
+ * anyway.
+ */
unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
truncate_inode_pages(inode->i_mapping, new_i_size);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
status = ocfs2_truncate_inline(inode, di_bh, new_i_size,
- i_size_read(inode), 0);
+ i_size_read(inode), 1);
if (status)
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
/* alright, we're going to need to do a full blown alloc size
status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
if (status < 0) {
mlog_errno(status);
- goto bail_unlock_data;
+ goto bail_unlock_sem;
}
/* TODO: orphan dir cleanup here. */
-bail_unlock_data:
- ocfs2_data_unlock(inode, 1);
-
+bail_unlock_sem:
up_write(&OCFS2_I(inode)->ip_alloc_sem);
bail:
mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
"clusters_to_add = %u, extents_to_split = %u\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+ (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode),
le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
num_free_extents = ocfs2_num_free_extents(osb, inode, di);
le32_to_cpu(fe->i_clusters),
(unsigned long long)le64_to_cpu(fe->i_size));
mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
- OCFS2_I(inode)->ip_clusters, i_size_read(inode));
+ OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
leave:
if (handle) {
struct buffer_head *di_bh,
u64 new_i_size)
{
- int ret = 0, data_locked = 0;
+ int ret = 0;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
BUG_ON(!di_bh);
&& ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
goto out_update_size;
- /*
- * protect the pages that ocfs2_zero_extend is going to be
- * pulling into the page cache.. we do this before the
- * metadata extend so that we don't get into the situation
- * where we've extended the metadata but can't get the data
- * lock to zero.
- */
- ret = ocfs2_data_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- data_locked = 1;
-
/*
* The alloc sem blocks people in read/write from reading our
* allocation until we're done changing it. We depend on
up_write(&oi->ip_alloc_sem);
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
}
if (ret < 0) {
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
out_update_size:
if (ret < 0)
mlog_errno(ret);
-out_unlock:
- if (data_locked)
- ocfs2_data_unlock(inode, 1);
-
out:
return ret;
}
}
}
- status = ocfs2_meta_lock(inode, &bh, 1);
+ status = ocfs2_inode_lock(inode, &bh, 1);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
ocfs2_rw_unlock(inode, 1);
mlog_entry_void();
- ret = ocfs2_meta_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret) {
if (ret != -ENOENT)
mlog_errno(ret);
ret = generic_permission(inode, mask, NULL);
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
out:
mlog_exit(ret);
return ret;
u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
+ struct address_space *mapping = inode->i_mapping;
ocfs2_init_dealloc_ctxt(&dealloc);
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
- byte_start + byte_len, 1);
- if (ret)
+ byte_start + byte_len, 0);
+ if (ret) {
mlog_errno(ret);
- return ret;
+ goto out;
+ }
+ /*
+ * There's no need to get fancy with the page cache
+ * truncate of an inline-data inode. We're talking
+ * about less than a page here, which will be cached
+ * in the dinode buffer anyway.
+ */
+ unmap_mapping_range(mapping, 0, 0, 0);
+ truncate_inode_pages(mapping, 0);
+ goto out;
}
trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
goto out;
}
- ret = ocfs2_meta_lock(inode, &di_bh, 1);
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_rw_unlock;
if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
ret = -EPERM;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
switch (sr->l_whence) {
break;
default:
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
sr->l_whence = 0;
|| (sr->l_start + llen) < 0
|| (sr->l_start + llen) > max_off) {
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
size = sr->l_start + sr->l_len;
if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
if (sr->l_len <= 0) {
ret = -EINVAL;
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
}
ret = __ocfs2_write_remove_suid(inode, di_bh);
if (ret) {
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
}
up_write(&OCFS2_I(inode)->ip_alloc_sem);
if (ret) {
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
/*
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_meta_unlock;
+ goto out_inode_unlock;
}
if (change_size && i_size_read(inode) < size)
ocfs2_commit_trans(osb, handle);
-out_meta_unlock:
+out_inode_unlock:
brelse(di_bh);
- ocfs2_meta_unlock(inode, 1);
+ ocfs2_inode_unlock(inode, 1);
out_rw_unlock:
ocfs2_rw_unlock(inode, 1);
* if we need to make modifications here.
*/
for(;;) {
- ret = ocfs2_meta_lock(inode, NULL, meta_level);
+ ret = ocfs2_inode_lock(inode, NULL, meta_level);
if (ret < 0) {
meta_level = -1;
mlog_errno(ret);
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_meta_unlock(inode, meta_level);
+ ocfs2_inode_unlock(inode, meta_level);
meta_level = 1;
continue;
}
*ppos = saved_pos;
out_unlock:
- ocfs2_meta_unlock(inode, meta_level);
+ ocfs2_inode_unlock(inode, meta_level);
out:
return ret;
}
-static inline void
-ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
- const struct iovec *iov = *iovp;
- size_t base = *basep;
-
- do {
- int copy = min(bytes, iov->iov_len - base);
-
- bytes -= copy;
- base += copy;
- if (iov->iov_len == base) {
- iov++;
- base = 0;
- }
- } while (bytes);
- *iovp = iov;
- *basep = base;
-}
-
-static struct page * ocfs2_get_write_source(char **ret_src_buf,
- const struct iovec *cur_iov,
- size_t iov_offset)
-{
- int ret;
- char *buf = cur_iov->iov_base + iov_offset;
- struct page *src_page = NULL;
- unsigned long off;
-
- off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
-
- if (!segment_eq(get_fs(), KERNEL_DS)) {
- /*
- * Pull in the user page. We want to do this outside
- * of the meta data locks in order to preserve locking
- * order in case of page fault.
- */
- ret = get_user_pages(current, current->mm,
- (unsigned long)buf & PAGE_CACHE_MASK, 1,
- 0, 0, &src_page, NULL);
- if (ret == 1)
- *ret_src_buf = kmap(src_page) + off;
- else
- src_page = ERR_PTR(-EFAULT);
- } else {
- *ret_src_buf = buf;
- }
-
- return src_page;
-}
-
-static void ocfs2_put_write_source(struct page *page)
-{
- if (page) {
- kunmap(page);
- page_cache_release(page);
- }
-}
-
-static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
- const struct iovec *iov,
- unsigned long nr_segs,
- size_t count,
- ssize_t o_direct_written)
-{
- int ret = 0;
- ssize_t copied, total = 0;
- size_t iov_offset = 0, bytes;
- loff_t pos;
- const struct iovec *cur_iov = iov;
- struct page *user_page, *page;
- char * uninitialized_var(buf);
- char *dst;
- void *fsdata;
-
- /*
- * handle partial DIO write. Adjust cur_iov if needed.
- */
- ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
-
- do {
- pos = *ppos;
-
- user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
- if (IS_ERR(user_page)) {
- ret = PTR_ERR(user_page);
- goto out;
- }
-
- /* Stay within our page boundaries */
- bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
- (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
- /* Stay within the vector boundary */
- bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
- /* Stay within count */
- bytes = min(bytes, count);
-
- page = NULL;
- ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
- &page, &fsdata);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- dst = kmap_atomic(page, KM_USER0);
- memcpy(dst + (pos & (loff_t)(PAGE_CACHE_SIZE - 1)), buf, bytes);
- kunmap_atomic(dst, KM_USER0);
- flush_dcache_page(page);
- ocfs2_put_write_source(user_page);
-
- copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
- bytes, page, fsdata);
- if (copied < 0) {
- mlog_errno(copied);
- ret = copied;
- goto out;
- }
-
- total += copied;
- *ppos = pos + copied;
- count -= copied;
-
- ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
- } while(count);
-
-out:
- return total ? total : ret;
-}
-
static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs,
loff_t pos)
{
int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
- int can_do_direct, sync = 0;
+ int can_do_direct;
ssize_t written = 0;
size_t ocount; /* original count */
size_t count; /* after file limit checks */
- loff_t *ppos = &iocb->ki_pos;
+ loff_t old_size, *ppos = &iocb->ki_pos;
+ u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_path.dentry->d_inode;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
mlog_entry("(0x%p, %u, '%.*s')\n", file,
(unsigned int)nr_segs,
if (iocb->ki_left == 0)
return 0;
- ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
- if (ret)
- return ret;
-
- count = ocount;
-
vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
appending = file->f_flags & O_APPEND ? 1 : 0;
rw_level = -1;
direct_io = 0;
- sync = 1;
goto relock;
}
- if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
- sync = 1;
-
- /*
- * XXX: Is it ok to execute these checks a second time?
- */
- ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
- if (ret)
- goto out;
-
/*
- * Set pos so that sync_page_range_nolock() below understands
- * where to start from. We might've moved it around via the
- * calls above. The range we want to actually sync starts from
- * *ppos here.
- *
+ * To later detect whether a journal commit for sync writes is
+ * necessary, we sample i_size, and cluster count here.
*/
- pos = *ppos;
+ old_size = i_size_read(inode);
+ old_clusters = OCFS2_I(inode)->ip_clusters;
/* communicate with ocfs2_dio_end_io */
ocfs2_iocb_set_rw_locked(iocb, rw_level);
if (direct_io) {
+ ret = generic_segment_checks(iov, &nr_segs, &ocount,
+ VERIFY_READ);
+ if (ret)
+ goto out_dio;
+
+ ret = generic_write_checks(file, ppos, &count,
+ S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out_dio;
+
written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
ppos, count, ocount);
if (written < 0) {
goto out_dio;
}
} else {
- written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
- count, written);
- if (written < 0) {
- ret = written;
- if (ret != -EFAULT || ret != -ENOSPC)
- mlog_errno(ret);
- goto out;
- }
+ written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
+ *ppos);
}
out_dio:
/* buffered aio wouldn't have proper lock coverage today */
BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
+ if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
+ /*
+ * The generic write paths have handled getting data
+ * to disk, but since we don't make use of the dirty
+ * inode list, a manual journal commit is necessary
+ * here.
+ */
+ if (old_size != i_size_read(inode) ||
+ old_clusters != OCFS2_I(inode)->ip_clusters) {
+ ret = journal_force_commit(osb->journal->j_journal);
+ if (ret < 0)
+ written = ret;
+ }
+ }
+
/*
* deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
* function pointer which is called when o_direct io completes so that
if (have_alloc_sem)
up_read(&inode->i_alloc_sem);
- if (written > 0 && sync) {
- ssize_t err;
-
- err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
- if (err < 0)
- written = err;
- }
-
mutex_unlock(&inode->i_mutex);
mlog_exit(ret);
return written ? written : ret;
}
-static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
- struct pipe_buffer *buf,
- struct splice_desc *sd)
-{
- int ret, count;
- ssize_t copied = 0;
- struct file *file = sd->u.file;
- unsigned int offset;
- struct page *page = NULL;
- void *fsdata;
- char *src, *dst;
-
- ret = buf->ops->confirm(pipe, buf);
- if (ret)
- goto out;
-
- offset = sd->pos & ~PAGE_CACHE_MASK;
- count = sd->len;
- if (count + offset > PAGE_CACHE_SIZE)
- count = PAGE_CACHE_SIZE - offset;
-
- ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
- &page, &fsdata);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- src = buf->ops->map(pipe, buf, 1);
- dst = kmap_atomic(page, KM_USER1);
- memcpy(dst + offset, src + buf->offset, count);
- kunmap_atomic(dst, KM_USER1);
- buf->ops->unmap(pipe, buf, src);
-
- copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
- page, fsdata);
- if (copied < 0) {
- mlog_errno(copied);
- ret = copied;
- goto out;
- }
-out:
-
- return copied ? copied : ret;
-}
-
-static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
- struct file *out,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
- int ret, err;
- struct address_space *mapping = out->f_mapping;
- struct inode *inode = mapping->host;
- struct splice_desc sd = {
- .total_len = len,
- .flags = flags,
- .pos = *ppos,
- .u.file = out,
- };
-
- ret = __splice_from_pipe(pipe, &sd, ocfs2_splice_write_actor);
- if (ret > 0) {
- *ppos += ret;
-
- if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
- err = generic_osync_inode(inode, mapping,
- OSYNC_METADATA|OSYNC_DATA);
- if (err)
- ret = err;
- }
- }
-
- return ret;
-}
-
static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
struct file *out,
loff_t *ppos,
goto out_unlock;
}
- /* ok, we're done with i_size and alloc work */
- ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
+ ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
out_unlock:
ocfs2_rw_unlock(inode, 1);
/*
* See the comment in ocfs2_file_aio_read()
*/
- ret = ocfs2_meta_lock(inode, NULL, 0);
+ ret = ocfs2_inode_lock(inode, NULL, 0);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_meta_unlock(inode, 0);
+ ocfs2_inode_unlock(inode, 0);
ret = generic_file_splice_read(in, ppos, pipe, len, flags);
* like i_size. This allows the checks down below
* generic_file_aio_read() a chance of actually working.
*/
- ret = ocfs2_meta_lock_atime(inode, filp->f_vfsmnt, &lock_level);
+ ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
if (ret < 0) {
mlog_errno(ret);
goto bail;
}
- ocfs2_meta_unlock(inode, lock_level);
+ ocfs2_inode_unlock(inode, lock_level);
ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
if (ret == -EINVAL)
};
const struct file_operations ocfs2_fops = {
+ .llseek = generic_file_llseek,
.read = do_sync_read,
.write = do_sync_write,
.mmap = ocfs2_mmap,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .flock = ocfs2_flock,
.splice_read = ocfs2_file_splice_read,
.splice_write = ocfs2_file_splice_write,
};
const struct file_operations ocfs2_dops = {
+ .llseek = generic_file_llseek,
.read = generic_read_dir,
.readdir = ocfs2_readdir,
.fsync = ocfs2_sync_file,
+ .release = ocfs2_dir_release,
+ .open = ocfs2_dir_open,
.ioctl = ocfs2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ocfs2_compat_ioctl,
#endif
+ .flock = ocfs2_flock,
};