* locking semantics of the file system using the protocol. It should
* be somewhere else, I'm sure, but right now it isn't.
*
+ * New in version 8:
+ * - Replace delete inode votes with a cluster lock
+ *
* New in version 7:
* - DLM join domain includes the live nodemap
*
* - full 64 bit i_size in the metadata lock lvbs
* - introduction of "rw" lock and pushing meta/data locking down
*/
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
struct o2net_handshake {
__be64 protocol_version;
__be64 connector_id;
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+ .get_osb = ocfs2_get_inode_osb,
+ .flags = 0,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
- lockres->l_type == OCFS2_LOCK_TYPE_RW;
+ lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+ lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
}
static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
case OCFS2_LOCK_TYPE_DATA:
ops = &ocfs2_inode_data_lops;
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ ops = &ocfs2_inode_open_lops;
+ break;
default:
mlog_bug_on_msg(1, "type: %d\n", type);
ops = NULL; /* thanks, gcc */
goto bail;
}
+ ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
bail:
mlog_exit(ret);
return ret;
mlog_exit_void();
}
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu take PRMODE open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE, 0, 0);
+ if (status < 0)
+ mlog_errno(status);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+ int status = 0, level;
+ struct ocfs2_lock_res *lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ BUG_ON(!inode);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu try to take %s open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ write ? "EXMODE" : "PRMODE");
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+ level = write ? LKM_EXMODE : LKM_PRMODE;
+
+ /*
+ * The file system may already holding a PRMODE/EXMODE open lock.
+ * Since we pass LKM_NOQUEUE, the request won't block waiting on
+ * other nodes and the -EAGAIN will indicate to the caller that
+ * this inode is still in use.
+ */
+ status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+ level, LKM_NOQUEUE, 0);
+
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+ struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ mlog_entry_void();
+
+ mlog(0, "inode %llu drop open lock\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+ if (ocfs2_mount_local(osb))
+ goto out;
+
+ if(lockres->l_ro_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_PRMODE);
+ if(lockres->l_ex_holders)
+ ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+ LKM_EXMODE);
+
+out:
+ mlog_exit_void();
+}
+
int ocfs2_data_lock_full(struct inode *inode,
int write,
int arg_flags)
* ocfs2_clear_inode has done it for us. */
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
- &OCFS2_I(inode)->ip_data_lockres);
+ &OCFS2_I(inode)->ip_open_lockres);
if (err < 0)
mlog_errno(err);
status = err;
+ err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+ &OCFS2_I(inode)->ip_data_lockres);
+ if (err < 0)
+ mlog_errno(err);
+ if (err < 0 && !status)
+ status = err;
+
err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
&OCFS2_I(inode)->ip_meta_lockres);
if (err < 0)
int write);
int ocfs2_rw_lock(struct inode *inode, int write);
void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
int ocfs2_meta_lock_atime(struct inode *inode,
struct vfsmount *vfsmnt,
int *level);
(unsigned long long)fe->i_blkno);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
- OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
inode->i_nlink = le16_to_cpu(fe->i_links_count);
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
OCFS2_LOCK_TYPE_META, 0, inode);
+
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN, 0, inode);
}
ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
* cluster lock before trusting anything anyway.
*/
can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
- && !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+ && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
&& !ocfs2_mount_local(osb);
/*
OCFS2_LOCK_TYPE_META,
generation, inode);
+ ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+ OCFS2_LOCK_TYPE_OPEN,
+ 0, inode);
+
if (can_lock) {
+ status = ocfs2_open_lock(inode);
+ if (status) {
+ make_bad_inode(inode);
+ mlog_errno(status);
+ return status;
+ }
status = ocfs2_meta_lock(inode, NULL, 0);
if (status) {
make_bad_inode(inode);
}
}
+ if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+ status = ocfs2_try_open_lock(inode, 0);
+ if (status) {
+ make_bad_inode(inode);
+ return status;
+ }
+ }
+
status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
can_lock ? inode : NULL);
if (status < 0) {
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_dinode *di;
- /* We've already voted on this so it should be readonly - no
- * spinlock needed. */
- orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
if (status)
return ret;
}
+static int ocfs2_request_delete(struct inode *inode)
+{
+ int status = 0;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ if (ocfs2_inode_is_new(inode))
+ return 0;
+
+ if (ocfs2_node_map_is_only(osb, &osb->mounted_map,
+ osb->node_num))
+ return 0;
+ /*
+ * This is how ocfs2 determines whether an inode is still live
+ * within the cluster. Every node takes a shared read lock on
+ * the inode open lock in ocfs2_read_locked_inode(). When we
+ * get to ->delete_inode(), each node tries to convert it's
+ * lock to an exclusive. Trylocks are serialized by the inode
+ * meta data lock. If the upconvert suceeds, we know the inode
+ * is no longer live and can be deleted.
+ *
+ * Though we call this with the meta data lock held, the
+ * trylock keeps us from ABBA deadlock.
+ */
+ status = ocfs2_try_open_lock(inode, 1);
+ if (status < 0 && status != -EAGAIN)
+ mlog_errno(status);
+ return status;
+}
+
/* Query the cluster to determine whether we should wipe an inode from
* disk or not.
*
goto bail;
}
- status = ocfs2_request_delete_vote(inode);
- /* -EBUSY means that other nodes are still using the
+ status = ocfs2_request_delete(inode);
+ /* -EAGAIN means that other nodes are still using the
* inode. We're done here though, so avoid doing anything on
* disk and let them worry about deleting it. */
- if (status == -EBUSY) {
+ if (status == -EAGAIN) {
status = 0;
mlog(0, "Skipping delete of %llu because it is in use on"
"other nodes\n", (unsigned long long)oi->ip_blkno);
goto bail;
}
- spin_lock(&oi->ip_lock);
- if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
- /* Nobody knew which slot this inode was orphaned
- * into. This may happen during node death and
- * recovery knows how to clean it up so we can safely
- * ignore this inode for now on. */
- mlog(0, "Nobody knew where inode %llu was orphaned!\n",
- (unsigned long long)oi->ip_blkno);
- } else {
- *wipe = 1;
-
- mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
- (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
- }
- spin_unlock(&oi->ip_lock);
+ *wipe = 1;
+ mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+ (unsigned long long)oi->ip_blkno,
+ le16_to_cpu(di->i_orphaned_slot));
bail:
return status;
mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
"Inode=%lu\n", inode->i_ino);
+ /* For remove delete_inode vote, we hold open lock before,
+ * now it is time to unlock PR and EX open locks. */
+ ocfs2_open_unlock(inode);
+
/* Do these before all the other work so that we don't bounce
* the vote thread while waiting to destroy the locks. */
ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+ ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
/* We very well may get a clear_inode before all an inodes
* metadata has hit disk. Of course, we can't drop any cluster
ocfs2_lock_res_free(&oi->ip_rw_lockres);
ocfs2_lock_res_free(&oi->ip_meta_lockres);
ocfs2_lock_res_free(&oi->ip_data_lockres);
+ ocfs2_lock_res_free(&oi->ip_open_lockres);
ocfs2_metadata_cache_purge(inode);
mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
(unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
- /* Testing ip_orphaned_slot here wouldn't work because we may
- * not have gotten a delete_inode vote from any other nodes
- * yet. */
if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
generic_delete_inode(inode);
else
struct ocfs2_lock_res ip_rw_lockres;
struct ocfs2_lock_res ip_meta_lockres;
struct ocfs2_lock_res ip_data_lockres;
+ struct ocfs2_lock_res ip_open_lockres;
/* protects allocation changes on this inode. */
struct rw_semaphore ip_alloc_sem;
/* Flags for ocfs2_iget() */
#define OCFS2_FI_FLAG_NOWAIT 0x1
#define OCFS2_FI_FLAG_DELETE 0x2
-#define OCFS2_FI_FLAG_SYSFILE 0x4
-#define OCFS2_FI_FLAG_NOLOCK 0x8
+#define OCFS2_FI_FLAG_SYSFILE 0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY 0x8
struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
u64 blkno,
continue;
iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
- OCFS2_FI_FLAG_NOLOCK);
+ OCFS2_FI_FLAG_ORPHAN_RECOVERY);
if (IS_ERR(iter))
continue;
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = slot;
spin_unlock(&oi->ip_lock);
iput(inode);
* unlink. */
spin_lock(&oi->ip_lock);
oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
- oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
spin_unlock(&oi->ip_lock);
bail_add:
/* Record which orphan dir our inode now resides
* in. delete_inode will use this to determine which orphan
* dir to lock. */
- spin_lock(&OCFS2_I(inode)->ip_lock);
- OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
- spin_unlock(&OCFS2_I(inode)->ip_lock);
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
mlog(0, "Inode %llu orphaned in slot %d\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
__le32 i_ctime_nsec;
__le32 i_mtime_nsec;
__le32 i_attr;
- __le32 i_reserved1;
+ __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL
+ was set in i_flags */
+ __le16 i_reserved1;
/*70*/ __le64 i_reserved2[8];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
OCFS2_LOCK_TYPE_RENAME,
OCFS2_LOCK_TYPE_RW,
OCFS2_LOCK_TYPE_DENTRY,
+ OCFS2_LOCK_TYPE_OPEN,
OCFS2_NUM_LOCK_TYPES
};
case OCFS2_LOCK_TYPE_DENTRY:
c = 'N';
break;
+ case OCFS2_LOCK_TYPE_OPEN:
+ c = 'O';
+ break;
default:
c = '\0';
}
* important job it does, anyway. */
[OCFS2_LOCK_TYPE_RW] = "Write/Read",
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+ [OCFS2_LOCK_TYPE_OPEN] = "Open",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+ ocfs2_lock_res_init_once(&oi->ip_open_lockres);
ocfs2_metadata_cache_init(&oi->vfs_inode);